def test_shard_bam_file(): filename = utils.get_test_data('10x-example/possorted_genome_bam.bam') bam_file = tenx.read_bam_file(filename) assert isinstance(bam_file, bs.AlignmentFile) expected_alignments = sum(1 for _ in bam_file) with utils.TempDirectory() as location: bam_shard_files = tenx.shard_bam_file(filename, expected_alignments, location) assert len(bam_shard_files) == 1 num_shards = 2 with utils.TempDirectory() as location: bam_shard_files = tenx.shard_bam_file( filename, expected_alignments // num_shards, location) assert len(bam_shard_files) == 2 total_alignments = 0 for bam_file in bam_shard_files: total_alignments += sum(1 for _ in tenx.read_bam_file(bam_file)) assert total_alignments == expected_alignments whole_bam_file = tenx.read_bam_file(filename) for bam_file in bam_shard_files: for line in tenx.read_bam_file(bam_file): assert line == next(whole_bam_file)
def test_barcode_umi_seq_to_fasta(): bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam') with utils.TempDirectory() as location: single_barcode_fastas = tenx.bam_to_temp_fasta( barcodes=None, barcode_renamer=None, delimiter="X", bam_file=bam_file, temp_folder=location) single_barcode_fastas = "," .join( itertools.chain(single_barcode_fastas)) all_fastas_sorted = tenx.get_fastas_per_unique_barcodes( single_barcode_fastas) tenx.barcode_umi_seq_to_fasta( location, "X", True, 10, location, all_fastas_sorted) fastas = glob.glob( os.path.join(location, "*_bam2fasta.fasta")) assert len(fastas) == 1 meta_txts = glob.glob( os.path.join(location, "*_meta.txt")) assert len(meta_txts) == 8
def test_bam2fasta_valid_args(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam') csv_path = os.path.join(location, "all_barcodes_meta.csv") barcodes_path = utils.get_test_data('10x-example/barcodes.tsv') renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv') fastas_dir = os.path.join(location, "fastas") if not os.path.exists(fastas_dir): os.makedirs(fastas_dir) parser = bam2fasta_args.create_parser() args = [ '--filename', testdata1, '--min-umi-per-barcode', '10', '--write-barcode-meta-csv', csv_path, '--barcodes-file', barcodes_path, '--rename-10x-barcodes', renamer_path, '--save-fastas', fastas_dir, ] expected_args_vals = { "filename": testdata1, "min_umi_per_barcode": 10, "write_barcode_meta_csv": csv_path, "barcodes_file": barcodes_path, "rename_10x_barcodes": renamer_path, "save_fastas": fastas_dir, "processes": bam2fasta_args.DEFAULT_PROCESSES, "delimiter": bam2fasta_args.DEFAULT_DELIMITER, "line_count": bam2fasta_args.DEFAULT_LINE_COUNT} args = parser.parse_args(args) args = vars(args) for key, val in args.items(): assert key in list(expected_args_vals.keys()) assert val in list(expected_args_vals.values())
def test_write_to_barcode_meta_csv(): bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam') with utils.TempDirectory() as location: single_barcode_fastas = tenx.bam_to_temp_fasta( barcodes=None, barcode_renamer=None, delimiter="X", bam_file=bam_file, temp_folder=location) single_barcode_fastas = "," .join( itertools.chain(single_barcode_fastas)) all_fastas_sorted = tenx.get_fastas_per_unique_barcodes( single_barcode_fastas) tenx.barcode_umi_seq_to_fasta( location, "X", True, 0, location, all_fastas_sorted) csv = os.path.join(location, "meta.csv") tenx.write_to_barcode_meta_csv(location, csv) umi_counts = [6, 2, 6, 4, 15, 5, 2, 2] read_counts = [312, 36, 251, 194, 594, 153, 2, 68] for index, row in pd.read_csv(csv).iterrows(): assert umi_counts[index] == row[tenx.UMI_COUNT] assert read_counts[index] == row[tenx.READ_COUNT]
def test_run_bam2fasta_percell_no_shard(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data( '10x-example/possorted_genome_bam.fastq.gz') fasta_files = cli.percell( ['--filename', testdata1, '--save-fastas', location]) print(fasta_files) barcodes = [filename.replace(".fasta", "") for filename in fasta_files] assert len(barcodes) == 8
def test_run_bam2fasta_convert(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam') fasta_files = cli.convert([ '--filename', testdata1, '--save-fastas', location, ]) barcodes = [filename.replace(".fasta", "") for filename in fasta_files] assert len(barcodes) == 8
def test_run_bam2fasta_default_args(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam') status, out, err = utils.run_shell_cmd( 'bam2fasta percell --filename ' + testdata1, in_directory=location) assert status == 0 fasta_files = os.listdir(location) barcodes = [ filename.replace(".fasta", "") for filename in fasta_files if filename.endswith("_bam2fasta.fasta") ] assert len(barcodes) == 8
def test_write_fastq(): path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz') with utils.TempDirectory() as location: with screed.open(path) as f: records = [] for record_count, record in enumerate(f): records.append(record) write_path = os.path.join(location, "result.fastq") tenx.write_fastq(records, write_path) with screed.open(write_path) as f: records_written = [] for record_count, record in enumerate(f): records_written.append(record) assert records_written == records
def test_get_fastas_per_unique_barcodes(): filename = utils.get_test_data('10x-example/barcodes.tsv') renamer_filename = utils.get_test_data('10x-example/barcodes_renamer.tsv') bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam') barcodes = tenx.read_barcodes_file(filename) with utils.TempDirectory() as location: all_fastas = tenx.bam_to_temp_fasta( barcodes=barcodes, barcode_renamer=renamer_filename, delimiter="X", bam_file=bam_file, temp_folder=location) all_fastas = ",".join(itertools.chain(all_fastas)) fastas_sorted = tenx.get_fastas_per_unique_barcodes(all_fastas) assert len(fastas_sorted) == 8
def test_run_bam2fasta_supply_all_args(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam') csv_path = os.path.join(location, "all_barcodes_meta.csv") barcodes_path = utils.get_test_data('10x-example/barcodes.tsv') renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv') fastas_dir = os.path.join(location, "fastas") temp_fastas_dir = os.path.join(os.path.dirname(testdata1), "temp_fastas/") if not os.path.exists(fastas_dir): os.makedirs(fastas_dir) if not os.path.exists(temp_fastas_dir): os.makedirs(temp_fastas_dir) status, out, err = utils.run_shell_cmd( 'bam2fasta percell --filename ' + testdata1 + ' --min-umi-per-barcode 10' + ' --write-barcode-meta-csv ' + csv_path + ' --save-intermediate-files ' + temp_fastas_dir + ' --barcodes-file ' + barcodes_path + ' --rename-10x-barcodes ' + renamer_path + ' --save-fastas ' + fastas_dir + " --processes 1", in_directory=location) assert status == 0 with open(csv_path, 'rb') as f: data = [line.split() for line in f] assert len(data) == 9 fasta_files = os.listdir(fastas_dir) barcodes = [filename.replace(".fasta", "") for filename in fasta_files] assert len(barcodes) == 1 assert len(fasta_files) == 1 assert barcodes[0] == \ ('lung_epithelial_cell_AAATGCCCAAACTGCT-1_bam2fasta') count = 0 fasta_file_name = os.path.join(fastas_dir, fasta_files[0]) for record in screed.open(fasta_file_name): name = record.name sequence = record.sequence count += 1 assert name.startswith('lung_epithelial_cell_AAATGCCCAAACTGCT-1') assert sequence.count(">") == 0 assert sequence.count("X") == 0 shutil.rmtree(temp_fastas_dir)
def test_make_per_cell_fastq_gzs(): path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz') barcodes_file = utils.get_test_data('10x-example/barcodes.tsv') barcodes = tenx.read_barcodes_file(barcodes_file) with utils.TempDirectory() as location: outdir = os.path.join(location, "outdir") os.makedirs(outdir) tenx.make_per_cell_fastqs( path, outdir, "possorted_aligned_", "fastq.gz", bam2fasta_args.CELL_BARCODE_PATTERN, barcodes_file) fastas = glob.glob(os.path.join(outdir, "*.fastq.gz")) for fasta in fastas: fasta_name = os.path.basename(fasta).replace( ".fastq.gz", "").replace("possorted_aligned__", "") assert fasta_name in barcodes
def test_run_make_fastqs_percell(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data( '10x-example/possorted_genome_bam.fastq.gz') good_barcodes_path = utils.get_test_data( '10x-example/good_barcodes.csv') barcodes_path = utils.get_test_data('10x-example/barcodes.tsv') fastas_dir = os.path.join(location, "fastas") if not os.path.exists(fastas_dir): os.makedirs(fastas_dir) status, out, err = utils.run_shell_cmd( 'bam2fasta make_fastqs_percell --filename ' + testdata1 + ' --barcodes-file ' + barcodes_path + " --barcodes-significant-umis-file " + good_barcodes_path + ' --save-fastas ' + fastas_dir, in_directory=location) assert status == 0 fastqs = glob.glob(os.path.join(fastas_dir + "/*.fastq")) assert len(fastqs) == 1, "fastas_dir is {}".format(fastas_dir)
def test_bam2fasta_valid_args(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam') csv_path = os.path.join(location, "all_barcodes_meta.csv") barcodes_path = utils.get_test_data('10x-example/barcodes.tsv') renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv') fastas_dir = os.path.join(location, "fastas") save_intermediate_files_dir = os.path.join(location, "temp_fastas") if not os.path.exists(fastas_dir): os.makedirs(fastas_dir) parser = b2fa_args.create_parser() args = [ '--filename', testdata1, '--min-umi-per-barcode', '10', '--write-barcode-meta-csv', csv_path, '--barcodes-file', barcodes_path, '--rename-10x-barcodes', renamer_path, '--save-fastas', fastas_dir, '--save-intermediate-files', save_intermediate_files_dir ] expected_args_vals = { "filename": testdata1, "min_umi_per_barcode": 10, "write_barcode_meta_csv": csv_path, "barcodes_file": barcodes_path, "rename_10x_barcodes": renamer_path, "save_fastas": fastas_dir, "barcodes_significant_umis_file": None, "channel_id": "", "output_format": "fastq", "processes": b2fa_args.DEFAULT_PROCESSES, "delimiter": b2fa_args.DEFAULT_DELIMITER, "shard_size": b2fa_args.DEFAULT_LINE_COUNT, "cell_barcode_pattern": b2fa_args.CELL_BARCODE_PATTERN, "molecular_barcode_pattern": b2fa_args.MOLECULAR_BARCODE_PATTERN, "save_intermediate_files": save_intermediate_files_dir } args = parser.parse_args(args) args = vars(args) for key, val in args.items(): assert key in list(expected_args_vals.keys()) assert val in list(expected_args_vals.values())
def test_run_bam2fasta_fq_percell_no_shard_nonzero_umi(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data( '10x-example/possorted_genome_bam.fastq.gz') fasta_files = cli.percell([ '--filename', testdata1, '--save-fastas', location, '--min-umi-per-barcode', '10' ]) print(fasta_files) barcodes = [filename.replace(".fastq", "") for filename in fasta_files] assert len(barcodes) == 1 sequences_fastq = [] with screed.open(fasta_files[0]) as f: for record in f: sequences_fastq.append(record.sequence) gt_data = utils.get_test_data( '10x-example/groundtruth_fasta_sequences.txt') with open(gt_data, "r") as f: for index, line in enumerate(f.readlines()): assert line.strip() in sequences_fastq, \ "failed at index {}".format(index)
def test_run_count_umis_percell(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data( '10x-example/possorted_genome_bam.fastq.gz') csv_path = os.path.join(location, "all_barcodes_meta.csv") good_barcodes_path = os.path.join(location, "good_barcodes.csv") barcodes_path = utils.get_test_data('10x-example/barcodes.tsv') renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv') status, out, err = utils.run_shell_cmd( 'bam2fasta count_umis_percell --filename ' + testdata1 + ' --min-umi-per-barcode 10' + ' --write-barcode-meta-csv ' + csv_path + ' --barcodes-file ' + barcodes_path + ' --rename-10x-barcodes ' + renamer_path + " --processes 1 " + "--barcodes-significant-umis-file " + good_barcodes_path, in_directory=location) assert status == 0 with open(csv_path, 'rb') as f: data = [line.split() for line in f] assert len(data) == 8 with open(good_barcodes_path, 'rb') as f: data = [line.split() for line in f] assert len(data) == 1
def test_count_umis_per_cell(expected_good_barcodes): path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz') with utils.TempDirectory() as location: meta = os.path.join(location, "barcode_umi_meta.csv") good_barcodes = os.path.join( location, "barcodes_with_significant_umi_records.csv") tenx.count_umis_per_cell( path, meta, bam2fasta_args.CELL_BARCODE_PATTERN, bam2fasta_args.MOLECULAR_BARCODE_PATTERN, 3, good_barcodes) expected_meta = [6, 15, 2, 2, 5, 4, 6, 2] all_barcodes = [ 'AAAGATGCAGATCTGT-1', 'AAATGCCCAAACTGCT-1', 'AACACGTAGTGTACCT-1', 'AACCATGAGTTGTCGT-1', 'AAATGCCGTGAACCTT-1', 'AAATGCCAGATAGTCA-1', 'AAACGGGAGGATATAC-1', 'AAACGGGTCTCGTATT-1'] assert all_barcodes == pd.read_csv( meta, header=None).iloc[:, 0].values.tolist() assert expected_meta == pd.read_csv( meta, header=None).iloc[:, 1].values.tolist() assert expected_good_barcodes == pd.read_csv( good_barcodes, header=None).iloc[:, 0].values.tolist()