def test_pass_alignment_qc(): barcodes = tenx.read_barcodes_file( utils.get_test_data('10x-example/barcodes.tsv')) bam = tenx.read_bam_file( utils.get_test_data('10x-example/possorted_genome_bam.bam')) total_pass = sum(1 for alignment in bam if tenx.pass_alignment_qc(alignment, barcodes)) assert total_pass == 439
def test_bam_to_temp_fasta(): filename = utils.get_test_data('10x-example/barcodes.tsv') bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam') barcodes = tenx.read_barcodes_file(filename) fastas = tenx.bam_to_temp_fasta(barcodes=barcodes, barcode_renamer=None, delimiter="X", bam_file=bam_file) assert len(list(fastas)) == 8
def test_get_cell_barcode_umis(): path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz') barcodes_file = utils.get_test_data('10x-example/barcodes.tsv') barcodes = tenx.read_barcodes_file(barcodes_file) barcode_cell_umi_dict = tenx.get_cell_barcode_umis( path, bam2fasta_args.CELL_BARCODE_PATTERN, bam2fasta_args.MOLECULAR_BARCODE_PATTERN) for barcode in list(barcode_cell_umi_dict.keys()): assert barcode in barcodes
def test_get_cell_barcode(): fastq_file = utils.get_test_data( '10x-example/possorted_genome_bam.fastq.gz') barcodes_file = utils.get_test_data('10x-example/barcodes.tsv') barcodes = tenx.read_barcodes_file(barcodes_file) with screed.open(fastq_file) as f: for record_count, record in enumerate(f): result = tenx.get_cell_barcode( record, bam2fasta_args.CELL_BARCODE_PATTERN) if result is not None: assert result in barcodes
def test_parse_barcode_renamer(): filename = utils.get_test_data('10x-example/barcodes.tsv') barcodes = tenx.read_barcodes_file(filename) renamer = tenx.parse_barcode_renamer(barcodes, None) for key, value in renamer.items(): assert key == value assert len(renamer) == len(barcodes) renamer = tenx.parse_barcode_renamer( barcodes, utils.get_test_data('10x-example/barcodes_renamer.tsv')) for key, value in renamer.items(): assert key in value assert "epithelial_cell" in value assert len(renamer) == len(barcodes)
def test_get_fastas_per_unique_barcodes(): filename = utils.get_test_data('10x-example/barcodes.tsv') renamer_filename = utils.get_test_data('10x-example/barcodes_renamer.tsv') bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam') barcodes = tenx.read_barcodes_file(filename) with utils.TempDirectory() as location: all_fastas = tenx.bam_to_temp_fasta( barcodes=barcodes, barcode_renamer=renamer_filename, delimiter="X", bam_file=bam_file, temp_folder=location) all_fastas = ",".join(itertools.chain(all_fastas)) fastas_sorted = tenx.get_fastas_per_unique_barcodes(all_fastas) assert len(fastas_sorted) == 8
def test_make_per_cell_fastq_gzs(): path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz') barcodes_file = utils.get_test_data('10x-example/barcodes.tsv') barcodes = tenx.read_barcodes_file(barcodes_file) with utils.TempDirectory() as location: outdir = os.path.join(location, "outdir") os.makedirs(outdir) tenx.make_per_cell_fastqs( path, outdir, "possorted_aligned_", "fastq.gz", bam2fasta_args.CELL_BARCODE_PATTERN, barcodes_file) fastas = glob.glob(os.path.join(outdir, "*.fastq.gz")) for fasta in fastas: fasta_name = os.path.basename(fasta).replace( ".fastq.gz", "").replace("possorted_aligned__", "") assert fasta_name in barcodes
def convert(args): """Cli tool to convert bam to fasta files""" parser = create_parser() args = parser.parse_args(args) logger.info(args) umi_filter = True if args.min_umi_per_barcode != 0 else False all_fastas_sorted = [] all_fastas = "" def collect_reduce_temp_fastas(index): """Convert fasta to sig record""" if umi_filter: return filtered_umi_to_fasta(index) else: return unfiltered_umi_to_fasta(index) def unfiltered_umi_to_fasta(index): """Returns signature records across fasta files for a unique barcode""" # Getting all fastas for a given barcode # from different shards single_barcode_fastas = all_fastas_sorted[index] count = 0 # Iterating through fasta files for single barcode from different # fastas for fasta in iter_split(single_barcode_fastas, ","): # Initializing the fasta file to write # all the sequences from all bam shards to if count == 0: unique_fasta_file = os.path.basename(fasta) barcode_name = unique_fasta_file.replace(".fasta", "") f = open(os.path.join(args.save_fastas, unique_fasta_file), "w") # Add sequence for record in screed.open(fasta): sequence = record.sequence umi = record.name split_seqs = sequence.split(args.delimiter) for index, seq in enumerate(split_seqs): if seq == "": continue f.write(">{}\n{}\n".format( barcode_name + "_" + umi + "_" + '{:03d}'.format(index), seq)) # Delete fasta file in tmp folder if os.path.exists(fasta): os.unlink(fasta) count += 1 # close the fasta file f.close() return os.path.join(args.save_fastas, unique_fasta_file) def filtered_umi_to_fasta(index): """Returns signature records for all the fasta files for a unique barcode, only if it has more than min_umi_per_barcode number of umis""" # Getting all fastas for a given barcode # from different shards single_barcode_fastas = all_fastas_sorted[index] logger.debug("calculating umi counts") # Tracking UMI Counts umis = defaultdict(int) # Iterating through fasta files for single barcode from different # fastas for fasta in iter_split(single_barcode_fastas, ","): # calculate unique umi, sequence counts for record in screed.open(fasta): umis[record.name] += record.sequence.count(args.delimiter) if args.write_barcode_meta_csv: unique_fasta_file = os.path.basename(fasta) unique_meta_file = unique_fasta_file.replace(".fasta", "_meta.txt") with open(unique_meta_file, "w") as f: f.write("{} {}".format(len(umis), sum(list(umis.values())))) logger.debug("Completed tracking umi counts") if len(umis) < args.min_umi_per_barcode: return [] count = 0 for fasta in iter_split(single_barcode_fastas, ","): # Initializing fasta file to save the sequence to if count == 0: unique_fasta_file = os.path.basename(fasta) barcode_name = unique_fasta_file.replace(".fasta", "") f = open(os.path.join(args.save_fastas, unique_fasta_file), "w") # Add sequences of barcodes with more than min-umi-per-barcode umis for record in screed.open(fasta): sequence = record.sequence umi = record.name # Appending sequence of a umi to the fasta split_seqs = sequence.split(args.delimiter) for index, seq in enumerate(split_seqs): if seq == "": continue f.write(">{}\n{}\n".format( barcode_name + "_" + umi + "_" + '{:03d}'.format(index), seq)) # Delete fasta file in tmp folder if os.path.exists(fasta): os.unlink(fasta) count += 1 # close the opened fasta file f.close() return os.path.join(args.save_fastas, unique_fasta_file) def write_to_barcode_meta_csv(): """ Merge all the meta text files for each barcode to one csv file with CELL_BARCODE, UMI_COUNT,READ_COUNT""" barcodes_meta_txts = glob.glob("*_meta.txt") with open(args.write_barcode_meta_csv, "w") as fp: fp.write("{},{},{}".format(CELL_BARCODE, UMI_COUNT, READ_COUNT)) fp.write('\n') for barcode_meta_txt in barcodes_meta_txts: with open(barcode_meta_txt, 'r') as f: umi_count, read_count = f.readline().split() umi_count = int(umi_count) read_count = int(read_count) barcode_name = barcode_meta_txt.replace('_meta.txt', '') fp.write("{},{},{}\n".format(barcode_name, umi_count, read_count)) def get_unique_barcodes(all_fastas): """ Build a dictionary with each unique barcode as key and their fasta files from different shards """ fasta_files_dict = OrderedDict() for fasta in iter_split(all_fastas, ","): barcode = os.path.basename(fasta).replace(".fasta", "") value = fasta_files_dict.get(barcode, "") fasta_files_dict[barcode] = value + fasta + "," # Find unique barcodes all_fastas_sorted = list(fasta_files_dict.values()) del fasta_files_dict return all_fastas_sorted # Initializing time startt = time.time() # Setting barcodes file, some 10x files don't have a filtered # barcode file if args.barcodes_file is not None: barcodes = tenx_utils.read_barcodes_file(args.barcodes_file) else: barcodes = None # Shard bam file to smaller bam file logger.info('... reading bam file from %s', args.filename) n_jobs = args.processes filenames, mmap_file = np_utils.to_memmap( np.array( tenx_utils.shard_bam_file(args.filename, args.line_count, os.getcwd()))) # Create a per-cell fasta generator of sequences # If the reads should be filtered by barcodes and umis # umis are saved in fasta file as record name and name of # the fasta file is the barcode func = partial(tenx_utils.bam_to_temp_fasta, barcodes, args.rename_10x_barcodes, args.delimiter) length_sharded_bam_files = len(filenames) chunksize = calculate_chunksize(length_sharded_bam_files, n_jobs) pool = multiprocessing.Pool(processes=n_jobs) logger.info( "multiprocessing pool processes {} and chunksize {} calculated".format( n_jobs, chunksize)) # All the fastas are stored in a string instead of a list # This saves memory per element of the list by 8 bits # If we have unique barcodes in the order of 10^6 before # filtering that would result in a huge list if each barcode # is saved as a separate element, hence the string all_fastas = ",".join( itertools.chain(*(pool.imap( lambda x: func(x.encode('utf-8')), filenames, chunksize=chunksize) ))) # clean up the memmap and sharded intermediary bam files [os.unlink(file) for file in filenames if os.path.exists(file)] del filenames os.unlink(mmap_file) logger.info("Deleted intermediary bam and memmap files") all_fastas_sorted = get_unique_barcodes(all_fastas) unique_barcodes = len(all_fastas_sorted) logger.info("Found %d unique barcodes", unique_barcodes) # Cleaning up to retrieve memory from unused large variables del all_fastas chunksize = calculate_chunksize(unique_barcodes, n_jobs) logger.info("Pooled %d and chunksize %d mapped", n_jobs, chunksize) fastas = list( pool.imap(lambda index: collect_reduce_temp_fastas(index), range(unique_barcodes), chunksize=chunksize)) if args.write_barcode_meta_csv: write_to_barcode_meta_csv() logger.info("time taken to convert fastas for 10x folder is %.5f seconds", time.time() - startt) fastas = [fasta for fasta in fastas if fasta != []] pool.close() pool.join() return fastas
def percell(args): """Cli tool to convert bam to per cell fasta files""" parser = create_parser() args = parser.parse_args(args) logger.info(args) save_fastas = os.path.abspath(args.save_fastas) if not os.path.exists(save_fastas): os.makedirs(save_fastas) else: logger.info("Path {} already exists, might be overwriting data".format( save_fastas)) # Initializing time startt = time.time() save_intermediate_files = os.path.abspath(args.save_intermediate_files) if not os.path.exists(save_intermediate_files): os.makedirs(save_intermediate_files) else: logger.info("Path {} already exists, might be overwriting data".format( save_intermediate_files)) # Setting barcodes file, some 10x files don't have a filtered # barcode file if args.barcodes_file is not None: barcodes = tenx_utils.read_barcodes_file(args.barcodes_file) else: barcodes = None # Shard bam file to smaller bam file logger.info('... reading bam file from %s', args.filename) n_jobs = args.processes input_format = os.path.basename(args.filename).split(".")[-1] if input_format == "bam": filenames = tenx_utils.shard_bam_file(args.filename, args.shard_size, save_intermediate_files) # Create a per-cell fasta generator of sequences # If the reads should be filtered by barcodes and umis # umis are saved in fasta file as record name and name of # the fasta file is the barcode func = partial(tenx_utils.bam_to_temp_fasta, barcodes, args.rename_10x_barcodes, args.delimiter, save_intermediate_files) length_sharded_bam_files = len(filenames) chunksize = tenx_utils.calculate_chunksize(length_sharded_bam_files, n_jobs) pool = multiprocessing.Pool(processes=n_jobs) logger.info("multiprocessing pool processes {} & chunksize {}".format( n_jobs, chunksize)) # All the fastas are stored in a string instead of a list # This saves memory per element of the list by 8 bits # If we have unique barcodes in the order of 10^6 before # filtering that would result in a huge list if each barcode # is saved as a separate element, hence the string all_fastas = ",".join( itertools.chain( *(pool.imap_unordered(lambda x: func(x.encode('utf-8')), filenames, chunksize=chunksize)))) # clean up the memmap and sharded intermediary bam files [os.unlink(file) for file in filenames if os.path.exists(file)] del filenames logger.info("Deleted intermediary bam") all_fastas_sorted = tenx_utils.get_fastas_per_unique_barcodes( all_fastas) unique_barcodes = len(all_fastas_sorted) logger.info("Found %d unique barcodes", unique_barcodes) # Cleaning up to retrieve memory from unused large variables del all_fastas # Gather all barcodes per umis to one fasta func = partial(tenx_utils.barcode_umi_seq_to_fasta, save_fastas, args.delimiter, args.write_barcode_meta_csv, args.min_umi_per_barcode, save_intermediate_files) chunksize = tenx_utils.calculate_chunksize(unique_barcodes, n_jobs) logger.info("Pooled %d and chunksize %d mapped for %d lists", n_jobs, chunksize, len(all_fastas_sorted)) list( pool.imap_unordered(lambda fasta: func([fasta]), all_fastas_sorted, chunksize=chunksize)) pool.close() pool.join() # Write barcode meta csv if args.write_barcode_meta_csv: tenx_utils.write_to_barcode_meta_csv(save_intermediate_files, args.write_barcode_meta_csv) # Gather all the fastas fastas = glob.glob(os.path.join(save_fastas, "*_bam2fasta.fasta")) else: # if the fastq.gz file is already given assert input_format == "gz", \ ("please convert" + "bam files to fastq.gz using samtools") # Check if the good barcodes with significant umis is already given barcodes_significant_umis_file = args.barcodes_significant_umis_file logger.info("barcodes_significant_umis_file {}".format( barcodes_significant_umis_file)) # Find the good_barcodes file from the aligned sequences and use it # for unaligned.fastq.gz basename_wo_format = os.path.basename(args.filename).replace( ".fastq.gz", "__") args.barcodes_significant_umis_file = os.path.join( save_fastas, "barcodes_with_significant_umis.tsv") count_umis_percell(args) args.channel_id = basename_wo_format fastas = make_fastqs_percell(args) logger.info("time taken to write fastas is %.5f seconds", time.time() - startt) return fastas
def test_read_barcodes_file(): filename = utils.get_test_data('10x-example/barcodes.tsv') barcodes = tenx.read_barcodes_file(filename) assert len(barcodes) == 10