def fb2fa_concatenated(x, fasta_file, num_n=0): """Generates feature barcode fasta file. Parameters ---------- x : str The path and name of feature barcode file. fasta_file : str The path and name of generated fasta file. num_n : int, optional Number of Ns to use for separating seqeunces belonging to the same feature. Returns ------- str The path and name of generated fasta file. """ fb = dict() with open_by_suffix(file_name=x, mode='r') as f: for line in f: i = line.rstrip().split('\t') if i[0] not in fb: fb[i[0]] = [] fb[i[0]].append(i[1]) with open_by_suffix(file_name=fasta_file, mode='w') as fo: for i in fb: fo.write('>' + i + '\n') fo.write(('N' * num_n).join(fb[i]) + '\n') return fasta_file
def correct_cell_barcodes(cb_file, output_directory, bus_file, corrected_bus_file): """Corrects cell barcodes.""" CELL_BARCODE_FILE = output_directory / 'barcodes_no_suffix.tsv' with open_by_suffix(file_name=str(cb_file), mode='r') as f: with open_by_suffix( file_name=str(CELL_BARCODE_FILE), mode='w') as fo: for line in f: i = line.rstrip().split('-')[0] fo.write(i + '\n') logger.info('Number of whitelisted cell barcodes: ' + f'{len([i for i in open(CELL_BARCODE_FILE)])}') cmd = [ get_binary_path(binary_name='bustools'), 'correct', '-w', str(output_directory / 'barcodes_no_suffix.tsv'), '-o', str(corrected_bus_file), str(bus_file) ] outs, errs = run_executable(cmd) logger.info(errs) return corrected_bus_file
def generate_modified_fastq(read1_file, read2_file, cb_file, read1_coords, modified_read_file, num_mismatches=1, num_n_threshold=3): """Matches cell barcodes and generates modified fastq file.""" cell_barcodes = [ i.rstrip().split('-')[0] for i in open_by_suffix(cb_file, mode='r') ] cb_index = create_index(barcodes=cell_barcodes, num_mismatches=num_mismatches) read_counter = [int(), int()] with dnaio.open(file1=read1_file, file2=read2_file, fileformat='fastq', mode='r') as f, dnaio.open(file1=modified_read_file, fileformat='fastq', mode='w') as f_out: for rec in f: read_counter[1] += 1 read1, read2 = rec reads = (read1.name, read1.sequence, read1.qualities, read2.sequence, read2.qualities) out = match_cell_barcodes(reads=reads, barcode_index=cb_index, read_coords=read1_coords, num_mismatches=num_mismatches, num_n_threshold=num_n_threshold) if out: read_counter[0] += 1 read_name, read1_seq, _, read2_seq, read2_qual, bc, dist = out read_info = '#'.join([read1_seq, bc, str(dist)]) read_name = ' '.join( [read_name.split(' ')[0], 'RI:Z:' + read_info]) s2 = dnaio.Sequence(read_name, read2_seq, read2_qual) f_out.write(s2) return modified_read_file, read_counter
def fb2fa_kallisto(x, fasta_file, t2g_file): """Prepares fasta file, t2g file and returns k-mer length. Parameters ---------- x : str The path and name of feature barcode file. The example content of the file: CD3 CTCATTGTAACTCCT CD4 TGTTCCCGCTCAACT CD8a GCTGCGCTTTCCATT CD11b GACAAGTGATCTGCA CD14 TCTCAGACCTCCGTA CD15 TCACCAGTACCTAGT CD16 AAGTTCACTCTTTGC CD19 CTGGGCAATTACTCG CD20 TTCTGGGTCCCTAGA CD25 TTTGTCCTGTACGCC fasta_file: str The path and name of generated fasta file. One mismatch at each coordinate. The example content of the file: >CD3_CTCATTGTAACTCCT_0_A ATCATTGTAACTCCT >CD3_CTCATTGTAACTCCT_0_C CTCATTGTAACTCCT >CD3_CTCATTGTAACTCCT_0_G GTCATTGTAACTCCT >CD3_CTCATTGTAACTCCT_0_T TTCATTGTAACTCCT >CD3_CTCATTGTAACTCCT_1_A CACATTGTAACTCCT >CD3_CTCATTGTAACTCCT_1_C CCCATTGTAACTCCT >CD3_CTCATTGTAACTCCT_1_G CGCATTGTAACTCCT >CD3_CTCATTGTAACTCCT_1_T CTCATTGTAACTCCT >CD3_CTCATTGTAACTCCT_2_A CTAATTGTAACTCCT >CD3_CTCATTGTAACTCCT_2_C CTCATTGTAACTCCT t2g_file: str The path and name of generated t2g file. The example content of the file: CD3_CTCATTGTAACTCCT_0_A CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT_0_C CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT_0_G CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT_0_T CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT_1_A CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT_1_C CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT_1_G CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT_1_T CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT_2_A CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT_2_C CD3_CTCATTGTAACTCCT CD3_CTCATTGTAACTCCT Returns ------- int Largest odd number of the minimal length of the feature barcodes. """ sequence_lengths = list() sequence_names = list() with open_by_suffix(file_name=x, mode='r') as f: with open_by_suffix(file_name=fasta_file, mode='w') as fo: for line in f: i = line.rstrip().split('\t') sequence = i[1] sequence_lengths.append(len(sequence)) for ii in range(len(sequence)): for iii in 'ACGT': sequence_name = '_'.join( [i[0], sequence, str(ii), iii]) sequence_mutated = sequence[0:ii] + \ iii + \ sequence[ii + 1:len(sequence)] sequence_names.append( '\t'.join( [sequence_name] + [i[0] + '_' + sequence] * 2 ) ) fo.write( '>' + sequence_name + '\n' + sequence_mutated + '\n' ) with open_by_suffix(file_name=t2g_file, mode='w') as foo: foo.write('\n'.join(sequence_names)) num_sequnces = set([i.split('\t')[1] for i in sequence_names]) logger.info(f'Number of feature barcodes: {len(num_sequnces)}') kmer = min(sequence_lengths) if kmer % 2 == 0: kmer -= 1 logger.info(f'k-mer length: {kmer}') return kmer
def filter_matching(matching_file, filtered_matching_file, cb_pos_start=0, cb_num_mismatches=1, cb_left_shift=1, cb_right_shift=1, cb_extra_seq=None, cb_extra_seq_num_mismatches=None, fb_pos_start=10, fb_num_mismatches=1, fb_left_shift=1, fb_right_shift=1, fb_extra_seq=None, fb_extra_seq_num_mismatches=None): """Filters raw cell and feature barcode matching result.""" with open_by_suffix(file_name=matching_file) as f: header_line = next(f).rstrip().split('\t') logger.info('Header line: {}'.format(' '.join(header_line))) logger.info( f'Cell barcode maximum number of mismatches: {cb_num_mismatches}') logger.info(('Feature barcode maximum number of mismatches: ' + f'{fb_num_mismatches}')) with open_by_suffix(file_name=filtered_matching_file, mode='w') as fo: fo.write('\t'.join(header_line) + '\n') read_counter = [int(), int()] if len(header_line) == 6: logger.info( 'Skipping arguments: ' + '"cb_pos_start", "cb_left_shift", "cb_right_shift"') logger.info( 'Skipping arguments: ' + '"fb_pos_start", "fb_left_shift", "fb_right_shift"') for line in f: read_counter[1] += 1 i = line.rstrip().split('\t') if cb_extra_seq and cb_extra_seq_num_mismatches: cell_barcode_sequence_regex = \ compile_regex_ref_barcodes_single( cb_extra_seq, num_mismatches=cb_extra_seq_num_mismatches ) else: cell_barcode_sequence_regex = None if len(header_line) == 8 or len(header_line) == 12: cell_barcode_matching = i[:4] cell_barcode_passed = is_matched( x=cell_barcode_matching, barcode_pos_start=cb_pos_start, mismatching_threshold=cb_num_mismatches, left_shift=cb_left_shift, right_shift=cb_right_shift, sequence_regex=cell_barcode_sequence_regex) elif len(header_line) == 6: if int(i[2]) <= cb_num_mismatches: cell_barcode_passed = True else: cell_barcode_passed = False if cell_barcode_passed: if fb_extra_seq and fb_extra_seq_num_mismatches: feature_barcode_sequence_regex = \ compile_regex_ref_barcodes_single( fb_extra_seq, num_mismatches=fb_extra_seq_num_mismatches ) else: feature_barcode_sequence_regex = None if len(header_line) == 8 or len(header_line) == 12: feature_barcode_matching = i[4:8] feature_barcode_passed = is_matched( x=feature_barcode_matching, barcode_pos_start=fb_pos_start, mismatching_threshold=fb_num_mismatches, left_shift=fb_left_shift, right_shift=fb_right_shift, sequence_regex=feature_barcode_sequence_regex) elif len(header_line) == 6: if int(i[5]) <= fb_num_mismatches: feature_barcode_passed = True else: feature_barcode_passed = False if feature_barcode_passed: fo.write(line) read_counter[0] += 1 logger.info(f'Number of lines processed: {read_counter[1]:,}') logger.info(f'Number of lines passed filters: {read_counter[0]:,}') return filtered_matching_file
def main(): args = parse_args() logger = get_logger(logger_name=__name__) banner = """ █████▒▄▄▄▄ ▄▄▄ ▓██ ▒▓█████▄ ▒████▄ ▒████ ░▒██▒ ▄██▒██ ▀█▄ ░▓█▒ ░▒██░█▀ ░██▄▄▄▄██ ░▒█░ ░▓█ ▀█▓ ▓█ ▓██▒ ▒ ░ ░▒▓███▀▒ ▒▒ ▓▒█░ ░ ▒░▒ ░ ▒ ▒▒ ░ ░ ░ ░ ░ ░ ▒ ░ ░ ░ ░ """ logger.info(banner) # print(banner) logger.info(f'fba version: {__version__}') logger.info('Initiating logging ...') logger.info( f'Python version: {sys.version_info.major}.{sys.version_info.minor}') if not sys.version_info.major == 3 and sys.version_info.minor >= 6: logger.critical('Please use Python >= 3.6') sys.exit(1) if (args.command == 'extract'): logger.info('Using extract subcommand ...') m = importlib.import_module(name='fba.levenshtein') with open_by_suffix(file_name=args.output, mode='w') as f: f.write('\t'.join( [ 'read1_seq', 'cell_barcode', 'cb_num_mismatches', 'read2_seq', 'feature_barcode', 'fb_num_mismatches' ] ) + '\n') for out in m.extract_feature_barcoding_fastss( read1_file=args.read1, read2_file=args.read2, cb_file=args.whitelist, fb_file=args.feature_ref, cb_num_mismatches=args.cell_barcode_mismatches, fb_num_mismatches=args.feature_barcode_mismatches, read1_coords=args.read1_coords, read2_coords=args.read2_coords, cb_num_n_threshold=args.cb_num_n_threshold, fb_num_n_threshold=args.fb_num_n_threshold ): f.write(out + '\n') logger.info('Done.') elif (args.command == 'map'): logger.info('Using map subcommand ...') m = importlib.import_module(name=f'fba.{args.command}') matrix_featurecount = m.map_feature_barcoding( read1_file=args.read1, read2_file=args.read2, cb_file=args.whitelist, fb_file=args.feature_ref, read1_coords=args.read1_coords, num_mismatches=args.cell_barcode_mismatches, num_n_threshold=args.cb_num_n_threshold, num_n_ref=args.num_n_ref, umi_pos_start=args.umi_pos_start, umi_length=args.umi_length, umi_deduplication_method=args.umi_deduplication_method, umi_deduplication_threshold=args.umi_mismatches, mapq=args.mapq, output_directory=args.output_directory, num_threads=args.threads, aligner=args.aligner ) matrix_featurecount.to_csv(path_or_buf=args.output, compression='infer') logger.info('Done.') elif (args.command == 'filter'): logger.info('Using filter subcommand ...') m = importlib.import_module(name=f'fba.{args.command}') _ = m.filter_matching( matching_file=args.input, filtered_matching_file=args.output, cb_pos_start=args.cell_barcode_pos_start, cb_num_mismatches=args.cell_barcode_mismatches, cb_left_shift=args.cell_barcode_left_shift, cb_right_shift=args.cell_barcode_right_shift, cb_extra_seq=args.cell_barcode_extra_seq, cb_extra_seq_num_mismatches=args.cell_barcode_extra_seq_mismatches, fb_pos_start=args.feature_barcode_pos_start, fb_num_mismatches=args.feature_barcode_mismatches, fb_left_shift=args.feature_barcode_left_shift, fb_right_shift=args.feature_barcode_right_shift, fb_extra_seq=args.cell_barcode_extra_seq, fb_extra_seq_num_mismatches=args.feature_barcode_extra_seq_mismatches) # noqa logger.info(f'Filtered feature barcoding result: {_}') logger.info('Done.') elif (args.command == 'count'): logger.info('Using count subcommand ...') m = importlib.import_module(name=f'fba.{args.command}') matrix_featurecount = m.generate_matrix( matching_file=args.input, umi_pos_start=args.umi_pos_start, umi_length=args.umi_length, umi_deduplication_method=args.umi_deduplication_method, umi_deduplication_threshold=args.umi_mismatches ) matrix_featurecount.to_csv( path_or_buf=args.output, compression='infer' ) logger.info('Done.') elif (args.command == 'demultiplex'): logger.info('Using demultiplex subcommand ...') m = importlib.import_module(name=f'fba.{args.command}') _ = m.demultiplex_feature_barcoding( matrix_featurecount_file=args.input, output_directory=args.output_directory, q=args.quantile, initial_clustering_methold=args.clustering_method, visualization=args.visualization, embeding_method=args.visualization_method, seed=42 ) logger.info('Done.') elif (args.command == 'qc'): logger.info('Using qc subcommand ...') m = importlib.import_module(name=f'fba.{args.command}') import pandas as pd from pathlib import Path if not isinstance(args.num_reads, int): if args.num_reads.isdigit(): num_reads = int(args.num_reads) elif args.num_reads.upper() == 'NONE': num_reads = None else: sys.exit(1) else: num_reads = args.num_reads if args.read1: _ = m.summarize_sequence_content( read1_file=args.read1, read2_file=args.read2, num_reads=num_reads, output_directory=args.output_directory ) OUTPUT_FILE = 'feature_barcoding_output.tsv.gz' OUTPUT_FILE = str(Path(args.output_directory) / OUTPUT_FILE) with open_by_suffix(file_name=OUTPUT_FILE, mode='w') as f: f.write('\t'.join( [ 'read1_seq', 'cell_barcode', 'cb_matching_pos', 'cb_matching_description', 'read2_seq', 'feature_barcode', 'fb_matching_pos', 'fb_matching_description' ] ) + '\n') n = importlib.import_module(name='fba.regex') for out in n.extract_feature_barcoding_regex( read1_file=args.read1, read2_file=args.read2, cb_file=args.whitelist, fb_file=args.feature_ref, cb_num_mismatches=args.cell_barcode_mismatches, fb_num_mismatches=args.feature_barcode_mismatches, cb_num_n_threshold=args.cb_num_n_threshold, fb_num_n_threshold=args.fb_num_n_threshold, read1_coords=args.read1_coords, read2_coords=args.read2_coords, num_threads=args.threads, chunk_size=args.chunk_size, num_reads=num_reads): f.write(out + '\n') _ = m.summarize_barcode_positions( matching_file=OUTPUT_FILE, output_directory=args.output_directory) else: logger.info('Bulk mode enabled: ' 'only feature barcodes on reads 2 are analyzed') if not args.read2_coords: logger.critical('Please specify "-r2_coords" in bulk mode') sys.exit(1) logger.info( 'Skipping arguments: "-1", "-w", "-cb_m", "-r1_coords"' ) fb_frequency = m.analyze_bulk( read_file=args.read2, read_coords=args.read2_coords, fb_file=args.feature_ref, num_mismatches=args.feature_barcode_mismatches, num_n_threshold=args.fb_num_n_threshold, num_reads=num_reads ) Path(args.output_directory).mkdir(exist_ok=True) OUTPUT_FILE = 'feature_barcode_frequency.csv' OUTPUT_FILE = str(Path(args.output_directory) / OUTPUT_FILE) logger.info(f'Output file: {OUTPUT_FILE}') fb_frequency = pd.DataFrame.from_dict( data=fb_frequency, orient='index', columns=['num_reads']).sort_values( by='num_reads', ascending=False ) fb_frequency['percentage'] = fb_frequency['num_reads'] / sum( fb_frequency['num_reads']) fb_frequency.to_csv(path_or_buf=OUTPUT_FILE) logger.info('Done.') elif (args.command == 'kallisto_wrapper'): logger.info('Using kallisto_wrapper subcommand ...') m = importlib.import_module(name='fba.kallisto') matrix_featurecount = m.run_kallisto( read1_file=args.read1, read2_file=args.read2, cb_file=args.whitelist, fb_file=args.feature_ref, technology=args.technology, # '10xv3', output_directory=args.output_directory, # 'kallisto', num_threads=args.threads) matrix_featurecount.to_csv( path_or_buf=args.output, compression='infer' )
def generate_matrix(matching_file, umi_pos_start=16, umi_length=12, umi_deduplication_method='directional', umi_deduplication_threshold=1): """Generates a matrix based on matching results. Parameters ---------- matching_file : str The path and name of matching result file. umi_length : int, optional The length of UMI on read 1 after cell barcode. The default is 12. umi_pos_start : int, optional The starting coordiate of UMI on read 1. If the input matching result is from the regex method of extract subcommand, the staring coordinate will be auto determined. umi_deduplication_method : str, optional The UMI dedupliation method used in UMI-tools (Smith, T., et al. (2017). Genome Res. 27, 491–499.). See https://cgatoxford.wordpress.com/2015/08/14/unique-molecular-identifiers-the-problem-the-solution-and-the-proof umi_deduplication_threshold : int, optional The mismatch tolerance for UMI deduplication. Returns ------- DataFrame A pandas DataFrame of feature count. The columns are cells and the rows are features. """ # noqa logger.info(f'UMI-tools version: {umi_tools_version}') matrix_featurecount = defaultdict(dict) line_counter = int() with open_by_suffix(file_name=matching_file) as f: header_line = next(f) if len(header_line.split('\t')) == 6: if umi_pos_start: logger.info( f'UMI starting position on read 1: {umi_pos_start}' ) else: logger.critical( 'Need to specify UMI starting position on read 1: -us' ) raise ValueError('need to specify UMI starting position') else: logger.info('UMI start position on read 1 auto-detected, ' 'overriding -us') logger.info(f'UMI length: {umi_length}') logger.info('UMI-tools deduplication threshold: ' f'{umi_deduplication_threshold}') logger.info('UMI-tools deduplication method: ' f'{umi_deduplication_method}') logger.info('Header line: {}'.format( header_line.rstrip().replace('\t', ' '))) for line in f: i = line.rstrip().split('\t') line_counter += 1 read_seq = i[0] cell_barcode = i[1] if len(header_line.split('\t')) == 6: feature_barcode = i[4] else: feature_barcode = i[5] umi_pos_start = [int(ii) for ii in i[2].split(':')][1] umi_pos_end = umi_pos_start + umi_length if len(read_seq) >= umi_pos_end: umi_seq = read_seq[ umi_pos_start:umi_pos_end].upper().encode() if feature_barcode not in matrix_featurecount[cell_barcode]: matrix_featurecount[cell_barcode][feature_barcode] = list() matrix_featurecount[cell_barcode][ feature_barcode].append(umi_seq) logger.info(f'Number of lines processed: {line_counter:,}') cell_barcodes = sorted(matrix_featurecount.keys()) feature_barcodes = sorted( set([ii for i in matrix_featurecount for ii in matrix_featurecount[i]]) ) logger.info(f'Number of cell barcodes detected: {len(cell_barcodes):,}') logger.info(f'Number of features detected: {len(feature_barcodes):,}') clusterer = UMIClusterer(cluster_method=umi_deduplication_method) for i in matrix_featurecount: for ii in feature_barcodes: umis = matrix_featurecount[i].setdefault(ii, 0) if umis: matrix_featurecount[i][ii] = len( clusterer(Counter(umis), threshold=umi_deduplication_threshold) ) matrix_featurecount = {i: [matrix_featurecount[i][ii] for ii in feature_barcodes] for i in cell_barcodes} matrix_featurecount = pd.DataFrame.from_dict(matrix_featurecount, orient='columns') matrix_featurecount.index = feature_barcodes logger.info('Total UMIs after deduplication: ' f'{matrix_featurecount.values.sum():,}') logger.info('Median number of UMIs per cell: ' f'{np.median(matrix_featurecount.sum(axis=0)):,}') return matrix_featurecount
def extract_feature_barcoding_fastss(read1_file, read2_file, cb_file, fb_file, read1_coords, read2_coords, cb_num_mismatches, fb_num_mismatches, cb_num_n_threshold=3, fb_num_n_threshold=3): """Extracts feature barcodes.""" with open_by_suffix(file_name=cb_file) as f: cell_barcodes = [i.split('-')[0].rstrip() for i in f] with open_by_suffix(file_name=fb_file) as f: feature_barcodes = { i.rstrip().split('\t')[-1]: i.rstrip().replace('\t', '_') for i in f } logger.info(f'Number of reference cell barcodes: {len(cell_barcodes):,}') logger.info( f'Number of reference feature barcodes: {len(feature_barcodes):,}' ) logger.info('Read 1 coordinates to search: [' + ', '.join([str(i) for i in read1_coords]) + ')') logger.info('Read 2 coordinates to search: [' + ', '.join([str(i) for i in read2_coords]) + ')') logger.info( f'Cell barcode maximum number of mismatches: {cb_num_mismatches}') logger.info( f'Feature barcode maximum number of mismatches: {fb_num_mismatches}') logger.info( f'Read 1 maximum number of N allowed: {cb_num_n_threshold}') logger.info( f'Read 2 maximum number of N allowed: {fb_num_n_threshold}') cb_index = create_index(barcodes=cell_barcodes, num_mismatches=cb_num_mismatches) fb_index = create_index(barcodes=feature_barcodes.keys(), num_mismatches=fb_num_mismatches) logger.info('Matching ...') with dnaio.open(file1=read1_file, file2=read2_file, fileformat='fastq', mode='r') as f: read_counter = [int(), int()] for rec in f: read1, read2 = rec read_counter[1] += 1 if read_counter[1] % 10_000_000 == 0: logger.info(f'Read pairs processed: {read_counter[1]:,}') out = match_barcodes_paired_fastss( read_seqs=(read1.sequence, read1.qualities, read2.sequence, read2.qualities), cb_index=cb_index, fb_index=fb_index, feature_barcodes=feature_barcodes, read1_coords=read1_coords, read2_coords=read2_coords, cb_num_mismatches=cb_num_mismatches, fb_num_mismatches=fb_num_mismatches, cb_num_n_threshold=cb_num_n_threshold, fb_num_n_threshold=fb_num_n_threshold ) if out: read_counter[0] += 1 yield '\t'.join(out)
def map_feature_barcoding(read1_file, read2_file, cb_file, fb_file, read1_coords, num_mismatches=1, num_n_threshold=3, num_n_ref=0, umi_pos_start=16, umi_length=12, umi_deduplication_method='directional', umi_deduplication_threshold=1, mapq=10, output_directory='barcode_mapping', num_threads=None, aligner='bwa'): """Maps feature barcoding. """ output_directory = Path(output_directory) output_directory.mkdir(exist_ok=True) FB_FASTA_FILE = str(output_directory / 'feature_ref.fasta') FEATURE_BARCODE_REF = str(output_directory / 'feature_ref') ALIGNMENT_FILE = str(output_directory / 'aligned.bam') fasta_file = fb2fa_concatenated(x=fb_file, fasta_file=FB_FASTA_FILE, num_n=num_n_ref) if aligner == 'bowtie2': FEATURE_BARCODE_INDEX_LOG = str(output_directory / 'bowtie2-build.log') UNALIGNED_BAM_FILE = str(output_directory / 'unaligned.bam') ALIGNMENT_LOG = str(output_directory / 'bowtie2.log') logger.info(f'bowtie2 version: {parse_bowtie2_version()}') if version.parse(parse_bowtie2_version()) < version.parse('2.4.0'): logger.critical('Please use bowtie2 >= 2.4.0') sys.exit(1) feature_barcode_ref, _ = build_bt2_index( fasta_file=fasta_file, index_base=FEATURE_BARCODE_REF) elif aligner == 'bwa': FEATURE_BARCODE_INDEX_LOG = str(output_directory / 'bwa-index.log') MODIFIED_READ_FILE = str(output_directory / 'modified.fq.gz') ALIGNMENT_LOG = str(output_directory / 'bwa.log') logger.info(f'bwa version: {parse_bwa_version()}') if version.parse(parse_bwa_version()) < version.parse('0.7.0'): logger.critical('Please use bwa >= 0.7.0') sys.exit(1) fasta_file, _ = build_bwa_index(fasta_file=fasta_file) logger.info(f'samtools version: {parse_samtools_version()}') with open_by_suffix(file_name=FEATURE_BARCODE_INDEX_LOG, mode='w') as f: f.write(_) num_cb = len([i for i in open_by_suffix(cb_file)]) logger.info(f'Number of reference cell barcodes: {num_cb:,}') logger.info('Read 1 coordinates to search: [' + ', '.join([str(i) for i in read1_coords]) + ')') logger.info(f'Cell barcode maximum number of mismatches: {num_mismatches}') logger.info(f'Read 1 maximum number of N allowed: {num_n_threshold}') logger.info('Matching cell barcodes, read 1 ...') if aligner == 'bowtie2': unaligned_bam_file, read_counter = generate_unaligned_bam( read1_file=read1_file, read2_file=read2_file, cb_file=cb_file, fb_file=fb_file, unaligned_bam_file=UNALIGNED_BAM_FILE, read1_coords=read1_coords, num_mismatches=num_mismatches, num_n_threshold=num_n_threshold, num_n_ref=num_n_ref) elif aligner == 'bwa': modified_read_file, read_counter = generate_modified_fastq( read1_file=read1_file, read2_file=read2_file, cb_file=cb_file, read1_coords=read1_coords, modified_read_file=MODIFIED_READ_FILE, num_mismatches=num_mismatches, num_n_threshold=num_n_threshold) logger.info(f'number of read pairs processed: {read_counter[1]:,}') logger.info('Number of read pairs w/ valid cell barcodes: ' f'{read_counter[0]:,}') num_fb = len(set([i.split('\t')[0] for i in open_by_suffix(fb_file)])) logger.info(f'Number of reference features: {num_fb:,}') if not num_threads: num_threads = cpu_count() logger.info(f'Number of threads: {num_threads}') logger.info('Aligning read 2 ...') if aligner == 'bowtie2': alignment_file, _ = align_reads_bowtie2( unaligned_bam_file=unaligned_bam_file, index_base=feature_barcode_ref, alignment_file=ALIGNMENT_FILE, temp_prefix=next(_get_candidate_names()), num_threads=num_threads) elif aligner == 'bwa': alignment_file, _ = align_reads_bwa( modified_read_file=modified_read_file, index_base=fasta_file, alignment_file=ALIGNMENT_FILE, temp_prefix=next(_get_candidate_names()), num_threads=num_threads) pysam.index(alignment_file, alignment_file + '.bai') with open_by_suffix(file_name=ALIGNMENT_LOG, mode='w') as f: f.write(_) logger.info(f'\n{_.rstrip()}') logger.info('Generating matrix (UMI deduplication) ...') logger.info(f'UMI-tools version: {umi_tools_version}') logger.info(f'Mapping quality threshold: {mapq}') logger.info(f'UMI starting position on read 1: {umi_pos_start}') logger.info(f'UMI length: {umi_length}') logger.info('UMI-tools deduplication threshold: ' f'{umi_deduplication_threshold}') logger.info('UMI-tools deduplication method: ' f'{umi_deduplication_method}') matrix_featurecount = generate_matrix_from_alignment( alignment_file=alignment_file, umi_pos_start=umi_pos_start, umi_length=umi_length, umi_deduplication_method='directional', umi_deduplication_threshold=umi_deduplication_threshold) logger.info( f'Number of cell barcodes detected: {matrix_featurecount.shape[1]:,}') logger.info( f'Number of features detected: {matrix_featurecount.shape[0]:,}') logger.info('Total UMIs after deduplication: ' f'{matrix_featurecount.values.sum():,}') logger.info('Median number of UMIs per cell: ' f'{np.median(matrix_featurecount.sum(axis=0)):,}') return matrix_featurecount
def generate_unaligned_bam(read1_file, read2_file, cb_file, fb_file, unaligned_bam_file, read1_coords, num_mismatches=1, num_n_threshold=3, num_n_ref=0): """Matches cell barcodes and generates unaligned bam. Parameters ---------- read1_file : str The path and name of read 1 file. read2_file : str The path and name of read 2 file. cb_file : str The path and name of cell barcode file. fb_file : str The path and name of feature barcode file. unaligned_bam_file : str The path and name of unaligned file. read1_coords : tuple or list The positions of read 1 to compare against cell barcodes. num_mismatches : int, optional Maximum levenshtein distance allowd. num_n_threshold : int, optional Maximum Ns allowd for read 1. Read 1 with more Ns than this threshold will be skipped. num_n_ref : int, optional Number of Ns to use for separating seqeunces belonging to the same feature. Needed for correctly constructing bam header. Returns ------- str The path and name of unaligned file. """ cell_barcodes = [ i.rstrip().split('-')[0] for i in open_by_suffix(cb_file, mode='r') ] cb_index = create_index(barcodes=cell_barcodes, num_mismatches=num_mismatches) # create bam header feature_barcodes = dict() with open_by_suffix(file_name=fb_file, mode='r') as f: for line in f: i = line.rstrip().split('\t') if i[0] not in feature_barcodes: feature_barcodes[i[0]] = [] feature_barcodes[i[0]].append(i[1]) feature_barcodes = [{ 'LN': len(('N' * num_n_ref).join(feature_barcodes[i])), 'SN': i } for i in feature_barcodes] rg = { 'ID': 'fba', 'LB': 'null', 'PL': 'illumina', 'PU': 'null', 'SM': 'null' } pg = { 'ID': 'fba', 'PN': 'fba', 'VN': __version__, 'CL': ' '.join(sys.argv) } fb_bam_header = { 'HD': { 'VN': '1.6' }, 'SQ': feature_barcodes, 'RG': [rg], 'PG': [pg] } def _get_sequence(read1_file, read2_file): """Gets sequences and qualities.""" with dnaio.open(file1=read1_file, file2=read2_file, fileformat='fastq', mode='r') as f: for rec in f: read1, read2 = rec yield read1.name, read1.sequence, read1.qualities, \ read2.sequence, read2.qualities read_counter = [int(), int()] with pysam.AlignmentFile(unaligned_bam_file, 'wb', header=fb_bam_header) as outf: for i in _get_sequence(read1_file, read2_file): read_counter[1] += 1 out = match_cell_barcodes(reads=i, barcode_index=cb_index, read_coords=read1_coords, num_mismatches=num_mismatches, num_n_threshold=num_n_threshold) if out: read_counter[0] += 1 outf.write(compose_aln(out)) return unaligned_bam_file, read_counter
def analyze_bulk(read_file, read_coords, fb_file, num_mismatches=1, num_n_threshold=3, num_reads=None): """Searches feature barcodes on reads 2 and generates matrix. Parameters ---------- read_file : str The path and name of read 2 file. read2_coords : tuple or list The positions on read 2 to search. fb_file : str The path and name of feature barcoding file. num_mismatches : int, optional Maximum levenshtein distance allowed. num_n_threshoold : int, optional Maximum Ns allowed for reads. num_reads ; int, optional Number of reads to analyze. Returns ------- dict Count and frequency of each feature barcode in the provided fastq file. """ with open_by_suffix(file_name=fb_file) as f: feature_barcodes = { i.rstrip().split('\t')[-1]: i.rstrip().replace('\t', '_') for i in f } fb_index = create_index(barcodes=feature_barcodes.keys(), num_mismatches=num_mismatches) feature_barcode_count = {i: int() for i in feature_barcodes} logger.info('Number of reference feature barcodes: ' f'{len(feature_barcode_count):,}') logger.info('Read 2 coordinates to search: [' + ', '.join([str(i) for i in read_coords]) + ')') logger.info( f'Feature barcode maximum number of mismatches: {num_mismatches}') logger.info(f'Read 2 maximum number of N allowed: {num_n_threshold}') if num_reads: logger.info(f'Number of read pairs to analyze: {num_reads:,}') else: logger.info('Number of read pairs to analyze: all') def _get_sequence(read_file): """Gets sequences.""" with dnaio.open(file1=read_file, file2=None, fileformat='fastq', mode='r') as f: for read in f: yield read.sequence, read.qualities _reads = islice(_get_sequence(read_file), 0, num_reads) logger.info('Matching ...') read_counter = int() for read_seq, read_qual in _reads: read_counter += 1 if read_counter % 10_000_000 == 0: logger.info(f'Reads processed: {read_counter:,}') if read_seq.count('N') <= num_n_threshold: x2, y2 = read_coords fb_queries = query_index(read_seq[x2:y2], barcode_index=fb_index, num_mismatches=num_mismatches) fb_matched = select_query(fb_queries, read_seq[x2:y2], read_qual[x2:y2]) if fb_matched: feature_barcode_count[fb_matched[0]] += 1
def summarize_barcode_positions(matching_file, output_directory='qc'): """Summarizes barcode positions for reads 1 and reads 2. Parameters ---------- matching_file : str The path and name of matching result. output_directory : str, optional The path and name for the output directory. Returns ------- str The path and name for the output directory. """ logger.info('Summarizing barcode coordinates ...') logger.info(f'Output directory: {output_directory}') # read1 Path(output_directory).mkdir(exist_ok=True) R1_BC_STARTING_FILE = \ Path(output_directory) / 'Read1_barcodes_starting.csv' R1_BC_ENDING_FILE = \ Path(output_directory) / 'Read1_barcodes_ending.csv' R1_BC_STARTING_ENDING_PLOT = \ Path(output_directory) / 'Pyplot_read1_barcodes_starting_ending.pdf' # read2 R2_BC_STARTING_FILE = \ Path(output_directory) / 'Read2_barcodes_starting.csv' R2_BC_ENDING_FILE = \ Path(output_directory) / 'Read2_barcodes_ending.csv' R2_BC_STARTING_ENDING_PLOT = \ Path(output_directory) / 'Pyplot_read2_barcodes_starting_ending.pdf' # summary CB_MISMATCHES_FILE = \ Path(output_directory) / 'Read1_barcodes_mismatches.csv' FB_MISMATCHES_FILE = \ Path(output_directory) / 'Read2_barcodes_mismatches.csv' MATCHED_BC_RATIO_FILE = Path( output_directory) / 'matched_barcode_ratio.csv' # with open_by_suffix(file_name=matching_file) as f: next(f) first_line = next(f) read1_length = len(first_line.split('\t')[0]) read2_length = len(first_line.split('\t')[4]) # barcode starts and ends barcode_counter = [int(), int()] cb_matching_pos = list() cb_matching_description = list() cb_mismatches = list() fb_matching_pos = list() fb_matching_description = list() fb_mismatches = list() with open_by_suffix(file_name=matching_file) as f: next(f) for line in f: i = line.rstrip().split('\t') barcode_counter[1] += 1 if (i[2] not in {'no_match', 'n_skipping'} and i[5] not in {'no_match', 'NA'}): barcode_counter[0] += 1 cb_matching_pos.append(i[2]) cb_matching_description.append(i[3]) _ = [int(ii) for ii in i[2].split(':')] cb_mismatches.append( len(i[1]) - (_[1] - _[0]) + sum([int(ii) for ii in i[3].split(':')])) fb_matching_pos.append(i[6]) fb_matching_description.append(i[7]) _ = [int(ii) for ii in i[6].split(':')] fb_mismatches.append( len(i[5]) - (_[1] - _[0]) + sum([int(ii) for ii in i[7].split(':')])) barcode_counter.append(barcode_counter[0] / barcode_counter[1]) with open_by_suffix(file_name=MATCHED_BC_RATIO_FILE, mode='w') as f: f.write(','.join(['valid', 'total', 'ratio']) + '\n' + ','.join([str(i) for i in barcode_counter]) + '\n') cb_mismatches = pd.Series(cb_mismatches).value_counts().to_frame( name='count') cb_mismatches['ratio'] = cb_mismatches['count'] / \ sum(cb_mismatches['count']) cb_mismatches.sort_index().to_csv(CB_MISMATCHES_FILE) fb_mismatches = pd.Series(fb_mismatches).value_counts().to_frame( name='count') fb_mismatches['ratio'] = fb_mismatches['count'] / \ sum(fb_mismatches['count']) fb_mismatches.sort_index().to_csv(FB_MISMATCHES_FILE) # cell barcode cb_s = [int(i.split(':')[0]) for i in cb_matching_pos] cb_e = [int(i.split(':')[1]) - 1 for i in cb_matching_pos] cb_start_dist = pd.Series(cb_s).value_counts().to_frame( name='count').reindex(list(range(read1_length))).fillna(0).astype( np.int64) cb_start_dist.to_csv(R1_BC_STARTING_FILE) cb_end_dist = pd.Series(cb_e).value_counts().to_frame( name='count').reindex(list(range(read1_length))).fillna(0).astype( np.int64) cb_end_dist.to_csv(R1_BC_ENDING_FILE) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(max(2.8, read1_length / 15), 2.5)) plot_barcode_startend(s=cb_start_dist['count'] / sum(cb_start_dist['count']), e=cb_end_dist['count'] / sum(cb_end_dist['count']), bases=cb_start_dist.index.values, title='Distribution of cell barcode positions', ax=ax) plt.tight_layout() fig.savefig(fname=R1_BC_STARTING_ENDING_PLOT, transparent=None, bbox_inches='tight') # feature barcode fb_s = [int(i.split(':')[0]) for i in fb_matching_pos] fb_e = [int(i.split(':')[1]) - 1 for i in fb_matching_pos] fb_start_dist = pd.Series(fb_s).value_counts().to_frame( name='count').reindex(list(range(read2_length))).fillna(0).astype( np.int64) fb_start_dist.to_csv(R2_BC_STARTING_FILE) fb_end_dist = pd.Series(fb_e).value_counts().to_frame( name='count').reindex(list(range(read2_length))).fillna(0).astype( np.int64) fb_end_dist.to_csv(R2_BC_ENDING_FILE) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(max(2.8, read2_length / 15), 2.5)) plot_barcode_startend(s=fb_start_dist['count'] / sum(fb_start_dist['count']), e=fb_end_dist['count'] / sum(fb_end_dist['count']), bases=fb_start_dist.index.values, title='Distribution of feature barcode positions', ax=ax) plt.tight_layout() fig.savefig(fname=R2_BC_STARTING_ENDING_PLOT, transparent=None, bbox_inches='tight') return output_directory
def extract_feature_barcoding_regex(read1_file, read2_file, cb_file, fb_file, cb_num_mismatches, fb_num_mismatches, cb_num_n_threshold=3, fb_num_n_threshold=3, read1_coords=None, read2_coords=None, num_threads=None, chunk_size=1000, num_reads=None): """Extracts feature barcodes.""" logger.info(f'regex version: {regex.__version__}') with open_by_suffix(file_name=cb_file) as f: cell_barcodes = [i.split('-')[0].rstrip() for i in f] with open_by_suffix(file_name=fb_file) as f: feature_barcodes = { i.rstrip().split('\t')[1]: i.split('\t')[0] for i in f } logger.info(f'Number of reference cell barcodes: {len(cell_barcodes):,}') logger.info('Number of refernece feature barcodes: ' f'{len(feature_barcodes):,}') if read1_coords: logger.info('Read 1 coordinates to search: [' + ', '.join([str(i) for i in read1_coords]) + ')') if read2_coords: logger.info('Read 2 coordinates to search: [' + ', '.join([str(i) for i in read2_coords]) + ')') logger.info( f'Cell barcode maximum number of mismatches: {cb_num_mismatches}') logger.info( f'Feature barcode maximum number of mismatches: {fb_num_mismatches}') logger.info( f'Read 1 maximum number of N allowed: {cb_num_n_threshold}') logger.info( f'Read 2 maximum number of N allowed: {fb_num_n_threshold}' ) if num_reads: logger.info(f'Number of read pairs to analyze: {num_reads:,}') if chunk_size > num_reads: chunk_size = num_reads else: logger.info('Number of read pairs to analyze: all') cell_barcodes_compiled_exact = compile_regex_ref_barcodes_exact( barcodes=cell_barcodes ) feature_barcodes_compiled_exact = compile_regex_ref_barcodes_exact( barcodes=feature_barcodes.keys() ) if cb_num_mismatches: cell_barcodes_compiled_fuzzy = compile_regex_ref_barcodes_fuzzy( barcodes=cell_barcodes, num_mismatches=cb_num_mismatches ) else: cell_barcodes_compiled_fuzzy = None if fb_num_mismatches: feature_barcodes_compiled_fuzzy = compile_regex_ref_barcodes_fuzzy( barcodes=feature_barcodes, num_mismatches=fb_num_mismatches ) else: feature_barcodes_compiled_fuzzy = None def get_sequence(read1_file, read2_file, read1_coords=read1_coords, read2_coords=read2_coords): """Gets sequences.""" with dnaio.open(file1=read1_file, file2=read2_file, fileformat='fastq', mode='r') as f: for rec in f: read1, read2 = rec read1_seq = read1.sequence read2_seq = read2.sequence if read1_coords: r1_start, r1_end = read1_coords r1 = read1_seq[r1_start: min(r1_end, len(read1_seq))] else: r1 = read1_seq if read2_coords: r2_start, r2_end = read2_coords r2 = read2_seq[r2_start: min(r2_end, len(read2_seq))] else: r2 = read2_seq yield r1, r2, read1_seq, read2_seq def _restore_orig_seq(x, read1_coords=read1_coords, read2_coords=read2_coords): """Formats matching output, restores original seqs and coordinates.""" """ ['TGATCTTAGAACACGT', 'TGATCTTAGAACACGT', '0:16', '2:0:0', 'GGGGGGGGGGGGGGGGAGGGGGCCGGAAAAGAACCCCGAGAGGCCAGCGCCAAACAAAAAAGAACAAAAAAGAGGAAAAAAAAAAAAAAA', 'no_match', 'NA', 'NA', 'TGATCTTAGAACACGTCAGGGTCCTGAA', 'GGGGGGGGGGGGGGGGAGGGGGCCGGAAAAGAACCCCGAGAGGCCAGCGCCAAACAAAAAAGAACAAAAAAGAGGAAAAAAAAAAAAAAA'] ['TCTCAGCGTATAGTCC', 'TCTCAGCGTATAGTCC', '0:16', '2:0:0', 'AGCGGGCGCATGTTCCCGCTCAACTATACGAACGGCTTTAAGGCCGGTCCTAGCAACCTGAAGGCTTAGGACTATACGCTGAGACTGTCT', 'TGTTCCCGCTCAACT', '10:25', '0:0:0', 'TCTCAGCGTATAGTCCTAAGCCTTCAGG', 'AGCGGGCGCATGTTCCCGCTCAACTATACGAACGGCTTTAAGGCCGGTCCTAGCAACCTGAAGGCTTAGGACTATACGCTGAGACTGTCT'] ['CGATCGGGTGTGCGCT', 'no_match', 'NA', 'NA', 'CGATCGGCAGTGCGCTCACCTATTAGCGGCTAAGGCGATCTTGAGAGAGCGCACACCCGATCGCTGTCTCTTATACACATCTGACGCTGC', 'NA', 'NA', 'NA', 'CGATCGGGTGTGCGCTCTCTCAAGATCG', 'CGATCGGCAGTGCGCTCACCTATTAGCGGCTAAGGCGATCTTGAGAGAGCGCACACCCGATCGCTGTCTCTTATACACATCTGACGCTGC'] ['CAACAGTGTAACTAAG', 'CAACAGTGTAACTAAG', '0:16', '2:0:0', 'GGGCAATGTAGCTGCGCTTTCCATTCGAGGCCGGGATTTAAGGCCGGTCCTAGCAANNCGGCTACCCTCTTAGTTACACTGTNGCTGTCT', 'n_skipping', 'NA', 'NA', 'CAACAGTGTAACTAAGAGGGTAGCCGTA', 'GGGCAATGTAGCTGCGCTTTCCATTCGAGGCCGGGATTTAAGGCCGGTCCTAGCAANNCGGCTACCCTCTTAGTTACACTGTNGCTGTCT'] """ # noqa # read1 if read1_coords: r1_start, r1_end = read1_coords if ':' in x[2]: x[2] = ':'.join( [str(int(i) + r1_start) for i in x[2].split(':')] ) x[0] = x[-2][:r1_start].lower() + x[0] + x[-2][r1_end:].lower() # read2 if read2_coords: r2_start, r2_end = read2_coords if ':' in x[6]: x[6] = ':'.join( [str(int(i) + r2_start) for i in x[6].split(':')] ) x[4] = x[-1][:r2_start].lower() + x[4] + x[-1][r2_end:].lower() if x[5] not in {'no_match', 'n_skipping', 'NA'}: x[5] = feature_barcodes[x[5]] + '_' + x[5] return '\t'.join(x[:-2]) if not num_threads: num_threads = cpu_count() logger.info(f'Number of threads: {num_threads}') if num_threads > 1: logger.info(f'Chunk size: {chunk_size:,}') logger.info('Matching ...') _reads = islice( get_sequence(read1_file, read2_file), 0, num_reads ) read_counter = int() if num_threads == 1: for r1, r2, read1_seq, read2_seq in _reads: read_counter += 1 if read_counter % chunk_size == 0: logger.info(f'Read pairs processed: {read_counter:,}') out = match_barcodes_paired( read_seqs=(r1, r2, read1_seq, read2_seq), cb_compiled_exact=cell_barcodes_compiled_exact, fb_compiled_exact=feature_barcodes_compiled_exact, cb_compiled_fuzzy=cell_barcodes_compiled_fuzzy, fb_compiled_fuzzy=feature_barcodes_compiled_fuzzy, cb_num_n_threshold=cb_num_n_threshold, fb_num_n_threshold=fb_num_n_threshold ) out = _restore_orig_seq(x=out, read1_coords=read1_coords, read2_coords=read2_coords) yield out else: items = list(islice(_reads, chunk_size)) with Pool(processes=num_threads) as p: while items: read_counter += len(items) outs = p.starmap( match_barcodes_paired, zip(items, repeat(cell_barcodes_compiled_exact), repeat(feature_barcodes_compiled_exact), repeat(cell_barcodes_compiled_fuzzy), repeat(feature_barcodes_compiled_fuzzy), repeat(cb_num_n_threshold), repeat(fb_num_n_threshold), ) ) outs = [ _restore_orig_seq(x=i, read1_coords=read1_coords, read2_coords=read2_coords) for i in outs ] logger.info(f'Read pairs processed: {read_counter:,}') yield '\n'.join(outs) items = list(islice(_reads, chunk_size))