示例#1
0
def fb2fa_concatenated(x, fasta_file, num_n=0):
    """Generates feature barcode fasta file.

    Parameters
    ----------
    x : str
        The path and name of feature barcode file.
    fasta_file : str
        The path and name of generated fasta file.
    num_n : int, optional
        Number of Ns to use for separating seqeunces belonging to
        the same feature.

    Returns
    -------
    str
        The path and name of generated fasta file.
    """

    fb = dict()
    with open_by_suffix(file_name=x, mode='r') as f:
        for line in f:
            i = line.rstrip().split('\t')

            if i[0] not in fb:
                fb[i[0]] = []
            fb[i[0]].append(i[1])

    with open_by_suffix(file_name=fasta_file, mode='w') as fo:
        for i in fb:
            fo.write('>' + i + '\n')
            fo.write(('N' * num_n).join(fb[i]) + '\n')

    return fasta_file
示例#2
0
文件: kallisto.py 项目: jlduan/fba
def correct_cell_barcodes(cb_file,
                          output_directory,
                          bus_file,
                          corrected_bus_file):
    """Corrects cell barcodes."""

    CELL_BARCODE_FILE = output_directory / 'barcodes_no_suffix.tsv'

    with open_by_suffix(file_name=str(cb_file), mode='r') as f:

        with open_by_suffix(
                file_name=str(CELL_BARCODE_FILE),
                mode='w') as fo:
            for line in f:
                i = line.rstrip().split('-')[0]
                fo.write(i + '\n')
    logger.info('Number of whitelisted cell barcodes: '
                + f'{len([i for i in open(CELL_BARCODE_FILE)])}')

    cmd = [
        get_binary_path(binary_name='bustools'),
        'correct',
        '-w',
        str(output_directory / 'barcodes_no_suffix.tsv'),
        '-o',
        str(corrected_bus_file),
        str(bus_file)
    ]

    outs, errs = run_executable(cmd)
    logger.info(errs)

    return corrected_bus_file
示例#3
0
def generate_modified_fastq(read1_file,
                            read2_file,
                            cb_file,
                            read1_coords,
                            modified_read_file,
                            num_mismatches=1,
                            num_n_threshold=3):
    """Matches cell barcodes and generates modified fastq file."""

    cell_barcodes = [
        i.rstrip().split('-')[0] for i in open_by_suffix(cb_file, mode='r')
    ]

    cb_index = create_index(barcodes=cell_barcodes,
                            num_mismatches=num_mismatches)

    read_counter = [int(), int()]
    with dnaio.open(file1=read1_file,
                    file2=read2_file,
                    fileformat='fastq',
                    mode='r') as f, dnaio.open(file1=modified_read_file,
                                               fileformat='fastq',
                                               mode='w') as f_out:

        for rec in f:
            read_counter[1] += 1

            read1, read2 = rec
            reads = (read1.name, read1.sequence, read1.qualities,
                     read2.sequence, read2.qualities)
            out = match_cell_barcodes(reads=reads,
                                      barcode_index=cb_index,
                                      read_coords=read1_coords,
                                      num_mismatches=num_mismatches,
                                      num_n_threshold=num_n_threshold)
            if out:
                read_counter[0] += 1

                read_name, read1_seq, _, read2_seq, read2_qual, bc, dist = out
                read_info = '#'.join([read1_seq, bc, str(dist)])

                read_name = ' '.join(
                    [read_name.split(' ')[0], 'RI:Z:' + read_info])

                s2 = dnaio.Sequence(read_name, read2_seq, read2_qual)
                f_out.write(s2)

    return modified_read_file, read_counter
示例#4
0
文件: kallisto.py 项目: jlduan/fba
def fb2fa_kallisto(x, fasta_file, t2g_file):
    """Prepares fasta file, t2g file and returns k-mer length.

    Parameters
    ----------
    x : str
        The path and name of feature barcode file.

        The example content of the file:
        CD3     CTCATTGTAACTCCT
        CD4     TGTTCCCGCTCAACT
        CD8a    GCTGCGCTTTCCATT
        CD11b   GACAAGTGATCTGCA
        CD14    TCTCAGACCTCCGTA
        CD15    TCACCAGTACCTAGT
        CD16    AAGTTCACTCTTTGC
        CD19    CTGGGCAATTACTCG
        CD20    TTCTGGGTCCCTAGA
        CD25    TTTGTCCTGTACGCC

    fasta_file: str
        The path and name of generated fasta file. One mismatch at each
        coordinate.

        The example content of the file:
        >CD3_CTCATTGTAACTCCT_0_A
        ATCATTGTAACTCCT
        >CD3_CTCATTGTAACTCCT_0_C
        CTCATTGTAACTCCT
        >CD3_CTCATTGTAACTCCT_0_G
        GTCATTGTAACTCCT
        >CD3_CTCATTGTAACTCCT_0_T
        TTCATTGTAACTCCT
        >CD3_CTCATTGTAACTCCT_1_A
        CACATTGTAACTCCT
        >CD3_CTCATTGTAACTCCT_1_C
        CCCATTGTAACTCCT
        >CD3_CTCATTGTAACTCCT_1_G
        CGCATTGTAACTCCT
        >CD3_CTCATTGTAACTCCT_1_T
        CTCATTGTAACTCCT
        >CD3_CTCATTGTAACTCCT_2_A
        CTAATTGTAACTCCT
        >CD3_CTCATTGTAACTCCT_2_C
        CTCATTGTAACTCCT

    t2g_file: str
        The path and name of generated t2g file.

        The example content of the file:
        CD3_CTCATTGTAACTCCT_0_A CD3_CTCATTGTAACTCCT     CD3_CTCATTGTAACTCCT
        CD3_CTCATTGTAACTCCT_0_C CD3_CTCATTGTAACTCCT     CD3_CTCATTGTAACTCCT
        CD3_CTCATTGTAACTCCT_0_G CD3_CTCATTGTAACTCCT     CD3_CTCATTGTAACTCCT
        CD3_CTCATTGTAACTCCT_0_T CD3_CTCATTGTAACTCCT     CD3_CTCATTGTAACTCCT
        CD3_CTCATTGTAACTCCT_1_A CD3_CTCATTGTAACTCCT     CD3_CTCATTGTAACTCCT
        CD3_CTCATTGTAACTCCT_1_C CD3_CTCATTGTAACTCCT     CD3_CTCATTGTAACTCCT
        CD3_CTCATTGTAACTCCT_1_G CD3_CTCATTGTAACTCCT     CD3_CTCATTGTAACTCCT
        CD3_CTCATTGTAACTCCT_1_T CD3_CTCATTGTAACTCCT     CD3_CTCATTGTAACTCCT
        CD3_CTCATTGTAACTCCT_2_A CD3_CTCATTGTAACTCCT     CD3_CTCATTGTAACTCCT
        CD3_CTCATTGTAACTCCT_2_C CD3_CTCATTGTAACTCCT     CD3_CTCATTGTAACTCCT

    Returns
    -------
    int
        Largest odd number of the minimal length of the feature barcodes.
    """

    sequence_lengths = list()
    sequence_names = list()

    with open_by_suffix(file_name=x, mode='r') as f:

        with open_by_suffix(file_name=fasta_file, mode='w') as fo:

            for line in f:
                i = line.rstrip().split('\t')

                sequence = i[1]
                sequence_lengths.append(len(sequence))

                for ii in range(len(sequence)):

                    for iii in 'ACGT':
                        sequence_name = '_'.join(
                            [i[0], sequence, str(ii), iii])
                        sequence_mutated = sequence[0:ii] + \
                            iii + \
                            sequence[ii + 1:len(sequence)]

                        sequence_names.append(
                            '\t'.join(
                                [sequence_name] + [i[0] + '_' + sequence] * 2
                            )
                        )

                        fo.write(
                            '>' + sequence_name
                            + '\n'
                            + sequence_mutated
                            + '\n'
                        )

    with open_by_suffix(file_name=t2g_file, mode='w') as foo:
        foo.write('\n'.join(sequence_names))

    num_sequnces = set([i.split('\t')[1] for i in sequence_names])
    logger.info(f'Number of feature barcodes: {len(num_sequnces)}')

    kmer = min(sequence_lengths)
    if kmer % 2 == 0:
        kmer -= 1
    logger.info(f'k-mer length: {kmer}')

    return kmer
示例#5
0
def filter_matching(matching_file,
                    filtered_matching_file,
                    cb_pos_start=0,
                    cb_num_mismatches=1,
                    cb_left_shift=1,
                    cb_right_shift=1,
                    cb_extra_seq=None,
                    cb_extra_seq_num_mismatches=None,
                    fb_pos_start=10,
                    fb_num_mismatches=1,
                    fb_left_shift=1,
                    fb_right_shift=1,
                    fb_extra_seq=None,
                    fb_extra_seq_num_mismatches=None):
    """Filters raw cell and feature barcode matching result."""

    with open_by_suffix(file_name=matching_file) as f:
        header_line = next(f).rstrip().split('\t')
        logger.info('Header line: {}'.format(' '.join(header_line)))
        logger.info(
            f'Cell barcode maximum number of mismatches: {cb_num_mismatches}')
        logger.info(('Feature barcode maximum number of mismatches: ' +
                     f'{fb_num_mismatches}'))

        with open_by_suffix(file_name=filtered_matching_file, mode='w') as fo:
            fo.write('\t'.join(header_line) + '\n')

            read_counter = [int(), int()]

            if len(header_line) == 6:
                logger.info(
                    'Skipping arguments: ' +
                    '"cb_pos_start", "cb_left_shift", "cb_right_shift"')

                logger.info(
                    'Skipping arguments: ' +
                    '"fb_pos_start", "fb_left_shift", "fb_right_shift"')

            for line in f:
                read_counter[1] += 1

                i = line.rstrip().split('\t')
                if cb_extra_seq and cb_extra_seq_num_mismatches:
                    cell_barcode_sequence_regex = \
                        compile_regex_ref_barcodes_single(
                            cb_extra_seq,
                            num_mismatches=cb_extra_seq_num_mismatches
                        )
                else:
                    cell_barcode_sequence_regex = None

                if len(header_line) == 8 or len(header_line) == 12:
                    cell_barcode_matching = i[:4]
                    cell_barcode_passed = is_matched(
                        x=cell_barcode_matching,
                        barcode_pos_start=cb_pos_start,
                        mismatching_threshold=cb_num_mismatches,
                        left_shift=cb_left_shift,
                        right_shift=cb_right_shift,
                        sequence_regex=cell_barcode_sequence_regex)
                elif len(header_line) == 6:
                    if int(i[2]) <= cb_num_mismatches:
                        cell_barcode_passed = True
                    else:
                        cell_barcode_passed = False

                if cell_barcode_passed:

                    if fb_extra_seq and fb_extra_seq_num_mismatches:
                        feature_barcode_sequence_regex = \
                            compile_regex_ref_barcodes_single(
                                fb_extra_seq,
                                num_mismatches=fb_extra_seq_num_mismatches
                            )
                    else:
                        feature_barcode_sequence_regex = None

                    if len(header_line) == 8 or len(header_line) == 12:
                        feature_barcode_matching = i[4:8]
                        feature_barcode_passed = is_matched(
                            x=feature_barcode_matching,
                            barcode_pos_start=fb_pos_start,
                            mismatching_threshold=fb_num_mismatches,
                            left_shift=fb_left_shift,
                            right_shift=fb_right_shift,
                            sequence_regex=feature_barcode_sequence_regex)
                    elif len(header_line) == 6:
                        if int(i[5]) <= fb_num_mismatches:
                            feature_barcode_passed = True
                        else:
                            feature_barcode_passed = False

                    if feature_barcode_passed:
                        fo.write(line)
                        read_counter[0] += 1

    logger.info(f'Number of lines processed: {read_counter[1]:,}')
    logger.info(f'Number of lines passed filters: {read_counter[0]:,}')

    return filtered_matching_file
示例#6
0
文件: __main__.py 项目: jlduan/fba
def main():
    args = parse_args()

    logger = get_logger(logger_name=__name__)
    banner = """


    █████▒▄▄▄▄    ▄▄▄
    ▓██   ▒▓█████▄ ▒████▄
    ▒████ ░▒██▒ ▄██▒██  ▀█▄
    ░▓█▒  ░▒██░█▀  ░██▄▄▄▄██
    ░▒█░   ░▓█  ▀█▓ ▓█   ▓██▒
    ▒ ░   ░▒▓███▀▒ ▒▒   ▓▒█░
    ░     ▒░▒   ░   ▒   ▒▒ ░
    ░ ░    ░    ░   ░   ▒
            ░            ░  ░
                ░
    """
    logger.info(banner)
    # print(banner)

    logger.info(f'fba version: {__version__}')
    logger.info('Initiating logging ...')
    logger.info(
        f'Python version: {sys.version_info.major}.{sys.version_info.minor}')

    if not sys.version_info.major == 3 and sys.version_info.minor >= 6:
        logger.critical('Please use Python >= 3.6')
        sys.exit(1)

    if (args.command == 'extract'):
        logger.info('Using extract subcommand ...')
        m = importlib.import_module(name='fba.levenshtein')

        with open_by_suffix(file_name=args.output, mode='w') as f:

            f.write('\t'.join(
                [
                    'read1_seq',
                    'cell_barcode',
                    'cb_num_mismatches',
                    'read2_seq',
                    'feature_barcode',
                    'fb_num_mismatches'
                ]
            ) + '\n')

            for out in m.extract_feature_barcoding_fastss(
                    read1_file=args.read1,
                    read2_file=args.read2,
                    cb_file=args.whitelist,
                    fb_file=args.feature_ref,
                    cb_num_mismatches=args.cell_barcode_mismatches,
                    fb_num_mismatches=args.feature_barcode_mismatches,
                    read1_coords=args.read1_coords,
                    read2_coords=args.read2_coords,
                    cb_num_n_threshold=args.cb_num_n_threshold,
                    fb_num_n_threshold=args.fb_num_n_threshold
            ):
                f.write(out + '\n')
        logger.info('Done.')

    elif (args.command == 'map'):
        logger.info('Using map subcommand ...')
        m = importlib.import_module(name=f'fba.{args.command}')

        matrix_featurecount = m.map_feature_barcoding(
            read1_file=args.read1,
            read2_file=args.read2,
            cb_file=args.whitelist,
            fb_file=args.feature_ref,
            read1_coords=args.read1_coords,
            num_mismatches=args.cell_barcode_mismatches,
            num_n_threshold=args.cb_num_n_threshold,
            num_n_ref=args.num_n_ref,
            umi_pos_start=args.umi_pos_start,
            umi_length=args.umi_length,
            umi_deduplication_method=args.umi_deduplication_method,
            umi_deduplication_threshold=args.umi_mismatches,
            mapq=args.mapq,
            output_directory=args.output_directory,
            num_threads=args.threads,
            aligner=args.aligner
        )

        matrix_featurecount.to_csv(path_or_buf=args.output,
                                   compression='infer')
        logger.info('Done.')

    elif (args.command == 'filter'):
        logger.info('Using filter subcommand ...')
        m = importlib.import_module(name=f'fba.{args.command}')

        _ = m.filter_matching(
            matching_file=args.input,
            filtered_matching_file=args.output,
            cb_pos_start=args.cell_barcode_pos_start,
            cb_num_mismatches=args.cell_barcode_mismatches,
            cb_left_shift=args.cell_barcode_left_shift,
            cb_right_shift=args.cell_barcode_right_shift,
            cb_extra_seq=args.cell_barcode_extra_seq,
            cb_extra_seq_num_mismatches=args.cell_barcode_extra_seq_mismatches,
            fb_pos_start=args.feature_barcode_pos_start,
            fb_num_mismatches=args.feature_barcode_mismatches,
            fb_left_shift=args.feature_barcode_left_shift,
            fb_right_shift=args.feature_barcode_right_shift,
            fb_extra_seq=args.cell_barcode_extra_seq,
            fb_extra_seq_num_mismatches=args.feature_barcode_extra_seq_mismatches)  # noqa
        logger.info(f'Filtered feature barcoding result: {_}')
        logger.info('Done.')

    elif (args.command == 'count'):
        logger.info('Using count subcommand ...')
        m = importlib.import_module(name=f'fba.{args.command}')

        matrix_featurecount = m.generate_matrix(
            matching_file=args.input,
            umi_pos_start=args.umi_pos_start,
            umi_length=args.umi_length,
            umi_deduplication_method=args.umi_deduplication_method,
            umi_deduplication_threshold=args.umi_mismatches
        )

        matrix_featurecount.to_csv(
            path_or_buf=args.output,
            compression='infer'
        )
        logger.info('Done.')

    elif (args.command == 'demultiplex'):
        logger.info('Using demultiplex subcommand ...')
        m = importlib.import_module(name=f'fba.{args.command}')

        _ = m.demultiplex_feature_barcoding(
            matrix_featurecount_file=args.input,
            output_directory=args.output_directory,
            q=args.quantile,
            initial_clustering_methold=args.clustering_method,
            visualization=args.visualization,
            embeding_method=args.visualization_method,
            seed=42
        )
        logger.info('Done.')

    elif (args.command == 'qc'):
        logger.info('Using qc subcommand ...')
        m = importlib.import_module(name=f'fba.{args.command}')

        import pandas as pd
        from pathlib import Path

        if not isinstance(args.num_reads, int):
            if args.num_reads.isdigit():
                num_reads = int(args.num_reads)
            elif args.num_reads.upper() == 'NONE':
                num_reads = None
            else:
                sys.exit(1)
        else:
            num_reads = args.num_reads

        if args.read1:
            _ = m.summarize_sequence_content(
                read1_file=args.read1,
                read2_file=args.read2,
                num_reads=num_reads,
                output_directory=args.output_directory
            )

            OUTPUT_FILE = 'feature_barcoding_output.tsv.gz'
            OUTPUT_FILE = str(Path(args.output_directory) / OUTPUT_FILE)
            with open_by_suffix(file_name=OUTPUT_FILE, mode='w') as f:

                f.write('\t'.join(
                    [
                        'read1_seq',
                        'cell_barcode',
                        'cb_matching_pos',
                        'cb_matching_description',
                        'read2_seq',
                        'feature_barcode',
                        'fb_matching_pos',
                        'fb_matching_description'
                    ]
                ) + '\n')

                n = importlib.import_module(name='fba.regex')
                for out in n.extract_feature_barcoding_regex(
                        read1_file=args.read1,
                        read2_file=args.read2,
                        cb_file=args.whitelist,
                        fb_file=args.feature_ref,
                        cb_num_mismatches=args.cell_barcode_mismatches,
                        fb_num_mismatches=args.feature_barcode_mismatches,
                        cb_num_n_threshold=args.cb_num_n_threshold,
                        fb_num_n_threshold=args.fb_num_n_threshold,
                        read1_coords=args.read1_coords,
                        read2_coords=args.read2_coords,
                        num_threads=args.threads,
                        chunk_size=args.chunk_size,
                        num_reads=num_reads):

                    f.write(out + '\n')

            _ = m.summarize_barcode_positions(
                matching_file=OUTPUT_FILE,
                output_directory=args.output_directory)

        else:
            logger.info('Bulk mode enabled: '
                        'only feature barcodes on reads 2 are analyzed')
            if not args.read2_coords:
                logger.critical('Please specify "-r2_coords" in bulk mode')
                sys.exit(1)

            logger.info(
                'Skipping arguments: "-1", "-w", "-cb_m", "-r1_coords"'
            )

            fb_frequency = m.analyze_bulk(
                read_file=args.read2,
                read_coords=args.read2_coords,
                fb_file=args.feature_ref,
                num_mismatches=args.feature_barcode_mismatches,
                num_n_threshold=args.fb_num_n_threshold,
                num_reads=num_reads
            )

            Path(args.output_directory).mkdir(exist_ok=True)
            OUTPUT_FILE = 'feature_barcode_frequency.csv'
            OUTPUT_FILE = str(Path(args.output_directory) / OUTPUT_FILE)
            logger.info(f'Output file: {OUTPUT_FILE}')

            fb_frequency = pd.DataFrame.from_dict(
                data=fb_frequency,
                orient='index',
                columns=['num_reads']).sort_values(
                by='num_reads',
                ascending=False
            )
            fb_frequency['percentage'] = fb_frequency['num_reads'] / sum(
                fb_frequency['num_reads'])
            fb_frequency.to_csv(path_or_buf=OUTPUT_FILE)
        logger.info('Done.')

    elif (args.command == 'kallisto_wrapper'):
        logger.info('Using kallisto_wrapper subcommand ...')
        m = importlib.import_module(name='fba.kallisto')

        matrix_featurecount = m.run_kallisto(
            read1_file=args.read1,
            read2_file=args.read2,
            cb_file=args.whitelist,
            fb_file=args.feature_ref,
            technology=args.technology,  # '10xv3',
            output_directory=args.output_directory,  # 'kallisto',
            num_threads=args.threads)

        matrix_featurecount.to_csv(
            path_or_buf=args.output,
            compression='infer'
        )
示例#7
0
def generate_matrix(matching_file,
                    umi_pos_start=16,
                    umi_length=12,
                    umi_deduplication_method='directional',
                    umi_deduplication_threshold=1):
    """Generates a matrix based on matching results.

    Parameters
    ----------
    matching_file : str
        The path and name of matching result file.
    umi_length : int, optional
        The length of UMI on read 1 after cell barcode. The default is 12.
    umi_pos_start : int, optional
        The starting coordiate of UMI on read 1. If the input matching result
        is from the regex method of extract subcommand, the staring
        coordinate will be auto determined.
    umi_deduplication_method : str, optional
        The UMI dedupliation method used in UMI-tools
        (Smith, T., et al. (2017). Genome Res. 27, 491–499.).
        See https://cgatoxford.wordpress.com/2015/08/14/unique-molecular-identifiers-the-problem-the-solution-and-the-proof
    umi_deduplication_threshold : int, optional
        The mismatch tolerance for UMI deduplication.

    Returns
    -------
    DataFrame
        A pandas DataFrame of feature count. The columns are cells and
        the rows are features.
    """  # noqa

    logger.info(f'UMI-tools version: {umi_tools_version}')

    matrix_featurecount = defaultdict(dict)
    line_counter = int()

    with open_by_suffix(file_name=matching_file) as f:
        header_line = next(f)

        if len(header_line.split('\t')) == 6:
            if umi_pos_start:
                logger.info(
                    f'UMI starting position on read 1: {umi_pos_start}'
                )
            else:
                logger.critical(
                    'Need to specify UMI starting position on read 1: -us'
                )
                raise ValueError('need to specify UMI starting position')
        else:
            logger.info('UMI start position on read 1 auto-detected, '
                        'overriding -us')
        logger.info(f'UMI length: {umi_length}')
        logger.info('UMI-tools deduplication threshold: '
                    f'{umi_deduplication_threshold}')
        logger.info('UMI-tools deduplication method: '
                    f'{umi_deduplication_method}')

        logger.info('Header line: {}'.format(
            header_line.rstrip().replace('\t', ' ')))

        for line in f:
            i = line.rstrip().split('\t')
            line_counter += 1

            read_seq = i[0]
            cell_barcode = i[1]

            if len(header_line.split('\t')) == 6:
                feature_barcode = i[4]
            else:
                feature_barcode = i[5]
                umi_pos_start = [int(ii) for ii in i[2].split(':')][1]

            umi_pos_end = umi_pos_start + umi_length

            if len(read_seq) >= umi_pos_end:
                umi_seq = read_seq[
                    umi_pos_start:umi_pos_end].upper().encode()

                if feature_barcode not in matrix_featurecount[cell_barcode]:
                    matrix_featurecount[cell_barcode][feature_barcode] = list()

                matrix_featurecount[cell_barcode][
                    feature_barcode].append(umi_seq)

    logger.info(f'Number of lines processed: {line_counter:,}')

    cell_barcodes = sorted(matrix_featurecount.keys())
    feature_barcodes = sorted(
        set([ii
             for i in matrix_featurecount
             for ii in matrix_featurecount[i]])
    )
    logger.info(f'Number of cell barcodes detected: {len(cell_barcodes):,}')
    logger.info(f'Number of features detected: {len(feature_barcodes):,}')

    clusterer = UMIClusterer(cluster_method=umi_deduplication_method)
    for i in matrix_featurecount:
        for ii in feature_barcodes:

            umis = matrix_featurecount[i].setdefault(ii, 0)
            if umis:
                matrix_featurecount[i][ii] = len(
                    clusterer(Counter(umis),
                              threshold=umi_deduplication_threshold)
                )

    matrix_featurecount = {i: [matrix_featurecount[i][ii]
                               for ii in feature_barcodes]
                           for i in cell_barcodes}
    matrix_featurecount = pd.DataFrame.from_dict(matrix_featurecount,
                                                 orient='columns')
    matrix_featurecount.index = feature_barcodes

    logger.info('Total UMIs after deduplication: '
                f'{matrix_featurecount.values.sum():,}')
    logger.info('Median number of UMIs per cell: '
                f'{np.median(matrix_featurecount.sum(axis=0)):,}')

    return matrix_featurecount
示例#8
0
文件: levenshtein.py 项目: jlduan/fba
def extract_feature_barcoding_fastss(read1_file,
                                     read2_file,
                                     cb_file,
                                     fb_file,
                                     read1_coords,
                                     read2_coords,
                                     cb_num_mismatches,
                                     fb_num_mismatches,
                                     cb_num_n_threshold=3,
                                     fb_num_n_threshold=3):
    """Extracts feature barcodes."""

    with open_by_suffix(file_name=cb_file) as f:
        cell_barcodes = [i.split('-')[0].rstrip() for i in f]

    with open_by_suffix(file_name=fb_file) as f:
        feature_barcodes = {
            i.rstrip().split('\t')[-1]: i.rstrip().replace('\t', '_')
            for i in f
        }

    logger.info(f'Number of reference cell barcodes: {len(cell_barcodes):,}')
    logger.info(
        f'Number of reference feature barcodes: {len(feature_barcodes):,}'
    )

    logger.info('Read 1 coordinates to search: [' +
                ', '.join([str(i) for i in read1_coords]) + ')')
    logger.info('Read 2 coordinates to search: [' +
                ', '.join([str(i) for i in read2_coords]) + ')')

    logger.info(
        f'Cell barcode maximum number of mismatches: {cb_num_mismatches}')
    logger.info(
        f'Feature barcode maximum number of mismatches: {fb_num_mismatches}')
    logger.info(
        f'Read 1 maximum number of N allowed: {cb_num_n_threshold}')
    logger.info(
        f'Read 2 maximum number of N allowed: {fb_num_n_threshold}')

    cb_index = create_index(barcodes=cell_barcodes,
                            num_mismatches=cb_num_mismatches)

    fb_index = create_index(barcodes=feature_barcodes.keys(),
                            num_mismatches=fb_num_mismatches)

    logger.info('Matching ...')

    with dnaio.open(file1=read1_file,
                    file2=read2_file,
                    fileformat='fastq',
                    mode='r') as f:

        read_counter = [int(), int()]
        for rec in f:
            read1, read2 = rec

            read_counter[1] += 1
            if read_counter[1] % 10_000_000 == 0:
                logger.info(f'Read pairs processed: {read_counter[1]:,}')

            out = match_barcodes_paired_fastss(
                read_seqs=(read1.sequence, read1.qualities,
                           read2.sequence, read2.qualities),
                cb_index=cb_index,
                fb_index=fb_index,
                feature_barcodes=feature_barcodes,
                read1_coords=read1_coords,
                read2_coords=read2_coords,
                cb_num_mismatches=cb_num_mismatches,
                fb_num_mismatches=fb_num_mismatches,
                cb_num_n_threshold=cb_num_n_threshold,
                fb_num_n_threshold=fb_num_n_threshold
            )
            if out:
                read_counter[0] += 1
                yield '\t'.join(out)
示例#9
0
def map_feature_barcoding(read1_file,
                          read2_file,
                          cb_file,
                          fb_file,
                          read1_coords,
                          num_mismatches=1,
                          num_n_threshold=3,
                          num_n_ref=0,
                          umi_pos_start=16,
                          umi_length=12,
                          umi_deduplication_method='directional',
                          umi_deduplication_threshold=1,
                          mapq=10,
                          output_directory='barcode_mapping',
                          num_threads=None,
                          aligner='bwa'):
    """Maps feature barcoding. """

    output_directory = Path(output_directory)
    output_directory.mkdir(exist_ok=True)

    FB_FASTA_FILE = str(output_directory / 'feature_ref.fasta')
    FEATURE_BARCODE_REF = str(output_directory / 'feature_ref')
    ALIGNMENT_FILE = str(output_directory / 'aligned.bam')

    fasta_file = fb2fa_concatenated(x=fb_file,
                                    fasta_file=FB_FASTA_FILE,
                                    num_n=num_n_ref)

    if aligner == 'bowtie2':
        FEATURE_BARCODE_INDEX_LOG = str(output_directory / 'bowtie2-build.log')
        UNALIGNED_BAM_FILE = str(output_directory / 'unaligned.bam')
        ALIGNMENT_LOG = str(output_directory / 'bowtie2.log')

        logger.info(f'bowtie2 version: {parse_bowtie2_version()}')

        if version.parse(parse_bowtie2_version()) < version.parse('2.4.0'):
            logger.critical('Please use bowtie2 >= 2.4.0')
            sys.exit(1)

        feature_barcode_ref, _ = build_bt2_index(
            fasta_file=fasta_file, index_base=FEATURE_BARCODE_REF)

    elif aligner == 'bwa':
        FEATURE_BARCODE_INDEX_LOG = str(output_directory / 'bwa-index.log')
        MODIFIED_READ_FILE = str(output_directory / 'modified.fq.gz')
        ALIGNMENT_LOG = str(output_directory / 'bwa.log')

        logger.info(f'bwa version: {parse_bwa_version()}')

        if version.parse(parse_bwa_version()) < version.parse('0.7.0'):
            logger.critical('Please use bwa >= 0.7.0')
            sys.exit(1)

        fasta_file, _ = build_bwa_index(fasta_file=fasta_file)

    logger.info(f'samtools version: {parse_samtools_version()}')
    with open_by_suffix(file_name=FEATURE_BARCODE_INDEX_LOG, mode='w') as f:
        f.write(_)

    num_cb = len([i for i in open_by_suffix(cb_file)])
    logger.info(f'Number of reference cell barcodes: {num_cb:,}')
    logger.info('Read 1 coordinates to search: [' +
                ', '.join([str(i) for i in read1_coords]) + ')')
    logger.info(f'Cell barcode maximum number of mismatches: {num_mismatches}')
    logger.info(f'Read 1 maximum number of N allowed: {num_n_threshold}')

    logger.info('Matching cell barcodes, read 1 ...')

    if aligner == 'bowtie2':
        unaligned_bam_file, read_counter = generate_unaligned_bam(
            read1_file=read1_file,
            read2_file=read2_file,
            cb_file=cb_file,
            fb_file=fb_file,
            unaligned_bam_file=UNALIGNED_BAM_FILE,
            read1_coords=read1_coords,
            num_mismatches=num_mismatches,
            num_n_threshold=num_n_threshold,
            num_n_ref=num_n_ref)

    elif aligner == 'bwa':
        modified_read_file, read_counter = generate_modified_fastq(
            read1_file=read1_file,
            read2_file=read2_file,
            cb_file=cb_file,
            read1_coords=read1_coords,
            modified_read_file=MODIFIED_READ_FILE,
            num_mismatches=num_mismatches,
            num_n_threshold=num_n_threshold)

    logger.info(f'number of read pairs processed: {read_counter[1]:,}')
    logger.info('Number of read pairs w/ valid cell barcodes: '
                f'{read_counter[0]:,}')

    num_fb = len(set([i.split('\t')[0] for i in open_by_suffix(fb_file)]))
    logger.info(f'Number of reference features: {num_fb:,}')

    if not num_threads:
        num_threads = cpu_count()
    logger.info(f'Number of threads: {num_threads}')

    logger.info('Aligning read 2 ...')

    if aligner == 'bowtie2':
        alignment_file, _ = align_reads_bowtie2(
            unaligned_bam_file=unaligned_bam_file,
            index_base=feature_barcode_ref,
            alignment_file=ALIGNMENT_FILE,
            temp_prefix=next(_get_candidate_names()),
            num_threads=num_threads)

    elif aligner == 'bwa':
        alignment_file, _ = align_reads_bwa(
            modified_read_file=modified_read_file,
            index_base=fasta_file,
            alignment_file=ALIGNMENT_FILE,
            temp_prefix=next(_get_candidate_names()),
            num_threads=num_threads)

    pysam.index(alignment_file, alignment_file + '.bai')
    with open_by_suffix(file_name=ALIGNMENT_LOG, mode='w') as f:
        f.write(_)
    logger.info(f'\n{_.rstrip()}')

    logger.info('Generating matrix (UMI deduplication) ...')
    logger.info(f'UMI-tools version: {umi_tools_version}')
    logger.info(f'Mapping quality threshold: {mapq}')

    logger.info(f'UMI starting position on read 1: {umi_pos_start}')
    logger.info(f'UMI length: {umi_length}')
    logger.info('UMI-tools deduplication threshold: '
                f'{umi_deduplication_threshold}')
    logger.info('UMI-tools deduplication method: '
                f'{umi_deduplication_method}')

    matrix_featurecount = generate_matrix_from_alignment(
        alignment_file=alignment_file,
        umi_pos_start=umi_pos_start,
        umi_length=umi_length,
        umi_deduplication_method='directional',
        umi_deduplication_threshold=umi_deduplication_threshold)

    logger.info(
        f'Number of cell barcodes detected: {matrix_featurecount.shape[1]:,}')
    logger.info(
        f'Number of features detected: {matrix_featurecount.shape[0]:,}')

    logger.info('Total UMIs after deduplication: '
                f'{matrix_featurecount.values.sum():,}')
    logger.info('Median number of UMIs per cell: '
                f'{np.median(matrix_featurecount.sum(axis=0)):,}')

    return matrix_featurecount
示例#10
0
def generate_unaligned_bam(read1_file,
                           read2_file,
                           cb_file,
                           fb_file,
                           unaligned_bam_file,
                           read1_coords,
                           num_mismatches=1,
                           num_n_threshold=3,
                           num_n_ref=0):
    """Matches cell barcodes and generates unaligned bam.

    Parameters
    ----------
    read1_file : str
        The path and name of read 1 file.
    read2_file : str
        The path and name of read 2 file.
    cb_file : str
        The path and name of cell barcode file.
    fb_file : str
        The path and name of feature barcode file.
    unaligned_bam_file : str
        The path and name of unaligned file.
    read1_coords : tuple or list
        The positions of read 1 to compare against cell barcodes.
    num_mismatches : int, optional
        Maximum levenshtein distance allowd.
    num_n_threshold : int, optional
        Maximum Ns allowd for read 1. Read 1 with more Ns than this
        threshold will be skipped.
    num_n_ref : int, optional
        Number of Ns to use for separating seqeunces belonging to
        the same feature. Needed for correctly constructing bam header.

    Returns
    -------
    str
        The path and name of unaligned file.
    """

    cell_barcodes = [
        i.rstrip().split('-')[0] for i in open_by_suffix(cb_file, mode='r')
    ]

    cb_index = create_index(barcodes=cell_barcodes,
                            num_mismatches=num_mismatches)

    # create bam header
    feature_barcodes = dict()
    with open_by_suffix(file_name=fb_file, mode='r') as f:
        for line in f:
            i = line.rstrip().split('\t')

            if i[0] not in feature_barcodes:
                feature_barcodes[i[0]] = []
            feature_barcodes[i[0]].append(i[1])

    feature_barcodes = [{
        'LN': len(('N' * num_n_ref).join(feature_barcodes[i])),
        'SN': i
    } for i in feature_barcodes]

    rg = {
        'ID': 'fba',
        'LB': 'null',
        'PL': 'illumina',
        'PU': 'null',
        'SM': 'null'
    }

    pg = {
        'ID': 'fba',
        'PN': 'fba',
        'VN': __version__,
        'CL': ' '.join(sys.argv)
    }

    fb_bam_header = {
        'HD': {
            'VN': '1.6'
        },
        'SQ': feature_barcodes,
        'RG': [rg],
        'PG': [pg]
    }

    def _get_sequence(read1_file, read2_file):
        """Gets sequences and qualities."""

        with dnaio.open(file1=read1_file,
                        file2=read2_file,
                        fileformat='fastq',
                        mode='r') as f:
            for rec in f:
                read1, read2 = rec

                yield read1.name, read1.sequence, read1.qualities, \
                    read2.sequence, read2.qualities

    read_counter = [int(), int()]
    with pysam.AlignmentFile(unaligned_bam_file, 'wb',
                             header=fb_bam_header) as outf:

        for i in _get_sequence(read1_file, read2_file):
            read_counter[1] += 1

            out = match_cell_barcodes(reads=i,
                                      barcode_index=cb_index,
                                      read_coords=read1_coords,
                                      num_mismatches=num_mismatches,
                                      num_n_threshold=num_n_threshold)
            if out:
                read_counter[0] += 1
                outf.write(compose_aln(out))

    return unaligned_bam_file, read_counter
示例#11
0
文件: qc.py 项目: jlduan/fba
def analyze_bulk(read_file,
                 read_coords,
                 fb_file,
                 num_mismatches=1,
                 num_n_threshold=3,
                 num_reads=None):
    """Searches feature barcodes on reads 2 and generates matrix.

    Parameters
    ----------
    read_file : str
        The path and name of read 2 file.
    read2_coords : tuple or list
        The positions on read 2 to search.
    fb_file : str
        The path and name of feature barcoding file.
    num_mismatches : int, optional
        Maximum levenshtein distance allowed.
    num_n_threshoold : int, optional
        Maximum Ns allowed for reads.
    num_reads ; int, optional
        Number of reads to analyze.

    Returns
    -------
    dict
        Count and frequency of each feature barcode in the provided fastq file.
    """

    with open_by_suffix(file_name=fb_file) as f:
        feature_barcodes = {
            i.rstrip().split('\t')[-1]: i.rstrip().replace('\t', '_')
            for i in f
        }
    fb_index = create_index(barcodes=feature_barcodes.keys(),
                            num_mismatches=num_mismatches)
    feature_barcode_count = {i: int() for i in feature_barcodes}

    logger.info('Number of reference feature barcodes: '
                f'{len(feature_barcode_count):,}')

    logger.info('Read 2 coordinates to search: [' +
                ', '.join([str(i) for i in read_coords]) + ')')

    logger.info(
        f'Feature barcode maximum number of mismatches: {num_mismatches}')
    logger.info(f'Read 2 maximum number of N allowed: {num_n_threshold}')

    if num_reads:
        logger.info(f'Number of read pairs to analyze: {num_reads:,}')
    else:
        logger.info('Number of read pairs to analyze: all')

    def _get_sequence(read_file):
        """Gets sequences."""

        with dnaio.open(file1=read_file,
                        file2=None,
                        fileformat='fastq',
                        mode='r') as f:

            for read in f:
                yield read.sequence, read.qualities

    _reads = islice(_get_sequence(read_file), 0, num_reads)

    logger.info('Matching ...')

    read_counter = int()
    for read_seq, read_qual in _reads:
        read_counter += 1

        if read_counter % 10_000_000 == 0:
            logger.info(f'Reads processed: {read_counter:,}')

        if read_seq.count('N') <= num_n_threshold:
            x2, y2 = read_coords

            fb_queries = query_index(read_seq[x2:y2],
                                     barcode_index=fb_index,
                                     num_mismatches=num_mismatches)

            fb_matched = select_query(fb_queries, read_seq[x2:y2],
                                      read_qual[x2:y2])
            if fb_matched:
                feature_barcode_count[fb_matched[0]] += 1
示例#12
0
文件: qc.py 项目: jlduan/fba
def summarize_barcode_positions(matching_file, output_directory='qc'):
    """Summarizes barcode positions for reads 1 and reads 2.

    Parameters
    ----------
    matching_file : str
        The path and name of matching result.
    output_directory : str, optional
        The path and name for the output directory.

    Returns
    -------
    str
        The path and name for the output directory.
    """

    logger.info('Summarizing barcode coordinates ...')
    logger.info(f'Output directory: {output_directory}')

    # read1
    Path(output_directory).mkdir(exist_ok=True)
    R1_BC_STARTING_FILE = \
        Path(output_directory) / 'Read1_barcodes_starting.csv'
    R1_BC_ENDING_FILE = \
        Path(output_directory) / 'Read1_barcodes_ending.csv'
    R1_BC_STARTING_ENDING_PLOT = \
        Path(output_directory) / 'Pyplot_read1_barcodes_starting_ending.pdf'
    # read2
    R2_BC_STARTING_FILE = \
        Path(output_directory) / 'Read2_barcodes_starting.csv'
    R2_BC_ENDING_FILE = \
        Path(output_directory) / 'Read2_barcodes_ending.csv'
    R2_BC_STARTING_ENDING_PLOT = \
        Path(output_directory) / 'Pyplot_read2_barcodes_starting_ending.pdf'
    # summary
    CB_MISMATCHES_FILE = \
        Path(output_directory) / 'Read1_barcodes_mismatches.csv'
    FB_MISMATCHES_FILE = \
        Path(output_directory) / 'Read2_barcodes_mismatches.csv'
    MATCHED_BC_RATIO_FILE = Path(
        output_directory) / 'matched_barcode_ratio.csv'

    #
    with open_by_suffix(file_name=matching_file) as f:
        next(f)
        first_line = next(f)

    read1_length = len(first_line.split('\t')[0])
    read2_length = len(first_line.split('\t')[4])

    # barcode starts and ends
    barcode_counter = [int(), int()]
    cb_matching_pos = list()
    cb_matching_description = list()
    cb_mismatches = list()
    fb_matching_pos = list()
    fb_matching_description = list()
    fb_mismatches = list()

    with open_by_suffix(file_name=matching_file) as f:
        next(f)
        for line in f:
            i = line.rstrip().split('\t')
            barcode_counter[1] += 1

            if (i[2] not in {'no_match', 'n_skipping'}
                    and i[5] not in {'no_match', 'NA'}):
                barcode_counter[0] += 1

                cb_matching_pos.append(i[2])
                cb_matching_description.append(i[3])
                _ = [int(ii) for ii in i[2].split(':')]
                cb_mismatches.append(
                    len(i[1]) - (_[1] - _[0]) +
                    sum([int(ii) for ii in i[3].split(':')]))
                fb_matching_pos.append(i[6])
                fb_matching_description.append(i[7])
                _ = [int(ii) for ii in i[6].split(':')]
                fb_mismatches.append(
                    len(i[5]) - (_[1] - _[0]) +
                    sum([int(ii) for ii in i[7].split(':')]))

    barcode_counter.append(barcode_counter[0] / barcode_counter[1])
    with open_by_suffix(file_name=MATCHED_BC_RATIO_FILE, mode='w') as f:
        f.write(','.join(['valid', 'total', 'ratio']) + '\n' +
                ','.join([str(i) for i in barcode_counter]) + '\n')

    cb_mismatches = pd.Series(cb_mismatches).value_counts().to_frame(
        name='count')
    cb_mismatches['ratio'] = cb_mismatches['count'] / \
        sum(cb_mismatches['count'])
    cb_mismatches.sort_index().to_csv(CB_MISMATCHES_FILE)

    fb_mismatches = pd.Series(fb_mismatches).value_counts().to_frame(
        name='count')
    fb_mismatches['ratio'] = fb_mismatches['count'] / \
        sum(fb_mismatches['count'])
    fb_mismatches.sort_index().to_csv(FB_MISMATCHES_FILE)

    # cell barcode
    cb_s = [int(i.split(':')[0]) for i in cb_matching_pos]
    cb_e = [int(i.split(':')[1]) - 1 for i in cb_matching_pos]

    cb_start_dist = pd.Series(cb_s).value_counts().to_frame(
        name='count').reindex(list(range(read1_length))).fillna(0).astype(
            np.int64)
    cb_start_dist.to_csv(R1_BC_STARTING_FILE)
    cb_end_dist = pd.Series(cb_e).value_counts().to_frame(
        name='count').reindex(list(range(read1_length))).fillna(0).astype(
            np.int64)
    cb_end_dist.to_csv(R1_BC_ENDING_FILE)

    fig, ax = plt.subplots(nrows=1,
                           ncols=1,
                           figsize=(max(2.8, read1_length / 15), 2.5))
    plot_barcode_startend(s=cb_start_dist['count'] /
                          sum(cb_start_dist['count']),
                          e=cb_end_dist['count'] / sum(cb_end_dist['count']),
                          bases=cb_start_dist.index.values,
                          title='Distribution of cell barcode positions',
                          ax=ax)
    plt.tight_layout()
    fig.savefig(fname=R1_BC_STARTING_ENDING_PLOT,
                transparent=None,
                bbox_inches='tight')

    # feature barcode
    fb_s = [int(i.split(':')[0]) for i in fb_matching_pos]
    fb_e = [int(i.split(':')[1]) - 1 for i in fb_matching_pos]

    fb_start_dist = pd.Series(fb_s).value_counts().to_frame(
        name='count').reindex(list(range(read2_length))).fillna(0).astype(
            np.int64)
    fb_start_dist.to_csv(R2_BC_STARTING_FILE)
    fb_end_dist = pd.Series(fb_e).value_counts().to_frame(
        name='count').reindex(list(range(read2_length))).fillna(0).astype(
            np.int64)
    fb_end_dist.to_csv(R2_BC_ENDING_FILE)

    fig, ax = plt.subplots(nrows=1,
                           ncols=1,
                           figsize=(max(2.8, read2_length / 15), 2.5))
    plot_barcode_startend(s=fb_start_dist['count'] /
                          sum(fb_start_dist['count']),
                          e=fb_end_dist['count'] / sum(fb_end_dist['count']),
                          bases=fb_start_dist.index.values,
                          title='Distribution of feature barcode positions',
                          ax=ax)
    plt.tight_layout()
    fig.savefig(fname=R2_BC_STARTING_ENDING_PLOT,
                transparent=None,
                bbox_inches='tight')

    return output_directory
示例#13
0
def extract_feature_barcoding_regex(read1_file,
                                    read2_file,
                                    cb_file,
                                    fb_file,
                                    cb_num_mismatches,
                                    fb_num_mismatches,
                                    cb_num_n_threshold=3,
                                    fb_num_n_threshold=3,
                                    read1_coords=None,
                                    read2_coords=None,
                                    num_threads=None,
                                    chunk_size=1000,
                                    num_reads=None):
    """Extracts feature barcodes."""

    logger.info(f'regex version: {regex.__version__}')

    with open_by_suffix(file_name=cb_file) as f:
        cell_barcodes = [i.split('-')[0].rstrip() for i in f]

    with open_by_suffix(file_name=fb_file) as f:
        feature_barcodes = {
            i.rstrip().split('\t')[1]: i.split('\t')[0] for i in f
        }

    logger.info(f'Number of reference cell barcodes: {len(cell_barcodes):,}')
    logger.info('Number of refernece feature barcodes: '
                f'{len(feature_barcodes):,}')

    if read1_coords:
        logger.info('Read 1 coordinates to search: [' +
                    ', '.join([str(i) for i in read1_coords]) + ')')
    if read2_coords:
        logger.info('Read 2 coordinates to search: [' +
                    ', '.join([str(i) for i in read2_coords]) + ')')

    logger.info(
        f'Cell barcode maximum number of mismatches: {cb_num_mismatches}')
    logger.info(
        f'Feature barcode maximum number of mismatches: {fb_num_mismatches}')
    logger.info(
        f'Read 1 maximum number of N allowed: {cb_num_n_threshold}')
    logger.info(
        f'Read 2 maximum number of N allowed: {fb_num_n_threshold}'
    )

    if num_reads:
        logger.info(f'Number of read pairs to analyze: {num_reads:,}')
        if chunk_size > num_reads:
            chunk_size = num_reads
    else:
        logger.info('Number of read pairs to analyze: all')

    cell_barcodes_compiled_exact = compile_regex_ref_barcodes_exact(
        barcodes=cell_barcodes
    )
    feature_barcodes_compiled_exact = compile_regex_ref_barcodes_exact(
        barcodes=feature_barcodes.keys()
    )

    if cb_num_mismatches:
        cell_barcodes_compiled_fuzzy = compile_regex_ref_barcodes_fuzzy(
            barcodes=cell_barcodes,
            num_mismatches=cb_num_mismatches
        )
    else:
        cell_barcodes_compiled_fuzzy = None

    if fb_num_mismatches:
        feature_barcodes_compiled_fuzzy = compile_regex_ref_barcodes_fuzzy(
            barcodes=feature_barcodes,
            num_mismatches=fb_num_mismatches
        )
    else:
        feature_barcodes_compiled_fuzzy = None

    def get_sequence(read1_file, read2_file,
                     read1_coords=read1_coords, read2_coords=read2_coords):
        """Gets sequences."""

        with dnaio.open(file1=read1_file,
                        file2=read2_file,
                        fileformat='fastq',
                        mode='r') as f:

            for rec in f:
                read1, read2 = rec

                read1_seq = read1.sequence
                read2_seq = read2.sequence

                if read1_coords:
                    r1_start, r1_end = read1_coords
                    r1 = read1_seq[r1_start: min(r1_end, len(read1_seq))]
                else:
                    r1 = read1_seq

                if read2_coords:
                    r2_start, r2_end = read2_coords
                    r2 = read2_seq[r2_start: min(r2_end, len(read2_seq))]
                else:
                    r2 = read2_seq

                yield r1, r2, read1_seq, read2_seq

    def _restore_orig_seq(x,
                          read1_coords=read1_coords,
                          read2_coords=read2_coords):
        """Formats matching output, restores original seqs and coordinates."""

        """
        ['TGATCTTAGAACACGT', 'TGATCTTAGAACACGT', '0:16', '2:0:0', 'GGGGGGGGGGGGGGGGAGGGGGCCGGAAAAGAACCCCGAGAGGCCAGCGCCAAACAAAAAAGAACAAAAAAGAGGAAAAAAAAAAAAAAA', 'no_match', 'NA', 'NA', 'TGATCTTAGAACACGTCAGGGTCCTGAA', 'GGGGGGGGGGGGGGGGAGGGGGCCGGAAAAGAACCCCGAGAGGCCAGCGCCAAACAAAAAAGAACAAAAAAGAGGAAAAAAAAAAAAAAA']
        ['TCTCAGCGTATAGTCC', 'TCTCAGCGTATAGTCC', '0:16', '2:0:0', 'AGCGGGCGCATGTTCCCGCTCAACTATACGAACGGCTTTAAGGCCGGTCCTAGCAACCTGAAGGCTTAGGACTATACGCTGAGACTGTCT', 'TGTTCCCGCTCAACT', '10:25', '0:0:0', 'TCTCAGCGTATAGTCCTAAGCCTTCAGG', 'AGCGGGCGCATGTTCCCGCTCAACTATACGAACGGCTTTAAGGCCGGTCCTAGCAACCTGAAGGCTTAGGACTATACGCTGAGACTGTCT']
        ['CGATCGGGTGTGCGCT', 'no_match', 'NA', 'NA', 'CGATCGGCAGTGCGCTCACCTATTAGCGGCTAAGGCGATCTTGAGAGAGCGCACACCCGATCGCTGTCTCTTATACACATCTGACGCTGC', 'NA', 'NA', 'NA', 'CGATCGGGTGTGCGCTCTCTCAAGATCG', 'CGATCGGCAGTGCGCTCACCTATTAGCGGCTAAGGCGATCTTGAGAGAGCGCACACCCGATCGCTGTCTCTTATACACATCTGACGCTGC']
        ['CAACAGTGTAACTAAG', 'CAACAGTGTAACTAAG', '0:16', '2:0:0', 'GGGCAATGTAGCTGCGCTTTCCATTCGAGGCCGGGATTTAAGGCCGGTCCTAGCAANNCGGCTACCCTCTTAGTTACACTGTNGCTGTCT', 'n_skipping', 'NA', 'NA', 'CAACAGTGTAACTAAGAGGGTAGCCGTA', 'GGGCAATGTAGCTGCGCTTTCCATTCGAGGCCGGGATTTAAGGCCGGTCCTAGCAANNCGGCTACCCTCTTAGTTACACTGTNGCTGTCT']
        """  # noqa

        # read1
        if read1_coords:
            r1_start, r1_end = read1_coords
            if ':' in x[2]:
                x[2] = ':'.join(
                    [str(int(i) + r1_start) for i in x[2].split(':')]
                )
            x[0] = x[-2][:r1_start].lower() + x[0] + x[-2][r1_end:].lower()

        # read2
        if read2_coords:
            r2_start, r2_end = read2_coords
            if ':' in x[6]:
                x[6] = ':'.join(
                    [str(int(i) + r2_start) for i in x[6].split(':')]
                )
            x[4] = x[-1][:r2_start].lower() + x[4] + x[-1][r2_end:].lower()

        if x[5] not in {'no_match', 'n_skipping', 'NA'}:
            x[5] = feature_barcodes[x[5]] + '_' + x[5]

        return '\t'.join(x[:-2])

    if not num_threads:
        num_threads = cpu_count()

    logger.info(f'Number of threads: {num_threads}')
    if num_threads > 1:
        logger.info(f'Chunk size: {chunk_size:,}')

    logger.info('Matching ...')

    _reads = islice(
        get_sequence(read1_file, read2_file),
        0,
        num_reads
    )

    read_counter = int()
    if num_threads == 1:
        for r1, r2, read1_seq, read2_seq in _reads:

            read_counter += 1
            if read_counter % chunk_size == 0:
                logger.info(f'Read pairs processed: {read_counter:,}')

            out = match_barcodes_paired(
                read_seqs=(r1, r2, read1_seq, read2_seq),
                cb_compiled_exact=cell_barcodes_compiled_exact,
                fb_compiled_exact=feature_barcodes_compiled_exact,
                cb_compiled_fuzzy=cell_barcodes_compiled_fuzzy,
                fb_compiled_fuzzy=feature_barcodes_compiled_fuzzy,
                cb_num_n_threshold=cb_num_n_threshold,
                fb_num_n_threshold=fb_num_n_threshold
            )

            out = _restore_orig_seq(x=out,
                                    read1_coords=read1_coords,
                                    read2_coords=read2_coords)
            yield out

    else:
        items = list(islice(_reads, chunk_size))

        with Pool(processes=num_threads) as p:
            while items:
                read_counter += len(items)

                outs = p.starmap(
                    match_barcodes_paired,
                    zip(items,
                        repeat(cell_barcodes_compiled_exact),
                        repeat(feature_barcodes_compiled_exact),
                        repeat(cell_barcodes_compiled_fuzzy),
                        repeat(feature_barcodes_compiled_fuzzy),
                        repeat(cb_num_n_threshold),
                        repeat(fb_num_n_threshold),
                        )
                )
                outs = [
                    _restore_orig_seq(x=i,
                                      read1_coords=read1_coords,
                                      read2_coords=read2_coords) for i in outs
                ]

                logger.info(f'Read pairs processed: {read_counter:,}')
                yield '\n'.join(outs)

                items = list(islice(_reads, chunk_size))