Пример #1
0
def process_bam_barcode(bam, pair_iter, bc, corrected_umis, reporter,
                        gene_umi_counts_per_bc, strand, out_bam, paired_end):
    """ Process all readpairs from pair_iter, all having the same bc """

    # Note: "gene" in this function is actually "chain"
    # Readpair counts per UMI (per-gene); {gene: {UMI: count}}

    # Note: Using a lambda here breaks cPickle for some reason
    gene_umi_counts = defaultdict(default_dict_int)

    read_pairs_written = 0

    for header, (read1, read2) in pair_iter:
        (gene1,
         gene2) = reporter.vdj_recombinome_bam_cb(read1, read2, bam, strand)

        umi = header.get_tag(cr_constants.RAW_UMI_TAG)
        corrected_umi = corrected_umis[umi]

        genes = list(set(filter(lambda x: x is not None, [gene1, gene2])))
        if len(genes) == 1:  # Unique mapping
            gene_umi_counts[genes[0]][corrected_umi] += 1
        else:  # Unmapped or ambiguously mapped read pairs go to "None" bucket
            gene_umi_counts["None"][corrected_umi] += 1

        gene_umi_counts[cr_constants.MULTI_REFS_PREFIX][corrected_umi] += 1

        header.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi)
        read1.qname = header.to_string()

        if read2 is not None:
            header2 = cr_fastq.AugmentedFastqHeader(read2.qname)
            assert (header2.get_tag(cr_constants.RAW_UMI_TAG) == umi)
            header2.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi)
            read2.qname = header2.to_string()

        reporter._get_metric_attr('vdj_corrected_umi_frac').add(
            1, filter=corrected_umis[umi] != umi)
        read_pairs_written += 1

        # Write whether this pair was filtered or not.
        out_bam.write(read1)
        if read2 is not None:
            out_bam.write(read2)

    # Report read-pairs/umi
    for gene in reporter.vdj_genes:
        tot_readpairs = 0

        for reads_per_umi in gene_umi_counts[gene].itervalues():
            reporter._get_metric_attr(
                'vdj_recombinome_readpairs_per_umi_distribution',
                gene).add(reads_per_umi)
            tot_readpairs += reads_per_umi

    gene_umi_counts_per_bc[bc] = gene_umi_counts
Пример #2
0
def get_bc_grouped_pair_iter(bam):
    """ Yields (bc, pair_iter)
        where pair_iter yields (AugmentedFastqHeader, (read1, read2)) for the barcode """
    wrap_header = lambda pair: (cr_fastq.AugmentedFastqHeader(pair[0].qname),
                                pair)
    get_barcode = lambda hdr_pair: hdr_pair[0].get_tag(cr_constants.
                                                       PROCESSED_BARCODE_TAG)

    return itertools.groupby(itertools.imap(wrap_header, get_pair_iter(bam)),
                             key=get_barcode)
Пример #3
0
def write_barcode_fastq(bam, pair_iter, bc, corrected_umis, reporter,
                        gene_umi_counts_per_bc, strand, out_bam, out_fastq1,
                        out_fastq2):
    """ Process all readpairs from pair_iter, all having the same bc """

    # Note: "gene" in this function is actually "chain"
    # Readpair counts per UMI (per-gene); {gene: {UMI: count}}
    gene_umi_counts = defaultdict(default_dict_int)

    read_pairs_written = 0

    for header, (read1, read2) in pair_iter:
        (gene1,
         gene2) = reporter.vdj_recombinome_bam_cb(read1, read2, bam, strand)

        if is_mapped(read1, read2):
            umi = header.get_tag(cr_constants.RAW_UMI_TAG)
            corrected_umi = corrected_umis[umi]

            # Count readpairs per UMI
            if gene1 is not None or gene2 is not None:
                for gene in set(
                        filter(
                            lambda x: x is not None,
                            [gene1, gene2, cr_constants.MULTI_REFS_PREFIX])):
                    gene_umi_counts[gene][corrected_umi] += 1

            header.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi)
            read1.qname = header.to_string()

            header2 = cr_fastq.AugmentedFastqHeader(read2.qname)
            assert (header2.get_tag(cr_constants.RAW_UMI_TAG) == umi)
            header2.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi)
            read2.qname = header2.to_string()

            reporter._get_metric_attr('vdj_corrected_umi_frac').add(
                1, filter=corrected_umis[umi] != umi)
            read_pairs_written += 1

        if not out_bam is None:
            # Write whether this pair was filtered or not.
            out_bam.write(read1)
            out_bam.write(read2)
        elif is_mapped(read1, read2):
            write_bam_read_fastq(out_fastq1, read1)
            write_bam_read_fastq(out_fastq2, read2)

    # Report read-pairs/umi
    for gene in reporter.vdj_genes:
        for reads_per_umi in gene_umi_counts[gene].itervalues():
            reporter._get_metric_attr(
                'vdj_recombinome_readpairs_per_umi_distribution',
                gene).add(reads_per_umi)

    gene_umi_counts_per_bc[bc] = gene_umi_counts
Пример #4
0
def get_bc_grouped_pair_iter(bam, paired_end):
    """ Yields (bc, pair_iter)
        where pair_iter yields (AugmentedFastqHeader, (read1, read2|None)) for the barcode """
    wrap_header = lambda pair: (cr_fastq.AugmentedFastqHeader(pair[0].qname),
                                pair)
    get_barcode = lambda hdr_pair: hdr_pair[0].get_tag(cr_constants.
                                                       PROCESSED_BARCODE_TAG)

    if paired_end:
        bam_iter = vdj_filt.get_pair_iter(bam)
    else:
        bam_iter = itertools.imap(lambda r1: (r1, None), bam)

    return itertools.groupby(itertools.imap(wrap_header, bam_iter),
                             key=get_barcode)
Пример #5
0
def process_bam_barcode(bam, pair_iter, bc, corrected_umis, reporter,
                        gene_umi_counts_per_bc, strand, out_bam,
                        asm_min_readpairs_per_umi, paired_end):
    """ Process all readpairs from pair_iter, all having the same bc """

    # Note: "gene" in this function is actually "chain"
    # Readpair counts per UMI (per-gene); {gene: {UMI: count}}

    # Note: Using a lambda here breaks cPickle for some reason
    gene_umi_counts = defaultdict(default_dict_int)

    read_pairs_written = 0

    for header, (read1, read2) in pair_iter:
        (gene1,
         gene2) = reporter.vdj_recombinome_bam_cb(read1, read2, bam, strand)

        umi = header.get_tag(cr_constants.RAW_UMI_TAG)
        corrected_umi = corrected_umis[umi]

        # Count readpairs per UMI
        if gene1 is not None:
            gene_umi_counts[gene1][corrected_umi] += 1
        if gene2 is not None:
            gene_umi_counts[gene2][corrected_umi] += 1
        if gene1 is None and gene2 is None:
            # Allow unmapped UMIs
            gene_umi_counts["None"][corrected_umi] += 1
        gene_umi_counts[cr_constants.MULTI_REFS_PREFIX][corrected_umi] += 1

        header.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi)
        read1.qname = header.to_string()

        if read2 is not None:
            header2 = cr_fastq.AugmentedFastqHeader(read2.qname)
            assert (header2.get_tag(cr_constants.RAW_UMI_TAG) == umi)
            header2.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi)
            read2.qname = header2.to_string()

        reporter._get_metric_attr('vdj_corrected_umi_frac').add(
            1, filter=corrected_umis[umi] != umi)
        read_pairs_written += 1

        # Write whether this pair was filtered or not.
        out_bam.write(read1)
        if read2 is not None:
            out_bam.write(read2)

    # Report read-pairs/umi
    for gene in reporter.vdj_genes:
        tot_readpairs = 0
        asm_bad_readpairs = 0

        for reads_per_umi in gene_umi_counts[gene].itervalues():
            reporter._get_metric_attr(
                'vdj_recombinome_readpairs_per_umi_distribution',
                gene).add(reads_per_umi)

            if reads_per_umi < asm_min_readpairs_per_umi:
                asm_bad_readpairs += reads_per_umi
            tot_readpairs += reads_per_umi

        reporter._get_metric_attr('vdj_recombinome_low_support_reads_frac',
                                  gene).set_value(asm_bad_readpairs,
                                                  tot_readpairs)

    gene_umi_counts_per_bc[bc] = gene_umi_counts
Пример #6
0
def fastq_barcode_sort_key(fastq_read):
    """ Return barcode, qname """
    fastq_header = cr_fastq.AugmentedFastqHeader(fastq_read[0])
    bc = fastq_header.get_tag(cr_constants.PROCESSED_BARCODE_TAG)
    return bc, fastq_header.fastq_header
Пример #7
0
def get_fastq_read_barcode(fastq_read):
    return cr_fastq.AugmentedFastqHeader(fastq_read[0]).get_tag(
        cr_constants.PROCESSED_BARCODE_TAG)
Пример #8
0
def get_consensus_quals(in_bam, clonotype_name, in_fasta, sel_contigs,
                        contig_umis, out_dir):
    """Compute base quality scores of a sequence.

    Args:
    - in_bam: bam file to get the list of reads assigned to UMIs on the selected contigs
    - clonotype_name: Used for naming output files.
    - sel_contigs: Contigs that led to the consensus sequence above
    - contig_umis: from contig name to list of umis assigned to that contig

    Return value:
    String with base qualities (in FASTQ format).
    """

    pref = re.sub('.fasta', '', os.path.basename(in_fasta))
    fastq1 = re.sub('.fasta', '_1.fastq', in_fasta)
    fastq2 = re.sub('.fasta', '_2.fastq', in_fasta)

    sel_reads = {}

    for contig in sel_contigs:
        umi_read_count = Counter()
        barcode = contig.split('_')[0]
        contig_read_count = 0

        # Wrap contig w/ str() because pysam crashes on unicode input
        for read in in_bam.fetch(str(contig)):
            # NOTE: Assembler assumes that any tags are part of the read name
            # BUT the bam that we feed to this stage has the tags stripped out
            # of the name.
            umi = read.get_tag(PROCESSED_UMI_TAG)
            if umi in contig_umis[contig] and not read.is_secondary:
                umi_read_count[umi] += 1
                if umi_read_count[umi] >= MAX_READS_PER_UMI:
                    continue

                contig_read_count += 1
                if contig_read_count >= MAX_READS_PER_CONTIG:
                    continue

                if not read.qname in sel_reads:
                    sel_reads[read.qname] = [None, None]
                sel_reads[read.qname][read.is_read2] = read

    with open(fastq1, 'w') as f1, open(fastq2, 'w') as f2:
        for read_name, pair in sel_reads.iteritems():
            read1, read2 = pair[0], pair[1]

            if read1 is None:
                # Replace the UMI with <BC>_<UMI>.
                umi = read2.get_tag(PROCESSED_UMI_TAG)
            else:
                umi = read1.get_tag(PROCESSED_UMI_TAG)

            header = cr_fastq.AugmentedFastqHeader(read_name)
            header.set_tag(PROCESSED_UMI_TAG, barcode + '_' + umi)
            header.set_tag(PROCESSED_BARCODE_TAG, barcode)

            if read1 is None:
                out_seq1 = ""
                out_quals1 = ""
            else:
                out_seq1 = tk_seq.get_rev_comp(
                    read1.seq) if read1.is_reverse else read1.seq
                out_quals1 = read1.qual[::
                                        -1] if read1.is_reverse else read1.qual
            tk_fasta.write_read_fastq(f1, header.to_string(), out_seq1,
                                      out_quals1)

            if read2 is None:
                out_seq2 = ""
                out_quals2 = ""
            else:
                out_seq2 = tk_seq.get_rev_comp(
                    read2.seq) if read2.is_reverse else read2.seq
                out_quals2 = read2.qual[::
                                        -1] if read2.is_reverse else read2.qual
            tk_fasta.write_read_fastq(f2, header.to_string(), out_seq2,
                                      out_quals2)

    assert (len(sel_reads) > 0)

    cmd = ['vdj_asm', 'base-quals', re.sub('.fasta', '', in_fasta), out_dir]
    sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

    tk_subproc.check_call(cmd, cwd=os.getcwd())

    with open(os.path.join(out_dir, pref + '.fastq'), 'r') as f:
        lines = f.readlines()

    return lines[3].strip()
Пример #9
0
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args):
    """Build a consensus sequence from a set of contigs.

    Args:
    - clonotype_name: Used to prefix output files.
    - sel_contigs: Names of contigs to use for consensus building.
    - best_contig: Name of "best" contig. Will search for this contig's sequence
        and base qualities.
    - out_dir: dir used for temporary results
    - args: stage args.

    - Return value:
    A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name).
    - best_contig_seq/best_contig_quals: the sequence and quals of the best contig
    - consensus_seq: the consensus sequence or None if no consensus could be built.
    - out_bam_name: Path of BAM with alignments of contigs to consensus seq.
    - out_fastq_name: FASTQ with contig sequences.
    - out_fasta_name: FASTA with consensus sequence.
    enough reads for consensus.
    """

    best_contig_seq = None
    best_contig_quals = None

    # Input to base quality computation - we don't really need the
    # base qualities because we will replace them by read-based qualities
    # But we need to do this to get proper alignments of contigs against
    # the consensus.
    out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq')

    # Input to assembly
    out_bam_name = martian.make_path(clonotype_name + '_contigs.bam')

    # The reference in the output bam doesn't really matter.
    out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1])

    # Read the entire fastq (all contigs) and write the selected contigs to
    # a bam for the assembler and a fastq for the aligner.
    with open(args.contigs_fastq, 'r') as f, open(out_fastq_name,
                                                  'w') as out_fq:
        fq_iter = tk_fasta.read_generator_fastq(f)
        for (name, seq, quals) in fq_iter:
            if name in sel_contigs:
                if name == best_contig:
                    best_contig_seq = seq
                    best_contig_quals = quals

                header = cr_fastq.AugmentedFastqHeader(name)
                # Create a pseudo-UMI for each input contig
                header.set_tag(PROCESSED_UMI_TAG, name)
                # Put all reads on the same "barcode". This is important, so
                # the assembler assembles all of them together.
                header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name)

                record = pysam.AlignedRead()

                record.reference_start = 0
                record.reference_id = 0
                # Wrap with str() or pysam will crash when given unicode
                record.qname = str(header.to_string())
                record.seq = seq
                record.qual = quals
                record.flag = MAPPED_UNPAIRED_FLAG

                out_bam.write(record)

                # Now change the tags. The final bam concatenation code will pull
                # the tags out of the header, so we want these to be meaningful.
                # Put the real barcode in the barcode tag. The alignment-base-qual
                # code will ignore it anyway.
                header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0])
                tk_fasta.write_read_fastq(out_fq, header.to_string(), seq,
                                          quals)

    out_bam.close()
    assert (not best_contig_seq is None)

    out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta')

    # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name.
    # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the
    # only output of the assembler we care about.
    if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS:
        cmd = [
            'vdj_asm',
            'asm',
            out_bam_name,
            out_dir,
            '--single-end',
            '--cons',  # required so we produce a single output sequence
            '--kmers=0',
            '--min-qual=0',
            '--score-factor=0.0'
        ]
        sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

        tk_subproc.check_call(cmd, cwd=os.getcwd())

        with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'),
                  'r') as contig_f:
            lines = contig_f.readlines()
            if lines:
                out_seq = lines[1].strip()
            else:
                # In some rare cases (eg. input contigs have 0 quality), assembly might fail.
                out_seq = None
    else:
        out_seq = None

    # Write the best contig sequence on a new fasta. We need to make sure this has the
    # right contig name because this will be the name written in the bam alignments
    # of the contigs against the consensus
    with open(out_fasta_name, 'w') as f:
        tk_fasta.write_read_fasta(f, clonotype_name,
                                  out_seq if out_seq else best_contig_seq)

    # Now align the same reads that were used in vdj_asm against the consensus that you just got.
    # The output will be in out_dir/<clonotype_name> + '_contigs.bam'
    cmd = [
        'vdj_asm', 'base-quals',
        martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end'
    ]
    sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

    tk_subproc.check_call(cmd, cwd=os.getcwd())

    # Move the BAM of the contigs aligned against the consensus out of the outs
    # (Will overwrite this bam which was already used as input to assembly).
    cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'),
               out_bam_name)

    return (best_contig_seq, best_contig_quals, out_seq, out_bam_name,
            out_fastq_name, out_fasta_name)
Пример #10
0
def main(args, outs):
    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []

    chunk_clonotypes = set(args.chunk_clonotypes)

    reporter = vdj_report.VdjReporter()
    if not args.clonotype_assignments or not vdj_utils.bam_has_seqs(
            args.contig_bam):
        # always produce an empty summary
        reporter.save(outs.chunked_reporter)
        return

    # Get the clonotype-barcode assignments
    with open(args.clonotype_assignments) as f:
        clonotypes = json.load(f)

    # Partition contig annotations by consensus id
    consensus_to_contigs = defaultdict(list)
    relevant_contig_ids = set()

    with open(args.chunk_annotations) as f:
        contigs = vdj_annot.load_contig_list_from_json(f,
                                                       args.vdj_reference_path)

    clo_key = '%s_clonotype_id' % args.metric_prefix
    cons_key = '%s_consensus_id' % args.metric_prefix

    for contig in contigs:
        clo_id = contig.info_dict.get(clo_key)
        cons_id = contig.info_dict.get(cons_key)
        assert clo_id in chunk_clonotypes and cons_id is not None

        consensus_to_contigs[cons_id].append(contig)
        relevant_contig_ids.add(contig.contig_name)

    assert len(consensus_to_contigs) > 0

    in_bam = tk_bam.create_bam_infile(args.contig_bam)

    n_merged_bams = 0

    # For all contigs relevant to this chunk,
    #   get the assembler umi data required for base qual recalculation.
    # Do not attempt to read into a pandas object because it can be huge.
    contig_umis = defaultdict(set)
    with open(args.umi_summary_tsv, 'r') as umi_file:
        for line in umi_file:
            fields = line.strip().split('\t')
            umi = fields[2]
            if umi == 'umi' or len(fields) < 7:
                continue
            good_umi = fields[5].lower() == 'true'
            contig_ids = set(fields[6].split(','))
            if good_umi and len(contig_ids & relevant_contig_ids) > 0:
                for c in contig_ids:
                    contig_umis[c].add(umi)

    consensus_fastq = open(outs.consensus_fastq, 'w')
    consensus_fasta = open(outs.consensus_fasta, 'w')
    ref_fasta = open(outs.concat_ref_fasta, 'w')

    consensus_contigs = []
    ref_contigs = []

    assert (args.metric_prefix in reporter.vdj_clonotype_types)

    # Iterate over clonotype assignments
    for clonotype_id, clonotype in clonotypes.iteritems():
        if not clonotype_id in chunk_clonotypes:
            continue

        for consensus_id, consensus in clonotype['consensuses'].iteritems():
            cdr = consensus['cdr3_seq']

            # Verify that the contig annotation data are consistent with the clonotype assignment data
            assert set(consensus['cell_contigs']) == \
                set(c.contig_name for c in consensus_to_contigs[consensus_id])
            sel_contigs = consensus_to_contigs[consensus_id]
            sel_contig_ids = [c.contig_name for c in sel_contigs]

            # Keep track of the "best" contig. This will be used in case the
            # merging fails.
            best_contig = None

            # Keep track of the set of distinct annotations of the contigs to merge.
            # Will use to report rate of discrepancies.
            feature_annotations = defaultdict(set)

            for contig in sel_contigs:
                for anno in contig.annotations:
                    feature_annotations[anno.feature.region_type].add(
                        anno.feature.gene_name)

                # Always choose a productive over a non-productive. Between
                # contigs with the same productivity, choose the one that had more UMIs.
                if best_contig is None or (not best_contig.productive and contig.productive) or \
                   (best_contig.productive == contig.productive and \
                    best_contig.umi_count < contig.umi_count):

                    best_contig = contig

            assert best_contig is not None

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_v_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_j_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            wrong_cdr_metric = reporter._get_metric_attr(
                'vdj_clonotype_consensus_wrong_cdr_contig_frac',
                args.metric_prefix)

            tmp_dir = martian.make_path(consensus_id + '_outs')
            cr_io.mkdir(tmp_dir, allow_existing=True)

            res = get_consensus_seq(consensus_id, sel_contig_ids,
                                    best_contig.contig_name, tmp_dir, args)
            (best_seq, best_quals, consensus_seq, contig_to_cons_bam,
             contig_fastq, contig_fasta) = res

            outs.chunked_consensus_bams.append(contig_to_cons_bam)

            # make sure the bam file has the right header (single sequence with this consensus name)
            tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam)
            if list(tmp_bam.references) != [consensus_id]:
                # Print some info to help us debug
                print tmp_bam.references, consensus_id
                assert (list(tmp_bam.references) == [consensus_id])
            tmp_bam.close()

            if consensus_seq:
                # If this is not None, we actually built a consensus, so we have to compute the quals from scratch.
                # Use a subset of the contigs for computing quals.
                contig_ids = map(
                    lambda c: c.contig_name,
                    sorted(sel_contigs,
                           key=lambda c: c.umi_count,
                           reverse=True))
                contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS]

                consensus_quals = get_consensus_quals(in_bam, consensus_id,
                                                      contig_fasta, contig_ids,
                                                      contig_umis, tmp_dir)
            else:
                consensus_seq = best_seq
                consensus_quals = best_quals

            assert (len(consensus_seq) == len(consensus_quals))

            total_read_count = sum([c.read_count for c in sel_contigs])
            total_umi_count = sum([c.umi_count for c in sel_contigs])

            contig_info_dict = {
                'cells': clonotype['barcodes'],
                'cell_contigs': sel_contig_ids,
                'clonotype_freq': clonotype['freq'],
                'clonotype_prop': clonotype['prop'],
            }

            contig = annotate_consensus_contig(args.vdj_reference_path,
                                               args.min_score_ratios,
                                               args.min_word_sizes,
                                               consensus_id,
                                               clonotype_id,
                                               consensus_seq,
                                               consensus_quals,
                                               read_count=total_read_count,
                                               umi_count=total_umi_count,
                                               info_dict=contig_info_dict,
                                               primers=args.primers)

            wrong_cdr_metric.add(1,
                                 filter=contig.cdr3_seq is None
                                 or contig.cdr3_seq != cdr)

            if contig.cdr3_seq is None or contig.cdr3_seq != cdr:
                # Something went wrong. Use "best" contig as the consensus.
                consensus_seq = best_seq
                consensus_quals = best_quals
                contig = annotate_consensus_contig(args.vdj_reference_path,
                                                   args.min_score_ratios,
                                                   args.min_word_sizes,
                                                   consensus_id,
                                                   clonotype_id,
                                                   consensus_seq,
                                                   consensus_quals,
                                                   read_count=total_read_count,
                                                   umi_count=total_umi_count,
                                                   info_dict=contig_info_dict,
                                                   primers=args.primers)

            assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr)

            consensus_contigs.append(contig)

            tk_fasta.write_read_fasta(consensus_fasta, consensus_id,
                                      consensus_seq)
            tk_fasta.write_read_fastq(consensus_fastq, consensus_id,
                                      consensus_seq, consensus_quals)
            assert (len(consensus_seq) == len(consensus_quals))

            ref_seq_parts, ref_annos = contig.get_concat_reference_sequence()

            # Align the contigs and consensus to a synthetic concatenated reference
            if ref_seq_parts is not None:
                # Trim the last segment down to the annotated length
                #   to avoid including the entire (500nt) C-region
                ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1].
                                                      annotation_match_end]

                # Concatenate the reference VDJC segments
                ref_seq = reduce(lambda x, y: x + y, ref_seq_parts)
                ref_name = re.sub('consensus', 'concat_ref', consensus_id)

                # Reannotate the reference sequence.
                # Restrict the annotation to the already-called segments to
                #   reduce the risk of discordance between the consensus and
                #   concat_ref annotations.
                ref_contig = annotate_consensus_contig(
                    args.vdj_reference_path,
                    args.min_score_ratios,
                    args.min_word_sizes,
                    ref_name,
                    clonotype_id,
                    ref_seq,
                    'I' * len(ref_seq),
                    use_features=set([a.feature.feature_id
                                      for a in ref_annos]),
                )
                ref_contigs.append(ref_contig)

                # Add the consensus sequence to the input FASTQ (next to the contigs)
                with open(contig_fastq, 'a') as contig_fq:
                    # Create a fake UMI and barcode
                    header = cr_fastq.AugmentedFastqHeader(consensus_id)
                    header.set_tag(PROCESSED_UMI_TAG, consensus_id)
                    header.set_tag(PROCESSED_BARCODE_TAG, consensus_id)
                    tk_fasta.write_read_fastq(contig_fq, header.to_string(),
                                              consensus_seq, consensus_quals)

                # Reuse this file (this had the assembly output but we don't need it anymore)
                ref_fasta_name = martian.make_path(consensus_id +
                                                   '_contigs.fasta')
                with open(ref_fasta_name, 'w') as f:
                    tk_fasta.write_read_fasta(f, ref_name, ref_seq)

                # Also append to the final output
                tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq)

                cmd = [
                    'vdj_asm', 'base-quals',
                    martian.make_path(consensus_id + '_contigs'), tmp_dir,
                    '--single-end'
                ]
                sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

                tk_subproc.check_call(cmd, cwd=os.getcwd())

                # Move out of tmp dir
                rec_bam = martian.make_path(consensus_id + '_reference.bam')
                cr_io.move(
                    os.path.join(tmp_dir, consensus_id + '_contigs.bam'),
                    rec_bam)
                outs.chunked_concat_ref_bams.append(rec_bam)

            if os.path.isdir(tmp_dir):
                shutil.rmtree(tmp_dir)

            # Clean up unneeded files ASAP
            rm_files([
                consensus_id + '_contigs.fasta',
                consensus_id + '_contigs.fastq'
            ])

            # Merge N most recent BAM files to avoid filesystem overload
            if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY:
                assert len(outs.chunked_consensus_bams) == len(
                    outs.chunked_concat_ref_bams)

                new_cons_bam = martian.make_path('merged-consensus-%03d.bam' %
                                                 n_merged_bams)
                concatenate_bams(new_cons_bam, outs.chunked_consensus_bams)
                rm_files(outs.chunked_consensus_bams)
                outs.chunked_consensus_bams = [new_cons_bam]

                new_ref_bam = martian.make_path('merged-ref-%03d.bam' %
                                                n_merged_bams)
                concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams)
                rm_files(outs.chunked_concat_ref_bams)
                outs.chunked_concat_ref_bams = [new_ref_bam]

                n_merged_bams += 1

    in_bam.close()

    consensus_fastq.close()
    consensus_fasta.close()
    ref_fasta.close()

    reporter.save(outs.chunked_reporter)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)
Пример #11
0
def main(args, outs):
    # Martian coerces dict keys to string
    # Coerce keys back to int
    args.chunks_per_gem_group = {int(k): v for k, v in args.chunks_per_gem_group.iteritems()}

    paired_end = args.read2s_chunk is not None

    # Lazy load R1
    r1_file = cr_io.open_maybe_gzip(args.read1s_chunk)
    read1s = tk_fasta.read_generator_fastq(r1_file)

    # Lazy load R2
    if paired_end:
        r2_file = cr_io.open_maybe_gzip(args.read2s_chunk)
        read2s = tk_fasta.read_generator_fastq(r2_file)
    else:
        read2s = []

    # Lazy load corrected BCs
    bc_file = cr_io.open_maybe_gzip(args.bcs)
    bcs = (line.strip() for line in bc_file)

    buckets = {}

    bucket_filenames = {}

    for gem_group, bucket_name in enumerate_bucket_names(args.chunks_per_gem_group):
        filename = martian.make_path("%s.fastq" % bucket_name)
        bucket_filenames[bucket_name] = filename
        buckets[bucket_name] = []

    for read1, read2, barcode in itertools.izip_longest(read1s, read2s, bcs):
        # Exclude unbarcoded reads
        if barcode == '':
            continue

        # Exclude short reads
        if len(read1[1]) < MIN_READ_LENGTH or (read2 is not None and len(read2[1]) < MIN_READ_LENGTH):
            continue

        # Attach processed barcode to reads
        r1_hdr = cr_fastq.AugmentedFastqHeader(read1[0])
        r1_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode)
        r1_new_qname = r1_hdr.to_string()

        if paired_end:
            r2_hdr = cr_fastq.AugmentedFastqHeader(read2[0])
            r2_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode)
            r2_new_qname = r2_hdr.to_string()

        barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
        bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group])

        buckets[bucket_name].append((r1_new_qname, read1[1], read1[2]))
        if paired_end:
            buckets[bucket_name].append((r2_new_qname, read2[1], read2[2]))

    outs.buckets = {}

    # Sort and write each bucket
    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=vdj_utils.fastq_barcode_sort_key)

        # Don't create empty bucket files.
        # This is common when the reads are ordered by gem group
        # And a chunk sees only a single gem group.
        if len(bucket) == 0:
            continue

        filename = bucket_filenames[bucket_name]
        with cr_io.open_maybe_gzip(filename, 'w') as f:
            for read in bucket:
                tk_fasta.write_read_fastq(f, *read)

        outs.buckets[bucket_name] = bucket_filenames[bucket_name]
Пример #12
0
def main(args, outs):
    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []

    chunk_clonotypes = set(args.chunk_clonotypes)

    reporter = vdj_report.VdjReporter()
    if not args.clonotype_assignments or not vdj_utils.bam_has_seqs(
            args.contig_bam):
        # always produce an empty summary
        reporter.save(outs.chunked_reporter)
        return

    with open(args.annotations) as f:
        contigs = cPickle.load(f)
    with open(args.clonotype_assignments) as f:
        clonotypes = json.load(f)
    in_bam = tk_bam.create_bam_infile(args.contig_bam)

    contig_read_counts = {c.contig_name: c.read_count for c in contigs}
    contig_umi_counts = {c.contig_name: c.umi_count for c in contigs}

    # Do not attempt to read into a pandas object because it can be huge.
    contig_umis = defaultdict(set)
    with open(args.umi_summary_tsv, 'r') as umi_file:
        for line in umi_file:
            fields = line.strip().split('\t')
            umi = fields[2]
            if umi == 'umi' or len(fields) < 7:
                continue
            good_umi = fields[5] == 'True'
            contig_names = fields[6].split(',')
            if good_umi:
                for c in contig_names:
                    contig_umis[c].add(umi)

    consensus_fastq = open(outs.consensus_fastq, 'w')
    consensus_fasta = open(outs.consensus_fasta, 'w')
    ref_fasta = open(outs.concat_ref_fasta, 'w')

    consensus_contigs = []
    ref_contigs = []

    assert (args.metric_prefix in reporter.vdj_clonotype_types)

    # Iterate over clonotype assignments
    for clonotype_id, clonotype in clonotypes.iteritems():
        if not clonotype_id in chunk_clonotypes:
            continue

        for consensus_id, consensus in clonotype['consensuses'].iteritems():
            cdr = consensus['cdr3_seq']

            sel_contigs = set(consensus['cell_contigs']
                              )  # Get the contigs that should be merged
            # Keep track of the "best" contig. This will be used in case the
            # merging fails.
            best_contig = None

            # Keep track of the set of distinct annotations of the contigs to merge.
            # Will use to report rate of discrepancies.
            feature_annotations = defaultdict(set)

            for contig in contigs:
                if contig.contig_name in sel_contigs:

                    for anno in contig.annotations:
                        feature_annotations[anno.feature.region_type].add(
                            anno.feature.gene_name)

                    # Always choose a productive over a non-productive. Between
                    # contigs with the same productivity, choose the one that had more UMIs.
                    if best_contig is None or (not best_contig.productive and contig.productive) or \
                       (best_contig.productive == contig.productive and \
                        len(contig_umis[best_contig.contig_name]) < len(contig_umis[contig.contig_name])):

                        best_contig = contig

            assert not best_contig is None

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_v_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_j_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            # Order contigs by decreasing UMI support
            ordered_contigs = list(
                sorted(sel_contigs,
                       key=lambda x: len(contig_umis[x]),
                       reverse=True))
            ordered_contigs = ordered_contigs[
                0:min(MAX_CELLS_FOR_BASE_QUALS, len(sel_contigs))]

            wrong_cdr_metric = reporter._get_metric_attr(
                'vdj_clonotype_consensus_wrong_cdr_contig_frac',
                args.metric_prefix)

            tmp_dir = martian.make_path(consensus_id + '_outs')
            cr_utils.mkdir(tmp_dir, allow_existing=True)

            res = get_consensus_seq(consensus_id, sel_contigs,
                                    best_contig.contig_name, tmp_dir, args)
            (best_seq, best_quals, consensus_seq, contig_to_cons_bam,
             contig_fastq, contig_fasta) = res

            outs.chunked_consensus_bams.append(contig_to_cons_bam)

            # make sure the bam file has the right header (single sequence with this consensus name)
            tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam)
            assert (list(tmp_bam.references) == [consensus_id])
            tmp_bam.close()

            if consensus_seq:
                # If this is not None, we actually built a consensus, so we have to compute the quals from scratch.
                consensus_quals = get_consensus_quals(in_bam, consensus_id,
                                                      contig_fasta,
                                                      ordered_contigs,
                                                      contig_umis, tmp_dir)
            else:
                consensus_seq = best_seq
                consensus_quals = best_quals

            assert (len(consensus_seq) == len(consensus_quals))

            total_read_count = np.sum(
                [contig_read_counts[c] for c in sel_contigs])
            total_umi_count = np.sum(
                [contig_umi_counts[c] for c in sel_contigs])

            contig_info_dict = {
                'cells': clonotype['barcodes'],
                'cell_contigs': sel_contigs,
                'clonotype_freq': clonotype['freq'],
                'clonotype_prop': clonotype['prop'],
            }

            contig = annotate_consensus_contig(args.vdj_reference_path,
                                               args.min_score_ratios,
                                               args.min_word_sizes,
                                               consensus_id,
                                               clonotype_id,
                                               consensus_seq,
                                               consensus_quals,
                                               read_count=total_read_count,
                                               umi_count=total_umi_count,
                                               info_dict=contig_info_dict,
                                               primers=args.primers)

            wrong_cdr_metric.add(1,
                                 filter=contig.cdr3_seq is None
                                 or contig.cdr3_seq != cdr)

            if contig.cdr3_seq is None or contig.cdr3_seq != cdr:
                # Something went wrong. Use "best" contig as the consensus.
                consensus_seq = best_seq
                consensus_quals = best_quals
                contig = annotate_consensus_contig(args.vdj_reference_path,
                                                   args.min_score_ratios,
                                                   args.min_word_sizes,
                                                   consensus_id,
                                                   clonotype_id,
                                                   consensus_seq,
                                                   consensus_quals,
                                                   read_count=total_read_count,
                                                   umi_count=total_umi_count,
                                                   info_dict=contig_info_dict,
                                                   primers=args.primers)

            assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr)

            consensus_contigs.append(contig)

            tk_fasta.write_read_fasta(consensus_fasta, consensus_id,
                                      consensus_seq)
            tk_fasta.write_read_fastq(consensus_fastq, consensus_id,
                                      consensus_seq, consensus_quals)
            assert (len(consensus_seq) == len(consensus_quals))

            ref_seq_parts, ref_annos = contig.get_concat_reference_sequence()

            # Align the contigs and consensus to a synthetic concatenated reference
            if ref_seq_parts is not None:
                # Trim the last segment down to the annotated length
                #   to avoid including the entire (500nt) C-region
                ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1].
                                                      annotation_match_end]

                # Concatenate the reference VDJC segments
                ref_seq = reduce(lambda x, y: x + y, ref_seq_parts)
                ref_name = re.sub('consensus', 'concat_ref', consensus_id)

                # Reannotate the reference sequence.
                # Restrict the annotation to the already-called segments to
                #   reduce the risk of discordance between the consensus and
                #   concat_ref annotations.
                ref_contig = annotate_consensus_contig(
                    args.vdj_reference_path,
                    args.min_score_ratios,
                    args.min_word_sizes,
                    ref_name,
                    clonotype_id,
                    ref_seq,
                    'I' * len(ref_seq),
                    use_features=set([a.feature.feature_id
                                      for a in ref_annos]),
                )
                ref_contigs.append(ref_contig)

                # Add the consensus sequence to the input FASTQ (next to the contigs)
                with open(contig_fastq, 'a') as contig_fq:
                    # Create a fake UMI and barcode
                    header = cr_fastq.AugmentedFastqHeader(consensus_id)
                    header.set_tag(PROCESSED_UMI_TAG, consensus_id)
                    header.set_tag(PROCESSED_BARCODE_TAG, consensus_id)
                    tk_fasta.write_read_fastq(contig_fq, header.to_string(),
                                              consensus_seq, consensus_quals)

                # Reuse this file (this had the assembly output but we don't need it anymore)
                ref_fasta_name = martian.make_path(consensus_id +
                                                   '_contigs.fasta')
                with open(ref_fasta_name, 'w') as f:
                    tk_fasta.write_read_fasta(f, ref_name, ref_seq)

                # Also append to the final output
                tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq)

                cmd = [
                    'vdj_asm',
                    'base-quals',
                    martian.make_path(consensus_id + '_contigs'),
                    tmp_dir,
                    '--single-end',
                    '--global'  # use global alignment if a good seed isn't found - everything must get aligned
                ]
                sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

                subprocess.check_call(cmd, cwd=os.getcwd())

                # Move out of tmp dir
                rec_bam = martian.make_path(consensus_id + '_reference.bam')
                cr_utils.move(
                    os.path.join(tmp_dir, consensus_id + '_contigs.bam'),
                    rec_bam)
                outs.chunked_concat_ref_bams.append(rec_bam)

            if os.path.isdir(tmp_dir):
                shutil.rmtree(tmp_dir)

    in_bam.close()

    consensus_fastq.close()
    consensus_fasta.close()
    ref_fasta.close()

    reporter.save(outs.chunked_reporter)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)
Пример #13
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group,
                                              args.library_type)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk)
    in_read2_fastq = cr_io.open_maybe_gzip(
        args.read2_chunk) if args.read2_chunk else []

    outs.corrected_bcs += h5_constants.LZ4_SUFFIX
    out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                            tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        processed_bc = None

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        out_file.write('%s\n' %
                       (processed_bc if processed_bc is not None else ''))

    in_read1_fastq.close()
    if in_read2_fastq:
        in_read2_fastq.close()
    out_file.close()

    bc_counter.close()

    reporter.save(outs.chunked_reporter)
Пример #14
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = open(args.read1_chunk)
    in_read2_fastq = open(args.read2_chunk)
    out_read1_fastq = open(outs.corrected_read1s, 'w')
    out_read2_fastq = open(outs.corrected_read2s, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                    tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])
        read2_header = cr_fastq.AugmentedFastqHeader(read2[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)
                read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)
                read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(),
                                  read1[1], read1[2])
        tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(),
                                  read2[1], read2[2])

    in_read1_fastq.close()
    in_read2_fastq.close()
    out_read1_fastq.close()
    out_read2_fastq.close()
    bc_counter.close()

    reporter.save(outs.chunked_reporter)