Пример #1
0
def join(args, outs, chunk_defs, chunk_outs):
    # Merge pickles and save JSON
    all_contigs = []

    for chunk in chunk_outs:
        if chunk.chunked_annotations is not None:
            all_contigs.extend(cPickle.load(open(chunk.chunked_annotations, 'rb')))

    # Clear this temporary, chunk-specific out
    outs.chunked_annotations = None

    # Write all contigs
    with open(outs.raw_annotations, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, all_contigs)

    # Write filtered contigs
    with open(outs.annotations, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, [c for c in all_contigs if c.filtered])

    # Save a BED formatted file of a subset of annotations
    with open(outs.annotations_bed, 'w') as output_file:
        bed_lines = cr_utils.flatten_list([c.get_annotations_bed() for c in all_contigs if c.filtered])

        for bed_line in bed_lines:
            output_file.write(bed_line + '\n')

    # Write annotations CSV
    with open(outs.annotations_csv, 'w') as csv:
        vdj_annot.save_contig_list_csv(csv, all_contigs)
Пример #2
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    with open(args.contig_annotations) as f:
        contigs = vdj_annot.load_contig_list_from_json(f,
                                                       args.vdj_reference_path)

    contigs.sort(key=lambda c: (c.barcode, c.get_single_chain(
    ), not c.productive, -c.umi_count, -c.read_count, -len(c)))

    low_confidence_contigs = set()
    cell_contigs = set()

    for (bc,
         chain), group in itertools.groupby(contigs,
                                            key=lambda c:
                                            (c.barcode, c.get_single_chain())):
        first_cdr3 = None
        first_cdr3_umis = None
        seen_cdr3s = set()

        for contig in group:
            contig.high_confidence = True

            if contig.is_cell:
                cell_contigs.add(contig.contig_name)

            if first_cdr3 is None:
                first_cdr3 = contig.cdr3_seq
                first_cdr3_umis = contig.umi_count

            # Mark as low confidence:
            # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3
            #    with a single UMI or low UMIs relative to the first contig, or
            extraneous_cdr3 = first_cdr3 is not None \
               and contig.cdr3_seq != first_cdr3 \
               and (contig.umi_count == 1 or \
                    (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO)

            # 2) Any contigs with a repeated CDR3.
            repeat_cdr3 = contig.cdr3_seq in seen_cdr3s

            if extraneous_cdr3 or repeat_cdr3:
                contig.high_confidence = False
                low_confidence_contigs.add(contig.contig_name)

            seen_cdr3s.add(contig.cdr3_seq)

            if chain in vdj_constants.VDJ_GENES:
                reporter._get_metric_attr('vdj_high_conf_prod_contig_frac',
                                          chain).add(
                                              1, filter=contig.high_confidence)
            reporter._get_metric_attr('vdj_high_conf_prod_contig_frac',
                                      cr_constants.MULTI_REFS_PREFIX).add(
                                          1, filter=contig.high_confidence)

    # Write augmented contig annotations
    with open(outs.contig_annotations, 'w') as f:
        vdj_annot.save_annotation_list_json(f, contigs)

    # Write filtered fasta
    with open(args.contig_fasta) as in_file, \
         open(outs.filtered_contig_fasta, 'w') as out_file:
        for hdr, seq in cr_utils.get_fasta_iter(in_file):
            # Keep contigs that are high confidence & in cells
            if hdr not in low_confidence_contigs and hdr in cell_contigs:
                tk_fasta.write_read_fasta(out_file, hdr, seq)

    # Write filtered fastq
    with open(args.contig_fastq) as in_file, \
         open(outs.filtered_contig_fastq, 'w') as out_file:
        for name, seq, qual in tk_fasta.read_generator_fastq(in_file):
            if name not in low_confidence_contigs and name in cell_contigs:
                tk_fasta.write_read_fastq(out_file, name, seq, qual)

    reporter.report_summary_json(outs.summary)
Пример #3
0
def join(args, outs, chunk_defs, chunk_outs):
    if len(chunk_outs) == 0:
        # Set all outputs to null
        for slot in outs.slots:
            setattr(outs, slot, None)
        return

    reporters = [chunk_out.chunked_reporter for chunk_out in chunk_outs]
    final_report = cr_report.merge_reporters(reporters)
    final_report.report_summary_json(outs.summary)

    consensus_contigs = []
    ref_contigs = []
    all_bams = []
    all_ref_bams = []

    for chunk in chunk_outs:
        if chunk.consensus_annotations_json and os.path.isfile(
                chunk.consensus_annotations_json):
            # Collect consensus annotations
            new_contigs = vdj_annot.load_cell_contigs_from_json(
                chunk.consensus_annotations_json,
                args.vdj_reference_path,
                group_key='clonotype')
            for cl in new_contigs:
                consensus_contigs.extend(cl.chains)

            # Collect concat_ref annotations
            new_ref_contigs = vdj_annot.load_cell_contigs_from_json(
                chunk.concat_ref_annotations_json,
                args.vdj_reference_path,
                group_key='clonotype')
            for cl in new_ref_contigs:
                ref_contigs.extend(cl.chains)

            all_bams.extend(chunk.chunked_consensus_bams)
            all_ref_bams.extend(chunk.chunked_concat_ref_bams)

    if consensus_contigs:
        all_fastqs = [chunk_out.consensus_fastq for chunk_out in chunk_outs]
        cr_io.concatenate_files(outs.consensus_fastq, all_fastqs)

        all_fastas = [chunk_out.consensus_fasta for chunk_out in chunk_outs]
        concatenate_and_index_fastas(outs.consensus_fasta, all_fastas)
        outs.consensus_fasta_fai = outs.consensus_fasta + '.fai'

        all_fastas = [chunk_out.concat_ref_fasta for chunk_out in chunk_outs]
        concatenate_and_index_fastas(outs.concat_ref_fasta, all_fastas)
        outs.concat_ref_fasta_fai = outs.concat_ref_fasta + '.fai'

        concatenate_sort_and_index_bams(outs.consensus_bam, all_bams)
        outs.consensus_bam_bai = outs.consensus_bam + '.bai'
        concatenate_sort_and_index_bams(outs.concat_ref_bam, all_ref_bams)
        outs.concat_ref_bam_bai = outs.concat_ref_bam + '.bai'

        # Sort contigs (and clonotypes) by frequency.
        with open(args.clonotype_assignments) as f:
            clonotypes = json.load(f)
        clonotype_freqs = {cid: c['freq'] for cid, c in clonotypes.iteritems()}

    consensus_contigs.sort(key=lambda x: clonotype_freqs[x.clonotype],
                           reverse=True)
    ref_contigs.sort(key=lambda x: clonotype_freqs[x.clonotype], reverse=True)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)

    with open(outs.consensus_annotations_csv, 'w') as out_file:
        vdj_annot.save_consensus_list_csv(out_file, consensus_contigs)

    with open(outs.clonotypes, 'w') as f:
        vdj_annot.save_clonotype_info_csv(f, consensus_contigs)

    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []
Пример #4
0
def main(args, outs):
    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []

    chunk_clonotypes = set(args.chunk_clonotypes)

    reporter = vdj_report.VdjReporter()
    if not args.clonotype_assignments or not vdj_utils.bam_has_seqs(
            args.contig_bam):
        # always produce an empty summary
        reporter.save(outs.chunked_reporter)
        return

    # Get the clonotype-barcode assignments
    with open(args.clonotype_assignments) as f:
        clonotypes = json.load(f)

    # Partition contig annotations by consensus id
    consensus_to_contigs = defaultdict(list)
    relevant_contig_ids = set()

    with open(args.chunk_annotations) as f:
        contigs = vdj_annot.load_contig_list_from_json(f,
                                                       args.vdj_reference_path)

    clo_key = '%s_clonotype_id' % args.metric_prefix
    cons_key = '%s_consensus_id' % args.metric_prefix

    for contig in contigs:
        clo_id = contig.info_dict.get(clo_key)
        cons_id = contig.info_dict.get(cons_key)
        assert clo_id in chunk_clonotypes and cons_id is not None

        consensus_to_contigs[cons_id].append(contig)
        relevant_contig_ids.add(contig.contig_name)

    assert len(consensus_to_contigs) > 0

    in_bam = tk_bam.create_bam_infile(args.contig_bam)

    n_merged_bams = 0

    # For all contigs relevant to this chunk,
    #   get the assembler umi data required for base qual recalculation.
    # Do not attempt to read into a pandas object because it can be huge.
    contig_umis = defaultdict(set)
    with open(args.umi_summary_tsv, 'r') as umi_file:
        for line in umi_file:
            fields = line.strip().split('\t')
            umi = fields[2]
            if umi == 'umi' or len(fields) < 7:
                continue
            good_umi = fields[5].lower() == 'true'
            contig_ids = set(fields[6].split(','))
            if good_umi and len(contig_ids & relevant_contig_ids) > 0:
                for c in contig_ids:
                    contig_umis[c].add(umi)

    consensus_fastq = open(outs.consensus_fastq, 'w')
    consensus_fasta = open(outs.consensus_fasta, 'w')
    ref_fasta = open(outs.concat_ref_fasta, 'w')

    consensus_contigs = []
    ref_contigs = []

    assert (args.metric_prefix in reporter.vdj_clonotype_types)

    # Iterate over clonotype assignments
    for clonotype_id, clonotype in clonotypes.iteritems():
        if not clonotype_id in chunk_clonotypes:
            continue

        for consensus_id, consensus in clonotype['consensuses'].iteritems():
            cdr = consensus['cdr3_seq']

            # Verify that the contig annotation data are consistent with the clonotype assignment data
            assert set(consensus['cell_contigs']) == \
                set(c.contig_name for c in consensus_to_contigs[consensus_id])
            sel_contigs = consensus_to_contigs[consensus_id]
            sel_contig_ids = [c.contig_name for c in sel_contigs]

            # Keep track of the "best" contig. This will be used in case the
            # merging fails.
            best_contig = None

            # Keep track of the set of distinct annotations of the contigs to merge.
            # Will use to report rate of discrepancies.
            feature_annotations = defaultdict(set)

            for contig in sel_contigs:
                for anno in contig.annotations:
                    feature_annotations[anno.feature.region_type].add(
                        anno.feature.gene_name)

                # Always choose a productive over a non-productive. Between
                # contigs with the same productivity, choose the one that had more UMIs.
                if best_contig is None or (not best_contig.productive and contig.productive) or \
                   (best_contig.productive == contig.productive and \
                    best_contig.umi_count < contig.umi_count):

                    best_contig = contig

            assert best_contig is not None

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_v_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_j_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            wrong_cdr_metric = reporter._get_metric_attr(
                'vdj_clonotype_consensus_wrong_cdr_contig_frac',
                args.metric_prefix)

            tmp_dir = martian.make_path(consensus_id + '_outs')
            cr_io.mkdir(tmp_dir, allow_existing=True)

            res = get_consensus_seq(consensus_id, sel_contig_ids,
                                    best_contig.contig_name, tmp_dir, args)
            (best_seq, best_quals, consensus_seq, contig_to_cons_bam,
             contig_fastq, contig_fasta) = res

            outs.chunked_consensus_bams.append(contig_to_cons_bam)

            # make sure the bam file has the right header (single sequence with this consensus name)
            tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam)
            if list(tmp_bam.references) != [consensus_id]:
                # Print some info to help us debug
                print tmp_bam.references, consensus_id
                assert (list(tmp_bam.references) == [consensus_id])
            tmp_bam.close()

            if consensus_seq:
                # If this is not None, we actually built a consensus, so we have to compute the quals from scratch.
                # Use a subset of the contigs for computing quals.
                contig_ids = map(
                    lambda c: c.contig_name,
                    sorted(sel_contigs,
                           key=lambda c: c.umi_count,
                           reverse=True))
                contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS]

                consensus_quals = get_consensus_quals(in_bam, consensus_id,
                                                      contig_fasta, contig_ids,
                                                      contig_umis, tmp_dir)
            else:
                consensus_seq = best_seq
                consensus_quals = best_quals

            assert (len(consensus_seq) == len(consensus_quals))

            total_read_count = sum([c.read_count for c in sel_contigs])
            total_umi_count = sum([c.umi_count for c in sel_contigs])

            contig_info_dict = {
                'cells': clonotype['barcodes'],
                'cell_contigs': sel_contig_ids,
                'clonotype_freq': clonotype['freq'],
                'clonotype_prop': clonotype['prop'],
            }

            contig = annotate_consensus_contig(args.vdj_reference_path,
                                               args.min_score_ratios,
                                               args.min_word_sizes,
                                               consensus_id,
                                               clonotype_id,
                                               consensus_seq,
                                               consensus_quals,
                                               read_count=total_read_count,
                                               umi_count=total_umi_count,
                                               info_dict=contig_info_dict,
                                               primers=args.primers)

            wrong_cdr_metric.add(1,
                                 filter=contig.cdr3_seq is None
                                 or contig.cdr3_seq != cdr)

            if contig.cdr3_seq is None or contig.cdr3_seq != cdr:
                # Something went wrong. Use "best" contig as the consensus.
                consensus_seq = best_seq
                consensus_quals = best_quals
                contig = annotate_consensus_contig(args.vdj_reference_path,
                                                   args.min_score_ratios,
                                                   args.min_word_sizes,
                                                   consensus_id,
                                                   clonotype_id,
                                                   consensus_seq,
                                                   consensus_quals,
                                                   read_count=total_read_count,
                                                   umi_count=total_umi_count,
                                                   info_dict=contig_info_dict,
                                                   primers=args.primers)

            assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr)

            consensus_contigs.append(contig)

            tk_fasta.write_read_fasta(consensus_fasta, consensus_id,
                                      consensus_seq)
            tk_fasta.write_read_fastq(consensus_fastq, consensus_id,
                                      consensus_seq, consensus_quals)
            assert (len(consensus_seq) == len(consensus_quals))

            ref_seq_parts, ref_annos = contig.get_concat_reference_sequence()

            # Align the contigs and consensus to a synthetic concatenated reference
            if ref_seq_parts is not None:
                # Trim the last segment down to the annotated length
                #   to avoid including the entire (500nt) C-region
                ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1].
                                                      annotation_match_end]

                # Concatenate the reference VDJC segments
                ref_seq = reduce(lambda x, y: x + y, ref_seq_parts)
                ref_name = re.sub('consensus', 'concat_ref', consensus_id)

                # Reannotate the reference sequence.
                # Restrict the annotation to the already-called segments to
                #   reduce the risk of discordance between the consensus and
                #   concat_ref annotations.
                ref_contig = annotate_consensus_contig(
                    args.vdj_reference_path,
                    args.min_score_ratios,
                    args.min_word_sizes,
                    ref_name,
                    clonotype_id,
                    ref_seq,
                    'I' * len(ref_seq),
                    use_features=set([a.feature.feature_id
                                      for a in ref_annos]),
                )
                ref_contigs.append(ref_contig)

                # Add the consensus sequence to the input FASTQ (next to the contigs)
                with open(contig_fastq, 'a') as contig_fq:
                    # Create a fake UMI and barcode
                    header = cr_fastq.AugmentedFastqHeader(consensus_id)
                    header.set_tag(PROCESSED_UMI_TAG, consensus_id)
                    header.set_tag(PROCESSED_BARCODE_TAG, consensus_id)
                    tk_fasta.write_read_fastq(contig_fq, header.to_string(),
                                              consensus_seq, consensus_quals)

                # Reuse this file (this had the assembly output but we don't need it anymore)
                ref_fasta_name = martian.make_path(consensus_id +
                                                   '_contigs.fasta')
                with open(ref_fasta_name, 'w') as f:
                    tk_fasta.write_read_fasta(f, ref_name, ref_seq)

                # Also append to the final output
                tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq)

                cmd = [
                    'vdj_asm', 'base-quals',
                    martian.make_path(consensus_id + '_contigs'), tmp_dir,
                    '--single-end'
                ]
                sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

                tk_subproc.check_call(cmd, cwd=os.getcwd())

                # Move out of tmp dir
                rec_bam = martian.make_path(consensus_id + '_reference.bam')
                cr_io.move(
                    os.path.join(tmp_dir, consensus_id + '_contigs.bam'),
                    rec_bam)
                outs.chunked_concat_ref_bams.append(rec_bam)

            if os.path.isdir(tmp_dir):
                shutil.rmtree(tmp_dir)

            # Clean up unneeded files ASAP
            rm_files([
                consensus_id + '_contigs.fasta',
                consensus_id + '_contigs.fastq'
            ])

            # Merge N most recent BAM files to avoid filesystem overload
            if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY:
                assert len(outs.chunked_consensus_bams) == len(
                    outs.chunked_concat_ref_bams)

                new_cons_bam = martian.make_path('merged-consensus-%03d.bam' %
                                                 n_merged_bams)
                concatenate_bams(new_cons_bam, outs.chunked_consensus_bams)
                rm_files(outs.chunked_consensus_bams)
                outs.chunked_consensus_bams = [new_cons_bam]

                new_ref_bam = martian.make_path('merged-ref-%03d.bam' %
                                                n_merged_bams)
                concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams)
                rm_files(outs.chunked_concat_ref_bams)
                outs.chunked_concat_ref_bams = [new_ref_bam]

                n_merged_bams += 1

    in_bam.close()

    consensus_fastq.close()
    consensus_fasta.close()
    ref_fasta.close()

    reporter.save(outs.chunked_reporter)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)
Пример #5
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    cell_barcodes = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes))

    barcode_contigs = vdj_annot.load_cell_contigs_from_json(
        args.annotations, args.vdj_reference_path, group_key='barcode')

    # From CDR sequence to sequence id
    sequences = {}
    # From clonotype (tuple of CDR ids) to clonotype id
    clonotypes = {}

    # From barcode to clonotype id
    bc_clonotype_assignments = {}

    # First pass: Just keep track of observed CDR3s
    for contig_list in barcode_contigs:

        # This will be a tuple of sequences like "TRA_<cdr seq>"
        barcode_clonotype_tuple = contig_list.clonotype_tuple(
            require_productive=not args.use_non_productive,
            require_full_len=True,
            require_high_conf=True)

        # Give unique numerical ids to the CDR3 sequences
        if barcode_clonotype_tuple:
            for cdr_seq in barcode_clonotype_tuple:
                sequences.setdefault(cdr_seq, len(sequences))

    # From sequence id to CDR sequence
    sequence_ids = {seq_id: seq for seq, seq_id in sequences.iteritems()}

    # Do a second pass to potentially use non-full length contigs with a valid CDR3.
    for contig_list in barcode_contigs:
        if args.use_non_full_len:
            barcode_clonotype_tuple = []

            for c in contig_list.contigs():
                (_, cl_seq) = c.clonotype_seq()
                # If this contig has a CDR3 and we can infer the gene type of
                # that CDR3 (either based on the contig itself or based on
                # other full-length contigs that had this CDR3, then add this
                # to the clonotype tuple).
                if cl_seq in sequences:
                    # this will rescue contigs that have a chain and CDR3 assigned
                    # but aren't full length
                    barcode_clonotype_tuple.append(cl_seq)
        else:
            barcode_clonotype_tuple = contig_list.clonotype_tuple(
                require_productive=(not args.use_non_productive),
                require_full_len=True,
                require_high_conf=True)
        barcode_clonotype = tuple(
            sorted(list(set([sequences[s] for s in barcode_clonotype_tuple]))))

        if barcode_clonotype:
            clonotype_id = clonotypes.setdefault(barcode_clonotype,
                                                 len(clonotypes))
            bc_clonotype_assignments[contig_list.name] = clonotype_id

    # From clonotype id to tuple of CDRs
    clonotype_ids = {
        clonotype_id: clonotype_tuple
        for clonotype_tuple, clonotype_id in clonotypes.iteritems()
    }

    out_clonotypes = vdj_annot.report_clonotypes(reporter, 'raw',
                                                 cell_barcodes, clonotype_ids,
                                                 sequence_ids, barcode_contigs,
                                                 bc_clonotype_assignments)

    with open(outs.clonotype_assignments, 'w') as out_file:
        tk_safe_json.dump_numpy(tk_safe_json.json_sanitize(out_clonotypes),
                                out_file,
                                pretty=True)

    # Add clonotype assignments to contig annotations
    del barcode_contigs
    with open(args.annotations) as f:
        all_contigs = vdj_annot.load_contig_list_from_json(
            f, args.vdj_reference_path)

    vdj_annot.label_contigs_with_consensus(out_clonotypes, all_contigs, 'raw')

    # Write augmented contig annotations
    with open(outs.contig_annotations, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, all_contigs)

    with open(outs.contig_annotations_csv, 'w') as out_file:
        vdj_annot.save_contig_list_csv(out_file,
                                       all_contigs,
                                       write_inferred=False)

    with open(outs.contig_annotations_pickle, 'w') as out_file:
        cPickle.dump(all_contigs, out_file, protocol=cPickle.HIGHEST_PROTOCOL)

    # Write filtered contig annotations
    with open(outs.filtered_contig_annotations_csv, 'w') as out_file:
        filtered_contigs = filter(lambda x: x.high_confidence and x.is_cell,
                                  all_contigs)
        vdj_annot.save_contig_list_csv(out_file,
                                       filtered_contigs,
                                       write_inferred=False)

    # Set a default value for paired clonotype diversity so that it will be
    # present in the metric summary csv even when there are no paired cells
    # or in denovo mode
    paired_diversity_metric = reporter._get_metric_attr(
        'vdj_paired_clonotype_diversity', MULTI_REFS_PREFIX, 'raw')
    if not paired_diversity_metric.d:
        paired_diversity_metric.add(None, 0)

    reporter.report_summary_json(outs.summary)
Пример #6
0
def main(args, outs):
    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []

    chunk_clonotypes = set(args.chunk_clonotypes)

    reporter = vdj_report.VdjReporter()
    if not args.clonotype_assignments or not vdj_utils.bam_has_seqs(
            args.contig_bam):
        # always produce an empty summary
        reporter.save(outs.chunked_reporter)
        return

    with open(args.annotations) as f:
        contigs = cPickle.load(f)
    with open(args.clonotype_assignments) as f:
        clonotypes = json.load(f)
    in_bam = tk_bam.create_bam_infile(args.contig_bam)

    contig_read_counts = {c.contig_name: c.read_count for c in contigs}
    contig_umi_counts = {c.contig_name: c.umi_count for c in contigs}

    # Do not attempt to read into a pandas object because it can be huge.
    contig_umis = defaultdict(set)
    with open(args.umi_summary_tsv, 'r') as umi_file:
        for line in umi_file:
            fields = line.strip().split('\t')
            umi = fields[2]
            if umi == 'umi' or len(fields) < 7:
                continue
            good_umi = fields[5] == 'True'
            contig_names = fields[6].split(',')
            if good_umi:
                for c in contig_names:
                    contig_umis[c].add(umi)

    consensus_fastq = open(outs.consensus_fastq, 'w')
    consensus_fasta = open(outs.consensus_fasta, 'w')
    ref_fasta = open(outs.concat_ref_fasta, 'w')

    consensus_contigs = []
    ref_contigs = []

    assert (args.metric_prefix in reporter.vdj_clonotype_types)

    # Iterate over clonotype assignments
    for clonotype_id, clonotype in clonotypes.iteritems():
        if not clonotype_id in chunk_clonotypes:
            continue

        for consensus_id, consensus in clonotype['consensuses'].iteritems():
            cdr = consensus['cdr3_seq']

            sel_contigs = set(consensus['cell_contigs']
                              )  # Get the contigs that should be merged
            # Keep track of the "best" contig. This will be used in case the
            # merging fails.
            best_contig = None

            # Keep track of the set of distinct annotations of the contigs to merge.
            # Will use to report rate of discrepancies.
            feature_annotations = defaultdict(set)

            for contig in contigs:
                if contig.contig_name in sel_contigs:

                    for anno in contig.annotations:
                        feature_annotations[anno.feature.region_type].add(
                            anno.feature.gene_name)

                    # Always choose a productive over a non-productive. Between
                    # contigs with the same productivity, choose the one that had more UMIs.
                    if best_contig is None or (not best_contig.productive and contig.productive) or \
                       (best_contig.productive == contig.productive and \
                        len(contig_umis[best_contig.contig_name]) < len(contig_umis[contig.contig_name])):

                        best_contig = contig

            assert not best_contig is None

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_v_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_j_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            # Order contigs by decreasing UMI support
            ordered_contigs = list(
                sorted(sel_contigs,
                       key=lambda x: len(contig_umis[x]),
                       reverse=True))
            ordered_contigs = ordered_contigs[
                0:min(MAX_CELLS_FOR_BASE_QUALS, len(sel_contigs))]

            wrong_cdr_metric = reporter._get_metric_attr(
                'vdj_clonotype_consensus_wrong_cdr_contig_frac',
                args.metric_prefix)

            tmp_dir = martian.make_path(consensus_id + '_outs')
            cr_utils.mkdir(tmp_dir, allow_existing=True)

            res = get_consensus_seq(consensus_id, sel_contigs,
                                    best_contig.contig_name, tmp_dir, args)
            (best_seq, best_quals, consensus_seq, contig_to_cons_bam,
             contig_fastq, contig_fasta) = res

            outs.chunked_consensus_bams.append(contig_to_cons_bam)

            # make sure the bam file has the right header (single sequence with this consensus name)
            tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam)
            assert (list(tmp_bam.references) == [consensus_id])
            tmp_bam.close()

            if consensus_seq:
                # If this is not None, we actually built a consensus, so we have to compute the quals from scratch.
                consensus_quals = get_consensus_quals(in_bam, consensus_id,
                                                      contig_fasta,
                                                      ordered_contigs,
                                                      contig_umis, tmp_dir)
            else:
                consensus_seq = best_seq
                consensus_quals = best_quals

            assert (len(consensus_seq) == len(consensus_quals))

            total_read_count = np.sum(
                [contig_read_counts[c] for c in sel_contigs])
            total_umi_count = np.sum(
                [contig_umi_counts[c] for c in sel_contigs])

            contig_info_dict = {
                'cells': clonotype['barcodes'],
                'cell_contigs': sel_contigs,
                'clonotype_freq': clonotype['freq'],
                'clonotype_prop': clonotype['prop'],
            }

            contig = annotate_consensus_contig(args.vdj_reference_path,
                                               args.min_score_ratios,
                                               args.min_word_sizes,
                                               consensus_id,
                                               clonotype_id,
                                               consensus_seq,
                                               consensus_quals,
                                               read_count=total_read_count,
                                               umi_count=total_umi_count,
                                               info_dict=contig_info_dict,
                                               primers=args.primers)

            wrong_cdr_metric.add(1,
                                 filter=contig.cdr3_seq is None
                                 or contig.cdr3_seq != cdr)

            if contig.cdr3_seq is None or contig.cdr3_seq != cdr:
                # Something went wrong. Use "best" contig as the consensus.
                consensus_seq = best_seq
                consensus_quals = best_quals
                contig = annotate_consensus_contig(args.vdj_reference_path,
                                                   args.min_score_ratios,
                                                   args.min_word_sizes,
                                                   consensus_id,
                                                   clonotype_id,
                                                   consensus_seq,
                                                   consensus_quals,
                                                   read_count=total_read_count,
                                                   umi_count=total_umi_count,
                                                   info_dict=contig_info_dict,
                                                   primers=args.primers)

            assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr)

            consensus_contigs.append(contig)

            tk_fasta.write_read_fasta(consensus_fasta, consensus_id,
                                      consensus_seq)
            tk_fasta.write_read_fastq(consensus_fastq, consensus_id,
                                      consensus_seq, consensus_quals)
            assert (len(consensus_seq) == len(consensus_quals))

            ref_seq_parts, ref_annos = contig.get_concat_reference_sequence()

            # Align the contigs and consensus to a synthetic concatenated reference
            if ref_seq_parts is not None:
                # Trim the last segment down to the annotated length
                #   to avoid including the entire (500nt) C-region
                ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1].
                                                      annotation_match_end]

                # Concatenate the reference VDJC segments
                ref_seq = reduce(lambda x, y: x + y, ref_seq_parts)
                ref_name = re.sub('consensus', 'concat_ref', consensus_id)

                # Reannotate the reference sequence.
                # Restrict the annotation to the already-called segments to
                #   reduce the risk of discordance between the consensus and
                #   concat_ref annotations.
                ref_contig = annotate_consensus_contig(
                    args.vdj_reference_path,
                    args.min_score_ratios,
                    args.min_word_sizes,
                    ref_name,
                    clonotype_id,
                    ref_seq,
                    'I' * len(ref_seq),
                    use_features=set([a.feature.feature_id
                                      for a in ref_annos]),
                )
                ref_contigs.append(ref_contig)

                # Add the consensus sequence to the input FASTQ (next to the contigs)
                with open(contig_fastq, 'a') as contig_fq:
                    # Create a fake UMI and barcode
                    header = cr_fastq.AugmentedFastqHeader(consensus_id)
                    header.set_tag(PROCESSED_UMI_TAG, consensus_id)
                    header.set_tag(PROCESSED_BARCODE_TAG, consensus_id)
                    tk_fasta.write_read_fastq(contig_fq, header.to_string(),
                                              consensus_seq, consensus_quals)

                # Reuse this file (this had the assembly output but we don't need it anymore)
                ref_fasta_name = martian.make_path(consensus_id +
                                                   '_contigs.fasta')
                with open(ref_fasta_name, 'w') as f:
                    tk_fasta.write_read_fasta(f, ref_name, ref_seq)

                # Also append to the final output
                tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq)

                cmd = [
                    'vdj_asm',
                    'base-quals',
                    martian.make_path(consensus_id + '_contigs'),
                    tmp_dir,
                    '--single-end',
                    '--global'  # use global alignment if a good seed isn't found - everything must get aligned
                ]
                sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

                subprocess.check_call(cmd, cwd=os.getcwd())

                # Move out of tmp dir
                rec_bam = martian.make_path(consensus_id + '_reference.bam')
                cr_utils.move(
                    os.path.join(tmp_dir, consensus_id + '_contigs.bam'),
                    rec_bam)
                outs.chunked_concat_ref_bams.append(rec_bam)

            if os.path.isdir(tmp_dir):
                shutil.rmtree(tmp_dir)

    in_bam.close()

    consensus_fastq.close()
    consensus_fasta.close()
    ref_fasta.close()

    reporter.save(outs.chunked_reporter)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)