示例#1
0
def main(args, outs):
    # Handle the dummy chunk
    if args.contigs is None:
        outs.chunked_reporter = None
        return

    # Calculate metrics on assigned contigs
    reporter = vdj_report.VdjReporter()

    bam = tk_bam.create_bam_infile(args.contig_bam)
    contigs = [
        contig for contig in bam.header['SQ']
        if contig['SN'] in set(args.contigs)
    ]

    for contig in contigs:
        contig_name = contig['SN']

        # Fetch indexed portion of BAM.
        read_iter = bam.fetch(str(contig_name))

        reporter.contig_mapping_frac_statistics_cb(
            read_iter, contig['LN'],
            cr_chem.get_strandedness(args.chemistry_def))

    reporter.save(outs.chunked_reporter)
示例#2
0
def main(args, outs):
    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Write compressed files
    outs.read1s += cr_constants.LZ4_SUFFIX
    outs.read2s += cr_constants.LZ4_SUFFIX

    cutadapt_out = os.path.join(os.path.dirname(outs.chunked_reporter), 'cutadapt_stdout')
    with open(cutadapt_out, 'w') as cut_stdout:
        status = run_cutadapt(args,
                              outs.read1s, outs.read2s,
                              args.chemistry_def,
                              cut_stdout)

    if args.read2s_chunk == None:
        outs.read2s = None

    if status != 0:
        martian.log_info('Error while running cutadapt')
    else:
        reporter = vdj_report.VdjReporter(primers=cr_utils.get_primers_from_dicts(args.primers))
        get_vdj_trim_metrics(reporter,
                             cutadapt_out,
                             paired_end)
        reporter.save(outs.chunked_reporter)
示例#3
0
def get_constants_for_pipeline(pipeline):
    if pipeline == shared_constants.PIPELINE_VDJ:
        metrics, alarms, charts = ws_vdj_constants.METRICS, ws_vdj_constants.METRIC_ALARMS, ws_vdj_constants.CHARTS
        metric_prefixes = vdj_report.VdjReporter().get_all_prefixes()
    else:
        metrics, alarms, charts = ws_gex_constants.METRICS, ws_gex_constants.METRIC_ALARMS, ws_gex_constants.CHARTS
        metric_prefixes = cr_report.Reporter().get_all_prefixes()

    return metrics, alarms, charts, metric_prefixes
示例#4
0
def main(args, outs):
    np.random.seed(0)

    unique_gem_groups = np.unique(args.gem_groups).tolist()

    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    # Load the umi info
    umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row,
                                          args.end_row)
    chains = umi_info['chains']
    barcodes = umi_info['barcodes']
    bc_gg = [str(cr_utils.split_barcode_seq(bc)[1]) for bc in barcodes]
    # Compute N50 read pairs per UMI for this gem group
    umi_read_pairs = []
    total_read_pairs = {}
    chain_bad_read_pairs = {}
    for bc_idx, data_iter in itertools.groupby(itertools.izip(
            umi_info['barcode_idx'], umi_info['umi_idx'],
            umi_info['chain_idx'], umi_info['reads']),
                                               key=lambda x: x[0]):

        bc_umi_read_pairs = {}
        for _, umi, chain_idx, reads in data_iter:
            bc_umi_read_pairs[umi] = bc_umi_read_pairs.get(umi, 0) + reads
            chain = chains[chain_idx]
            total_read_pairs[chain] = total_read_pairs.get(chain, 0) + reads
            total_read_pairs[
                cr_constants.MULTI_REFS_PREFIX] = total_read_pairs.get(
                    cr_constants.MULTI_REFS_PREFIX, 0) + reads
            if reads < args.min_readpairs_per_umi[bc_gg[bc_idx]]:
                chain_bad_read_pairs[chain] = chain_bad_read_pairs.get(
                    chain, 0) + reads
                chain_bad_read_pairs[
                    cr_constants.MULTI_REFS_PREFIX] = chain_bad_read_pairs.get(
                        cr_constants.MULTI_REFS_PREFIX, 0) + reads

        for r in bc_umi_read_pairs.itervalues():
            umi_read_pairs.append(r)

    rppu_n50 = tk_stats.NX(umi_read_pairs, 0.5)
    if rppu_n50 is None:
        rppu_n50 = float('NaN')

    # Report bad read-pairs/umi
    for chain in reporter.vdj_genes:
        bad_count = chain_bad_read_pairs.get(chain, 0)
        total_count = total_read_pairs.get(chain, 0)
        reporter._get_metric_attr('vdj_recombinome_low_support_reads_frac',
                                  chain).set_value(bad_count, total_count)

    reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50',
                              cr_constants.MULTI_REFS_PREFIX,
                              args.gem_group).set_value(rppu_n50)

    reporter.save(outs.chunked_reporter)
示例#5
0
def main(args, outs):
    status = run_cutadapt(args, outs.read1s, outs.read2s)

    if args.read2s_chunk == None:
        outs.read2s = None

    if status != 0:
        martian.log_info('Error while running cutadapt')
    else:
        reporter = vdj_report.VdjReporter(
            primers=cr_utils.get_primers_from_dicts(args.primers))
        get_vdj_trim_metrics(
            reporter,
            os.path.join(os.path.dirname(outs.chunked_reporter), '..',
                         '_stdout'))
        reporter.save(outs.chunked_reporter)
示例#6
0
def get_constants_for_pipeline(pipeline, sample_properties):
    """ Get the appropriate metrics/alarms/charts for a pipeline """
    if pipeline == shared_constants.PIPELINE_VDJ:
        metrics, alarms, charts = ws_vdj_constants.METRICS, ws_vdj_constants.METRIC_ALARMS, ws_vdj_constants.CHARTS

        metric_prefixes = filter_vdj_prefixes(
            vdj_report.VdjReporter().get_all_prefixes(), sample_properties)

        alarms = filter_vdj_alarms(alarms, sample_properties)

    else:
        metrics, alarms, charts = ws_gex_constants.METRICS, ws_gex_constants.METRIC_ALARMS, ws_gex_constants.CHARTS

        metric_prefixes = cr_report.Reporter().get_all_prefixes()

    return metrics, alarms, charts, metric_prefixes
示例#7
0
def main(args, outs):
    np.random.seed(0)

    unique_gem_groups = np.unique(args.gem_groups).tolist()

    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    # Load the umi info
    umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row,
                                          args.end_row)

    # Compute N50 read pairs per UMI for this gem group
    rppu_n50 = tk_stats.NX(umi_info['reads'], 0.5)
    if rppu_n50 is None:
        rppu_n50 = float('NaN')

    reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50',
                              cr_constants.MULTI_REFS_PREFIX,
                              args.gem_group).set_value(rppu_n50)

    reporter.save(outs.chunked_reporter)
示例#8
0
def main(args, outs):
    np.random.seed(0)

    unique_gem_groups = np.unique(args.gem_groups).tolist()
    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    cell_barcodes = set()
    bc_support = defaultdict(int)

    # Load barcode whitelist
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)

    all_gem_groups = sorted(set(args.gem_groups))

    if args.recovered_cells:
        recovered_cells = args.recovered_cells
    else:
        recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len(
            all_gem_groups)

    for gem_group in all_gem_groups:
        if barcode_whitelist is None:
            break

        # Load barcode raw read count distribution
        barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                                  barcode_whitelist,
                                                  gem_group,
                                                  proportions=False)
        counts = np.array(barcode_dist.values())

        # Append gem group to barcode seqs
        barcodes = np.array([
            cr_utils.format_barcode_seq(seq, gem_group)
            for seq in barcode_dist.keys()
        ])

        # Call cell barcodes
        gg_bc_support, gg_cell_bcs, rpu_threshold, umi_threshold, confidence = call_cell_barcodes(
            args.umi_info, int(gem_group))

        # Record the RPU and UMI thresholds
        reporter._get_metric_attr('vdj_filter_bcs_rpu_threshold',
                                  gem_group).set_value(rpu_threshold)
        reporter._get_metric_attr('vdj_filter_bcs_umi_threshold',
                                  gem_group).set_value(umi_threshold)
        reporter._get_metric_attr('vdj_filter_bcs_confidence',
                                  gem_group).set_value(confidence)

        if len(gg_bc_support) > 0:
            if args.force_cells is not None:
                sorted_bcs = map(
                    lambda kv: kv[0],
                    sorted(gg_bc_support.items(),
                           key=lambda kv: kv[1],
                           reverse=True))
                gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells
                                              )]

            # Update set of BCs called as cells
            cell_barcodes.update(set(gg_cell_bcs))

            # Sum BC support
            for bc, count in gg_bc_support.iteritems():
                bc_support[bc] += count

        # Load the extract_reads summary to get the total raw reads
        total_read_pairs = cr_utils.get_metric_from_json(
            args.extract_reads_summary, 'total_read_pairs')

        reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts,
                                        total_read_pairs, recovered_cells)

    save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes)

    with open(outs.barcode_support, 'w') as f:
        f.write('barcode,count\n')
        for k, v in bc_support.iteritems():
            f.write('%s,%d\n' % (k, v))

    write_barcode_umi_summary(args.umi_info, reporter,
                              outs.barcode_umi_summary,
                              args.min_readpairs_per_umi, cell_barcodes)

    reporter.report_summary_json(outs.summary)
示例#9
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    with open(args.contig_annotations) as f:
        contigs = vdj_annot.load_contig_list_from_json(f,
                                                       args.vdj_reference_path)

    contigs.sort(key=lambda c: (c.barcode, c.get_single_chain(
    ), not c.productive, -c.umi_count, -c.read_count, -len(c)))

    low_confidence_contigs = set()
    cell_contigs = set()

    for (bc,
         chain), group in itertools.groupby(contigs,
                                            key=lambda c:
                                            (c.barcode, c.get_single_chain())):
        first_cdr3 = None
        first_cdr3_umis = None
        seen_cdr3s = set()

        for contig in group:
            contig.high_confidence = True

            if contig.is_cell:
                cell_contigs.add(contig.contig_name)

            if first_cdr3 is None:
                first_cdr3 = contig.cdr3_seq
                first_cdr3_umis = contig.umi_count

            # Mark as low confidence:
            # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3
            #    with a single UMI or low UMIs relative to the first contig, or
            extraneous_cdr3 = first_cdr3 is not None \
               and contig.cdr3_seq != first_cdr3 \
               and (contig.umi_count == 1 or \
                    (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO)

            # 2) Any contigs with a repeated CDR3.
            repeat_cdr3 = contig.cdr3_seq in seen_cdr3s

            if extraneous_cdr3 or repeat_cdr3:
                contig.high_confidence = False
                low_confidence_contigs.add(contig.contig_name)

            seen_cdr3s.add(contig.cdr3_seq)

            if chain in vdj_constants.VDJ_GENES:
                reporter._get_metric_attr('vdj_high_conf_prod_contig_frac',
                                          chain).add(
                                              1, filter=contig.high_confidence)
            reporter._get_metric_attr('vdj_high_conf_prod_contig_frac',
                                      cr_constants.MULTI_REFS_PREFIX).add(
                                          1, filter=contig.high_confidence)

    # Write augmented contig annotations
    with open(outs.contig_annotations, 'w') as f:
        vdj_annot.save_annotation_list_json(f, contigs)

    # Write filtered fasta
    with open(args.contig_fasta) as in_file, \
         open(outs.filtered_contig_fasta, 'w') as out_file:
        for hdr, seq in cr_utils.get_fasta_iter(in_file):
            # Keep contigs that are high confidence & in cells
            if hdr not in low_confidence_contigs and hdr in cell_contigs:
                tk_fasta.write_read_fasta(out_file, hdr, seq)

    # Write filtered fastq
    with open(args.contig_fastq) as in_file, \
         open(outs.filtered_contig_fastq, 'w') as out_file:
        for name, seq, qual in tk_fasta.read_generator_fastq(in_file):
            if name not in low_confidence_contigs and name in cell_contigs:
                tk_fasta.write_read_fastq(out_file, name, seq, qual)

    reporter.report_summary_json(outs.summary)
示例#10
0
def main(args, outs):
    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []

    chunk_clonotypes = set(args.chunk_clonotypes)

    reporter = vdj_report.VdjReporter()
    if not args.clonotype_assignments or not vdj_utils.bam_has_seqs(
            args.contig_bam):
        # always produce an empty summary
        reporter.save(outs.chunked_reporter)
        return

    # Get the clonotype-barcode assignments
    with open(args.clonotype_assignments) as f:
        clonotypes = json.load(f)

    # Partition contig annotations by consensus id
    consensus_to_contigs = defaultdict(list)
    relevant_contig_ids = set()

    with open(args.chunk_annotations) as f:
        contigs = vdj_annot.load_contig_list_from_json(f,
                                                       args.vdj_reference_path)

    clo_key = '%s_clonotype_id' % args.metric_prefix
    cons_key = '%s_consensus_id' % args.metric_prefix

    for contig in contigs:
        clo_id = contig.info_dict.get(clo_key)
        cons_id = contig.info_dict.get(cons_key)
        assert clo_id in chunk_clonotypes and cons_id is not None

        consensus_to_contigs[cons_id].append(contig)
        relevant_contig_ids.add(contig.contig_name)

    assert len(consensus_to_contigs) > 0

    in_bam = tk_bam.create_bam_infile(args.contig_bam)

    n_merged_bams = 0

    # For all contigs relevant to this chunk,
    #   get the assembler umi data required for base qual recalculation.
    # Do not attempt to read into a pandas object because it can be huge.
    contig_umis = defaultdict(set)
    with open(args.umi_summary_tsv, 'r') as umi_file:
        for line in umi_file:
            fields = line.strip().split('\t')
            umi = fields[2]
            if umi == 'umi' or len(fields) < 7:
                continue
            good_umi = fields[5].lower() == 'true'
            contig_ids = set(fields[6].split(','))
            if good_umi and len(contig_ids & relevant_contig_ids) > 0:
                for c in contig_ids:
                    contig_umis[c].add(umi)

    consensus_fastq = open(outs.consensus_fastq, 'w')
    consensus_fasta = open(outs.consensus_fasta, 'w')
    ref_fasta = open(outs.concat_ref_fasta, 'w')

    consensus_contigs = []
    ref_contigs = []

    assert (args.metric_prefix in reporter.vdj_clonotype_types)

    # Iterate over clonotype assignments
    for clonotype_id, clonotype in clonotypes.iteritems():
        if not clonotype_id in chunk_clonotypes:
            continue

        for consensus_id, consensus in clonotype['consensuses'].iteritems():
            cdr = consensus['cdr3_seq']

            # Verify that the contig annotation data are consistent with the clonotype assignment data
            assert set(consensus['cell_contigs']) == \
                set(c.contig_name for c in consensus_to_contigs[consensus_id])
            sel_contigs = consensus_to_contigs[consensus_id]
            sel_contig_ids = [c.contig_name for c in sel_contigs]

            # Keep track of the "best" contig. This will be used in case the
            # merging fails.
            best_contig = None

            # Keep track of the set of distinct annotations of the contigs to merge.
            # Will use to report rate of discrepancies.
            feature_annotations = defaultdict(set)

            for contig in sel_contigs:
                for anno in contig.annotations:
                    feature_annotations[anno.feature.region_type].add(
                        anno.feature.gene_name)

                # Always choose a productive over a non-productive. Between
                # contigs with the same productivity, choose the one that had more UMIs.
                if best_contig is None or (not best_contig.productive and contig.productive) or \
                   (best_contig.productive == contig.productive and \
                    best_contig.umi_count < contig.umi_count):

                    best_contig = contig

            assert best_contig is not None

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_v_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_j_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            wrong_cdr_metric = reporter._get_metric_attr(
                'vdj_clonotype_consensus_wrong_cdr_contig_frac',
                args.metric_prefix)

            tmp_dir = martian.make_path(consensus_id + '_outs')
            cr_io.mkdir(tmp_dir, allow_existing=True)

            res = get_consensus_seq(consensus_id, sel_contig_ids,
                                    best_contig.contig_name, tmp_dir, args)
            (best_seq, best_quals, consensus_seq, contig_to_cons_bam,
             contig_fastq, contig_fasta) = res

            outs.chunked_consensus_bams.append(contig_to_cons_bam)

            # make sure the bam file has the right header (single sequence with this consensus name)
            tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam)
            if list(tmp_bam.references) != [consensus_id]:
                # Print some info to help us debug
                print tmp_bam.references, consensus_id
                assert (list(tmp_bam.references) == [consensus_id])
            tmp_bam.close()

            if consensus_seq:
                # If this is not None, we actually built a consensus, so we have to compute the quals from scratch.
                # Use a subset of the contigs for computing quals.
                contig_ids = map(
                    lambda c: c.contig_name,
                    sorted(sel_contigs,
                           key=lambda c: c.umi_count,
                           reverse=True))
                contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS]

                consensus_quals = get_consensus_quals(in_bam, consensus_id,
                                                      contig_fasta, contig_ids,
                                                      contig_umis, tmp_dir)
            else:
                consensus_seq = best_seq
                consensus_quals = best_quals

            assert (len(consensus_seq) == len(consensus_quals))

            total_read_count = sum([c.read_count for c in sel_contigs])
            total_umi_count = sum([c.umi_count for c in sel_contigs])

            contig_info_dict = {
                'cells': clonotype['barcodes'],
                'cell_contigs': sel_contig_ids,
                'clonotype_freq': clonotype['freq'],
                'clonotype_prop': clonotype['prop'],
            }

            contig = annotate_consensus_contig(args.vdj_reference_path,
                                               args.min_score_ratios,
                                               args.min_word_sizes,
                                               consensus_id,
                                               clonotype_id,
                                               consensus_seq,
                                               consensus_quals,
                                               read_count=total_read_count,
                                               umi_count=total_umi_count,
                                               info_dict=contig_info_dict,
                                               primers=args.primers)

            wrong_cdr_metric.add(1,
                                 filter=contig.cdr3_seq is None
                                 or contig.cdr3_seq != cdr)

            if contig.cdr3_seq is None or contig.cdr3_seq != cdr:
                # Something went wrong. Use "best" contig as the consensus.
                consensus_seq = best_seq
                consensus_quals = best_quals
                contig = annotate_consensus_contig(args.vdj_reference_path,
                                                   args.min_score_ratios,
                                                   args.min_word_sizes,
                                                   consensus_id,
                                                   clonotype_id,
                                                   consensus_seq,
                                                   consensus_quals,
                                                   read_count=total_read_count,
                                                   umi_count=total_umi_count,
                                                   info_dict=contig_info_dict,
                                                   primers=args.primers)

            assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr)

            consensus_contigs.append(contig)

            tk_fasta.write_read_fasta(consensus_fasta, consensus_id,
                                      consensus_seq)
            tk_fasta.write_read_fastq(consensus_fastq, consensus_id,
                                      consensus_seq, consensus_quals)
            assert (len(consensus_seq) == len(consensus_quals))

            ref_seq_parts, ref_annos = contig.get_concat_reference_sequence()

            # Align the contigs and consensus to a synthetic concatenated reference
            if ref_seq_parts is not None:
                # Trim the last segment down to the annotated length
                #   to avoid including the entire (500nt) C-region
                ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1].
                                                      annotation_match_end]

                # Concatenate the reference VDJC segments
                ref_seq = reduce(lambda x, y: x + y, ref_seq_parts)
                ref_name = re.sub('consensus', 'concat_ref', consensus_id)

                # Reannotate the reference sequence.
                # Restrict the annotation to the already-called segments to
                #   reduce the risk of discordance between the consensus and
                #   concat_ref annotations.
                ref_contig = annotate_consensus_contig(
                    args.vdj_reference_path,
                    args.min_score_ratios,
                    args.min_word_sizes,
                    ref_name,
                    clonotype_id,
                    ref_seq,
                    'I' * len(ref_seq),
                    use_features=set([a.feature.feature_id
                                      for a in ref_annos]),
                )
                ref_contigs.append(ref_contig)

                # Add the consensus sequence to the input FASTQ (next to the contigs)
                with open(contig_fastq, 'a') as contig_fq:
                    # Create a fake UMI and barcode
                    header = cr_fastq.AugmentedFastqHeader(consensus_id)
                    header.set_tag(PROCESSED_UMI_TAG, consensus_id)
                    header.set_tag(PROCESSED_BARCODE_TAG, consensus_id)
                    tk_fasta.write_read_fastq(contig_fq, header.to_string(),
                                              consensus_seq, consensus_quals)

                # Reuse this file (this had the assembly output but we don't need it anymore)
                ref_fasta_name = martian.make_path(consensus_id +
                                                   '_contigs.fasta')
                with open(ref_fasta_name, 'w') as f:
                    tk_fasta.write_read_fasta(f, ref_name, ref_seq)

                # Also append to the final output
                tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq)

                cmd = [
                    'vdj_asm', 'base-quals',
                    martian.make_path(consensus_id + '_contigs'), tmp_dir,
                    '--single-end'
                ]
                sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

                tk_subproc.check_call(cmd, cwd=os.getcwd())

                # Move out of tmp dir
                rec_bam = martian.make_path(consensus_id + '_reference.bam')
                cr_io.move(
                    os.path.join(tmp_dir, consensus_id + '_contigs.bam'),
                    rec_bam)
                outs.chunked_concat_ref_bams.append(rec_bam)

            if os.path.isdir(tmp_dir):
                shutil.rmtree(tmp_dir)

            # Clean up unneeded files ASAP
            rm_files([
                consensus_id + '_contigs.fasta',
                consensus_id + '_contigs.fastq'
            ])

            # Merge N most recent BAM files to avoid filesystem overload
            if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY:
                assert len(outs.chunked_consensus_bams) == len(
                    outs.chunked_concat_ref_bams)

                new_cons_bam = martian.make_path('merged-consensus-%03d.bam' %
                                                 n_merged_bams)
                concatenate_bams(new_cons_bam, outs.chunked_consensus_bams)
                rm_files(outs.chunked_consensus_bams)
                outs.chunked_consensus_bams = [new_cons_bam]

                new_ref_bam = martian.make_path('merged-ref-%03d.bam' %
                                                n_merged_bams)
                concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams)
                rm_files(outs.chunked_concat_ref_bams)
                outs.chunked_concat_ref_bams = [new_ref_bam]

                n_merged_bams += 1

    in_bam.close()

    consensus_fastq.close()
    consensus_fasta.close()
    ref_fasta.close()

    reporter.save(outs.chunked_reporter)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)
示例#11
0
def main(args, outs):
    unique_gem_groups = np.unique(args.gem_groups).tolist()
    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    cell_barcodes = set()
    bc_support = {}

    # Load barcode whitelist
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)

    all_gem_groups = sorted(set(args.gem_groups))

    if args.recovered_cells:
        recovered_cells = args.recovered_cells
    else:
        recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len(
            all_gem_groups)

    for gem_group in all_gem_groups:
        if barcode_whitelist is None:
            break

        # Load barcode raw read count distribution
        barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                                  barcode_whitelist,
                                                  gem_group,
                                                  proportions=False)
        counts = np.array(barcode_dist.values())

        # Append gem group to barcode seqs
        barcodes = np.array([
            cr_utils.format_barcode_seq(seq, gem_group)
            for seq in barcode_dist.keys()
        ])

        # Call cell barcodes
        gg_bc_support, gg_cell_bcs, threshold = call_cell_barcodes(
            args.umi_summary, int(gem_group), args.min_umis,
            args.readpairs_per_umi_nx, args.readpairs_per_umi_ratio)

        # Record the threshold
        reporter._get_metric_attr(
            'vdj_filtered_bc_contig_kth_umi_readpair_threshold',
            gem_group).set_value(threshold)

        if len(gg_bc_support) > 0:
            if args.force_cells is not None:
                sorted_bcs = map(
                    lambda kv: kv[0],
                    sorted(gg_bc_support.items(),
                           key=lambda kv: kv[1],
                           reverse=True))
                gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells
                                              )]

            cell_barcodes.update(set(gg_cell_bcs))
            bc_support.update(gg_bc_support)

        # Load the extract_reads summary to get the total raw reads
        total_read_pairs = cr_utils.get_metric_from_json(
            args.extract_reads_summary, 'total_read_pairs')

        # Load the assembly metrics summary to get the total assemblable reads
        assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json(
            args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc')
        assemblable_read_pairs = sum(
            assemblable_read_pairs_by_bc.get(bc, 0) for bc in cell_barcodes)

        reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts,
                                        total_read_pairs,
                                        assemblable_read_pairs,
                                        recovered_cells)

    save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes)

    with open(outs.barcode_support, 'w') as f:
        f.write('barcode,count\n')
        for k, v in bc_support.iteritems():
            f.write('%s,%d\n' % (k, v))

    write_barcode_umi_summary(args.umi_info, reporter,
                              outs.barcode_umi_summary,
                              args.min_readpairs_per_umi, cell_barcodes)

    reporter.report_summary_json(outs.summary)
示例#12
0
def main(args, outs):
    reporter = vdj_report.VdjReporter(
        vdj_reference_path=args.vdj_reference_path)
    gene_umi_counts_per_bc = {}

    strand = cr_chem.get_strandedness(args.chemistry_def)

    # For the entire chunk, match reads against the V(D)J reference
    ref_fasta = vdj_reference.get_vdj_reference_fasta(args.vdj_reference_path)
    fq_prefix = re.sub('_1.fastq', '', args.read1_chunk)
    # The filtering code will write this bam. Then we'll read it, correct the UMIs
    # and write outs.chunked_bams.
    filter_bam = martian.make_path('tmp.bam')

    run_read_match(fq_prefix, ref_fasta, filter_bam, args.chemistry_def,
                   args.sw_params)

    # Make two passes over the BAM file, processing one barcode at a time
    bam1 = tk_bam.create_bam_infile(filter_bam)
    bam2 = tk_bam.create_bam_infile(filter_bam)
    bc_iter1 = get_bc_grouped_pair_iter(bam1)
    bc_iter2 = get_bc_grouped_pair_iter(bam2)

    reads_per_bc = open(outs.reads_per_bc, 'w')
    if args.output_fastqs:
        out_fastq1 = open(outs.barcode_chunked_read1, 'w')
        out_fastq2 = open(outs.barcode_chunked_read2, 'w')
        out_bam = None
    else:
        out_bam, _ = tk_bam.create_bam_outfile(outs.barcode_chunked_bams,
                                               None,
                                               None,
                                               template=bam1)
        out_fastq1 = None
        out_fastq2 = None

    for (bc, pair_iter1), (_,
                           pair_iter2) in itertools.izip(bc_iter1, bc_iter2):
        nreads = 0

        # Pass 1: UMI correction
        umi_counts = defaultdict(int)
        for header, (read1, read2) in pair_iter1:
            nreads += 2
            if is_mapped(read1, read2):
                umi_counts[header.get_tag(cr_constants.RAW_UMI_TAG)] += 1

        corrected_umis = correct_umis(umi_counts)

        # Pass 2: Write the UMI-corrected records
        write_barcode_fastq(bam1, pair_iter2, bc, corrected_umis, reporter,
                            gene_umi_counts_per_bc, strand, out_bam,
                            out_fastq1, out_fastq2)

        reads_per_bc.write('{}\t{}\n'.format(bc, nreads))

    bam1.close()
    bam2.close()
    if args.output_fastqs:
        out_fastq1.close()
        out_fastq2.close()
    else:
        out_bam.close()

    # Write bc-gene-umi counts
    cPickle.dump(gene_umi_counts_per_bc, open(outs.chunked_gene_umi_counts,
                                              'w'))

    reporter.save(outs.chunked_reporter)
示例#13
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = open(args.read1_chunk)
    in_read2_fastq = open(args.read2_chunk)
    out_read1_fastq = open(outs.corrected_read1s, 'w')
    out_read2_fastq = open(outs.corrected_read2s, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                    tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])
        read2_header = cr_fastq.AugmentedFastqHeader(read2[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)
                read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)
                read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(),
                                  read1[1], read1[2])
        tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(),
                                  read2[1], read2[2])

    in_read1_fastq.close()
    in_read2_fastq.close()
    out_read1_fastq.close()
    out_read2_fastq.close()
    bc_counter.close()

    reporter.save(outs.chunked_reporter)
示例#14
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    barcode_contigs = collections.defaultdict(list)
    contig_annotations = {}

    # Get annotations for each contig
    for annotation in iter(json.load(open(args.annotations))):
        contig_annotations[annotation['contig_name']] = annotation

    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t',
                                     dtype={'component': int, 'num_reads': int,
                                            'num_pairs': int, 'num_umis': int,
                                            'umi_list': str,
                                     })
        contig_summary = contig_summary.groupby('barcode')
    else:
        contig_summary = None

    if args.umi_summary and os.path.isfile(args.umi_summary):
        umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t')
        umi_summary = umi_summary.groupby('barcode')
    else:
        umi_summary = None

    if args.filter_summary:
        filter_summary = vdj_utils.load_contig_summary_table(args.filter_summary)
    else:
        filter_summary = None

    # Get contigs for each barcode
    for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)):
        contig_name = contig_hdr.split(' ')[0]
        if not filter_summary is None and not vdj_utils.is_contig_filtered(filter_summary, contig_name):
            continue

        barcode = vdj_utils.get_barcode_from_contig_name(contig_name)
        barcode_contigs[barcode].append((contig_name, contig_seq))

    # Compute metrics for each barcode
    if args.cell_barcodes:
        barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes)
    else:
        # Pass an empty barcode JSON for bulk
        barcodes = {''}


    reference = vdj_ref.VdjReference(args.vdj_reference_path)

    for barcode in barcodes:
        contigs = barcode_contigs[barcode]
        annotations = [contig_annotations[contig[0]] for contig in contigs]

        reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference)

        if not contig_summary is None and barcode in contig_summary.groups:
            bc_contig_summary = contig_summary.get_group(barcode)
        else:
            bc_contig_summary = None

        if not umi_summary is None and barcode in umi_summary.groups:
            bc_umi_summary = umi_summary.get_group(barcode)
        else:
            bc_umi_summary = None

        reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference)

    reporter.report_summary_json(outs.summary)
示例#15
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    barcode_contigs = defaultdict(list)
    contig_annotations = {}

    # Get annotations for each contig
    for annotation in iter(json.load(open(args.annotations))):
        contig_annotations[annotation['contig_name']] = annotation

    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary,
                                     header=0,
                                     index_col=None,
                                     sep='\t',
                                     dtype={
                                         'component': int,
                                         'num_reads': int,
                                         'num_pairs': int,
                                         'num_umis': int,
                                         'umi_list': str,
                                     })
        contig_summary = contig_summary.groupby('barcode')
    else:
        contig_summary = None

    if args.umi_summary and os.path.isfile(args.umi_summary):
        umi_summary = pd.read_csv(args.umi_summary,
                                  header=0,
                                  index_col=None,
                                  sep='\t')
        umi_summary = umi_summary.groupby('barcode')
    else:
        umi_summary = None

    if args.filter_summary:
        filter_summary = vdj_utils.load_contig_summary_table(
            args.filter_summary)
    else:
        filter_summary = None

    # Get contigs for each barcode
    for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)):
        contig_name = contig_hdr.split(' ')[0]
        if not filter_summary is None and not vdj_utils.is_contig_filtered(
                filter_summary, contig_name):
            continue

        barcode = vdj_utils.get_barcode_from_contig_name(contig_name)
        barcode_contigs[barcode].append((contig_name, contig_seq))

    # Compute metrics for each barcode
    if args.cell_barcodes:
        barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes)
    else:
        # Pass an empty barcode JSON for bulk
        barcodes = {''}

    reference = vdj_ref.VdjReference(args.vdj_reference_path)

    for barcode in barcodes:
        contigs = barcode_contigs[barcode]
        annotations = [contig_annotations[contig[0]] for contig in contigs]

        reporter.vdj_barcode_contig_cb(barcode, contigs, annotations,
                                       reference)

        if not contig_summary is None and barcode in contig_summary.groups:
            bc_contig_summary = contig_summary.get_group(barcode)
        else:
            bc_contig_summary = None

        if not umi_summary is None and barcode in umi_summary.groups:
            bc_umi_summary = umi_summary.get_group(barcode)
        else:
            bc_umi_summary = None

        reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary,
                                 annotations, reference)

    ## Compute post-assembly per-cell metrics
    # Load the assembly metrics summary to get the total assemblable reads
    if args.assemble_metrics_summary and args.reads_summary:
        assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json(
            args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc')
        assemblable_read_pairs = sum(
            assemblable_read_pairs_by_bc.get(bc, 0) for bc in barcodes)

        total_read_pairs = cr_utils.get_metric_from_json(
            args.reads_summary, 'total_read_pairs')

        reporter._get_metric_attr(
            'vdj_assemblable_read_pairs_per_filtered_bc').set_value(
                assemblable_read_pairs, len(barcodes))
        reporter._get_metric_attr('vdj_sequencing_efficiency').set_value(
            assemblable_read_pairs, total_read_pairs)

    ## Try to autodetect the chain type
    # Find all chains w/ a significant presence.
    # If there's exactly one, set the chain type filter to that.
    # Otherwise, show all chain types.

    chain_count = defaultdict(int)
    for anno_dict in contig_annotations.itervalues():
        contig = vdj_annotations.AnnotatedContig.from_dict(
            anno_dict, reference)
        if contig.is_cell and contig.high_confidence and contig.productive:
            for anno in contig.annotations:
                if anno.feature.chain_type in vdj_constants.VDJ_CHAIN_TYPES:
                    chain_count[anno.feature.chain_type] += 1

    outs.chain_type = vdj_constants.ALL_CHAIN_TYPES

    print chain_count

    if len(chain_count) > 0:
        n_contigs = sum(chain_count.itervalues())
        sig_chains = [
            ct
            for ct, count in chain_count.iteritems() if tk_stats.robust_divide(
                count, n_contigs) >= MIN_CHAIN_TYPE_CONTIG_FRAC
        ]
        if len(sig_chains) == 1:
            outs.chain_type = sig_chains[0]

    reporter.report_summary_json(outs.summary)
示例#16
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    cell_barcodes = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes))

    barcode_contigs = vdj_annot.load_cell_contigs_from_json(
        args.annotations, args.vdj_reference_path, group_key='barcode')

    # From CDR sequence to sequence id
    sequences = {}
    # From clonotype (tuple of CDR ids) to clonotype id
    clonotypes = {}

    # From barcode to clonotype id
    bc_clonotype_assignments = {}

    # First pass: Just keep track of observed CDR3s
    for contig_list in barcode_contigs:

        # This will be a tuple of sequences like "TRA_<cdr seq>"
        barcode_clonotype_tuple = contig_list.clonotype_tuple(
            require_productive=not args.use_non_productive,
            require_full_len=True,
            require_high_conf=True)

        # Give unique numerical ids to the CDR3 sequences
        if barcode_clonotype_tuple:
            for cdr_seq in barcode_clonotype_tuple:
                sequences.setdefault(cdr_seq, len(sequences))

    # From sequence id to CDR sequence
    sequence_ids = {seq_id: seq for seq, seq_id in sequences.iteritems()}

    # Do a second pass to potentially use non-full length contigs with a valid CDR3.
    for contig_list in barcode_contigs:
        if args.use_non_full_len:
            barcode_clonotype_tuple = []

            for c in contig_list.contigs():
                (_, cl_seq) = c.clonotype_seq()
                # If this contig has a CDR3 and we can infer the gene type of
                # that CDR3 (either based on the contig itself or based on
                # other full-length contigs that had this CDR3, then add this
                # to the clonotype tuple).
                if cl_seq in sequences:
                    # this will rescue contigs that have a chain and CDR3 assigned
                    # but aren't full length
                    barcode_clonotype_tuple.append(cl_seq)
        else:
            barcode_clonotype_tuple = contig_list.clonotype_tuple(
                require_productive=(not args.use_non_productive),
                require_full_len=True,
                require_high_conf=True)
        barcode_clonotype = tuple(
            sorted(list(set([sequences[s] for s in barcode_clonotype_tuple]))))

        if barcode_clonotype:
            clonotype_id = clonotypes.setdefault(barcode_clonotype,
                                                 len(clonotypes))
            bc_clonotype_assignments[contig_list.name] = clonotype_id

    # From clonotype id to tuple of CDRs
    clonotype_ids = {
        clonotype_id: clonotype_tuple
        for clonotype_tuple, clonotype_id in clonotypes.iteritems()
    }

    out_clonotypes = vdj_annot.report_clonotypes(reporter, 'raw',
                                                 cell_barcodes, clonotype_ids,
                                                 sequence_ids, barcode_contigs,
                                                 bc_clonotype_assignments)

    with open(outs.clonotype_assignments, 'w') as out_file:
        tk_safe_json.dump_numpy(tk_safe_json.json_sanitize(out_clonotypes),
                                out_file,
                                pretty=True)

    # Add clonotype assignments to contig annotations
    del barcode_contigs
    with open(args.annotations) as f:
        all_contigs = vdj_annot.load_contig_list_from_json(
            f, args.vdj_reference_path)

    vdj_annot.label_contigs_with_consensus(out_clonotypes, all_contigs, 'raw')

    # Write augmented contig annotations
    with open(outs.contig_annotations, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, all_contigs)

    with open(outs.contig_annotations_csv, 'w') as out_file:
        vdj_annot.save_contig_list_csv(out_file,
                                       all_contigs,
                                       write_inferred=False)

    with open(outs.contig_annotations_pickle, 'w') as out_file:
        cPickle.dump(all_contigs, out_file, protocol=cPickle.HIGHEST_PROTOCOL)

    # Write filtered contig annotations
    with open(outs.filtered_contig_annotations_csv, 'w') as out_file:
        filtered_contigs = filter(lambda x: x.high_confidence and x.is_cell,
                                  all_contigs)
        vdj_annot.save_contig_list_csv(out_file,
                                       filtered_contigs,
                                       write_inferred=False)

    # Set a default value for paired clonotype diversity so that it will be
    # present in the metric summary csv even when there are no paired cells
    # or in denovo mode
    paired_diversity_metric = reporter._get_metric_attr(
        'vdj_paired_clonotype_diversity', MULTI_REFS_PREFIX, 'raw')
    if not paired_diversity_metric.d:
        paired_diversity_metric.add(None, 0)

    reporter.report_summary_json(outs.summary)
示例#17
0
def main(args, outs):
    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []

    chunk_clonotypes = set(args.chunk_clonotypes)

    reporter = vdj_report.VdjReporter()
    if not args.clonotype_assignments or not vdj_utils.bam_has_seqs(
            args.contig_bam):
        # always produce an empty summary
        reporter.save(outs.chunked_reporter)
        return

    with open(args.annotations) as f:
        contigs = cPickle.load(f)
    with open(args.clonotype_assignments) as f:
        clonotypes = json.load(f)
    in_bam = tk_bam.create_bam_infile(args.contig_bam)

    contig_read_counts = {c.contig_name: c.read_count for c in contigs}
    contig_umi_counts = {c.contig_name: c.umi_count for c in contigs}

    # Do not attempt to read into a pandas object because it can be huge.
    contig_umis = defaultdict(set)
    with open(args.umi_summary_tsv, 'r') as umi_file:
        for line in umi_file:
            fields = line.strip().split('\t')
            umi = fields[2]
            if umi == 'umi' or len(fields) < 7:
                continue
            good_umi = fields[5] == 'True'
            contig_names = fields[6].split(',')
            if good_umi:
                for c in contig_names:
                    contig_umis[c].add(umi)

    consensus_fastq = open(outs.consensus_fastq, 'w')
    consensus_fasta = open(outs.consensus_fasta, 'w')
    ref_fasta = open(outs.concat_ref_fasta, 'w')

    consensus_contigs = []
    ref_contigs = []

    assert (args.metric_prefix in reporter.vdj_clonotype_types)

    # Iterate over clonotype assignments
    for clonotype_id, clonotype in clonotypes.iteritems():
        if not clonotype_id in chunk_clonotypes:
            continue

        for consensus_id, consensus in clonotype['consensuses'].iteritems():
            cdr = consensus['cdr3_seq']

            sel_contigs = set(consensus['cell_contigs']
                              )  # Get the contigs that should be merged
            # Keep track of the "best" contig. This will be used in case the
            # merging fails.
            best_contig = None

            # Keep track of the set of distinct annotations of the contigs to merge.
            # Will use to report rate of discrepancies.
            feature_annotations = defaultdict(set)

            for contig in contigs:
                if contig.contig_name in sel_contigs:

                    for anno in contig.annotations:
                        feature_annotations[anno.feature.region_type].add(
                            anno.feature.gene_name)

                    # Always choose a productive over a non-productive. Between
                    # contigs with the same productivity, choose the one that had more UMIs.
                    if best_contig is None or (not best_contig.productive and contig.productive) or \
                       (best_contig.productive == contig.productive and \
                        len(contig_umis[best_contig.contig_name]) < len(contig_umis[contig.contig_name])):

                        best_contig = contig

            assert not best_contig is None

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_v_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_j_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            # Order contigs by decreasing UMI support
            ordered_contigs = list(
                sorted(sel_contigs,
                       key=lambda x: len(contig_umis[x]),
                       reverse=True))
            ordered_contigs = ordered_contigs[
                0:min(MAX_CELLS_FOR_BASE_QUALS, len(sel_contigs))]

            wrong_cdr_metric = reporter._get_metric_attr(
                'vdj_clonotype_consensus_wrong_cdr_contig_frac',
                args.metric_prefix)

            tmp_dir = martian.make_path(consensus_id + '_outs')
            cr_utils.mkdir(tmp_dir, allow_existing=True)

            res = get_consensus_seq(consensus_id, sel_contigs,
                                    best_contig.contig_name, tmp_dir, args)
            (best_seq, best_quals, consensus_seq, contig_to_cons_bam,
             contig_fastq, contig_fasta) = res

            outs.chunked_consensus_bams.append(contig_to_cons_bam)

            # make sure the bam file has the right header (single sequence with this consensus name)
            tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam)
            assert (list(tmp_bam.references) == [consensus_id])
            tmp_bam.close()

            if consensus_seq:
                # If this is not None, we actually built a consensus, so we have to compute the quals from scratch.
                consensus_quals = get_consensus_quals(in_bam, consensus_id,
                                                      contig_fasta,
                                                      ordered_contigs,
                                                      contig_umis, tmp_dir)
            else:
                consensus_seq = best_seq
                consensus_quals = best_quals

            assert (len(consensus_seq) == len(consensus_quals))

            total_read_count = np.sum(
                [contig_read_counts[c] for c in sel_contigs])
            total_umi_count = np.sum(
                [contig_umi_counts[c] for c in sel_contigs])

            contig_info_dict = {
                'cells': clonotype['barcodes'],
                'cell_contigs': sel_contigs,
                'clonotype_freq': clonotype['freq'],
                'clonotype_prop': clonotype['prop'],
            }

            contig = annotate_consensus_contig(args.vdj_reference_path,
                                               args.min_score_ratios,
                                               args.min_word_sizes,
                                               consensus_id,
                                               clonotype_id,
                                               consensus_seq,
                                               consensus_quals,
                                               read_count=total_read_count,
                                               umi_count=total_umi_count,
                                               info_dict=contig_info_dict,
                                               primers=args.primers)

            wrong_cdr_metric.add(1,
                                 filter=contig.cdr3_seq is None
                                 or contig.cdr3_seq != cdr)

            if contig.cdr3_seq is None or contig.cdr3_seq != cdr:
                # Something went wrong. Use "best" contig as the consensus.
                consensus_seq = best_seq
                consensus_quals = best_quals
                contig = annotate_consensus_contig(args.vdj_reference_path,
                                                   args.min_score_ratios,
                                                   args.min_word_sizes,
                                                   consensus_id,
                                                   clonotype_id,
                                                   consensus_seq,
                                                   consensus_quals,
                                                   read_count=total_read_count,
                                                   umi_count=total_umi_count,
                                                   info_dict=contig_info_dict,
                                                   primers=args.primers)

            assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr)

            consensus_contigs.append(contig)

            tk_fasta.write_read_fasta(consensus_fasta, consensus_id,
                                      consensus_seq)
            tk_fasta.write_read_fastq(consensus_fastq, consensus_id,
                                      consensus_seq, consensus_quals)
            assert (len(consensus_seq) == len(consensus_quals))

            ref_seq_parts, ref_annos = contig.get_concat_reference_sequence()

            # Align the contigs and consensus to a synthetic concatenated reference
            if ref_seq_parts is not None:
                # Trim the last segment down to the annotated length
                #   to avoid including the entire (500nt) C-region
                ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1].
                                                      annotation_match_end]

                # Concatenate the reference VDJC segments
                ref_seq = reduce(lambda x, y: x + y, ref_seq_parts)
                ref_name = re.sub('consensus', 'concat_ref', consensus_id)

                # Reannotate the reference sequence.
                # Restrict the annotation to the already-called segments to
                #   reduce the risk of discordance between the consensus and
                #   concat_ref annotations.
                ref_contig = annotate_consensus_contig(
                    args.vdj_reference_path,
                    args.min_score_ratios,
                    args.min_word_sizes,
                    ref_name,
                    clonotype_id,
                    ref_seq,
                    'I' * len(ref_seq),
                    use_features=set([a.feature.feature_id
                                      for a in ref_annos]),
                )
                ref_contigs.append(ref_contig)

                # Add the consensus sequence to the input FASTQ (next to the contigs)
                with open(contig_fastq, 'a') as contig_fq:
                    # Create a fake UMI and barcode
                    header = cr_fastq.AugmentedFastqHeader(consensus_id)
                    header.set_tag(PROCESSED_UMI_TAG, consensus_id)
                    header.set_tag(PROCESSED_BARCODE_TAG, consensus_id)
                    tk_fasta.write_read_fastq(contig_fq, header.to_string(),
                                              consensus_seq, consensus_quals)

                # Reuse this file (this had the assembly output but we don't need it anymore)
                ref_fasta_name = martian.make_path(consensus_id +
                                                   '_contigs.fasta')
                with open(ref_fasta_name, 'w') as f:
                    tk_fasta.write_read_fasta(f, ref_name, ref_seq)

                # Also append to the final output
                tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq)

                cmd = [
                    'vdj_asm',
                    'base-quals',
                    martian.make_path(consensus_id + '_contigs'),
                    tmp_dir,
                    '--single-end',
                    '--global'  # use global alignment if a good seed isn't found - everything must get aligned
                ]
                sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

                subprocess.check_call(cmd, cwd=os.getcwd())

                # Move out of tmp dir
                rec_bam = martian.make_path(consensus_id + '_reference.bam')
                cr_utils.move(
                    os.path.join(tmp_dir, consensus_id + '_contigs.bam'),
                    rec_bam)
                outs.chunked_concat_ref_bams.append(rec_bam)

            if os.path.isdir(tmp_dir):
                shutil.rmtree(tmp_dir)

    in_bam.close()

    consensus_fastq.close()
    consensus_fasta.close()
    ref_fasta.close()

    reporter.save(outs.chunked_reporter)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)
示例#18
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group,
                                              args.library_type)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk)
    in_read2_fastq = cr_io.open_maybe_gzip(
        args.read2_chunk) if args.read2_chunk else []

    outs.corrected_bcs += h5_constants.LZ4_SUFFIX
    out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                            tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        processed_bc = None

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        out_file.write('%s\n' %
                       (processed_bc if processed_bc is not None else ''))

    in_read1_fastq.close()
    if in_read2_fastq:
        in_read2_fastq.close()
    out_file.close()

    bc_counter.close()

    reporter.save(outs.chunked_reporter)
示例#19
0
def main(args, outs):
    reporter = vdj_report.VdjReporter(
        vdj_reference_path=args.vdj_reference_path)
    gene_umi_counts_per_bc = {}

    strand = cr_chem.get_strandedness(args.chemistry_def)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)
    assert paired_end != (args.read2_chunk is None)

    # For the entire chunk, match reads against the V(D)J reference
    ref_fasta = vdj_reference.get_vdj_reference_fasta(args.vdj_reference_path)

    # The filtering code will write this bam. Then we'll read it, correct the UMIs
    # and write outs.chunked_bams.
    filter_bam = martian.make_path('tmp.bam')

    vdj_filt.run_read_match(args.read1_chunk, args.read2_chunk, ref_fasta,
                            filter_bam, strand, args.sw_params)

    # Make two passes over the BAM file, processing one barcode at a time
    bam1 = pysam.AlignmentFile(filter_bam, check_sq=False)
    bam2 = pysam.AlignmentFile(filter_bam, check_sq=False)
    bc_iter1 = get_bc_grouped_pair_iter(bam1, paired_end)
    bc_iter2 = get_bc_grouped_pair_iter(bam2, paired_end)

    reads_per_bc = open(outs.reads_per_bc, 'w')
    out_bam, _ = tk_bam.create_bam_outfile(outs.barcode_chunked_bams,
                                           None,
                                           None,
                                           template=bam1)

    for (bc, pair_iter1), (_,
                           pair_iter2) in itertools.izip(bc_iter1, bc_iter2):
        nreads = 0

        # Pass 1: UMI correction
        umi_counts = defaultdict(int)
        for header, (read1, read2) in pair_iter1:
            nreads += 2
            umi_counts[header.get_tag(cr_constants.RAW_UMI_TAG)] += 1

        corrected_umis = correct_umis(umi_counts)

        # Pass 2: Write the UMI-corrected records
        process_bam_barcode(bam1, pair_iter2, bc, corrected_umis, reporter,
                            gene_umi_counts_per_bc, strand, out_bam,
                            paired_end)

        reads_per_bc.write('{}\t{}\n'.format(bc, nreads))

    bam1.close()
    bam2.close()
    out_bam.close()

    # Write bc-gene-umi counts
    cPickle.dump(gene_umi_counts_per_bc, open(outs.chunked_gene_umi_counts,
                                              'w'))

    # Copy the input barcodes
    if args.barcodes_chunk is not None:
        cr_utils.copy(args.barcodes_chunk, outs.barcodes_in_chunks)
    else:
        outs.barcodes_in_chunks = None

    reporter.save(outs.chunked_reporter)
示例#20
0
def main(args, outs):
    np.random.seed(0)

    unique_gem_groups = np.unique(args.gem_groups).tolist()

    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    # Load the umi info
    umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row,
                                          args.end_row)

    # Compute initial within-barcode thresholds
    # Assumes the fraction of noise-UMI reads is < some fraction (NX)
    barcode_nx = np.zeros(len(umi_info['barcodes']), dtype=int)

    # Assume grouped by barcode
    for bc, bc_reads in itertools.groupby(itertools.izip(
            umi_info['barcode_idx'], umi_info['reads']),
                                          key=lambda x: x[0]):
        bc_reads_arr = np.fromiter((reads for bc, reads in bc_reads),
                                   umi_info['reads'].dtype)
        barcode_nx[bc] = tk_stats.NX(bc_reads_arr, args.intra_barcode_nx)

    # Filter out UMIs below the within-BC threshold (in-place)
    top_in_bc = umi_info['reads'] >= barcode_nx[umi_info['barcode_idx']]

    for col in vdj_umi_info.UMI_INFO_COLS.iterkeys():
        umi_info[col] = np.compress(top_in_bc, umi_info[col])

    # Compute N50 read pairs per UMI for this gem group
    # and use it to subsample to the target N50.
    rppu_n50 = tk_stats.NX(umi_info['reads'], 0.5)
    if rppu_n50 is None:
        rppu_n50 = float('NaN')

    reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50',
                              cr_constants.MULTI_REFS_PREFIX,
                              args.gem_group).set_value(rppu_n50)
    if rppu_n50 == 0:
        subsample_rate = 1.0
    else:
        subsample_rate = min(1.0,
                             tk_stats.robust_divide(args.target_n50, rppu_n50))

    reporter._get_metric_attr('vdj_assembly_subsample_rate',
                              args.gem_group).set_value(subsample_rate, 1.0)

    # Weighted average of subsample rates where weight = sum of readpairs on UMIs for each gem-group
    reporter._get_metric_attr('vdj_assembly_overall_subsample_rate').set_value(
        subsample_rate * sum(umi_info['reads']), sum(umi_info['reads']))

    # Find the global (per-chain) thresholds
    thresholds = {}
    chain_totals = {}

    # Sort the chains alphabetically for determinism in e.g. multi-library vs single-library
    #   runs.
    chain_tuples = list(enumerate(umi_info['chains']))
    sorted_chain_tuples = sorted(chain_tuples, key=lambda x: x[1])

    for chain_idx, chain in sorted_chain_tuples:
        chain_reads = umi_info['reads'][umi_info['chain_idx'] == chain_idx]
        chain_totals[chain] = chain_reads.sum()

        # Record the per-chain N50 read pairs per UMI (but don't use it)
        chain_n50 = tk_stats.NX(chain_reads, 0.5)
        if chain_n50 is None:
            chain_n50 = float('NaN')
        reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50',
                                  chain, args.gem_group).set_value(chain_n50)

        print "Computing per-chain threshold for %s" % chain

        thresholds[chain] = vdj_stats.compute_readpairs_per_umi_threshold(
            chain_reads, subsample_rate)

        print "  %d" % thresholds[chain]

        reporter._get_metric_attr(
            'vdj_recombinome_readpairs_per_umi_threshold', chain,
            args.gem_group).set_value(thresholds[chain])

    # Take the min threshold among the chains that make up N90 of all reads
    chain_n90 = tk_stats.NX(chain_totals.values(), 0.9)

    use_chains = [
        chain for chain in thresholds.iterkeys()
        if chain_totals[chain] >= chain_n90
    ]
    use_thresholds = [thresholds[c] for c in use_chains]

    print "Using thresholds from " + str(use_chains) + ": " + str(
        use_thresholds)

    # Handle case where no chains were detected
    if len(use_chains) == 0:
        threshold = 1
    else:
        threshold = min(use_thresholds)

    outs.min_readpairs_per_umi = {args.gem_group: int(threshold)}
    outs.subsample_rate = {args.gem_group: float(subsample_rate)}

    reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_threshold',
                              cr_constants.MULTI_REFS_PREFIX,
                              args.gem_group).set_value(threshold)

    reporter.save(outs.chunked_reporter)