def process_bam_barcode(bam, pair_iter, bc, corrected_umis, reporter, gene_umi_counts_per_bc, strand, out_bam, paired_end): """ Process all readpairs from pair_iter, all having the same bc """ # Note: "gene" in this function is actually "chain" # Readpair counts per UMI (per-gene); {gene: {UMI: count}} # Note: Using a lambda here breaks cPickle for some reason gene_umi_counts = defaultdict(default_dict_int) read_pairs_written = 0 for header, (read1, read2) in pair_iter: (gene1, gene2) = reporter.vdj_recombinome_bam_cb(read1, read2, bam, strand) umi = header.get_tag(cr_constants.RAW_UMI_TAG) corrected_umi = corrected_umis[umi] genes = list(set(filter(lambda x: x is not None, [gene1, gene2]))) if len(genes) == 1: # Unique mapping gene_umi_counts[genes[0]][corrected_umi] += 1 else: # Unmapped or ambiguously mapped read pairs go to "None" bucket gene_umi_counts["None"][corrected_umi] += 1 gene_umi_counts[cr_constants.MULTI_REFS_PREFIX][corrected_umi] += 1 header.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi) read1.qname = header.to_string() if read2 is not None: header2 = cr_fastq.AugmentedFastqHeader(read2.qname) assert (header2.get_tag(cr_constants.RAW_UMI_TAG) == umi) header2.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi) read2.qname = header2.to_string() reporter._get_metric_attr('vdj_corrected_umi_frac').add( 1, filter=corrected_umis[umi] != umi) read_pairs_written += 1 # Write whether this pair was filtered or not. out_bam.write(read1) if read2 is not None: out_bam.write(read2) # Report read-pairs/umi for gene in reporter.vdj_genes: tot_readpairs = 0 for reads_per_umi in gene_umi_counts[gene].itervalues(): reporter._get_metric_attr( 'vdj_recombinome_readpairs_per_umi_distribution', gene).add(reads_per_umi) tot_readpairs += reads_per_umi gene_umi_counts_per_bc[bc] = gene_umi_counts
def get_bc_grouped_pair_iter(bam): """ Yields (bc, pair_iter) where pair_iter yields (AugmentedFastqHeader, (read1, read2)) for the barcode """ wrap_header = lambda pair: (cr_fastq.AugmentedFastqHeader(pair[0].qname), pair) get_barcode = lambda hdr_pair: hdr_pair[0].get_tag(cr_constants. PROCESSED_BARCODE_TAG) return itertools.groupby(itertools.imap(wrap_header, get_pair_iter(bam)), key=get_barcode)
def write_barcode_fastq(bam, pair_iter, bc, corrected_umis, reporter, gene_umi_counts_per_bc, strand, out_bam, out_fastq1, out_fastq2): """ Process all readpairs from pair_iter, all having the same bc """ # Note: "gene" in this function is actually "chain" # Readpair counts per UMI (per-gene); {gene: {UMI: count}} gene_umi_counts = defaultdict(default_dict_int) read_pairs_written = 0 for header, (read1, read2) in pair_iter: (gene1, gene2) = reporter.vdj_recombinome_bam_cb(read1, read2, bam, strand) if is_mapped(read1, read2): umi = header.get_tag(cr_constants.RAW_UMI_TAG) corrected_umi = corrected_umis[umi] # Count readpairs per UMI if gene1 is not None or gene2 is not None: for gene in set( filter( lambda x: x is not None, [gene1, gene2, cr_constants.MULTI_REFS_PREFIX])): gene_umi_counts[gene][corrected_umi] += 1 header.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi) read1.qname = header.to_string() header2 = cr_fastq.AugmentedFastqHeader(read2.qname) assert (header2.get_tag(cr_constants.RAW_UMI_TAG) == umi) header2.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi) read2.qname = header2.to_string() reporter._get_metric_attr('vdj_corrected_umi_frac').add( 1, filter=corrected_umis[umi] != umi) read_pairs_written += 1 if not out_bam is None: # Write whether this pair was filtered or not. out_bam.write(read1) out_bam.write(read2) elif is_mapped(read1, read2): write_bam_read_fastq(out_fastq1, read1) write_bam_read_fastq(out_fastq2, read2) # Report read-pairs/umi for gene in reporter.vdj_genes: for reads_per_umi in gene_umi_counts[gene].itervalues(): reporter._get_metric_attr( 'vdj_recombinome_readpairs_per_umi_distribution', gene).add(reads_per_umi) gene_umi_counts_per_bc[bc] = gene_umi_counts
def get_bc_grouped_pair_iter(bam, paired_end): """ Yields (bc, pair_iter) where pair_iter yields (AugmentedFastqHeader, (read1, read2|None)) for the barcode """ wrap_header = lambda pair: (cr_fastq.AugmentedFastqHeader(pair[0].qname), pair) get_barcode = lambda hdr_pair: hdr_pair[0].get_tag(cr_constants. PROCESSED_BARCODE_TAG) if paired_end: bam_iter = vdj_filt.get_pair_iter(bam) else: bam_iter = itertools.imap(lambda r1: (r1, None), bam) return itertools.groupby(itertools.imap(wrap_header, bam_iter), key=get_barcode)
def process_bam_barcode(bam, pair_iter, bc, corrected_umis, reporter, gene_umi_counts_per_bc, strand, out_bam, asm_min_readpairs_per_umi, paired_end): """ Process all readpairs from pair_iter, all having the same bc """ # Note: "gene" in this function is actually "chain" # Readpair counts per UMI (per-gene); {gene: {UMI: count}} # Note: Using a lambda here breaks cPickle for some reason gene_umi_counts = defaultdict(default_dict_int) read_pairs_written = 0 for header, (read1, read2) in pair_iter: (gene1, gene2) = reporter.vdj_recombinome_bam_cb(read1, read2, bam, strand) umi = header.get_tag(cr_constants.RAW_UMI_TAG) corrected_umi = corrected_umis[umi] # Count readpairs per UMI if gene1 is not None: gene_umi_counts[gene1][corrected_umi] += 1 if gene2 is not None: gene_umi_counts[gene2][corrected_umi] += 1 if gene1 is None and gene2 is None: # Allow unmapped UMIs gene_umi_counts["None"][corrected_umi] += 1 gene_umi_counts[cr_constants.MULTI_REFS_PREFIX][corrected_umi] += 1 header.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi) read1.qname = header.to_string() if read2 is not None: header2 = cr_fastq.AugmentedFastqHeader(read2.qname) assert (header2.get_tag(cr_constants.RAW_UMI_TAG) == umi) header2.set_tag(cr_constants.PROCESSED_UMI_TAG, corrected_umi) read2.qname = header2.to_string() reporter._get_metric_attr('vdj_corrected_umi_frac').add( 1, filter=corrected_umis[umi] != umi) read_pairs_written += 1 # Write whether this pair was filtered or not. out_bam.write(read1) if read2 is not None: out_bam.write(read2) # Report read-pairs/umi for gene in reporter.vdj_genes: tot_readpairs = 0 asm_bad_readpairs = 0 for reads_per_umi in gene_umi_counts[gene].itervalues(): reporter._get_metric_attr( 'vdj_recombinome_readpairs_per_umi_distribution', gene).add(reads_per_umi) if reads_per_umi < asm_min_readpairs_per_umi: asm_bad_readpairs += reads_per_umi tot_readpairs += reads_per_umi reporter._get_metric_attr('vdj_recombinome_low_support_reads_frac', gene).set_value(asm_bad_readpairs, tot_readpairs) gene_umi_counts_per_bc[bc] = gene_umi_counts
def fastq_barcode_sort_key(fastq_read): """ Return barcode, qname """ fastq_header = cr_fastq.AugmentedFastqHeader(fastq_read[0]) bc = fastq_header.get_tag(cr_constants.PROCESSED_BARCODE_TAG) return bc, fastq_header.fastq_header
def get_fastq_read_barcode(fastq_read): return cr_fastq.AugmentedFastqHeader(fastq_read[0]).get_tag( cr_constants.PROCESSED_BARCODE_TAG)
def get_consensus_quals(in_bam, clonotype_name, in_fasta, sel_contigs, contig_umis, out_dir): """Compute base quality scores of a sequence. Args: - in_bam: bam file to get the list of reads assigned to UMIs on the selected contigs - clonotype_name: Used for naming output files. - sel_contigs: Contigs that led to the consensus sequence above - contig_umis: from contig name to list of umis assigned to that contig Return value: String with base qualities (in FASTQ format). """ pref = re.sub('.fasta', '', os.path.basename(in_fasta)) fastq1 = re.sub('.fasta', '_1.fastq', in_fasta) fastq2 = re.sub('.fasta', '_2.fastq', in_fasta) sel_reads = {} for contig in sel_contigs: umi_read_count = Counter() barcode = contig.split('_')[0] contig_read_count = 0 # Wrap contig w/ str() because pysam crashes on unicode input for read in in_bam.fetch(str(contig)): # NOTE: Assembler assumes that any tags are part of the read name # BUT the bam that we feed to this stage has the tags stripped out # of the name. umi = read.get_tag(PROCESSED_UMI_TAG) if umi in contig_umis[contig] and not read.is_secondary: umi_read_count[umi] += 1 if umi_read_count[umi] >= MAX_READS_PER_UMI: continue contig_read_count += 1 if contig_read_count >= MAX_READS_PER_CONTIG: continue if not read.qname in sel_reads: sel_reads[read.qname] = [None, None] sel_reads[read.qname][read.is_read2] = read with open(fastq1, 'w') as f1, open(fastq2, 'w') as f2: for read_name, pair in sel_reads.iteritems(): read1, read2 = pair[0], pair[1] if read1 is None: # Replace the UMI with <BC>_<UMI>. umi = read2.get_tag(PROCESSED_UMI_TAG) else: umi = read1.get_tag(PROCESSED_UMI_TAG) header = cr_fastq.AugmentedFastqHeader(read_name) header.set_tag(PROCESSED_UMI_TAG, barcode + '_' + umi) header.set_tag(PROCESSED_BARCODE_TAG, barcode) if read1 is None: out_seq1 = "" out_quals1 = "" else: out_seq1 = tk_seq.get_rev_comp( read1.seq) if read1.is_reverse else read1.seq out_quals1 = read1.qual[:: -1] if read1.is_reverse else read1.qual tk_fasta.write_read_fastq(f1, header.to_string(), out_seq1, out_quals1) if read2 is None: out_seq2 = "" out_quals2 = "" else: out_seq2 = tk_seq.get_rev_comp( read2.seq) if read2.is_reverse else read2.seq out_quals2 = read2.qual[:: -1] if read2.is_reverse else read2.qual tk_fasta.write_read_fastq(f2, header.to_string(), out_seq2, out_quals2) assert (len(sel_reads) > 0) cmd = ['vdj_asm', 'base-quals', re.sub('.fasta', '', in_fasta), out_dir] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(os.path.join(out_dir, pref + '.fastq'), 'r') as f: lines = f.readlines() return lines[3].strip()
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args): """Build a consensus sequence from a set of contigs. Args: - clonotype_name: Used to prefix output files. - sel_contigs: Names of contigs to use for consensus building. - best_contig: Name of "best" contig. Will search for this contig's sequence and base qualities. - out_dir: dir used for temporary results - args: stage args. - Return value: A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name). - best_contig_seq/best_contig_quals: the sequence and quals of the best contig - consensus_seq: the consensus sequence or None if no consensus could be built. - out_bam_name: Path of BAM with alignments of contigs to consensus seq. - out_fastq_name: FASTQ with contig sequences. - out_fasta_name: FASTA with consensus sequence. enough reads for consensus. """ best_contig_seq = None best_contig_quals = None # Input to base quality computation - we don't really need the # base qualities because we will replace them by read-based qualities # But we need to do this to get proper alignments of contigs against # the consensus. out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq') # Input to assembly out_bam_name = martian.make_path(clonotype_name + '_contigs.bam') # The reference in the output bam doesn't really matter. out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1]) # Read the entire fastq (all contigs) and write the selected contigs to # a bam for the assembler and a fastq for the aligner. with open(args.contigs_fastq, 'r') as f, open(out_fastq_name, 'w') as out_fq: fq_iter = tk_fasta.read_generator_fastq(f) for (name, seq, quals) in fq_iter: if name in sel_contigs: if name == best_contig: best_contig_seq = seq best_contig_quals = quals header = cr_fastq.AugmentedFastqHeader(name) # Create a pseudo-UMI for each input contig header.set_tag(PROCESSED_UMI_TAG, name) # Put all reads on the same "barcode". This is important, so # the assembler assembles all of them together. header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name) record = pysam.AlignedRead() record.reference_start = 0 record.reference_id = 0 # Wrap with str() or pysam will crash when given unicode record.qname = str(header.to_string()) record.seq = seq record.qual = quals record.flag = MAPPED_UNPAIRED_FLAG out_bam.write(record) # Now change the tags. The final bam concatenation code will pull # the tags out of the header, so we want these to be meaningful. # Put the real barcode in the barcode tag. The alignment-base-qual # code will ignore it anyway. header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0]) tk_fasta.write_read_fastq(out_fq, header.to_string(), seq, quals) out_bam.close() assert (not best_contig_seq is None) out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta') # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name. # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the # only output of the assembler we care about. if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS: cmd = [ 'vdj_asm', 'asm', out_bam_name, out_dir, '--single-end', '--cons', # required so we produce a single output sequence '--kmers=0', '--min-qual=0', '--score-factor=0.0' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'), 'r') as contig_f: lines = contig_f.readlines() if lines: out_seq = lines[1].strip() else: # In some rare cases (eg. input contigs have 0 quality), assembly might fail. out_seq = None else: out_seq = None # Write the best contig sequence on a new fasta. We need to make sure this has the # right contig name because this will be the name written in the bam alignments # of the contigs against the consensus with open(out_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, clonotype_name, out_seq if out_seq else best_contig_seq) # Now align the same reads that were used in vdj_asm against the consensus that you just got. # The output will be in out_dir/<clonotype_name> + '_contigs.bam' cmd = [ 'vdj_asm', 'base-quals', martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move the BAM of the contigs aligned against the consensus out of the outs # (Will overwrite this bam which was already used as input to assembly). cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'), out_bam_name) return (best_contig_seq, best_contig_quals, out_seq, out_bam_name, out_fastq_name, out_fasta_name)
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return # Get the clonotype-barcode assignments with open(args.clonotype_assignments) as f: clonotypes = json.load(f) # Partition contig annotations by consensus id consensus_to_contigs = defaultdict(list) relevant_contig_ids = set() with open(args.chunk_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) clo_key = '%s_clonotype_id' % args.metric_prefix cons_key = '%s_consensus_id' % args.metric_prefix for contig in contigs: clo_id = contig.info_dict.get(clo_key) cons_id = contig.info_dict.get(cons_key) assert clo_id in chunk_clonotypes and cons_id is not None consensus_to_contigs[cons_id].append(contig) relevant_contig_ids.add(contig.contig_name) assert len(consensus_to_contigs) > 0 in_bam = tk_bam.create_bam_infile(args.contig_bam) n_merged_bams = 0 # For all contigs relevant to this chunk, # get the assembler umi data required for base qual recalculation. # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5].lower() == 'true' contig_ids = set(fields[6].split(',')) if good_umi and len(contig_ids & relevant_contig_ids) > 0: for c in contig_ids: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] # Verify that the contig annotation data are consistent with the clonotype assignment data assert set(consensus['cell_contigs']) == \ set(c.contig_name for c in consensus_to_contigs[consensus_id]) sel_contigs = consensus_to_contigs[consensus_id] sel_contig_ids = [c.contig_name for c in sel_contigs] # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ best_contig.umi_count < contig.umi_count): best_contig = contig assert best_contig is not None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_io.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contig_ids, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) if list(tmp_bam.references) != [consensus_id]: # Print some info to help us debug print tmp_bam.references, consensus_id assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. # Use a subset of the contigs for computing quals. contig_ids = map( lambda c: c.contig_name, sorted(sel_contigs, key=lambda c: c.umi_count, reverse=True)) contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS] consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, contig_ids, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = sum([c.read_count for c in sel_contigs]) total_umi_count = sum([c.umi_count for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contig_ids, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_io.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) # Clean up unneeded files ASAP rm_files([ consensus_id + '_contigs.fasta', consensus_id + '_contigs.fastq' ]) # Merge N most recent BAM files to avoid filesystem overload if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY: assert len(outs.chunked_consensus_bams) == len( outs.chunked_concat_ref_bams) new_cons_bam = martian.make_path('merged-consensus-%03d.bam' % n_merged_bams) concatenate_bams(new_cons_bam, outs.chunked_consensus_bams) rm_files(outs.chunked_consensus_bams) outs.chunked_consensus_bams = [new_cons_bam] new_ref_bam = martian.make_path('merged-ref-%03d.bam' % n_merged_bams) concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams) rm_files(outs.chunked_concat_ref_bams) outs.chunked_concat_ref_bams = [new_ref_bam] n_merged_bams += 1 in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)
def main(args, outs): # Martian coerces dict keys to string # Coerce keys back to int args.chunks_per_gem_group = {int(k): v for k, v in args.chunks_per_gem_group.iteritems()} paired_end = args.read2s_chunk is not None # Lazy load R1 r1_file = cr_io.open_maybe_gzip(args.read1s_chunk) read1s = tk_fasta.read_generator_fastq(r1_file) # Lazy load R2 if paired_end: r2_file = cr_io.open_maybe_gzip(args.read2s_chunk) read2s = tk_fasta.read_generator_fastq(r2_file) else: read2s = [] # Lazy load corrected BCs bc_file = cr_io.open_maybe_gzip(args.bcs) bcs = (line.strip() for line in bc_file) buckets = {} bucket_filenames = {} for gem_group, bucket_name in enumerate_bucket_names(args.chunks_per_gem_group): filename = martian.make_path("%s.fastq" % bucket_name) bucket_filenames[bucket_name] = filename buckets[bucket_name] = [] for read1, read2, barcode in itertools.izip_longest(read1s, read2s, bcs): # Exclude unbarcoded reads if barcode == '': continue # Exclude short reads if len(read1[1]) < MIN_READ_LENGTH or (read2 is not None and len(read2[1]) < MIN_READ_LENGTH): continue # Attach processed barcode to reads r1_hdr = cr_fastq.AugmentedFastqHeader(read1[0]) r1_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode) r1_new_qname = r1_hdr.to_string() if paired_end: r2_hdr = cr_fastq.AugmentedFastqHeader(read2[0]) r2_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode) r2_new_qname = r2_hdr.to_string() barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group]) buckets[bucket_name].append((r1_new_qname, read1[1], read1[2])) if paired_end: buckets[bucket_name].append((r2_new_qname, read2[1], read2[2])) outs.buckets = {} # Sort and write each bucket for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=vdj_utils.fastq_barcode_sort_key) # Don't create empty bucket files. # This is common when the reads are ordered by gem group # And a chunk sees only a single gem group. if len(bucket) == 0: continue filename = bucket_filenames[bucket_name] with cr_io.open_maybe_gzip(filename, 'w') as f: for read in bucket: tk_fasta.write_read_fastq(f, *read) outs.buckets[bucket_name] = bucket_filenames[bucket_name]
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return with open(args.annotations) as f: contigs = cPickle.load(f) with open(args.clonotype_assignments) as f: clonotypes = json.load(f) in_bam = tk_bam.create_bam_infile(args.contig_bam) contig_read_counts = {c.contig_name: c.read_count for c in contigs} contig_umi_counts = {c.contig_name: c.umi_count for c in contigs} # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5] == 'True' contig_names = fields[6].split(',') if good_umi: for c in contig_names: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] sel_contigs = set(consensus['cell_contigs'] ) # Get the contigs that should be merged # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in contigs: if contig.contig_name in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ len(contig_umis[best_contig.contig_name]) < len(contig_umis[contig.contig_name])): best_contig = contig assert not best_contig is None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) # Order contigs by decreasing UMI support ordered_contigs = list( sorted(sel_contigs, key=lambda x: len(contig_umis[x]), reverse=True)) ordered_contigs = ordered_contigs[ 0:min(MAX_CELLS_FOR_BASE_QUALS, len(sel_contigs))] wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_utils.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contigs, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, ordered_contigs, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = np.sum( [contig_read_counts[c] for c in sel_contigs]) total_umi_count = np.sum( [contig_umi_counts[c] for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contigs, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end', '--global' # use global alignment if a good seed isn't found - everything must get aligned ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') subprocess.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_utils.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group, args.library_type) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk) in_read2_fastq = cr_io.open_maybe_gzip( args.read2_chunk) if args.read2_chunk else [] outs.corrected_bcs += h5_constants.LZ4_SUFFIX out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) processed_bc = None if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) reporter.vdj_barcode_cb(raw_bc, processed_bc) out_file.write('%s\n' % (processed_bc if processed_bc is not None else '')) in_read1_fastq.close() if in_read2_fastq: in_read2_fastq.close() out_file.close() bc_counter.close() reporter.save(outs.chunked_reporter)
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = open(args.read1_chunk) in_read2_fastq = open(args.read2_chunk) out_read1_fastq = open(outs.corrected_read1s, 'w') out_read2_fastq = open(outs.corrected_read2s, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) read2_header = cr_fastq.AugmentedFastqHeader(read2[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) reporter.vdj_barcode_cb(raw_bc, processed_bc) tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(), read1[1], read1[2]) tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(), read2[1], read2[2]) in_read1_fastq.close() in_read2_fastq.close() out_read1_fastq.close() out_read2_fastq.close() bc_counter.close() reporter.save(outs.chunked_reporter)