def join(args, outs, chunk_defs, chunk_outs): # Merge pickles and save JSON all_contigs = [] for chunk in chunk_outs: if chunk.chunked_annotations is not None: all_contigs.extend(cPickle.load(open(chunk.chunked_annotations, 'rb'))) # Clear this temporary, chunk-specific out outs.chunked_annotations = None # Write all contigs with open(outs.raw_annotations, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, all_contigs) # Write filtered contigs with open(outs.annotations, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, [c for c in all_contigs if c.filtered]) # Save a BED formatted file of a subset of annotations with open(outs.annotations_bed, 'w') as output_file: bed_lines = cr_utils.flatten_list([c.get_annotations_bed() for c in all_contigs if c.filtered]) for bed_line in bed_lines: output_file.write(bed_line + '\n') # Write annotations CSV with open(outs.annotations_csv, 'w') as csv: vdj_annot.save_contig_list_csv(csv, all_contigs)
def main(args, outs): reporter = vdj_report.VdjReporter() with open(args.contig_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) contigs.sort(key=lambda c: (c.barcode, c.get_single_chain( ), not c.productive, -c.umi_count, -c.read_count, -len(c))) low_confidence_contigs = set() cell_contigs = set() for (bc, chain), group in itertools.groupby(contigs, key=lambda c: (c.barcode, c.get_single_chain())): first_cdr3 = None first_cdr3_umis = None seen_cdr3s = set() for contig in group: contig.high_confidence = True if contig.is_cell: cell_contigs.add(contig.contig_name) if first_cdr3 is None: first_cdr3 = contig.cdr3_seq first_cdr3_umis = contig.umi_count # Mark as low confidence: # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3 # with a single UMI or low UMIs relative to the first contig, or extraneous_cdr3 = first_cdr3 is not None \ and contig.cdr3_seq != first_cdr3 \ and (contig.umi_count == 1 or \ (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO) # 2) Any contigs with a repeated CDR3. repeat_cdr3 = contig.cdr3_seq in seen_cdr3s if extraneous_cdr3 or repeat_cdr3: contig.high_confidence = False low_confidence_contigs.add(contig.contig_name) seen_cdr3s.add(contig.cdr3_seq) if chain in vdj_constants.VDJ_GENES: reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', chain).add( 1, filter=contig.high_confidence) reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', cr_constants.MULTI_REFS_PREFIX).add( 1, filter=contig.high_confidence) # Write augmented contig annotations with open(outs.contig_annotations, 'w') as f: vdj_annot.save_annotation_list_json(f, contigs) # Write filtered fasta with open(args.contig_fasta) as in_file, \ open(outs.filtered_contig_fasta, 'w') as out_file: for hdr, seq in cr_utils.get_fasta_iter(in_file): # Keep contigs that are high confidence & in cells if hdr not in low_confidence_contigs and hdr in cell_contigs: tk_fasta.write_read_fasta(out_file, hdr, seq) # Write filtered fastq with open(args.contig_fastq) as in_file, \ open(outs.filtered_contig_fastq, 'w') as out_file: for name, seq, qual in tk_fasta.read_generator_fastq(in_file): if name not in low_confidence_contigs and name in cell_contigs: tk_fasta.write_read_fastq(out_file, name, seq, qual) reporter.report_summary_json(outs.summary)
def join(args, outs, chunk_defs, chunk_outs): if len(chunk_outs) == 0: # Set all outputs to null for slot in outs.slots: setattr(outs, slot, None) return reporters = [chunk_out.chunked_reporter for chunk_out in chunk_outs] final_report = cr_report.merge_reporters(reporters) final_report.report_summary_json(outs.summary) consensus_contigs = [] ref_contigs = [] all_bams = [] all_ref_bams = [] for chunk in chunk_outs: if chunk.consensus_annotations_json and os.path.isfile( chunk.consensus_annotations_json): # Collect consensus annotations new_contigs = vdj_annot.load_cell_contigs_from_json( chunk.consensus_annotations_json, args.vdj_reference_path, group_key='clonotype') for cl in new_contigs: consensus_contigs.extend(cl.chains) # Collect concat_ref annotations new_ref_contigs = vdj_annot.load_cell_contigs_from_json( chunk.concat_ref_annotations_json, args.vdj_reference_path, group_key='clonotype') for cl in new_ref_contigs: ref_contigs.extend(cl.chains) all_bams.extend(chunk.chunked_consensus_bams) all_ref_bams.extend(chunk.chunked_concat_ref_bams) if consensus_contigs: all_fastqs = [chunk_out.consensus_fastq for chunk_out in chunk_outs] cr_io.concatenate_files(outs.consensus_fastq, all_fastqs) all_fastas = [chunk_out.consensus_fasta for chunk_out in chunk_outs] concatenate_and_index_fastas(outs.consensus_fasta, all_fastas) outs.consensus_fasta_fai = outs.consensus_fasta + '.fai' all_fastas = [chunk_out.concat_ref_fasta for chunk_out in chunk_outs] concatenate_and_index_fastas(outs.concat_ref_fasta, all_fastas) outs.concat_ref_fasta_fai = outs.concat_ref_fasta + '.fai' concatenate_sort_and_index_bams(outs.consensus_bam, all_bams) outs.consensus_bam_bai = outs.consensus_bam + '.bai' concatenate_sort_and_index_bams(outs.concat_ref_bam, all_ref_bams) outs.concat_ref_bam_bai = outs.concat_ref_bam + '.bai' # Sort contigs (and clonotypes) by frequency. with open(args.clonotype_assignments) as f: clonotypes = json.load(f) clonotype_freqs = {cid: c['freq'] for cid, c in clonotypes.iteritems()} consensus_contigs.sort(key=lambda x: clonotype_freqs[x.clonotype], reverse=True) ref_contigs.sort(key=lambda x: clonotype_freqs[x.clonotype], reverse=True) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs) with open(outs.consensus_annotations_csv, 'w') as out_file: vdj_annot.save_consensus_list_csv(out_file, consensus_contigs) with open(outs.clonotypes, 'w') as f: vdj_annot.save_clonotype_info_csv(f, consensus_contigs) outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = []
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return # Get the clonotype-barcode assignments with open(args.clonotype_assignments) as f: clonotypes = json.load(f) # Partition contig annotations by consensus id consensus_to_contigs = defaultdict(list) relevant_contig_ids = set() with open(args.chunk_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) clo_key = '%s_clonotype_id' % args.metric_prefix cons_key = '%s_consensus_id' % args.metric_prefix for contig in contigs: clo_id = contig.info_dict.get(clo_key) cons_id = contig.info_dict.get(cons_key) assert clo_id in chunk_clonotypes and cons_id is not None consensus_to_contigs[cons_id].append(contig) relevant_contig_ids.add(contig.contig_name) assert len(consensus_to_contigs) > 0 in_bam = tk_bam.create_bam_infile(args.contig_bam) n_merged_bams = 0 # For all contigs relevant to this chunk, # get the assembler umi data required for base qual recalculation. # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5].lower() == 'true' contig_ids = set(fields[6].split(',')) if good_umi and len(contig_ids & relevant_contig_ids) > 0: for c in contig_ids: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] # Verify that the contig annotation data are consistent with the clonotype assignment data assert set(consensus['cell_contigs']) == \ set(c.contig_name for c in consensus_to_contigs[consensus_id]) sel_contigs = consensus_to_contigs[consensus_id] sel_contig_ids = [c.contig_name for c in sel_contigs] # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ best_contig.umi_count < contig.umi_count): best_contig = contig assert best_contig is not None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_io.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contig_ids, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) if list(tmp_bam.references) != [consensus_id]: # Print some info to help us debug print tmp_bam.references, consensus_id assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. # Use a subset of the contigs for computing quals. contig_ids = map( lambda c: c.contig_name, sorted(sel_contigs, key=lambda c: c.umi_count, reverse=True)) contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS] consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, contig_ids, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = sum([c.read_count for c in sel_contigs]) total_umi_count = sum([c.umi_count for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contig_ids, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_io.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) # Clean up unneeded files ASAP rm_files([ consensus_id + '_contigs.fasta', consensus_id + '_contigs.fastq' ]) # Merge N most recent BAM files to avoid filesystem overload if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY: assert len(outs.chunked_consensus_bams) == len( outs.chunked_concat_ref_bams) new_cons_bam = martian.make_path('merged-consensus-%03d.bam' % n_merged_bams) concatenate_bams(new_cons_bam, outs.chunked_consensus_bams) rm_files(outs.chunked_consensus_bams) outs.chunked_consensus_bams = [new_cons_bam] new_ref_bam = martian.make_path('merged-ref-%03d.bam' % n_merged_bams) concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams) rm_files(outs.chunked_concat_ref_bams) outs.chunked_concat_ref_bams = [new_ref_bam] n_merged_bams += 1 in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)
def main(args, outs): reporter = vdj_report.VdjReporter() cell_barcodes = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes)) barcode_contigs = vdj_annot.load_cell_contigs_from_json( args.annotations, args.vdj_reference_path, group_key='barcode') # From CDR sequence to sequence id sequences = {} # From clonotype (tuple of CDR ids) to clonotype id clonotypes = {} # From barcode to clonotype id bc_clonotype_assignments = {} # First pass: Just keep track of observed CDR3s for contig_list in barcode_contigs: # This will be a tuple of sequences like "TRA_<cdr seq>" barcode_clonotype_tuple = contig_list.clonotype_tuple( require_productive=not args.use_non_productive, require_full_len=True, require_high_conf=True) # Give unique numerical ids to the CDR3 sequences if barcode_clonotype_tuple: for cdr_seq in barcode_clonotype_tuple: sequences.setdefault(cdr_seq, len(sequences)) # From sequence id to CDR sequence sequence_ids = {seq_id: seq for seq, seq_id in sequences.iteritems()} # Do a second pass to potentially use non-full length contigs with a valid CDR3. for contig_list in barcode_contigs: if args.use_non_full_len: barcode_clonotype_tuple = [] for c in contig_list.contigs(): (_, cl_seq) = c.clonotype_seq() # If this contig has a CDR3 and we can infer the gene type of # that CDR3 (either based on the contig itself or based on # other full-length contigs that had this CDR3, then add this # to the clonotype tuple). if cl_seq in sequences: # this will rescue contigs that have a chain and CDR3 assigned # but aren't full length barcode_clonotype_tuple.append(cl_seq) else: barcode_clonotype_tuple = contig_list.clonotype_tuple( require_productive=(not args.use_non_productive), require_full_len=True, require_high_conf=True) barcode_clonotype = tuple( sorted(list(set([sequences[s] for s in barcode_clonotype_tuple])))) if barcode_clonotype: clonotype_id = clonotypes.setdefault(barcode_clonotype, len(clonotypes)) bc_clonotype_assignments[contig_list.name] = clonotype_id # From clonotype id to tuple of CDRs clonotype_ids = { clonotype_id: clonotype_tuple for clonotype_tuple, clonotype_id in clonotypes.iteritems() } out_clonotypes = vdj_annot.report_clonotypes(reporter, 'raw', cell_barcodes, clonotype_ids, sequence_ids, barcode_contigs, bc_clonotype_assignments) with open(outs.clonotype_assignments, 'w') as out_file: tk_safe_json.dump_numpy(tk_safe_json.json_sanitize(out_clonotypes), out_file, pretty=True) # Add clonotype assignments to contig annotations del barcode_contigs with open(args.annotations) as f: all_contigs = vdj_annot.load_contig_list_from_json( f, args.vdj_reference_path) vdj_annot.label_contigs_with_consensus(out_clonotypes, all_contigs, 'raw') # Write augmented contig annotations with open(outs.contig_annotations, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, all_contigs) with open(outs.contig_annotations_csv, 'w') as out_file: vdj_annot.save_contig_list_csv(out_file, all_contigs, write_inferred=False) with open(outs.contig_annotations_pickle, 'w') as out_file: cPickle.dump(all_contigs, out_file, protocol=cPickle.HIGHEST_PROTOCOL) # Write filtered contig annotations with open(outs.filtered_contig_annotations_csv, 'w') as out_file: filtered_contigs = filter(lambda x: x.high_confidence and x.is_cell, all_contigs) vdj_annot.save_contig_list_csv(out_file, filtered_contigs, write_inferred=False) # Set a default value for paired clonotype diversity so that it will be # present in the metric summary csv even when there are no paired cells # or in denovo mode paired_diversity_metric = reporter._get_metric_attr( 'vdj_paired_clonotype_diversity', MULTI_REFS_PREFIX, 'raw') if not paired_diversity_metric.d: paired_diversity_metric.add(None, 0) reporter.report_summary_json(outs.summary)
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return with open(args.annotations) as f: contigs = cPickle.load(f) with open(args.clonotype_assignments) as f: clonotypes = json.load(f) in_bam = tk_bam.create_bam_infile(args.contig_bam) contig_read_counts = {c.contig_name: c.read_count for c in contigs} contig_umi_counts = {c.contig_name: c.umi_count for c in contigs} # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5] == 'True' contig_names = fields[6].split(',') if good_umi: for c in contig_names: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] sel_contigs = set(consensus['cell_contigs'] ) # Get the contigs that should be merged # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in contigs: if contig.contig_name in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ len(contig_umis[best_contig.contig_name]) < len(contig_umis[contig.contig_name])): best_contig = contig assert not best_contig is None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) # Order contigs by decreasing UMI support ordered_contigs = list( sorted(sel_contigs, key=lambda x: len(contig_umis[x]), reverse=True)) ordered_contigs = ordered_contigs[ 0:min(MAX_CELLS_FOR_BASE_QUALS, len(sel_contigs))] wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_utils.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contigs, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, ordered_contigs, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = np.sum( [contig_read_counts[c] for c in sel_contigs]) total_umi_count = np.sum( [contig_umi_counts[c] for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contigs, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end', '--global' # use global alignment if a good seed isn't found - everything must get aligned ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') subprocess.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_utils.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)