def insert_adapter_at_random(out_writer, adapter, num_seqs, err_rate, allow_indel): for i in xrange(num_seqs): hdr = str(i) seq = random_seq(SEQ_LEN) if random.random() > 0.5: pos = random.randint(0, SEQ_LEN - 1) seq = seq[:pos] + mutate(adapter, err_rate, allow_indel) + seq[pos:] seq = seq[:SEQ_LEN] tk_fasta.write_read_fasta(out_writer, hdr, seq)
def main(args, outs): reporter = vdj_report.VdjReporter() with open(args.contig_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) contigs.sort(key=lambda c: (c.barcode, c.get_single_chain( ), not c.productive, -c.umi_count, -c.read_count, -len(c))) low_confidence_contigs = set() cell_contigs = set() for (bc, chain), group in itertools.groupby(contigs, key=lambda c: (c.barcode, c.get_single_chain())): first_cdr3 = None first_cdr3_umis = None seen_cdr3s = set() for contig in group: contig.high_confidence = True if contig.is_cell: cell_contigs.add(contig.contig_name) if first_cdr3 is None: first_cdr3 = contig.cdr3_seq first_cdr3_umis = contig.umi_count # Mark as low confidence: # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3 # with a single UMI or low UMIs relative to the first contig, or extraneous_cdr3 = first_cdr3 is not None \ and contig.cdr3_seq != first_cdr3 \ and (contig.umi_count == 1 or \ (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO) # 2) Any contigs with a repeated CDR3. repeat_cdr3 = contig.cdr3_seq in seen_cdr3s if extraneous_cdr3 or repeat_cdr3: contig.high_confidence = False low_confidence_contigs.add(contig.contig_name) seen_cdr3s.add(contig.cdr3_seq) if chain in vdj_constants.VDJ_GENES: reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', chain).add( 1, filter=contig.high_confidence) reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', cr_constants.MULTI_REFS_PREFIX).add( 1, filter=contig.high_confidence) # Write augmented contig annotations with open(outs.contig_annotations, 'w') as f: vdj_annot.save_annotation_list_json(f, contigs) # Write filtered fasta with open(args.contig_fasta) as in_file, \ open(outs.filtered_contig_fasta, 'w') as out_file: for hdr, seq in cr_utils.get_fasta_iter(in_file): # Keep contigs that are high confidence & in cells if hdr not in low_confidence_contigs and hdr in cell_contigs: tk_fasta.write_read_fasta(out_file, hdr, seq) # Write filtered fastq with open(args.contig_fastq) as in_file, \ open(outs.filtered_contig_fastq, 'w') as out_file: for name, seq, qual in tk_fasta.read_generator_fastq(in_file): if name not in low_confidence_contigs and name in cell_contigs: tk_fasta.write_read_fastq(out_file, name, seq, qual) reporter.report_summary_json(outs.summary)
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args): """Build a consensus sequence from a set of contigs. Args: - clonotype_name: Used to prefix output files. - sel_contigs: Names of contigs to use for consensus building. - best_contig: Name of "best" contig. Will search for this contig's sequence and base qualities. - out_dir: dir used for temporary results - args: stage args. - Return value: A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name). - best_contig_seq/best_contig_quals: the sequence and quals of the best contig - consensus_seq: the consensus sequence or None if no consensus could be built. - out_bam_name: Path of BAM with alignments of contigs to consensus seq. - out_fastq_name: FASTQ with contig sequences. - out_fasta_name: FASTA with consensus sequence. enough reads for consensus. """ best_contig_seq = None best_contig_quals = None # Input to base quality computation - we don't really need the # base qualities because we will replace them by read-based qualities # But we need to do this to get proper alignments of contigs against # the consensus. out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq') # Input to assembly out_bam_name = martian.make_path(clonotype_name + '_contigs.bam') # The reference in the output bam doesn't really matter. out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1]) # Read the entire fastq (all contigs) and write the selected contigs to # a bam for the assembler and a fastq for the aligner. with open(args.contigs_fastq, 'r') as f, open(out_fastq_name, 'w') as out_fq: fq_iter = tk_fasta.read_generator_fastq(f) for (name, seq, quals) in fq_iter: if name in sel_contigs: if name == best_contig: best_contig_seq = seq best_contig_quals = quals header = cr_fastq.AugmentedFastqHeader(name) # Create a pseudo-UMI for each input contig header.set_tag(PROCESSED_UMI_TAG, name) # Put all reads on the same "barcode". This is important, so # the assembler assembles all of them together. header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name) record = pysam.AlignedRead() record.reference_start = 0 record.reference_id = 0 # Wrap with str() or pysam will crash when given unicode record.qname = str(header.to_string()) record.seq = seq record.qual = quals record.flag = MAPPED_UNPAIRED_FLAG out_bam.write(record) # Now change the tags. The final bam concatenation code will pull # the tags out of the header, so we want these to be meaningful. # Put the real barcode in the barcode tag. The alignment-base-qual # code will ignore it anyway. header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0]) tk_fasta.write_read_fastq(out_fq, header.to_string(), seq, quals) out_bam.close() assert (not best_contig_seq is None) out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta') # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name. # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the # only output of the assembler we care about. if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS: cmd = [ 'vdj_asm', 'asm', out_bam_name, out_dir, '--single-end', '--cons', # required so we produce a single output sequence '--kmers=0', '--min-qual=0', '--score-factor=0.0' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'), 'r') as contig_f: lines = contig_f.readlines() if lines: out_seq = lines[1].strip() else: # In some rare cases (eg. input contigs have 0 quality), assembly might fail. out_seq = None else: out_seq = None # Write the best contig sequence on a new fasta. We need to make sure this has the # right contig name because this will be the name written in the bam alignments # of the contigs against the consensus with open(out_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, clonotype_name, out_seq if out_seq else best_contig_seq) # Now align the same reads that were used in vdj_asm against the consensus that you just got. # The output will be in out_dir/<clonotype_name> + '_contigs.bam' cmd = [ 'vdj_asm', 'base-quals', martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move the BAM of the contigs aligned against the consensus out of the outs # (Will overwrite this bam which was already used as input to assembly). cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'), out_bam_name) return (best_contig_seq, best_contig_quals, out_seq, out_bam_name, out_fastq_name, out_fasta_name)
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return # Get the clonotype-barcode assignments with open(args.clonotype_assignments) as f: clonotypes = json.load(f) # Partition contig annotations by consensus id consensus_to_contigs = defaultdict(list) relevant_contig_ids = set() with open(args.chunk_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) clo_key = '%s_clonotype_id' % args.metric_prefix cons_key = '%s_consensus_id' % args.metric_prefix for contig in contigs: clo_id = contig.info_dict.get(clo_key) cons_id = contig.info_dict.get(cons_key) assert clo_id in chunk_clonotypes and cons_id is not None consensus_to_contigs[cons_id].append(contig) relevant_contig_ids.add(contig.contig_name) assert len(consensus_to_contigs) > 0 in_bam = tk_bam.create_bam_infile(args.contig_bam) n_merged_bams = 0 # For all contigs relevant to this chunk, # get the assembler umi data required for base qual recalculation. # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5].lower() == 'true' contig_ids = set(fields[6].split(',')) if good_umi and len(contig_ids & relevant_contig_ids) > 0: for c in contig_ids: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] # Verify that the contig annotation data are consistent with the clonotype assignment data assert set(consensus['cell_contigs']) == \ set(c.contig_name for c in consensus_to_contigs[consensus_id]) sel_contigs = consensus_to_contigs[consensus_id] sel_contig_ids = [c.contig_name for c in sel_contigs] # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ best_contig.umi_count < contig.umi_count): best_contig = contig assert best_contig is not None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_io.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contig_ids, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) if list(tmp_bam.references) != [consensus_id]: # Print some info to help us debug print tmp_bam.references, consensus_id assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. # Use a subset of the contigs for computing quals. contig_ids = map( lambda c: c.contig_name, sorted(sel_contigs, key=lambda c: c.umi_count, reverse=True)) contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS] consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, contig_ids, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = sum([c.read_count for c in sel_contigs]) total_umi_count = sum([c.umi_count for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contig_ids, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_io.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) # Clean up unneeded files ASAP rm_files([ consensus_id + '_contigs.fasta', consensus_id + '_contigs.fastq' ]) # Merge N most recent BAM files to avoid filesystem overload if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY: assert len(outs.chunked_consensus_bams) == len( outs.chunked_concat_ref_bams) new_cons_bam = martian.make_path('merged-consensus-%03d.bam' % n_merged_bams) concatenate_bams(new_cons_bam, outs.chunked_consensus_bams) rm_files(outs.chunked_consensus_bams) outs.chunked_consensus_bams = [new_cons_bam] new_ref_bam = martian.make_path('merged-ref-%03d.bam' % n_merged_bams) concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams) rm_files(outs.chunked_concat_ref_bams) outs.chunked_concat_ref_bams = [new_ref_bam] n_merged_bams += 1 in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return with open(args.annotations) as f: contigs = cPickle.load(f) with open(args.clonotype_assignments) as f: clonotypes = json.load(f) in_bam = tk_bam.create_bam_infile(args.contig_bam) contig_read_counts = {c.contig_name: c.read_count for c in contigs} contig_umi_counts = {c.contig_name: c.umi_count for c in contigs} # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5] == 'True' contig_names = fields[6].split(',') if good_umi: for c in contig_names: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] sel_contigs = set(consensus['cell_contigs'] ) # Get the contigs that should be merged # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in contigs: if contig.contig_name in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ len(contig_umis[best_contig.contig_name]) < len(contig_umis[contig.contig_name])): best_contig = contig assert not best_contig is None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) # Order contigs by decreasing UMI support ordered_contigs = list( sorted(sel_contigs, key=lambda x: len(contig_umis[x]), reverse=True)) ordered_contigs = ordered_contigs[ 0:min(MAX_CELLS_FOR_BASE_QUALS, len(sel_contigs))] wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_utils.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contigs, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, ordered_contigs, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = np.sum( [contig_read_counts[c] for c in sel_contigs]) total_umi_count = np.sum( [contig_umi_counts[c] for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contigs, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end', '--global' # use global alignment if a good seed isn't found - everything must get aligned ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') subprocess.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_utils.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)