def evaluate(self, true_line, inf_line, simglfo=None): if len(inf_line['unique_ids']) > 1: raise Exception('mutli-seq lines not yet handled') iseq = 0 def addval(col, simval, infval): if col[2:] == '_insertion': # stored as the actual inserted bases simval = len(simval) infval = len(infval) diff = infval - simval if diff not in self.values[col]: self.values[col][diff] = 0 self.values[col][diff] += 1 if indelutils.has_indels(true_line['indelfos'][iseq]) or indelutils.has_indels(inf_line['indelfos'][iseq]): simlen = indelutils.net_length(true_line['indelfos'][iseq]) inflen = indelutils.net_length(inf_line['indelfos'][iseq]) addval('shm_indel_length', simlen, inflen) if simlen != inflen: # this is probably because the simulated shm indel was within the cdr3, so we attempt to fix it by switching the sim line to non-reversed print ' %s true and inferred shm net indel lengths different, so skipping rest of performance evaluation' % ' '.join(inf_line['unique_ids']) # note that you can't really evaluate the rest of the performance vars in a particularly meaningful when the indel info is different (like I tried to do below) since you have to decide how to assign the indel'd bases (like, is it correct to assign the indel'd bases to a deletion? or to an insertion? or to the j?) return # true_line = copy.deepcopy(true_line) # utils.remove_all_implicit_info(true_line) # true_line['indelfos'][iseq] = indelutils.get_empty_indel() # true_line['seqs'][iseq] = true_line['input_seqs'][iseq] # utils.add_implicit_info(simglfo, true_line) mutfo = {lt : {mt : {} for mt in ['freq', 'total']} for lt in ['sim', 'inf']} for rstr in plotconfig.rstrings: if rstr == '': # these are already in the <line>s, so may as well not recalculate mutfo['sim']['freq'][rstr], mutfo['sim']['total'][rstr] = true_line['mut_freqs'][iseq], true_line['n_mutations'][iseq] mutfo['inf']['freq'][rstr], mutfo['inf']['total'][rstr] = inf_line['mut_freqs'][iseq], inf_line['n_mutations'][iseq] else: mutfo['sim']['freq'][rstr], mutfo['sim']['total'][rstr] = utils.get_mutation_rate_and_n_muted(true_line, iseq=iseq, restrict_to_region=rstr.rstrip('_')) mutfo['inf']['freq'][rstr], mutfo['inf']['total'][rstr] = utils.get_mutation_rate_and_n_muted(inf_line, iseq=iseq, restrict_to_region=rstr.rstrip('_')) for col in plotconfig.gene_usage_columns: self.set_bool_column(true_line, inf_line, col, mutfo['sim']['freq']['']) # this also sets the fraction-correct-vs-mute-freq hists for column in plotconfig.int_columns: addval(column, true_line[column], inf_line[column]) for rstr in plotconfig.rstrings: addval(rstr + 'hamming_to_true_naive', 0, self.hamming_to_true_naive(true_line, inf_line, restrict_to_region=rstr.rstrip('_'))) addval(rstr + 'muted_bases', mutfo['sim']['total'][rstr], mutfo['inf']['total'][rstr]) for region in utils.regions: if region + '_per_gene_support' in inf_line: self.set_per_gene_support(true_line, inf_line, region) self.hists['mute_freqs'].fill(mutfo['inf']['freq'][''] - mutfo['sim']['freq']['']) # when we're evaluating on multi-seq hmm output, we synthesize single-sequence lines for each sequence
def check_seed_for_indels(cluster_line, seed_id, partition_file): iseq_seed = cluster_line["unique_ids"].index(seed_id) if indelutils.has_indels(cluster_line["indelfos"][iseq_seed]): print(indelutils.get_dbg_str(cluster_line["indelfos"][iseq_seed])) raise Exception( "indel in seed sequence {}. Options are 1. Look at the annotation for this cluster and find the indel in the seed. Rerun process_partis.py with --match-indel-in-uid <uid-of-seq-containing-indel-of-interest> to process only sequences containing that specific indel for further analysis of the indel 2. Run with --ignore-seed-indels. PS check out {}" .format(seed_id, partition_file))
def setline(self, irandom=None): # don't access <self.line> directly if self.line is not None: return self.line line = {} for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for boundary in utils.effective_boundaries: line[boundary + '_insertion'] = '' # NOTE 'fv' and 'jf' insertions are hereby hardcoded to zero (I'm just writing this here to make it easily searchable -- I don't remember why it's set up that way) for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] line['input_seqs'] = self.final_seqs line['indelfos'] = self.indelfos line['seqs'] = [self.indelfos[iseq]['reversed_seq'] if indelutils.has_indels(self.indelfos[iseq]) else line['input_seqs'][iseq] for iseq in range(len(line['input_seqs']))] self.set_ids(line, irandom=irandom) treeutils.translate_labels(self.tree, zip(self.leaf_names, line['unique_ids'])) # ordering in <self.leaf_names> is set in recombinator.add_mutants() line['affinities'] = [None for _ in line['unique_ids']] line['tree'] = self.tree.as_string(schema='newick') utils.add_implicit_info(self.glfo, line) self.line = line
def setline(self, irandom=None): # don't access <self.line> directly if self.line is not None: return self.line line = {} for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for boundary in utils.effective_boundaries: line[boundary + '_insertion'] = '' for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] line['input_seqs'] = self.final_seqs line['indelfos'] = self.indelfos line['seqs'] = [ line['indelfos'][iseq]['reversed_seq'] if indelutils.has_indels( line['indelfos'][iseq]) else line['input_seqs'][iseq] for iseq in range(len(line['input_seqs'])) ] self.set_ids(line, irandom) utils.add_implicit_info(self.glfo, line) self.line = line
def get_cluster_meta_dict(cluster_line, seed_id, args): has_indels = any([ indelutils.has_indels(cluster_line["indelfos"][iseq]) for iseq in range(len(cluster_line["input_seqs"])) ]) if not args.indel_reversed_seqs and not has_indels: warnings.warn( "{}: --indel-reversed-seqs was not passed and there are no indels. If running this script from CFT, this is probably because CFT was run with --preserve-indels and there are no indels in this cluster. It will get aligned anyway." .format(utils.color("red", "warning"))) return { "sequences": get_cluster_seqs_dict(cluster_line, seed_id, args), "cdr3_start": cluster_line["codon_positions"]["v"], "has_seed": seed_id in cluster_line["unique_ids"], "mean_mut_freq": numpy.mean(cluster_line["mut_freqs"]), "seed_id": seed_id, "match_indel_in_uid": args.match_indel_in_uid is not None, "has_indels": has_indels, "indels_reversed": has_indels and args.indel_reversed_seqs, }
def print_seq_in_reco_event(original_line, iseq, extra_str='', label='', one_line=False, seed_uid=None, check_line_integrity=False): """ Print ascii summary of recombination event and mutation. If <one_line>, then skip the germline lines, and only print the final_seq line. """ line = original_line if check_line_integrity: # it's very important not to modify <line> -- this lets you verify that you aren't line = copy.deepcopy( original_line) # copy that we can modify without changing <line> delstrs = { d: '.' * line[d + '_del'] for d in utils.all_erosions } # NOTE len(delstrs[<del>]) is not in general the same as len(line[<del>_del]) if len( delstrs['v_5p'] ) > 50: # don't print a million dots if left-side v deletion is really big delstrs['v_5p'] = '.%d.' % len(delstrs['v_5p']) # if there isn't enough space for dots in the vj line, we add some dashes to everybody so things fit (rare in heavy chain rearrangements, but pretty common in light chain) d_plus_inserts_length = len(line['vd_insertion'] + line['d_gl_seq'] + line['dj_insertion']) if line['v_3p_del'] + line[ 'j_5p_del'] > d_plus_inserts_length: # if dots for v and j interior deletions will be longer than <d_plus_inserts_length> delstrs['v_3p'] = '.%d.' % line['v_3p_del'] delstrs['j_5p'] = '.%d.' % line['j_5p_del'] gapstr = '-' * (len(delstrs['v_3p'] + delstrs['j_5p']) - d_plus_inserts_length) gap_insert_point = len( line['fv_insertion'] + delstrs['v_5p'] + line['v_gl_seq'] ) # it doesn't really matter exactly where we put the blue dashes, as long as it's the same place in all four lines, but this is a good spot extra_space_because_of_fixed_nospace = max( 0, d_plus_inserts_length - len(delstrs['v_3p'] + delstrs['j_5p']) ) # if shortening the <delstrs> already over-compensated for the lack of space (i.e., if the number of dashes necessary is zero), then we need to add some dots to the vj line below else: gapstr = '' gap_insert_point = None extra_space_because_of_fixed_nospace = 0 eroded_seqs_dots = { r: delstrs[r + '_5p'] + line[r + '_gl_seq'] + delstrs[r + '_3p'] for r in utils.regions } # build the three germline lines insert_line = ' ' * (len(line['fv_insertion']) + line['lengths']['v'] + len(delstrs['v_5p'])) \ + line['vd_insertion'] + ' ' * line['lengths']['d'] + line['dj_insertion'] \ + ' ' * (line['lengths']['j'] + line['j_3p_del'] + len(line['jf_insertion'])) germline_d_start = len(line['fv_insertion']) + line['lengths']['v'] + len( line['vd_insertion']) - line['d_5p_del'] germline_d_end = germline_d_start + line['d_5p_del'] + line['lengths'][ 'd'] + line['d_3p_del'] d_line = ' ' * (germline_d_start + len(delstrs['v_5p'])) \ + eroded_seqs_dots['d'] \ + ' ' * (len(line['j_gl_seq']) + len(line['dj_insertion']) - line['d_3p_del'] + line['j_3p_del'] + len(line['jf_insertion'])) germline_v_end = len(line['fv_insertion']) + len(line['v_gl_seq']) + line[ 'v_3p_del'] - 1 # position in the query sequence at which we find the last base of the v match. NOTE we subtract off the v_5p_del because we're *not* adding dots for that deletion (it's just too long) germline_j_start = germline_d_end + 1 - line['d_3p_del'] + len( line['dj_insertion']) - line['j_5p_del'] vj_line = ' ' * len(line['fv_insertion']) + eroded_seqs_dots['v'] + '.' * extra_space_because_of_fixed_nospace \ + ' ' * (germline_j_start - germline_v_end - 2) + eroded_seqs_dots['j'] + ' ' * len(line['jf_insertion']) # and the query line qrseq_line = ' ' * len( delstrs['v_5p']) + line['seqs'][iseq] + ' ' * line['j_3p_del'] outstrs = [insert_line, d_line, vj_line, qrseq_line] check_outsr_lengths( line, outstrs, fix=True ) # I think the only way they can be different is if the d right side erosion is so long that it hangs over the right side of the j if gap_insert_point is not None: for istr in [ 0, 1, 3 ]: # everybody except the vj line, which already has the modified interior delstrs above outstrs[ istr] = outstrs[istr][:gap_insert_point] + gapstr + outstrs[ istr][gap_insert_point:] check_outsr_lengths(line, outstrs, fix=True) colors = [[[] for _ in range(len(ostr))] for ostr in outstrs] if indelutils.has_indels(line['indelfos'][iseq]): # outstrs, colors = old_indel_shenanigans(line, iseq, outstrs, colors) outstrs, colors = indel_shenanigans(line, iseq, outstrs, colors) outstrs = add_colors(outstrs, colors, line) suffixes = [ 'insert%s\n' % ('s' if utils.has_d_gene(utils.get_locus(line['v_gene'])) else ''), '%s\n' % (utils.color_gene(line['d_gene'])), '%s %s\n' % (utils.color_gene(line['v_gene']), utils.color_gene(line['j_gene'])), '%s %4.2f mut %s\n' % (get_uid_str(line, iseq, seed_uid), line['mut_freqs'][iseq], utils.color('red', utils.is_functional_dbg_str(line, iseq))) ] outstrs = [ '%s%s %s' % (extra_str, ostr, suf) for ostr, suf in zip(outstrs, suffixes) ] if label != '': # this doesn't really work if the edge of the removed string is the middle of a color code... but oh well, it doesn't really happen any more since I shortened the kbound label from waterer.py offset = max( 0, len(extra_str) - 2) # skootch <label> this many positions leftward into <extra_str> removed_str = outstrs[0][offset:offset + utils.len_excluding_colors(label)] outstrs[0] = outstrs[0][:offset] + label + outstrs[0][ utils.len_excluding_colors(label) + offset:] # NOTE this *replaces* the bases in <extra_str> with <label>, which is only fine if they're spaces if removed_str.strip() != '': print '%s%s (covered by label \'%s\')' % ( ' ' * offset, utils.color('red', removed_str), label) if one_line: outstrs = outstrs[-1:] # remove all except the query seq line elif not utils.has_d_gene(utils.get_locus(line['v_gene'])): outstrs.pop(1) # remove the d germline line print ''.join(outstrs), if check_line_integrity: if set(line.keys()) != set(original_line.keys()): raise Exception('ack 1') for k in line: if line[k] != original_line[k]: print 'key %s differs:\n %s\n %s ' % (k, line[k], original_line[k]) raise Exception('')
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname( outdir)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( outdir, args.extrastr, outdir, outdir) utils.run_ete_script(cmd, ete_path) nodefo = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd': float(line['kd']), 'relative_kd': float(line['relative_kd']), 'lambda': line.get('lambda', None), 'target_index': int(line['target_index']), } if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx( '%s/%s.fasta' % (outdir, args.extrastr)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( ete_path, outdir, args.extrastr, outdir, outdir) utils.simplerun(cmd, shell=True) kdvals = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: kdvals[line['uid']] = float(line['kd']) if len( set(kdvals) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(kdvals) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(kdvals)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(kdvals)) final_line['affinities'] = [ 1. / kdvals[u] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] from Bio.Seq import Seq final_line['nearest_target_indices'] = [] aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']] for mseq in final_line['input_seqs']: aa_mseq = Seq(mseq).translate() aa_hdists = [ utils.hamming_distance(aa_t, aa_mseq, amino_acid=True) for aa_t in aa_targets ] imin = aa_hdists.index( min(aa_hdists) ) # NOTE doesn't do anything differently if there's more than one min final_line['nearest_target_indices'].append(imin) return final_line
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir)) # output mutated sequences from bcr-phylo assert len(naive_line['unique_ids']) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels(naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines(''.join(lines)) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4): # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname) utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1) nodefo = {} with open(kdfname) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd' : float(line['kd']), 'relative_kd' : float(line['relative_kd']), 'lambda' : line.get('lambda', None), 'target_index' : int(line['target_index']), } if len(set(nodefo) - set(final_line['unique_ids'])) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']] final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']] final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']] final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']] tree = treeutils.get_dendro_tree(treefname=nwkfname) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent(glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id(final_line, irandom=ievent) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
def get_mature_line(sfos, naive_line, glfo, nodefo, dtree, target_sfos, locus=None): assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in sfos: mline = utils.get_non_implicit_copy(naive_line) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % ( ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines( ''.join(lines) ) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in sfos], reco_info) ftree = copy.deepcopy(dtree) if locus is not None: def ltr(u): return u + '-' + locus new_nodefo = {} for u_old in nodefo: new_nodefo[ltr(u_old)] = nodefo[u_old] nodefo = new_nodefo treeutils.translate_labels(ftree, [(u, ltr(u)) for u in final_line['unique_ids']]) final_line['unique_ids'] = [ ltr(u) for u in final_line['unique_ids'] ] assert len(sfos) == len(final_line['unique_ids']) for iseq, sfo in enumerate(sfos): naive_id = naive_line['unique_ids'][0] assert naive_id.count('-') == 1 bstr = naive_id.replace('-' + locus, '') pids = final_line['paired-uids'][iseq] assert len(pids) == 1 and pids[0].find( bstr ) == 0 and pids[0].count('-') == 1 and pids[0].split( '-' )[1] in utils.loci # if uid is xxx-igh, paired id shoud be e.g. xxx-igk final_line['paired-uids'][iseq] = [ p.replace(bstr, sfo['name']) for p in pids ] if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] ftree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree), padwidth=12) final_line['tree'] = ftree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo ) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos] return final_line