def infer_tree_from_leaves(self, region, in_tree, leafseqs, naive_seq): if 'dendropy' not in sys.modules: import dendropy dendropy = sys.modules['dendropy'] taxon_namespace = dendropy.TaxonNamespace() with tempfile.NamedTemporaryFile() as tmpfile: tmpfile.write('>%s\n%s\n' % ('naive', naive_seq)) for iseq in range(len(leafseqs)): tmpfile.write( '>t%s\n%s\n' % (iseq + 1, leafseqs[iseq]) ) # NOTE the order of the leaves/names is checked when reading bppseqgen output tmpfile.flush() # BEWARE if you forget this you are f****d with open(os.devnull, 'w') as fnull: out_tree = subprocess.check_output('./bin/FastTree -gtr -nt ' + tmpfile.name, shell=True, stderr=fnull) out_dtree = dendropy.Tree.get_from_string( out_tree, 'newick', taxon_namespace=taxon_namespace) out_dtree.reroot_at_node( out_dtree.find_node_with_taxon_label('naive'), update_bipartitions=True) out_tree = out_dtree.as_string(schema='newick', suppress_rooting=True) in_height = treegenerator.get_mean_height(in_tree) out_height = treegenerator.get_mean_height(out_tree) base_width = 100 print ' %s trees:' % ('full sequence' if region == 'all' else region) print ' %s' % utils.color('blue', 'input:') print treegenerator.get_ascii_tree(in_tree, extra_str=' ', width=base_width) print ' %s' % utils.color('blue', 'output:') print treegenerator.get_ascii_tree(out_tree, extra_str=' ', width=int(base_width * out_height / in_height)) in_dtree = dendropy.Tree.get_from_string( in_tree, 'newick', taxon_namespace=taxon_namespace) if self.args.debug: print ' heights: %.3f %.3f' % (in_height, out_height) print ' symmetric difference: %d' % dendropy.calculate.treecompare.symmetric_difference( in_dtree, out_dtree) print ' euclidean distance: %f' % dendropy.calculate.treecompare.euclidean_distance( in_dtree, out_dtree) print ' r-f distance: %f' % dendropy.calculate.treecompare.robinson_foulds_distance( in_dtree, out_dtree)
def infer_tree_from_leaves(self, region, in_tree, leafseqs): with tempfile.NamedTemporaryFile() as tmpfile: for iseq in range(len(leafseqs)): tmpfile.write( '>t%s\n%s\n' % (iseq + 1, leafseqs[iseq]) ) # NOTE the order of the leaves/names is checked when reading bppseqgen output tmpfile.flush() # BEWARE if you forget this you are f****d with open(os.devnull, 'w') as fnull: out_tree = subprocess.check_output('./bin/FastTree -gtr -nt ' + tmpfile.name, shell=True, stderr=fnull) in_height = treegenerator.get_mean_height(in_tree) out_height = treegenerator.get_mean_height(out_tree) base_width = 100 print ' %s trees: input/output' % region print treegenerator.get_ascii_tree(in_tree, extra_str=' ', width=base_width) print treegenerator.get_ascii_tree(out_tree, extra_str=' ', width=int(base_width * out_height / in_height)) if 'dendropy' not in sys.modules: import dendropy in_dtree = sys.modules['dendropy'].Tree.get_from_string( in_tree, 'newick') out_dtree = sys.modules['dendropy'].Tree.get_from_string( out_tree, 'newick') if self.args.debug: print ' heights: %.3f %.3f' % (in_height, out_height) print ' symmetric difference: %d' % in_dtree.symmetric_difference( out_dtree) print ' euclidean distance: %f' % in_dtree.euclidean_distance( out_dtree) print ' r-f distance: %f' % in_dtree.robinson_foulds_distance( out_dtree)
def add_mutants(self, reco_event, irandom): if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.: # some of the stuff below fails if mut mult is actually 0. reco_event.final_seqs.append( reco_event.recombined_seq) # set final sequnce in reco_event reco_event.indelfos = [ indelutils.get_empty_indel() for _ in range(len(reco_event.final_seqs)) ] return # When generating trees, each tree's number of leaves and total depth are chosen from the specified distributions (a.t.m., by default n-leaves is from a geometric/zipf, and depth is from data) # This chosen depth corresponds to the sequence-wide mutation frequency. # In order to account for varying mutation rates in v, d, and j we simulate these regions separately, by appropriately rescaling the tree for each region. # i.e.: here we get the sequence-wide mute freq from the tree, and rescale it by the repertoire-wide ratios from data (which are stored in the tree file). # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence (i.e. the weighted mean of v, d, and j) # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the same for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file treefostr = self.treeinfo[random.randint( 0, len(self.treeinfo) - 1 )] # per-region mutation info is tacked on after the tree... sigh. kind of hackey but works ok. assert treefostr.count(';') == 1 isplit = treefostr.find(';') + 1 chosen_tree = treefostr[:isplit] # includes semi-colon mutefo = [rstr for rstr in treefostr[isplit:].split(',')] mean_total_height = treegenerator.get_mean_height(chosen_tree) regional_heights = { } # per-region height, including <self.args.mutation_multiplier> for tmpstr in mutefo: region, ratio = tmpstr.split(':') assert region in utils.regions ratio = float(ratio) if self.args.mutation_multiplier is not None: # multiply the branch lengths by some factor ratio *= self.args.mutation_multiplier regional_heights[region] = mean_total_height * ratio scaled_trees = { r: treegenerator.rescale_tree(chosen_tree, regional_heights[r]) for r in utils.regions } if self.args.debug: print ' chose tree with total height %f' % treegenerator.get_mean_height( chosen_tree) print ' regional trees rescaled to heights: %s' % (' '.join([ '%s %.3f (expected %.3f)' % (region, treegenerator.get_mean_height( scaled_trees[region]), regional_heights[region]) for region in utils.regions ])) print treegenerator.get_ascii_tree(chosen_tree, extra_str=' ') n_leaves = treegenerator.get_n_leaves(chosen_tree) cmdfos = [] for region in utils.regions: simstr = reco_event.eroded_seqs[region] if region == 'd': simstr = reco_event.insertions[ 'vd'] + simstr + reco_event.insertions['dj'] cmdfos.append( self.prepare_bppseqgen(simstr, scaled_trees[region], n_leaves, reco_event.genes[region], reco_event, seed=irandom)) utils.run_cmds( [cfo for cfo in cmdfos if cfo is not None], sleep=False) # shenanigan is to handle zero-length regional seqs mseqs = {} for ireg in range( len(utils.regions) ): # NOTE kind of sketchy just using index in <utils.regions> (although it just depends on the loop immediately above a.t.m.) if cmdfos[ireg] is None: mseqs[utils.regions[ireg]] = [ '' for _ in range(n_leaves) ] # return an empty string for each leaf node else: mseqs[utils.regions[ireg]] = self.read_bppseqgen_output( cmdfos[ireg], n_leaves) assert len(reco_event.final_seqs) == 0 for iseq in range(n_leaves): seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq] seq = reco_event.revert_conserved_codons( seq, debug=self.args.debug ) # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with reco_event.final_seqs.append( seq) # set final sequnce in reco_event reco_event.final_codon_positions.append( copy.deepcopy(reco_event.post_erosion_codon_positions) ) # separate codon positions for each sequence, because of shm indels self.add_shm_indels(reco_event) reco_event.setline( irandom ) # set the line here because we use it when checking tree simulation, and want to make sure the uids are always set at the same point in the workflow self.check_tree_simulation(mean_total_height, regional_heights, scaled_trees, mseqs, reco_event) if self.args.debug: utils.print_reco_event(reco_event.line, extra_str=' ')