def read_input_tree_file(self, outfname): if self.args.debug: print ' reading trees from %s' % self.args.input_simulation_treefname utils.simplerun('cp %s %s' % (self.args.input_simulation_treefname, outfname), debug=False) ages, treestrs = [], [] with open(outfname) as treefile: for line in treefile: tstr = line.strip() if tstr == '': # skip empty lines continue dtree = treeutils.get_dendro_tree( treestr=tstr, suppress_internal_node_taxa=True) if dtree.seed_node.edge_length is None: # make sure root edge length is set (otherwise bppseqgen barfs) dtree.seed_node.edge_length = 0. old_new_label_pairs = [ (l.taxon.label, 't%d' % (i + 1)) for i, l in enumerate(dtree.leaf_node_iter()) ] treeutils.translate_labels( dtree, old_new_label_pairs ) # rename the leaves to t1, t2, etc. (it would be nice to not have to do this, but a bunch of stuff in recombinator uses this to check that e.g. bppseqgen didn't screw up the ordering) age = self.choose_full_sequence_branch_length() if self.args.debug > 1: # it's easier to keep this debug line separate up here than make a tmp variable to keep track of the old height print ' input tree %d (rescaled depth %.3f --> %.3f):' % ( len(ages), treeutils.get_mean_leaf_height(tree=dtree), age) treeutils.rescale_tree( age, dtree=dtree ) # I think this gets rescaled again for each event, so we could probably in principle avoid this rescaling, but if the input depth is greater than one stuff starts breaking, so may as well do it now ages.append(age) treestrs.append(dtree.as_string(schema='newick').strip()) if self.args.debug > 1: print utils.pad_lines(treeutils.get_ascii_tree(dtree)) if any(a > 1. for a in ages): raise Exception( 'tree depths must be less than 1., but trees read from %s don\'t satisfy this: %s' % (self.args.input_simulation_treefname, ages)) if len(ages) != self.args.n_trees: print ' resetting --n-trees from %d to %d to match trees read from %s' % ( self.args.n_trees, len(ages), self.args.input_simulation_treefname) self.args.n_trees = len(ages) return ages, treestrs
def add_mutants(self, reco_event, irandom): if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.: # some of the stuff below fails if mut mult is actually 0. reco_event.final_seqs.append(reco_event.recombined_seq) # set final sequnce in reco_event reco_event.indelfos = [indelutils.get_empty_indel() for _ in range(len(reco_event.final_seqs))] return # When generating trees, each tree's number of leaves and total depth are chosen from the specified distributions (a.t.m., by default n-leaves is from a geometric/zipf, and depth is from data) # This chosen depth corresponds to the sequence-wide mutation frequency. # In order to account for varying mutation rates in v, d, and j we simulate these regions separately, by appropriately rescaling the tree for each region. # i.e.: here we get the sequence-wide mute freq from the tree, and rescale it by the repertoire-wide ratios from data (which are stored in the tree file). # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence (i.e. the weighted mean of v, d, and j) # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the same for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file treefostr = self.treeinfo[random.randint(0, len(self.treeinfo)-1)] # per-region mutation info is tacked on after the tree... sigh. kind of hackey but works ok. assert treefostr.count(';') == 1 isplit = treefostr.find(';') + 1 chosen_tree = treefostr[:isplit] # includes semi-colon reco_event.set_tree(chosen_tree) # leaf names are still just like t<n> mutefo = [rstr for rstr in treefostr[isplit:].split(',')] mean_total_height = treeutils.get_mean_leaf_height(treestr=chosen_tree) regional_heights = {} # per-region height, including <self.args.mutation_multiplier> for tmpstr in mutefo: region, ratio = tmpstr.split(':') assert region in utils.regions ratio = float(ratio) if self.args.mutation_multiplier is not None: # multiply the branch lengths by some factor ratio *= self.args.mutation_multiplier regional_heights[region] = mean_total_height * ratio scaled_trees = {r : treeutils.rescale_tree(regional_heights[r], treestr=chosen_tree) for r in utils.regions} if self.args.debug: print ' chose tree with total height %f' % treeutils.get_mean_leaf_height(treestr=chosen_tree) print ' regional trees rescaled to heights: %s' % (' '.join(['%s %.3f (expected %.3f)' % (region, treeutils.get_mean_leaf_height(treestr=scaled_trees[region]), regional_heights[region]) for region in utils.regions])) n_leaves = treeutils.get_n_leaves(treeutils.get_dendro_tree(treestr=chosen_tree, schema='newick')) cmdfos = [] regional_naive_seqs = {} # only used for tree checking for region in utils.regions: simstr = reco_event.eroded_seqs[region] if region == 'd': simstr = reco_event.insertions['vd'] + simstr + reco_event.insertions['dj'] cmdfos.append(self.prepare_bppseqgen(simstr, scaled_trees[region], n_leaves, reco_event.genes[region], reco_event, seed=irandom)) regional_naive_seqs[region] = simstr utils.run_cmds([cfo for cfo in cmdfos if cfo is not None], sleep=False) # shenanigan is to handle zero-length regional seqs mseqs = {} for ireg in range(len(utils.regions)): # NOTE kind of sketchy just using index in <utils.regions> (although it just depends on the loop immediately above a.t.m.) if cmdfos[ireg] is None: mseqs[utils.regions[ireg]] = ['' for _ in range(n_leaves)] # return an empty string for each leaf node else: tmp_names, tmp_seqs = self.read_bppseqgen_output(cmdfos[ireg], n_leaves) if reco_event.leaf_names is None: reco_event.leaf_names = tmp_names assert reco_event.leaf_names == tmp_names # enforce different regions having same name + ordering (although this is already enforced when reading bppseqgen output) mseqs[utils.regions[ireg]] = tmp_seqs assert len(reco_event.final_seqs) == 0 for iseq in range(n_leaves): seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq] seq = reco_event.revert_conserved_codons(seq, debug=self.args.debug) # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with reco_event.final_seqs.append(seq) # set final sequnce in reco_event reco_event.final_codon_positions.append(copy.deepcopy(reco_event.post_erosion_codon_positions)) # separate codon positions for each sequence, because of shm indels self.add_shm_indels(reco_event) reco_event.setline(irandom) # set the line here because we use it when checking tree simulation, and want to make sure the uids are always set at the same point in the workflow # self.check_tree_simulation(mean_total_height, regional_heights, chosen_tree, scaled_trees, regional_naive_seqs, mseqs, reco_event) # self.print_validation_values() if self.args.debug: print ' tree passed to bppseqgen:' print treeutils.get_ascii_tree(dendro_tree=reco_event.tree, extra_str=' ') utils.print_reco_event(reco_event.line, extra_str=' ')
def run_treesim(self, seed, outfname, workdir): if self.args.debug or utils.getsuffix(outfname) == '.nwk': print ' generating %d tree%s,' % ( self.args.n_trees, utils.plural(self.args.n_trees)), if self.args.constant_number_of_leaves: print 'all with %s leaves' % str(self.args.n_leaves) else: print 'n-leaves from %s' % ( 'hist in parameter dir' if self.final_nldist == 'hist' else '%s distribution with parameter %s' % (self.final_nldist, str(self.args.n_leaves))) if self.args.debug: print ' mean branch lengths from %s' % ( self.parameter_dir if self.parameter_dir is not None else 'scratch') for mtype in [ 'all', ] + utils.regions: print ' %4s %7.3f (ratio %7.3f)' % ( mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean']) ages, treestrs = [], [] cmd_lines = [] pkgname = 'TreeSim' # TreeSimGM when root_mrca_weibull_parameter is set, otherwise TreeSim if self.args.root_mrca_weibull_parameter is not None: pkgname += 'GM' cmd_lines += ['require(%s, quietly=TRUE)' % pkgname] cmd_lines += ['set.seed(' + str(seed) + ')'] for itree in range(self.args.n_trees): n_leaves = self.choose_n_leaves() age = self.choose_full_sequence_branch_length() ages.append(age) if n_leaves == 1: # add singleton trees by hand treestrs.append('t1:%f;' % age) continue treestrs.append(None) # NOTE these simulation functions seem to assume that we want all the extant leaves to have the same height. Which is kind of weird. Maybe makes more sense at some point to change this. params = {'n': n_leaves, 'numbsim': self.n_trees_each_run} if self.args.root_mrca_weibull_parameter is None: fcn = 'sim.bd.taxa.age' params['lambda'] = 1 # speciation_rate params['mu'] = 0.5 # extinction_rate params['age'] = age else: fcn = 'sim.taxa' params['distributionspname'] = '"rweibull"' params[ 'distributionspparameters'] = 'c(%f, 1)' % self.args.root_mrca_weibull_parameter params[ 'labellivingsp'] = '"t"' # TreeSim doesn't let you do this, but a.t.m. this is their default cmd_lines += [ 'trees <- %s(%s)' % (fcn, ', '.join( ['%s=%s' % (k, str(v)) for k, v in params.items()])) ] cmd_lines += [ 'write.tree(trees[[1]], \"' + outfname + '\", append=TRUE)' ] if None not in treestrs: # if every tree has one leaf, we don't need to run R open(outfname, 'w').close() else: if os.path.exists(outfname): os.remove(outfname) utils.run_r( cmd_lines, workdir, print_time='tree generation' if self.args.debug else None) with open(outfname) as treefile: for itree, tstr in enumerate(treestrs): if tstr is None: treestrs[itree] = treefile.readline().strip() if None in treestrs: raise Exception( 'didn\'t read enough trees from %s: still %d empty places in treestrs' % (outfname, treestrs.count(None))) # rescale branch lengths (TreeSim lets you specify the number of leaves and the height at the same time, but TreeSimGM doesn't, and TreeSim's numbers are usually a little off anyway... so we rescale everybody) for itree in range(len(ages)): treestrs[itree] = '(%s):0.0;' % treestrs[itree].rstrip( ';' ) # the trees it spits out have non-zero branch length above root (or at least that's what the newick strings turn into when dendropy reads them), which is f****d up and annoying, so here we add a new/real root at the top of the original root's branch treestrs[itree] = treeutils.rescale_tree(ages[itree], treestr=treestrs[itree]) return ages, treestrs