def sample_dlcoal_no_ifix(stree, n, freq, duprate, lossrate, freqdup, freqloss,\ forcetime, namefunc=lambda x: x, \ remove_single=True, name_internal="n", minsize=0): """Sample a gene tree from the DLCoal model using the new simulator""" # generate the locus tree while True: locus_tree, locus_extras = sim_DLILS_gene_tree(stree, n, freq, \ duprate, lossrate, \ freqdup, freqloss, \ forcetime) if len(locus_tree.leaves()) >= minsize: break if len(locus_tree.nodes) <= 1: # TODO: check 1 value # total extinction coal_tree = treelib.Tree() coal_tree.make_root() coal_recon = {coal_tree.root: locus_tree.root} daughters = set() else: # simulate coalescence # create new (expanded) locus tree logged_locus_tree, logged_extras = locus_to_logged_tree(locus_tree, popsize = n) daughters = logged_extras[0] pops = logged_extras[1] log_recon = logged_extras[2] # treelib.assert_tree(logged_locus_tree) # removed locus_tree_copy from below coal_tree, coal_recon = dlcoal.sample_locus_coal_tree(logged_locus_tree, n=pops, daughters=daughters, namefunc=lambda lognamex: log_recon[lognamex] + '_' + str(lognamex)) # print set(coal_tree) - set(coal_tree.postorder()) treelib.assert_tree(coal_tree) # clean up coal tree if remove_single: treelib.remove_single_children(coal_tree) phylo.subset_recon(coal_tree, coal_recon) if name_internal: dlcoal.rename_nodes(coal_tree, name_internal) dlcoal.rename_nodes(locus_tree, name_internal) # store extra information ### TODO: update this now that we're using logged locus tree, new sample function extra = {"locus_tree": locus_tree, "locus_recon": locus_extras['recon'], "locus_events": locus_extras['events'], "coal_tree": coal_tree, "coal_recon": coal_recon, "daughters": daughters} return coal_tree, extra
def debug_test3(): stree = treelib.read_tree('examples/nbin.stree') # run from ../ of this directory for node in stree: node.dist *= 1e7 # gen per myr popsize = 2e7 freq = 1e0 dr = .0000012 / 1e7 #.0012/1e7 lr = .0000011 / 1e7 #.0006/1e7 freqdup = freqloss = .05 forcetime = 1e7 for node in stree: print node.name, node.dist, len(node.children) print locus_tree, locus_extras = sim_DLILS_gene_tree(stree, popsize, freq, \ dr, lr, \ freqdup, freqloss, \ forcetime) for node in locus_tree: print node.name, node.dist, len(node.children) print logged_locus_tree, logged_extras = locus_to_logged_tree(locus_tree, popsize) daughters = logged_extras[0] pops = logged_extras[1] coal_tree, coal_recon = dlcoal.sample_locus_coal_tree(logged_locus_tree, n=pops, daughters=daughters, namefunc=lambda x: logged_extras[2][x] + '_' + str(x)) #begin debug print coal_tree.leaf_names() try: # print set(coal_tree) - set(coal_tree.postorder()) treelib.assert_tree(coal_tree) except AssertionError: print 'assertion error thrown on coal_tree being a proper tree' from rasmus import util hd= util.hist_dict(x.name for x in coal_tree.postorder()) for key in hd.keys(): print key if hd[key]>1 else '', print print len(coal_tree.nodes) - len(list(coal_tree.postorder()))
def test(self): """Test a tree search""" tree = parse_newick("((a,b),((c,d),(e,f)))") a, b = phylo.propose_random_spr(tree) phylo.perform_spr(tree, a, b) treelib.assert_tree(tree) for i in xrange(100): top1 = phylo.hash_tree(tree) s = phylo.TreeSearchSpr(tree) s.next() top2 = phylo.hash_tree(tree) self.assertNotEqual(top1, top2) s.revert() self.assertEqual(phylo.hash_tree(tree), top1)
def sim_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime): """ Runs a relaxed fixation assumption simulation on a species tree. Some simplifying assumptions are made for this version of the simulator: 1) All branches of the species tree have the same population size 2) All branches of the species tree have the same duplication rate 3) All branches of the species tree have the same loss rate 4) All branches of the species tree have the same duplication effect 5) All branches of the species tree have the same loss effect 6) All branches of the species tree have the same time between forced frequency changes 7) There is a single allele at the root of the species tree. A duplication/loss effect is the change in frequency for either event. Appropriate default values for these effects may need to be determined. Furture iterations should remove these assumptions by incorporating dictionaries to allow values for each branch. stree is the initial species tree; it may be mutated by the simulator popsize is the population size (assmpt. 1) freq is the allele frequency (assmpt. 7) dr is the duplication rate (in events/myr/indiv(?); assmpt. 2) lr is the loss rate (in events/myr/indiv(?); assmpt. 3) freqdup is the duplication effect (assmpt. 4) freqloss is the loss effect (assmpt. 5) forcetime is the maximum time between frequency changes (assmpt. 6) """ ## sanity checks before running the simulator; may be removed or relaxed treelib.assert_tree(stree) assert popsize > 0 assert 0.0 <= freq and freq <= 1.0 assert dr >= 0.0 assert lr >= 0.0 assert 0.0 <= freqdup and freqdup <= 1.0 assert 0.0 <= freqloss and freqloss <= 1.0 assert forcetime >= 0.0 if dr + lr <= 0.0: return stree # no duplications or losses => stree is final tree # note: the use of < instead of <= is intentional # if lr==0, duprate/fullrate==1, and random() returns from [0.0,1.0) def event_is_dup(duprate, fullrate): return random.random() < duprate / fullrate def remove_duds(node): if len(node.children) == 0: parent = node.parent stree.remove(node) return remove_duds(parent) if parent else None # if parent: # remove_duds(parent) def sim_walk(node, p, walk_time=0.0, time_until_force=forcetime): debugprint(" Sim on branch" + str(node.name) + " with frequency " + str(p) + " and walk time " + str(walk_time)) if p <= 0.0: debugprint(" Extinction on branch " + str(node.name)) parent = node.parent stree.remove_tree(node) # extinction event remove_duds(parent) return elif p >= 1.0: # sanity check p = 1.0 eff_dr = dr * p # * popsize #?? eff_lr = lr * p # * popsize #?? eff_bothr = eff_dr + eff_lr event_time = stats.exponentialvariate(eff_bothr) if event_time >= min(time_until_force, node.dist - walk_time): # >= ok? # do not process D/L event; determine whether at force or new node if time_until_force < node.dist - walk_time: # force new frequency newp = coal.sample_freq_CDF(p, popsize, forcetime * 1e6) # scale forcetime to years (in myr) debugprint(" Forced new frequency: " + str(newp)) ## TODO: may wish to log newp in node.data new_walk_time = walk_time + time_until_force return sim_walk(node, newp, walk_time=new_walk_time) # continue walk with new frequency # increase walk_time accordingly # reset time_until_force to forcetime else: # finish node, determine whether to contine walking on children newp = coal.sample_freq_CDF(p, popsize, \ (node.dist - walk_time) * 1e6) # scale remaining time into years (from myr) node.data['freq'] = newp # stores frequency of allele at the speciation event debugprint(" Completed branch; new frequency: " + str(newp)) return node.recurse(sim_walk, newp) else: # process D/L event # no WF updates for these events (modelling decision) new_walk_time = walk_time + event_time new_time_until_force = time_until_force - event_time if event_is_dup(eff_dr, eff_bothr): # perform duplication event new_node = treelib.TreeNode(stree.new_name()) # create a node new_node for the duplication event stree.add_child(node.parent, new_node) # add the dup node subtree_copy = treelib.subtree(stree, node) # make a copy of node's subtree (dup tree) stree.remove(node) # pull node off of parent node stree.add_child(new_node, node) # attach node to dup node stree.add_tree(new_node, subtree_copy) # attach dup copy new_node.dist = new_walk_time # set dist to dup node.dist = node.dist - new_walk_time # correct for dup dist subtree_copy.root.dist = node.dist # also correct for dup dist new_node.data['freq'] = p # set frequency at dup event debugprint(" Duplication occurred at walk time " + str(new_walk_time)) sim_walk(node, p, time_until_force = new_time_until_force) # recurse on remainder of original branch sim_walk(subtree_copy.root, freqdup, time_until_force = new_time_until_force) # recurse on dup tree with correct starting frequency return else: # perform loss event newp = p - freqloss debugprint(" Loss occurred at walk time " + str(new_walk_time) + " yielding new frequency " + str(newp)) return sim_walk(node, newp, walk_time=new_walk_time, \ time_until_force=new_time_until_force) def remove_single_child_nodes(): callagain = False for node in stree.preorder(): if node.parent and len(node.children) == 1: parent = node.parent child = node.children[0] newdist = node.dist + child.dist stree.remove(node) stree.add_child(parent, child) child.dist = newdist callagain = True break if callagain: remove_single_child_nodes() # main code sim_walk(stree.root, freq) remove_single_child_nodes() return stree # poor nomenclature; this will be fixed in v2.1
def sim_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime): """ Runs a relaxed fixation assumption simulation on a species tree. Some simplifying assumptions are made for this version of the simulator: 1) All branches of the species tree have the same population size 2) All branches of the species tree have the same duplication rate 3) All branches of the species tree have the same loss rate 4) All branches of the species tree have the same duplication effect 5) All branches of the species tree have the same loss effect 6) All branches of the species tree have the same time between forced frequency changes 7) There is a single allele at the root of the species tree. A duplication/loss effect is the change in frequency for either event. Appropriate default values for these effects may need to be determined. Furture iterations should remove these assumptions by incorporating dictionaries to allow values for each branch. stree is the initial species tree; it may be mutated by the simulator popsize is the population size (assmpt. 1) freq is the allele frequency (assmpt. 7) dr is the duplication rate (in events/myr/indiv(?); assmpt. 2) lr is the loss rate (in events/myr/indiv(?); assmpt. 3) freqdup is the duplication effect (assmpt. 4) freqloss is the loss effect (assmpt. 5) forcetime is the maximum time between frequency changes (assmpt. 6) """ ## sanity checks before running the simulator; may be removed or relaxed treelib.assert_tree(stree) assert popsize > 0 assert 0.0 <= freq and freq <= 1.0 assert dr >= 0.0 assert lr >= 0.0 assert 0.0 <= freqdup and freqdup <= 1.0 assert 0.0 <= freqloss and freqloss <= 1.0 assert forcetime >= 0.0 if dr + lr <= 0.0: return stree.copy() # no duplications or losses => stree is final tree # note: the use of < instead of <= is intentional # if lr==0, duprate/fullrate==1, and random() returns from [0.0,1.0) def event_is_dup(duprate, fullrate): return random.random() < duprate / fullrate def sim_walk(gtree, snode, gnode, p, s_walk_time=0.0, g_walk_time=0.0, \ time_until_force=forcetime): # debugprint(" Sim on branch" + str(node.name) + " with frequency " + str(p) + " and walk time " + str(walk_time)) # debugprint(" walking on " + str(gnode.name)) if p <= 0.0: # gnode is 'parent' of extinct node # create new_gnode, set data['freq'] = 0.0 # prune at the end new_gnode = treelib.TreeNode(gtree.new_name()) new_gnode.dist = g_walk_time new_gnode.data['freq'] = 0.0 gtree.add_child(gnode, new_gnode) # debugprint(" extinction on " + str(gnode.name)) else: # put everything else in this block to avoid using returns p = min(p, 1.0) # sanity check eff_dr = dr * p # * popsize #?? eff_lr = lr * p # * popsize #?? eff_bothr = eff_dr + eff_lr event_time = stats.exponentialvariate(eff_bothr) remaining_s_dist = snode.dist - s_walk_time if event_time >= min(time_until_force, remaining_s_dist): # do not process D/L event; determine whether at force or speciation if time_until_force < remaining_s_dist: # force new frequency newp = coal.sample_freq_CDF(p, popsize, forcetime * 1e6) # scale forcetime to years (in myr) # debugprint(" Forced new frequency: " + str(newp)) ## TODO: may wish to log newp in node.data new_s_walk_time = s_walk_time + time_until_force new_g_walk_time = g_walk_time + time_until_force sim_walk(gtree, snode, gnode, newp, \ s_walk_time=new_s_walk_time, \ g_walk_time=new_g_walk_time) # continue walk with new frequency # increase walk_times accordingly # reset time_until_force to forcetime else: # speciation event newp = coal.sample_freq_CDF(p, popsize, remaining_s_dist * 1e6) # scale remaining time into years (from myr) new_gnode = treelib.TreeNode(gtree.new_name()) new_gnode.dist = g_walk_time + remaining_s_dist new_gnode.data['freq'] = newp # stores frequency of allele at the speciation event gtree.add_child(gnode, new_gnode) # debugprint(" Completed branch; new frequency: " + str(newp)) for schild in snode.children: sim_walk(gtree, schild, new_gnode, newp) # return # shouldn't be necessary else: # process D/L event # no WF updates for these events (modelling decision) new_s_walk_time = s_walk_time + event_time new_g_walk_time = g_walk_time + event_time new_time_until_force = time_until_force - event_time if event_is_dup(eff_dr, eff_bothr): # perform duplication event new_gnode = treelib.TreeNode(gtree.new_name()) # create a node new_gnode for the duplication event new_gnode.dist = new_g_walk_time # set dist to dup new_gnode.data['freq'] = p # set frequency at dup event # debugprint(" Duplication occurred at walk time " + str(new_walk_time)) gtree.add_child(gnode, new_gnode) # debugprint(" starting on orig of " + str(new_gnode.name)) sim_walk(gtree, snode, new_gnode, p, \ s_walk_time=new_s_walk_time, \ time_until_force = new_time_until_force) # recurse on remainder of original branch # debugprint(" starting on dup of " + str(new_gnode.name)) sim_walk(gtree, snode, new_gnode, freqdup, \ s_walk_time=new_s_walk_time, \ time_until_force = new_time_until_force) # recurse on dup tree with correct starting frequency # return else: # perform loss event newp = max(p - freqloss, 0.0) # sanity check # debugprint(" Loss occurred at walk time " + str(new_walk_time) + " yielding new frequency " + str(newp)) sim_walk(gtree, snode, gnode, newp, \ s_walk_time=new_s_walk_time, \ g_walk_time=new_g_walk_time, \ time_until_force=new_time_until_force) # main code # create new gene tree and simulate its evolution gtree = treelib.Tree() gtree.make_root() gtree.root.dist = 0.0 gtree.root.data['freq'] = freq sim_walk(gtree, stree.root, gtree.root, freq) # should mutate gtree # remove dead branches and single children (inside last method) # note that the simplifyRoot argument was added to the treelib methods # so that gtree.root.dist is always equal to 0.0 (and this allows the # root to have a single child) # if this behavior is undesired later, we can simply remove the argument # and the root will be collapsed (and have >0 dist) extant_leaves = [] for leaf in gtree.leaves(): if leaf.data['freq'] > 0.0: extant_leaves.append(leaf.name) gtree = treelib.subtree_by_leaf_names(gtree, extant_leaves, \ simplifyRoot=False) return gtree
def sim_DLILS_gene_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime): """ Runs a relaxed fixation assumption simulation on a species tree. Some simplifying assumptions are made for this version of the simulator: 1) All branches of the species tree have the same population size 2) All branches of the species tree have the same duplication rate 3) All branches of the species tree have the same loss rate 4) All branches of the species tree have the same duplication effect 5) All branches of the species tree have the same loss effect 6) All branches of the species tree have the same time between forced frequency changes 7) There is a single allele at the root of the species tree. A duplication/loss effect is the change in frequency for either event. Appropriate default values for these effects may need to be determined. Furture iterations should remove these assumptions by incorporating dictionaries to allow values for each branch. stree is the initial species tree; it may be mutated by the simulator popsize is the population size (assmpt. 1) freq is the allele frequency (assmpt. 7) dr is the duplication rate (in events/myr/indiv(?); assmpt. 2) lr is the loss rate (in events/myr/indiv(?); assmpt. 3) freqdup is the duplication effect (assmpt. 4) freqloss is the loss effect (assmpt. 5) forcetime is the maximum time between frequency changes (assmpt. 6) Update: 30 July 2010 Will return the gene (locus) tree, as well as extra information including a reconciliation dictionary and an events dictionary. """ ## sanity checks before running the simulator; may be removed or relaxed treelib.assert_tree(stree) assert popsize > 0 assert 0.0 <= freq and freq <= 1.0 assert dr >= 0.0 assert lr >= 0.0 assert 0.0 <= freqdup and freqdup <= 1.0 assert 0.0 <= freqloss and freqloss <= 1.0 assert forcetime >= 0.0 if dr + lr <= 0.0: return stree.copy() # no duplications or losses => stree is final tree # note: the use of < instead of <= is intentional # if lr==0, duprate/fullrate==1, and random() returns from [0.0,1.0) def event_is_dup(duprate, fullrate): return random.random() < duprate / fullrate def sim_walk(gtree, snode, gnode, p, s_walk_time=0.0, g_walk_time=0.0, \ time_until_force=forcetime, eventlog=[]): ### Most of the variables are obvious from descriptions in sim_tree or similar. ### eventlog is a log of events along the gtree branch; each entry has the form ### (time_on_branch, event_type, frequency, species_node), ### where 0.0 <= time_on_branch <= branch_node.dist ### event_type is one of {'extinction', 'frequency', 'speciation', ### duplication', 'loss', 'root', 'gene'}, where 'root' is a unique event ### not added during the sim_walk process ### frequency is the branch frequency at the event time ### species_node is the name of the node of the species tree branch ### in which the event occurs if p <= 0.0: ## EXTINCTION EVENT # gnode is 'parent' of extinct node # create new_gnode new_gnode = treelib.TreeNode(gtree.new_name()) new_gnode.dist = g_walk_time # set new_gnode's frequency new_gnode.data['freq'] = 0.0 gtree.add_child(gnode, new_gnode) # add extinction event to the event log ext_event = (g_walk_time, 'extinction', 0.0, snode.name) eventlog.append(ext_event) # set new_gnode's event log new_gnode.data['log'] = eventlog eventlog = [] # should have no effect; added for debugging on 18 Oct 2010 else: # put everything else in this block to avoid using returns p = min(p, 1.0) # sanity check eff_dr = dr * p # * popsize #?? eff_lr = lr * p # * popsize #?? eff_bothr = eff_dr + eff_lr event_time = stats.exponentialvariate(eff_bothr) remaining_s_dist = snode.dist - s_walk_time if event_time >= min(time_until_force, remaining_s_dist): # do not process D/L event; determine whether at force or speciation if time_until_force < remaining_s_dist: ## FREQUENCY UPDATE EVENT # sample a new frequency (note scaling to years from myr) # edit: not any more newp = coal.sample_freq_CDF(p, popsize, forcetime) # * 1e6) # TODO: if we decide not to reset time_until_force at # speciation events, the newp generation will need to be # altered in some form (probably using a new variable) # update walk times new_s_walk_time = s_walk_time + time_until_force new_g_walk_time = g_walk_time + time_until_force # add frequency event to event log freq_event = (new_g_walk_time, 'frequency', newp, snode.name) eventlog.append(freq_event) # continue the walk with a reset forcetime sim_walk(gtree, snode, gnode, newp, \ s_walk_time=new_s_walk_time, \ g_walk_time=new_g_walk_time, \ eventlog=eventlog) eventlog = [] # should have no effect; debug add on 18 Oct 2010 else: ## SPECIATION EVENT # separate into separate root, non-root speciations # if gnode.parent: # gnode not the root if gnode.data['log'][-1][1] != 'root': # sample a new frequency (note scaling to years from myr) # edit: not any more newp = coal.sample_freq_CDF(p, popsize, remaining_s_dist) # * 1e6) # create new_gnode for this event new_gnode = treelib.TreeNode(gtree.new_name()) new_g_walk_time = g_walk_time + remaining_s_dist new_gnode.dist = new_g_walk_time # set new node's frequency new_gnode.data['freq'] = newp gtree.add_child(gnode, new_gnode) # add speciation event to event log and set the new node's log if snode.is_leaf(): gene_event = (new_g_walk_time, 'gene', newp, snode.name) eventlog.append(gene_event) new_gnode.data['log'] = eventlog # end of walk on species branch eventlog = [] # should have no effect; debug add on 18 Oct 2010 else: spec_event = (new_g_walk_time, 'speciation', newp, snode.name) eventlog.append(spec_event) new_gnode.data['log'] = eventlog for schild in snode.children: sim_walk(gtree, schild, new_gnode, newp, eventlog=[]) # TODO: if we decide not to reset time_until_force at # speciation events, this sim_walk call will need updating eventlog = [] # should have no effect; debug add on 18 Oct 2010 else: # gnode is the root spec_event = (0.0, 'speciation', p, snode.name) eventlog = gnode.data['log'] eventlog.append(spec_event) gnode.data['log'] = eventlog # ### debug print # print # print 'adding: ', eventlog # ### end debug for schild in snode.children: sim_walk(gtree, schild, gnode, p, eventlog=[]) eventlog = [] # should have no effect; debug add on 18 Oct 2010 else: # process D/L event # no WF updates for these events (modelling decision) new_s_walk_time = s_walk_time + event_time new_g_walk_time = g_walk_time + event_time new_time_until_force = time_until_force - event_time if event_is_dup(eff_dr, eff_bothr): ## DUPLICATION EVENT # create a node new_gnode for the duplication event new_gnode = treelib.TreeNode(gtree.new_name()) new_gnode.dist = new_g_walk_time # set new node's frequency new_gnode.data['freq'] = p gtree.add_child(gnode, new_gnode) # add duplication event to event log and set the new node's log dup_event = (new_g_walk_time, 'duplication', p, snode.name) eventlog.append(dup_event) new_gnode.data['log'] = eventlog # recurse on remainder of original branch sim_walk(gtree, snode, new_gnode, p, \ s_walk_time=new_s_walk_time, \ time_until_force = new_time_until_force, \ eventlog=[]) # recurse on dup tree with correct starting frequency sim_walk(gtree, snode, new_gnode, freqdup, \ s_walk_time=new_s_walk_time, \ time_until_force = new_time_until_force, \ eventlog=[(0.0,'daughter',freqdup,snode.name)]) # added for daughter detection eventlog = [] # should have no effect; debug add on 18 Oct 2010 else: ## LOSS EVENT newp = max(p - freqloss, 0.0) # sanity check # add loss event to event log loss_event = (new_g_walk_time, 'loss', newp, snode.name) eventlog.append(loss_event) sim_walk(gtree, snode, gnode, newp, \ s_walk_time=new_s_walk_time, \ g_walk_time=new_g_walk_time, \ time_until_force=new_time_until_force, \ eventlog=eventlog) eventlog = [] # should have no effect; debug add on 18 Oct 2010 # main code # create new gene tree and simulate its evolution gtree = treelib.Tree() gtree.make_root() gtree.root.dist = 0.0 gtree.root.data['freq'] = freq root_event = (0.0, 'root', freq, stree.root.name) gtree.root.data['log'] = [root_event] sim_walk(gtree, stree.root, gtree.root, freq) # should mutate gtree # # remove dead branches and single children (inside last method) # # note that the simplifyRoot argument was added to the treelib methods # # so that gtree.root.dist is always equal to 0.0 (and this allows the # # root to have a single child) # # if this behavior is undesired later, we can simply remove the argument # # and the root will be collapsed (and have >0 dist) extant_leaves = [] for leaf in gtree.leaves(): if leaf.data['freq'] > 0.0: extant_leaves.append(leaf.name) gtree = treelib.subtree_by_leaf_names(gtree, extant_leaves, keep_single=True) remove_single_children(gtree) # allows for correct logging of events extras = generate_extras(stree, gtree) return gtree, extras
def test_sample_censored_coal(self): n = 1000 tree, lineages = coal.sample_censored_coal_tree( 10, n, 300, capped=True) treelib.assert_tree(tree)
def test_sample_coal_tree(self): n = 1000 tree = coal.sample_coal_tree(10, n) treelib.assert_tree(tree)
def sample_dlcoal_hem(stree, n, duprate, lossrate, freq, freqdup, freqloss, steptime, namefunc=lambda x: x, keep_extinct=False, remove_single=True, name_internal="n", minsize=0): """Sample a gene tree from the DLCoal model with hemiplasy""" # generate the locus tree while True: locus_tree, locus_extras = sample_locus_tree_hem( stree, n, duprate, lossrate, freq, freqdup, freqloss, steptime, keep_extinct=keep_extinct) if len(locus_tree.leaves()) >= minsize: break if len(locus_tree.nodes) <= 1: # TODO: check 1 value # total extinction coal_tree = treelib.Tree() coal_tree.make_root() coal_recon = {coal_tree.root: locus_tree.root} daughters = set() else: # simulate coalescence # create new (expanded) locus tree logged_locus_tree, logged_extras = locus_to_logged_tree( locus_tree, popsize=n) daughters = logged_extras[0] pops = logged_extras[1] log_recon = logged_extras[2] #treelib.assert_tree(logged_locus_tree) # removed locus_tree_copy from below coal_tree, coal_recon = dlcoal.sim.sample_multilocus_tree( logged_locus_tree, n=pops, daughters=daughters, namefunc=lambda lognamex: log_recon[lognamex]+'_'+str(lognamex)) #print set(coal_tree) - set(coal_tree.postorder()) treelib.assert_tree(coal_tree) # clean up coal tree if remove_single: treelib.remove_single_children(coal_tree) phylo.subset_recon(coal_tree, coal_recon) if name_internal: dlcoal.rename_nodes(coal_tree, name_internal) dlcoal.rename_nodes(locus_tree, name_internal) # store extra information extra = {"locus_tree": locus_tree, "locus_recon": locus_extras['recon'], "locus_events": locus_extras['events'], "coal_tree": coal_tree, "coal_recon": coal_recon, "daughters": daughters} if keep_extinct: extra["full_locus_tree"] = locus_extras["full_locus_tree"] return coal_tree, extra
def sample_locus_tree_hem(stree, popsize, duprate, lossrate, freq=1.0, freqdup=.05, freqloss=.05, steptime=1e6, keep_extinct=False): """ Sample a locus tree with birth-death and hemiplasy Runs a relaxed fixation assumption simulation on a species tree. Some simplifying assumptions are made for this version of the simulator: 1) All branches of the species tree have the same population size 2) All branches of the species tree have the same duplication rate 3) All branches of the species tree have the same loss rate 4) All branches of the species tree have the same duplication effect 5) All branches of the species tree have the same loss effect 6) All branches of the species tree have the same time between forced frequency changes 7) There is a single allele at the root of the species tree. A duplication/loss effect is the change in frequency for either event. Appropriate default values for these effects may need to be determined. Furture iterations should remove these assumptions by incorporating dictionaries to allow values for each branch. parameters: stree is the initial species tree; it may be mutated by the simulator popsize is the population size (assmpt. 1) freq is the allele frequency (assmpt. 7) duprate is the duplication rate (in events/myr/indiv(?); assmpt. 2) lossrate is the loss rate (in events/myr/indiv(?); assmpt. 3) freqdup is the duplication effect (assmpt. 4) freqloss is the loss effect (assmpt. 5) forcetime is the maximum time between frequency changes (assmpt. 6) Returns the locus tree, as well as extra information including a reconciliation dictionary and an events dictionary. """ ## sanity checks before running the simulator; may be removed or relaxed treelib.assert_tree(stree) assert popsize > 0 assert 0.0 <= freq and freq <= 1.0 assert duprate >= 0.0 assert lossrate >= 0.0 assert 0.0 <= freqdup and freqdup <= 1.0 assert 0.0 <= freqloss and freqloss <= 1.0 assert steptime > 0.0 # special case: no duplications or losses if duprate == 0.0 and lossrate == 0.0: locus_tree = stree.copy() recon = phylo.reconcile(locus_tree, stree, lambda x: x) events = phylo.label_events(locus_tree, recon) return locus_tree, {"recon": recon, "events": events, "daughters": set()} def event_is_dup(duprate, fullrate): return random.random() <= duprate / fullrate def sim_walk(gtree, snode, gparent, p, s_walk_time=0.0, remaining_steptime=steptime, daughter=False): """ eventlog is a log of events along the gtree branch. Each entry has the form (time_on_branch, event_type, frequency, species_node), where 0.0 <= time_on_branch <= branch_node.dist event_type is one of {'extinction', 'frequency', 'speciation', duplication', 'loss', 'root', 'gene'}, where 'root' is a unique event not added during the sim_walk process frequency is the branch frequency at the event time species_node is the name of the node of the species tree branch in which the event occurs """ # create new node gnode = treelib.TreeNode(gtree.new_name()) gtree.add_child(gparent, gnode) gnode.data = {"freq": p, "log": []} eventlog = gnode.data["log"] g_walk_time = 0.0 if daughter: eventlog.append((0.0, 'daughter', freqdup, snode.name)) # grow this branch, determine next event event = None while True: if p <= 0.0: event = "extinct" break # determine remaing time remaining_s_dist = snode.dist - s_walk_time remaining_time = min(remaining_steptime, remaining_s_dist) # sample next dup/loss event eff_duprate = duprate * p / freqdup eff_lossrate = lossrate * p / freqloss eff_bothrate = eff_duprate + eff_lossrate event_time = stats.exponentialvariate(eff_bothrate) # advance times time_delta = min(event_time, remaining_time) s_walk_time += time_delta g_walk_time += time_delta # sample new frequency p = coal.sample_freq_CDF(p, popsize, time_delta) # determine event if event_time < remaining_time: # dup/loss occurs if event_is_dup(eff_duprate, eff_bothrate): # dup, stop growing event = "dup" break else: # loss, continue growing event = "loss" else: if remaining_s_dist < remaining_steptime: # we are at a speciation, stop growing event = "spec" break # process step if event == "loss": # LOSS EVENT p = max(p - freqloss, 0.0) remaining_steptime -= time_delta eventlog.append((g_walk_time, 'loss', p, snode.name)) else: # NEXT TIME STEP remaining_steptime = steptime eventlog.append((g_walk_time, 'frequency', p, snode.name)) # process event if event == "extinct": # EXTINCTION EVENT (p <= 0) gnode.dist = g_walk_time gnode.data['freq'] = 0.0 eventlog.append((g_walk_time, 'extinction', 0.0, snode.name)) elif event == "spec": # SPECIATION EVENT gnode.dist = g_walk_time gnode.data['freq'] = p # add speciation event to event log and if snode.is_leaf(): eventlog.append((g_walk_time, 'gene', p, snode.name)) else: eventlog.append((g_walk_time, 'speciation', p, snode.name)) for schild in snode.children: sim_walk(gtree, schild, gnode, p) elif event == "dup": # DUPLICATION EVENT gnode.dist = g_walk_time gnode.data['freq'] = p eventlog.append((g_walk_time, 'duplication', p, snode.name)) # recurse on mother sim_walk(gtree, snode, gnode, p, s_walk_time=s_walk_time, remaining_steptime=remaining_steptime) # recurse on daughter sim_walk(gtree, snode, gnode, freqdup, s_walk_time=s_walk_time, remaining_steptime=remaining_steptime, daughter=True) else: raise Exception("unknown event '%s'" % event) # create new gene tree and simulate its evolution gtree = treelib.Tree() gtree.make_root() gtree.root.dist = 0.0 gtree.root.data['freq'] = freq gtree.root.data['log'] = [(0.0, 'speciation', freq, stree.root.name)] # simulate locus tree sim_walk(gtree, stree.root.children[0], gtree.root, freq) sim_walk(gtree, stree.root.children[1], gtree.root, freq) # remove dead branches and single children extant_leaves = [leaf.name for leaf in gtree.leaves() if leaf.data['freq'] > 0.0] extinctions = [leaf for leaf in gtree.leaves() if leaf.data['freq'] == 0.0] if keep_extinct: full_gtree = gtree.copy() # do deep copy of data for node in full_gtree: node2 = gtree.nodes[node.name] for key, val in node2.data.items(): node.data[key] = copy.copy(val) treelib.subtree_by_leaf_names(gtree, extant_leaves, keep_single=True) remove_single_children(gtree) # determine extra information (recon, events, daughters) extras = generate_extras(stree, gtree) if keep_extinct: extras["full_locus_tree"] = full_gtree return gtree, extras