def prob_locus_coal_recon_topology(tree, recon, locus_tree, n, daughters): """ Returns the log probability of a reconciled gene tree ('tree', 'recon') from the coalescent model given a locus tree 'locus_tree', population sizes 'n', and daughters set 'daughters' """ # initialize popsizes, lineage counts, and divergence times popsizes = coal.init_popsizes(locus_tree, n) lineages = coal.count_lineages_per_branch(tree, recon, locus_tree) locus_times = treelib.get_tree_timestamps(locus_tree) # calc log probability lnp = coal.pmrt( tree, recon, locus_tree, popsizes, lineages=lineages) def walk(node, gene_counts, leaves): if node.is_leaf(): gene_counts[node.name] = lineages[node][0] leaves.add(node) else: for child in node.children: if child in daughters: gene_counts[child.name] = 1 leaves.add(child) else: walk(child, gene_counts, leaves) for daughter in daughters: # determine leaves of the coal subtree gene_counts = {} leaves = set() walk(daughter, gene_counts, leaves) p = coal.cdf_mrca_bounded_multicoal( gene_counts, locus_times[daughter.parent], locus_tree, popsizes, sroot=daughter, sleaves=leaves, stimes=locus_times) if p == -util.INF: return -util.INF lnp -= p return lnp
def show_coal_track2(tree_track): win = summon.Window() bgcolor = (1, 1, 1, .1) cmap = util.rainbow_color_map(low=0.0, high=1.0) tracks = {} maxage = 0 for (start, end), tree in tree_track: print start l = [] times = treelib.get_tree_timestamps(tree) nleaves = len(tree.leaves()) maxage2 = 0 for node in tree: if len(node.children) > 1: age = times[node] freq = len(node.leaves()) / float(nleaves) #sizes = [len(x.leaves()) for x in node.children] #m = max(sizes) #n = sum(sizes) #pval = 2 * (n - m) / float(n - 1) l.extend([color(*cmap.get(freq)), start, age, end, age]) if age > maxage2: maxage2 = age win.add_group(group(lines(*l), color(*bgcolor), box(start, 0, end, maxage2, fill=True))) if maxage2 > maxage: maxage = maxage2 def func(): x, y = win.get_mouse_pos() print "pos=%s age=%f" % (util.int2pretty(int(x)), y) win.add_group(hotspot("click", 0, 0, end, maxage, func)) win.home("exact") return win
def show_coal_track2(tree_track): win = summon.Window() bgcolor = (1, 1, 1, .1) cmap = util.rainbow_color_map(low=0.0, high=1.0) tracks = {} maxage = 0 for (start, end), tree in tree_track: print(start) l = [] times = treelib.get_tree_timestamps(tree) nleaves = len(tree.leaves()) maxage2 = 0 for node in tree: if len(node.children) > 1: age = times[node] freq = len(node.leaves()) / float(nleaves) #sizes = [len(x.leaves()) for x in node.children] #m = max(sizes) #n = sum(sizes) #pval = 2 * (n - m) / float(n - 1) l.extend([color(*cmap.get(freq)), start, age, end, age]) if age > maxage2: maxage2 = age win.add_group(group(lines(*l), color(*bgcolor), box(start, 0, end, maxage2, fill=True))) if maxage2 > maxage: maxage = maxage2 def func(): x, y = win.get_mouse_pos() print("pos=%s age=%f" % (util.int2pretty(int(x)), y)) win.add_group(hotspot("click", 0, 0, end, maxage, func)) win.home("exact") return win
def test_cdf_bmc(self): # test cdf mrca BMC stree = treelib.parse_newick( "((A:1000, B:1000):500, (C:700, D:700):800);") n = 1000 gene_counts = dict.fromkeys(stree.leaf_names(), 1) T = 2000 p = exp(coal.cdf_mrca_bounded_multicoal(gene_counts, T, stree, n)) nsamples = 5000 c = 0 for i in xrange(nsamples): tree, recon = coal.sample_multicoal_tree(stree, n) if treelib.get_tree_timestamps(tree)[tree.root] < T: c += 1 p2 = c / float(nsamples) fequal(p, p2, .05)
def prob_locus_coal_recon_topology(tree, recon, locus_tree, n, daughters): ptree, nodes, nodelookup = dlcoal.make_ptree(tree) pltree, lnodes, lnodelookup = dlcoal.make_ptree(locus_tree) recon2 = dlcoal.make_recon_array(tree, recon, nodes, lnodelookup) popsizes = compbio.coal.init_popsizes(locus_tree, n) popsizes2 = [popsizes[lnode.name] for lnode in lnodes] ltimes = treelib.get_tree_timestamps(locus_tree) ltimes2 = [ltimes[lnode] for lnode in lnodes] daughters2 = [lnodelookup[lnode] for lnode in daughters] p = dlcoal.dlcoalc.prob_locus_coal_recon_topology( c_list(c_int, ptree), len(nodes), c_list(c_int, recon2), c_list(c_int, pltree), 0, len(lnodes), c_list(c_double, popsizes2), c_list(c_double, ltimes2), c_list(c_int, daughters2), len(daughters)) return p
def dlcoal_sims(outdir, nsims, stree, n, duprate, lossrate, start=0, freq=1.0, freqdup=.05, freqloss=.05, steptime=None, nsteps=100, full_log=False, **options): if steptime is None: stimes = treelib.get_tree_timestamps(stree) steptime = stimes[stree.root] / float(nsteps) for i in xrange(start, nsims): outfile = phylo.phylofile(outdir, str(i), "") util.makedirs(os.path.dirname(outfile)) print "simulating", outfile # sample a new tree from DLCoal model coal_tree, ex = sample_dlcoal_hem( stree, n, duprate, lossrate, freq, freqdup, freqloss, steptime, keep_extinct=full_log, **options) # write datastructures dlcoal.write_dlcoal_recon(outfile, coal_tree, ex) if full_log: full_logfile = phylo.phylofile(outdir, str(i), ".locus.info") full_locus_tree = ex["full_locus_tree"] ex2 = generate_extras(stree, full_locus_tree) daughters = ex2["daughters"] out = open(full_logfile, "w") out.write("hem\t%d\n" % int(is_locus_tree_hemiplasy(full_locus_tree, daughters))) out.close()
def sample_multilocus_tree(stree, n, leaf_counts=None, daughters=set(), namefunc=None): """ Returns a gene tree from a multilocus coalescent process n -- population size (int or dict) If n is a dict it must map from species name to population size """ # initialize vector for how many genes per extant species if leaf_counts is None: leaf_counts = dict((l, 1) for l in stree.leaf_names()) # initialize function for generating new gene names if namefunc is None: spcounts = dict((l, 1) for l in stree.leaf_names()) def namefunc(sp): name = sp + "_" + str(spcounts[sp]) spcounts[sp] += 1 return name stimes = treelib.get_tree_timestamps(stree) # initialize population sizes popsizes = coal.init_popsizes(stree, n) # init gene counts counts = dict((n.name, 0) for n in stree) counts.update(leaf_counts) # init lineage counts lineages = {stree.root: [None, None]} for node in stree.leaves(): lineages[node] = [leaf_counts[node.name], None] for node in daughters: if node not in lineages: lineages[node] = [None, 1] else: lineages[node][1] = 1 def get_subtree(node, leaves, leaf_counts2): """collects info of subtree rooted at node""" if node.is_leaf(): leaves.add(node) leaf_counts2[node.name] = leaf_counts[node.name] else: for child in node.children: if child in daughters: leaves.add(child) leaf_counts2[child.name] = 1 else: get_subtree(child, leaves, leaf_counts2) # loop through subtrees for snode in chain(daughters, [stree.root]): # determine leaves of the coal subtree leaves = set() leaf_counts2 = {} get_subtree(snode, leaves, leaf_counts2) if snode.parent: T = stimes[snode.parent] else: T = None # calc table prob_counts = coal.calc_prob_counts_table( leaf_counts2, T, stree, popsizes, sroot=snode, sleaves=leaves, stimes=stimes) # sample lineage counts try: coal.sample_lineage_counts(snode, leaves, popsizes, stimes, T, lineages, prob_counts) except: print snode.name treelib.draw_tree_names(stree, maxlen=8) util.print_dict(lineages, key=lambda x: x.name) raise # sample coal times tree, recon = coal.coal_cond_lineage_counts( lineages, stree.root, set(stree.leaves()), popsizes, stimes, None, namefunc) return tree, recon
def prob_dlcoal_recon_topology(coal_tree, coal_recon, locus_tree, locus_recon, locus_events, daughters, stree, n, duprate, lossrate, pretime=None, premean=None, nsamples=100, add_spec=True, info=None): """ Probability of a reconcile gene tree in the DLCoal model. coal_tree -- coalescent tree coal_recon -- reconciliation of coalescent tree to locus tree locus_tree -- locus tree (has dup-loss) locus_recon -- reconciliation of locus tree to species tree locus_events -- events dict for locus tree stree -- species tree n -- population sizes in species tree duprate -- duplication rate lossrate -- loss rate You must also specify one of the following pretime -- starting time before species tree premean -- mean starting time before species tree """ # init popsizes for locus tree stree_popsizes = coal.init_popsizes(stree, n) popsizes = {} for node in locus_tree: popsizes[node.name] = stree_popsizes[locus_recon[node].name] # duploss probability dl_prob = duploss.prob_dup_loss( locus_tree, stree, locus_recon, locus_events, duprate, lossrate) # daughters probability dups = phylo.count_dup(locus_tree, locus_events) d_prob = dups * log(.5) # integrate over duplication times using sampling stimes = treelib.get_tree_timestamps(stree) prob = prob_locus_coal_recon_topology_samples( coal_tree, coal_recon, locus_tree, locus_recon, locus_events, popsizes, stree, stimes, daughters, duprate, lossrate, nsamples, pretime, premean) # logging info if info is not None: info["duploss_prob"] = dl_prob info["daughters_prob"] = d_prob info["coal_prob"] = prob info["prob"] = dl_prob + d_prob + prob - log(nsamples) return dl_prob + d_prob + prob - log(nsamples)
def sample_dup_times(tree, stree, recon, birth, death, pretime=None, premean=None, events=None): """ Sample duplication times for a gene tree in the dup-loss model """ if events is None: events = phylo.label_events(tree, recon) # get species tree timestamps stimes = treelib.get_tree_timestamps(stree) #treelib.check_timestamps(stree, stimes) # init timestamps for gene tree times = {} # set pretimes if events[tree.root] != "spec": if recon[tree.root] != stree.root: # tree root is a dup within species tree snode = recon[tree.root] start_time = stimes[snode.parent] time_span = start_time - stimes[snode] else: # tree root is a pre-spec dup if pretime is None: if premean is None: raise Exception("must set pre-mean") pretime = 0.0 while pretime == 0.0: pretime = random.expovariate(1/premean) start_time = stimes[stree.root] + pretime time_span = pretime sample_dup_times_subtree(times, start_time, time_span, tree.root, recon, events, stree, birth, death) # set times for node in tree.preorder(): if events[node] == "spec": # set speciation time times[node] = stimes[recon[node]] elif (events[node] == "dup" and node.parent is not None and recon[node] != recon[node.parent]): # set duplication times within duplication subtree # node is duproot snode = recon[node] start_time = stimes[snode.parent] time_span = start_time - stimes[snode] sample_dup_times_subtree(times, start_time, time_span, node, recon, events, stree, birth, death) elif events[node] == "gene": times[node] = 0.0 return times
def sample_dup_times(tree, stree, recon, birth, death, pretime=None, premean=None, events=None): """ Sample duplication times for a gene tree in the dup-loss model NOTE: Implied speciation nodes must be present """ def gene2species(gene): return recon[tree.nodes[gene]].name if events is None: events = phylo.label_events(tree, recon) # get species tree timestamps stimes = treelib.get_tree_timestamps(stree) # treelib.check_timestamps(stree, stimes) # init timestamps for gene tree times = {} # set pretimes if events[tree.root] != "spec": if recon[tree.root] != stree.root: # tree root is a dup within species tree snode = recon[tree.root] start_time = stimes[snode.parent] time_span = snode.dist if recon[tree.root] == stree.root: # tree root is a pre-spec dup if pretime is None: if premean is None: raise Exception("must set pre-mean") pretime = 0.0 while pretime == 0.0: pretime = random.expovariate(1 / premean) start_time = stimes[stree.root] + pretime time_span = pretime sample_dup_times_subtree(times, start_time, time_span, tree.root, recon, events, stree, birth, death) # set times for node in tree.preorder(): if events[node] == "spec": # set speciation time start_time = times[node] = stimes[recon[node]] if node.parent: if times[node] > times[node.parent]: print "bad", node.name # raise Exception("bad time") # set duplication times within duplication subtree for duproot in node.children: if events[duproot] == "dup": snode = recon[duproot] time_span = snode.dist # assert start_time - time_span >= stimes[snode], \ # (duproot.name, start_time, time_span, stimes[snode]) sample_dup_times_subtree(times, start_time, time_span, duproot, recon, events, stree, birth, death) elif events[node] == "gene": times[node] = 0.0 return times
def sample_dup_times(tree, stree, recon, birth, death, pretime=None, premean=None, events=None): """ Sample duplication times for a gene tree in the dup-loss model NOTE: Implied speciation nodes must be present """ def gene2species(gene): return recon[tree.nodes[gene]].name if events is None: events = phylo.label_events(tree, recon) # get species tree timestamps stimes = treelib.get_tree_timestamps(stree) #treelib.check_timestamps(stree, stimes) # init timestamps for gene tree times = {} # set pretimes if events[tree.root] != "spec": if recon[tree.root] != stree.root: # tree root is a dup within species tree snode = recon[tree.root] start_time = stimes[snode.parent] time_span = snode.dist if recon[tree.root] == stree.root: # tree root is a pre-spec dup if pretime is None: if premean is None: raise Exception("must set pre-mean") pretime = 0.0 while pretime == 0.0: pretime = random.expovariate(1 / premean) start_time = stimes[stree.root] + pretime time_span = pretime sample_dup_times_subtree(times, start_time, time_span, tree.root, recon, events, stree, birth, death) # set times for node in tree.preorder(): if events[node] == "spec": # set speciation time start_time = times[node] = stimes[recon[node]] if node.parent: if times[node] > times[node.parent]: print "bad", node.name #raise Exception("bad time") # set duplication times within duplication subtree for duproot in node.children: if events[duproot] == "dup": snode = recon[duproot] time_span = snode.dist #assert start_time - time_span >= stimes[snode], \ # (duproot.name, start_time, time_span, stimes[snode]) sample_dup_times_subtree(times, start_time, time_span, duproot, recon, events, stree, birth, death) elif events[node] == "gene": times[node] = 0.0 return times
lossrate = 0.12 # events/lineages/myr gentime = 0.1 # yr / gen popsizes = 2 * 1e6 * gentime / 1e6 # "normalized popsize" = 2 (diploid) * Ne * yr/gen * myr/yr (first 1e6 is the pop size) subrate = 5e-9 / gentime * 1e6 # sub/site/myr = sub/site/gen * gen/yr * yr/myr\ rates, freqs, alphas = pllprob.optimize_parameters(alnfile, partfile, coal_tree_treefix, threads=1, seed=ALIGNMENT_SEED, eps=1) nsamples_coal = 1 nsamples_locus = 1 times = treelib.get_tree_timestamps(stree) pretime = None premean = 0.5 * times[stree.root] p_raxml = prob_locus_gene_species_alignment_recon(alnfile, partfile, stree, popsizes, duprate, lossrate, subrate, pretime, premean, coal_tree_raxml, coal_recon_raxml, nsamples_coal,