def prob_locus_coal_recon_topology_samples( coal_tree, coal_recon, locus_tree, locus_recon, locus_events, popsizes, stree, stimes, daughters, duprate, lossrate, nsamples, pretime=None, premean=None): if dlcoalc: # sample some reason branch lengths just for logging purposes locus_times = duploss.sample_dup_times( locus_tree, stree, locus_recon, duprate, lossrate, pretime, premean, events=locus_events) treelib.set_dists_from_timestamps(locus_tree, locus_times) # use C code return coal.prob_locus_coal_recon_topology_samples( coal_tree, coal_recon, locus_tree, locus_recon, locus_events, popsizes, stree, stimes, daughters, duprate, lossrate, nsamples, pretime, premean) else: # python backup prob = 0.0 for i in xrange(nsamples): # sample duplication times locus_times = duploss.sample_dup_times( locus_tree, stree, locus_recon, duprate, lossrate, pretime, premean, events=locus_events) treelib.set_dists_from_timestamps(locus_tree, locus_times) # coal topology probability coal_prob = prob_locus_coal_recon_topology( coal_tree, coal_recon, locus_tree, popsizes, daughters) prob += exp(coal_prob) prob = util.safelog(prob / nsamples) return prob
def prob_dlcoal_recon_topology(coal_tree, coal_recon, locus_tree, locus_recon, locus_events, daughters, stree, n, duprate, lossrate, pretime=None, premean=None, maxdoom=20, nsamples=100, add_spec=True): """ Probability of a reconcile gene tree in the DLCoal model. coal_tree -- coalescent tree coal_recon -- reconciliation of coalescent tree to locus tree locus_tree -- locus tree (has dup-loss) locus_recon -- reconciliation of locus tree to species tree locus_events -- events dict for locus tree stree -- species tree n -- population sizes in species tree duprate -- duplication rate lossrate -- loss rate You must also specify one of the following pretime -- starting time before species tree premean -- mean starting time before species tree Note: locus tree must have implied speciation nodes present """ dups = phylo.count_dup(locus_tree, locus_events) # ensure implicit speciations are present if add_spec: phylo.add_implied_spec_nodes(locus_tree, stree, locus_recon, locus_events) # init popsizes for locus tree stree_popsizes = coal.init_popsizes(stree, n) popsizes = {} for node in locus_tree: popsizes[node.name] = stree_popsizes[locus_recon[node].name] # duploss probability util.tic("top") dl_prob = spidir.calc_birth_death_prior(locus_tree, stree, locus_recon, duprate, lossrate, maxdoom=maxdoom) util.toc() # daughters probability d_prob = dups * log(.5) # integrate over duplication times using sampling prob = 0.0 #util.tic("int") for i in xrange(nsamples): # sample duplication times locus_times = spidir.topology_prior.sample_dup_times( locus_tree, stree, locus_recon, duprate, lossrate, pretime, premean, events=locus_events) assert len(locus_times) == len(locus_tree.nodes), ( len(locus_times), len(locus_tree.nodes)) treelib.set_dists_from_timestamps(locus_tree, locus_times) # coal topology probability coal_prob = prob_coal_recon_topology(coal_tree, coal_recon, locus_tree, popsizes, daughters) prob += exp(coal_prob) print coal_prob #util.toc() return dl_prob + d_prob + util.safelog(prob / nsamples)
def prob_gene_species_alignment_recon(alnfile, partfile, stree, popsizes, duprate, lossrate, subrate, beta, pretime, premean, coal_tree, coal_recon, nsamples_coal, locus_tree, locus_recon, nsamples_locus, daughters, rates, freqs, alphas, threads=1, seed=ALIGNMENT_SEED, eps=0.1, info=None): """ Evaluate terms that depend on T^G and R^G. That is, fix T^L, R^L, and daughters and evaluate the double integral: int int P(t^L | T^L, R^L, S, theta) * P(T^G, R^G, t^G | t^L, T^L, daughters, R^L, theta) * P(A | T^G, t^G) dt^L dt^G This is the probability we used in the searching process. alnfile -- alignment file partfile -- partition file stree -- species tree popsizes -- population sizes in species tree duprate -- duplication rate lossrate -- loss rate subrate -- substitution rate beta -- regularization parameter pretime -- starting time before species tree premean -- mean starting time before species tree coal_tree -- coalescent tree coal_recon -- reconciliation of coalescent tree to locus tree nsamples_coal -- number of times to sample coal times t^G locus_tree -- locus tree (has dup-loss) locus_recon -- reconciliation of locus tree to species tree nsamples_locus -- number of times to sample the locus tree times t^L daughters -- daughter nodes rates, freqs, alphas -- optimization parameters """ locus_events = phylo.label_events(locus_tree, locus_recon) # optimize the parameters # util.tic("optimize parameter") # rates, freqs, alphas = pllprob.optimize_parameters(alnfile, partfile, coal_tree, # threads=threads, seed=seed, eps=eps) # util.toc() # double integral double_integral_list = [] double_integral = 0.0 util.tic("recon prob") for i in xrange(nsamples_locus): # sample t^L, the unit should be in myr #util.tic("topo prob") locus_times = duploss.sample_dup_times(locus_tree, stree, locus_recon, duprate, lossrate, pretime, premean, events=locus_events) treelib.set_dists_from_timestamps(locus_tree, locus_times) # calculate P(T^G, R^G | T^L, t^L, daughters, theta) topology_prob = prob_locus_coal_recon_topology(coal_tree, coal_recon, locus_tree, popsizes, daughters) #util.toc() # for a fixed t^L, compute coal_prob # sample t^G for topology and compute the probabililty of observing the alignment using MonteCarlo integration coal_prob = 0.0 alignment_prob_MonteCarlo = 0.0 alignment_prob_list = [] # check probability of lineage counts for this locus tree zero_lineage_prob = False #util.tic("set times") for lnode in locus_tree: lineages = coal.count_lineages_per_branch(coal_tree, coal_recon, locus_tree) bottom_num, top_num = lineages[lnode] if lnode.parent: T = lnode.dist else: T = util.INF popsizes = popsizes lineage_prob = prob_coal_counts(bottom_num, top_num, T, popsizes) # set zero_lineage_prob = TRUE if one lineage returns zero probability if (lineage_prob == 0.0): zero_lineage_prob = True #util.toc() # if lineage_prob is zero, coal_prob is zero if zero_lineage_prob: coal_prob = -float("inf") # otherwise, we calculate the coal_prob else: for j in xrange(nsamples_coal): # sample coal times and set the coal_tree accordingly # locus tree branch lengths are in myr # make sure the input popsizes are scaled to fit the time unit (typically myr) try: sample_coal_times_topology(coal_tree, coal_recon, locus_tree, popsizes) except (ZeroDivisionError, ValueError): # bad sample util.log("bad sample") alignment_prob = -util.INF continue #=============================================================================== # (log) probability of observing the alignment #util.tic("alignment probability") # convert branch lengths from myr to sub/site for node in coal_tree: node.dist *= subrate #util.tic("alignment prob") # set a regularization parameter beta print beta alignment_prob = beta * prob_alignment(alnfile, partfile, coal_tree, rates, freqs, alphas, threads=threads, seed=seed, eps=eps) #util.toc() ### util.log("p = %.6f" % alignment_prob) #util.toc() #=============================================================================== ### util.log(" log p = %.6g" % alignment_prob) ### util.log(" p = %.6g" % exp(alignment_prob)) alignment_prob_list.append(alignment_prob) ### util.log("p = %f" % alignment_prob_MonteCarlo) # log_sum_exp function exponentiate the log probability of observing alignment, # add them up, and take log again if len(alignment_prob_list) == 0: # all bad samples alignment_prob_MonteCarlo = -util.INF else: alignment_prob_MonteCarlo = log_sum_exp( alignment_prob_list) - log(nsamples_coal) # P(T^G, R^G | T^L, t^L, daughters, theta) * $ P(t^G | ~) * P(A | T^G,t^G) dtG # coal_prob is a log probability coal_prob += topology_prob + alignment_prob_MonteCarlo # add coal probability to a list for further processing double_integral_list.append(coal_prob) # log_sum_exp function exponentiate the log probability of observing alignment, # add them up, and take log again double_integral = log_sum_exp(double_integral_list) - log( nsamples_locus) # logging info if info is not None: info["topology_prob"] = topology_prob # one sample of t^L info[ "alignment_prob"] = alignment_prob_MonteCarlo # one sample of t^L, averaged over t^G info["coal_prob"] = double_integral util.toc() return double_integral