def addVisited(conf, visited, tree, gene2species, thash=None): if thash is None: thash = phylo.hash_tree(tree) if thash in visited: visited[thash][2] += 1 else: visited[thash] = [tree.data["logl"], tree.copy(), 1] if "correcthash" in conf: if thash == conf["correcthash"]: debug("PROPOSED CORRECT TREE: visisted = ", len(visited)) if conf["searchtest"]: drawTreeLogl(tree) sys.exit(0) if "debugtab_file" in conf: shash = phylo.hash_tree(tree, gene2species) if "correcthash" in conf: correct = (conf["correcthash"] == thash) else: correct = False conf["debugtab"].writeRow(conf["debugtab_file"], {"correct": correct, "logl": tree.data["logl"], "treelen": sum(x.dist for x in tree), "baserate": tree.data["baserate"], "error": tree.data["error"], "errorlogl": tree.data["errorlogl"], "eventlogl": tree.data["eventlogl"], "tree": tree.getOnelineNewick(), "topology": thash, "species_hash": shash})
def test_search(self): """Test all terms""" prep_dir("test/output/all_terms_search") out = open("test/output/all_terms_search/flies.txt", "w") #out = sys.stderr treeids = os.listdir("test/data/flies") #treeids = ["3"] for treeid in treeids: tree_correct = read_tree("test/data/flies.nt/%s/%s.tree" % (treeid, treeid)) align = read_fasta("test/data/flies.nt/%s/%s.align" % (treeid, treeid)) phylo.hash_order_tree(tree_correct) print >>out, treeid print >>out, "correct" drawTree(tree_correct, out=out) stree = read_tree("test/data/flies.norm.stree") gene2species = phylo.read_gene2species("test/data/flies.smap") params = spidir.read_params("test/data/flies.nt.param") birth = .4 death = .39 pretime = 1.0 maxdoom = 20 bgfreq = [.258,.267,.266,.209] kappa = 1.59 genes = align.keys() seqs = align.values() tree = spidir.search_climb(genes, seqs, stree, gene2species, params, birth, death, pretime, bgfreq, kappa, maxdoom=maxdoom, niter=50, quickiter=100, nsamples=100, branch_approx=True) phylo.hash_order_tree(tree) print >>out, "constructed" drawTree(tree, out=out) print >>out, "is_correct:", (phylo.hash_tree(tree) == phylo.hash_tree(tree_correct)) out.close()
def buildTree(conf, stree, gene2species): params = Spidir.readParams(conf["param"]) if "correcttree" in conf: conf["correcthash"] = phylo.hash_tree(conf["correcttree"]) if "dist" in conf: for i in range(len(conf["dist"])): distfile = conf["dist"][i] labels, distmat = phylip.read_dist_matrix(distfile) # read in different labels if needed if "labels" in conf: labels = Spidir.readLabels(conf["labels"][i]) conf["aln"] = fasta.read_fasta(conf["labels"][i]) tree, logl = Spidir.spidir(conf, distmat, labels, stree, gene2species, params) tree.write(Spidir.outTreeFile(conf)) # test for correctness if "correcttree" in conf: correctTree = conf["correcttree"] phylo.hash_order_tree(correctTree) phylo.hash_order_tree(tree) thash1 = phylo.hash_tree(tree) thash2 = phylo.hash_tree(correctTree) print "spidir: " treelib.draw_tree(tree, maxlen=5, minlen=5) print print "correct:" treelib.draw_tree(correctTree, maxlen=5, minlen=5) print if len(tree.leaves()) > 3: rferror = Spidir.robinson_foulds_error(correctTree, tree) else: rferror = 0.0 if thash1 == thash2: print "CORRECT TREE FOUND" else: print "WRONG TREE FOUND (RF: %f)" % rferror
def propose(chain, tree): tree2 = proposeFunc(conf, tree, distmat, labels, stree, gene2species, params, visited) # check visited dict thash = phylo.hash_tree(tree2) if thash in visited: logl, tree2, count = visited[thash] #this.nold += 1 else: Spidir.setTreeDistances(conf, tree2, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params) this.nold = 0 addVisited(conf, visited, tree2, gene2species, thash) # best yet tree if logl > this.toplogl: printMCMC(conf, "%d:%d" % (chain.name, this.iter), tree2, stree, gene2species, visited) this.toplogl = logl this.toptree = tree2.copy() # move some other chains to best state #chains2 = sorted(chains, key=lambda x: x.logl) #for chain in chains2[:1]: # chain.state = this.toptree.copy() # chain.logl = this.toplogl # alter logl to influence search only #chain.relax = conf["speedup"] * this.nold return tree2, logl
def evalUserTree(tree): setTreeDistances(conf, tree, distmat, labels) logl = treeLogLikelihood(conf, tree, stree, gene2species, params) thash = phylo.hash_tree(tree) if thash in visited: a, b, count = visited[thash] else: count = 0 visited[thash] = [logl, tree.copy(), count+1] if isDebug(DEBUG_LOW): debug("\nuser given tree:") recon = phylo.reconcile(tree, stree, gene2species) events = phylo.label_events(tree, recon) drawTreeLogl(tree, events=events)
def getProposals(conf, tree, distmat, labels, stree, gene2species, params, visited, stuck=False): # TODO: handle root edges # try all NNI # find edges for NNI nodes = tree.nodes.values() nodes = filter(lambda x: not x.isLeaf() and x != tree.root and x not in tree.root.children, nodes) edges = [(node, node.parent) for node in nodes] edges.append(tuple(tree.root.children)) treelib.drawTreeNames(tree, minlen=5, maxlen=5, out=sys.stderr) util.printcols(util.map2(lambda x: x.name, edges), out=sys.stderr) proposals = [] for edge in edges: for change in (0,1): proposeNni(tree, edge[0], edge[1], change) tree2 = phylo.reconRoot(tree, stree, gene2species, newCopy=True) thash = phylo.hash_tree(tree2) if thash not in visited: Spidir.setTreeDistances(conf, tree2, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params) visited[thash] = [logl, tree2, 1] proposals.append([logl, edge, change]) else: visited[thash][2] += 1 logl = visited[thash][0] if not stuck: proposals.append([logl, edge, change]) # switch branch back proposeNni(tree, edge[0], edge[1], change) proposals.sort(key=lambda x: x[0], reverse=True) return proposals
def searchExhaustive(conf, distmat, labels, tree, stree, gene2species, params, depth=2, visited=None, visited2=None, topDepth=True, toplogl=None, short=False): if visited == None: visited = {} if visited2 == None: visited2 = {} tree = tree.copy() # find initial logl thash = phylo.hash_tree(tree) if thash not in visited: Spidir.setTreeDistances(conf, tree, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params) visited[thash] = [logl, tree.copy(), 1] drawTreeLogl(tree) else: logl = visited[thash][0] if toplogl == None: toplogl = [logl] debug(" " * (depth*2), "(%d)" % len(visited)) sys.stdout.flush() if depth < 1: return tree, logl # try all NNI # find edges for NNI nodes = tree.nodes.values() nodes = filter(lambda x: not x.isLeaf() and x != tree.root and \ x.parent != tree.root, nodes) edges = [(node, node.parent) for node in nodes] for edge in edges: for change in (0,1): proposeNni(tree, edge[0], edge[1], change) thash = phylo.hash_tree(tree) if thash not in visited: Spidir.setTreeDistances(conf, tree, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params) visited[thash] = [logl, tree.copy(), 1] else: logl = visited[thash][0] if logl > toplogl[0]: toplogl[0] = logl if short: return tree, logl else: printMCMC(conf, "N/A", tree, stree, gene2species, visited) if (thash not in visited2 or \ depth > visited2[thash]) and \ logl - toplogl[0] >= conf["eprune"]: visited2[thash] = depth # dig deeper if depth > 1: tree2, logl2 = searchExhaustive(conf, distmat, labels, tree, stree, gene2species, params, depth=depth-1, visited=visited, visited2=visited2, topDepth=False, toplogl=toplogl, short=short) if short and tree2 != None: return tree2, logl2 # switch branch back proposeNni(tree, edge[0], edge[1], change) # debug if topDepth: items = visited.items() i = util.argmaxfunc(lambda x: x[1][0], items) thash, (logl, tree, count) = items[i] return tree, logl else: return None, None
def searchHillClimb(conf, distmat, labels, stree, gene2species, params, initTree=None, visited=None): if visited == None: visited = {} # init with NJ if initTree != None: tree = initTree else: #tree = bionj.bionj(labels=labels, distmat=distmat, verbose=False) tree = phylo.neighborjoin(distmat, labels) tree = phylo.reconRoot(tree, stree, gene2species) Spidir.setTreeDistances(conf, tree, distmat, labels) # init likelihood score logl = treeLogLikelihood(conf, tree, stree, gene2species, params) # store tree in visited addVisited(conf, visited, tree, gene2species) stuck = False for i in range(conf["hilliters"]): printMCMC(conf, i, tree, stree, gene2species, visited) proposals = getProposals(conf, tree, distmat, labels, stree, gene2species, params, visited, stuck) util.printcols(map(lambda (a,(b,c),d): [a, b.name, c.name, d], proposals)) print # determine which proposals to use edgeset = set() proposals2 = [] for logl2, edge, change in proposals: if edge in edgeset: continue proposals2.append([logl2, edge, change]) edgeset.add((getNniUncle(tree, edge[0], edge[1]), edge[1])) edgeset.add((edge[0].children[change], edge[0])) edgeset.add((edge[0], edge[1])) util.printcols(map(lambda (a,(b,c),d): [a, b.name, c.name, d], proposals2)) print heat = 1.0 start = 0 while start < len(proposals2): nproposals = int(math.ceil(len(proposals2) * heat)) # apply proposals for logl3, edge, change in proposals2[start:start+nproposals]: proposeNni(tree, edge[0], edge[1], change) tree2 = phylo.reconRoot(tree, stree, gene2species, newCopy=True) # calc likelihood thash = phylo.hash_tree(tree2) if thash not in visited: Spidir.setTreeDistances(conf, tree2, distmat, labels) logl2 = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params) stuck = False else: logl2 = visited[thash][0] Spidir.setTreeDistances(conf, tree2, distmat, labels) logl2 = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params) if nproposals == 1: stuck = True addVisited(conf, visited, tree2, gene2species, thash) debug("logl2", logl2) if logl2 > logl: logl = logl2 tree = tree2 break if nproposals == 1: logl = logl2 tree = tree2 break heat *= .5 # undo reversals for logl3, edge, change in util.reverse(proposals2[start:start+nproposals]): proposeNni(tree, edge[0], edge[1], change) debug("start:", start) debug("swaps:", nproposals) debug("heat:", heat) debug("stuck:", stuck) items = visited.items() i = util.argmaxfunc(lambda x: x[1][0], items) thash, (logl, tree, count) = items[i] return tree, logl
def proposeTree3(conf, tree, distmat, labels, stree, gene2species, params, visited): toplogl = tree.data["logl"] toptree = tree.copy() tree = tree.copy() nodes = tree.nodes.values() nodes.remove(tree.root) weights = [1 for x in nodes] #[x.data["error"] for x in nodes] badgene = nodes[stats.sample(weights)] # detemine distance from badgene to everyone else dists = util.Dict(default=-util.INF) def walk(node, dist): dists[node.name] = dist for child in node.children: walk(child, dist + child.dist) walk(badgene, 0) seen = set([badgene]) node = badgene.parent dist = badgene.dist while node != None: for child in node.children: if child not in seen: walk(child, dist) seen.add(node) dist += node.dist node = node.parent tree1, tree2 = splitTree(tree, badgene, badgene.parent) names = tree1.nodes.keys() names.remove(tree1.root.name) #names.sort(key=lambda x: dists[x]) random.shuffle(names) for name in names[:min(len(names), conf["regraftloop"])]: tree = tree1.copy() node = tree.nodes[name] #print "p3>>", node.name, node.parent.name regraftTree(tree, tree2.copy(), node, node.parent) thash = phylo.hash_tree(tree) if thash not in visited: Spidir.setTreeDistances(conf, tree, distmat, labels) logl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params) addVisited(conf, visited, tree, gene2species, thash) logl, tree, count = visited[thash] if logl > toplogl: toplogl = logl toptree = tree # try returning immediately #return toptree assert toptree != None return toptree
def test_birth_death_single_sim(self): """test the single branch prior""" duprate = 2.0 lossrate = .5 ntrees = 1000 tabsize = 100 T = 1.0 tops = [] survivors = [] lookup = {} # define species tree stree = treelib.parse_newick("(A:1);") def gene2species(gene): return gene[:1].upper() # simulate gene trees util.tic("simulating %d trees" % ntrees) for i in xrange(ntrees): tree, doom = birthdeath.sample_birth_death_tree( T, duprate, lossrate) if tree.root in doom: tops.append("()") survivors.append(0) else: rename_leaves(tree, stree, lambda x: "A") tops.append(phylo.hash_tree(tree, gene2species)) survivors.append(len(tree.leaves())) lookup[tops[-1]] = tree util.toc() # setup test output outdir = "test/output/birthdeath_sim_simple" prep_dir(outdir) # histogram of topologies and survivors (# leaves) hist_tops = histtab(tops) hist_num = histtab(survivors) # compute survivor prior probs = [] for row in hist_num: ngenes = row["item"] probs.append(birthDeathCount(ngenes, T, duprate, lossrate)) # compute topologie priors probs_tops = [] for row in hist_tops: tree = lookup[row["item"]] if tree.root.is_leaf(): p = log(birthdeath.prob_birth_death1( 0, T, duprate, lossrate)) else: nhist = numTopologyHistories(tree.root) s = len(tree.leaves()) thist = factorial(s) * factorial(s-1) / 2**(s-1) r = numRedunantTopology(tree.root, gene2species, all_leaves=True) p = log(r * nhist / thist * birthdeath.prob_birth_death1( s, T, duprate, lossrate)) probs_tops.append(exp(p)) self.calc_fit(outdir + "/sim_prior_ngenes", hist_num, probs) self.calc_fit(outdir + "/sim_prior_top", hist_tops, probs_tops)
def do_test_birth_death_gene_sim(self, stree, gene2species, duprate, lossrate, ntrees=10000, tabsize=30): """Perform a birth death gene tree simulation test""" doomtable = calcDoomTable(stree, duprate, lossrate) tops = [] lookup = {} def rename_tree(tree, gene2species): if len(tree.nodes) == 0: return spcounts = util.hist_dict(map(gene2species, tree.leaf_names())) names = {} for sp, c in spcounts.items(): names[sp] = range(1, c+1) random.shuffle(names[sp]) for node in tree.leaves(): sp = gene2species(node.name) tree.rename(node.name, sp + "." + str(names[sp].pop())) util.tic("simulating %d trees" % ntrees) for i in xrange(ntrees): tree, recon, events = birthdeath.sample_birth_death_gene_tree( stree, duprate, lossrate, removeloss=True) phylo.add_implied_spec_nodes(tree, stree, recon, events) if len(tree.nodes) == 1 and recon[tree.root] == stree.root: tops.append("()") lookup["()"] = (None, None, None) else: rename_tree(tree, gene2species) tops.append(phylo.hash_tree(tree)) lookup[tops[-1]] = (tree, recon, events) util.toc() hist = histtab(tops) probs = [] for row in hist: tree, recon, events = lookup[row["item"]] if tree is None: probs.append(exp(doomtable[-1])) else: p = c_calcBirthDeathPrior(tree, stree, recon, duprate, lossrate, events=events) p2 = calcBirthDeathPrior(tree, stree, recon, duprate, lossrate, events=events) fequal(p, p2) probs.append(exp(p)) return hist, probs
def test_search(self): """Test all terms""" prep_dir("test/output/all_terms_search") out = open("test/output/all_terms_search/flies.txt", "w") #out = sys.stderr treeids = os.listdir("test/data/flies") #treeids = ["3"] for treeid in treeids: tree_correct = read_tree("test/data/flies.nt/%s/%s.tree" % (treeid, treeid)) align = read_fasta("test/data/flies.nt/%s/%s.align" % (treeid, treeid)) phylo.hash_order_tree(tree_correct) print >> out, treeid print >> out, "correct" drawTree(tree_correct, out=out) stree = read_tree("test/data/flies.norm.stree") gene2species = phylo.read_gene2species("test/data/flies.smap") params = spidir.read_params("test/data/flies.nt.param") birth = .4 death = .39 pretime = 1.0 maxdoom = 20 bgfreq = [.258, .267, .266, .209] kappa = 1.59 genes = align.keys() seqs = align.values() tree = spidir.search_climb(genes, seqs, stree, gene2species, params, birth, death, pretime, bgfreq, kappa, maxdoom=maxdoom, niter=50, quickiter=100, nsamples=100, branch_approx=True) phylo.hash_order_tree(tree) print >> out, "constructed" drawTree(tree, out=out) print >> out, "is_correct:", ( phylo.hash_tree(tree) == phylo.hash_tree(tree_correct)) out.close()