def test_ml_large(self): """Test ML code""" # params bgfreq = [.258,.267,.266,.209] kappa = 1.59 # data tree = treelib.readTree("test/data/verts/19520/19520.ensembl.tree") align = fasta.readFasta("test/data/verts/19520/19520.nt.mfa") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa) print l self.assert_(l != -util.INF) l = spidir.find_ml_branch_lengths_hky( tree, util.mget(align, tree.leafNames()), bgfreq, kappa, parsinit=False, maxiter=1) print l self.assert_(l != -util.INF)
def test_ml_large(self): """Test ML code""" # params bgfreq = [.258, .267, .266, .209] kappa = 1.59 # data tree = treelib.readTree("test/data/verts/19520/19520.ensembl.tree") align = fasta.readFasta("test/data/verts/19520/19520.nt.mfa") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa) print l self.assert_(l != -util.INF) l = spidir.find_ml_branch_lengths_hky(tree, util.mget( align, tree.leafNames()), bgfreq, kappa, parsinit=False, maxiter=1) print l self.assert_(l != -util.INF)
def _test_ml_speed(self): # params bgfreq = [.258,.267,.266,.209] kappa = 1.59 # data tree = treelib.readTree("test/data/flies.nt/0/0.tree") align = fasta.readFasta("test/data/flies.nt/0/0.align") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) util.tic("find ML") for i in xrange(10): l = spidir.find_ml_branch_lengths_hky( tree, util.mget(align, tree.leafNames()), bgfreq, kappa, maxiter=10) util.toc() dists.append([n.dist for n in nodes]) likes.append(l)
def _test_ml_speed(self): # params bgfreq = [.258, .267, .266, .209] kappa = 1.59 # data tree = treelib.readTree("test/data/flies.nt/0/0.tree") align = fasta.readFasta("test/data/flies.nt/0/0.align") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) util.tic("find ML") for i in xrange(10): l = spidir.find_ml_branch_lengths_hky(tree, util.mget( align, tree.leafNames()), bgfreq, kappa, maxiter=10) util.toc() dists.append([n.dist for n in nodes]) likes.append(l)
def test_calc_hky_seq_likelihood(self): bgfreq = [.258, .267, .266, .209] kappa = 1.59 tree = treelib.parse_newick( "((A:.1,B:.1):.1,((C:.1,D:.1):.2,E:.3):.1);") align = { "A": "CGCAGACAACTCCCCCGACCACACATAGTACGAAATCCTCAGCCGCTGCCGACTCCGACGCGCGGACTGTCCGGGTTCAGCGAGGCTTAAGAGAACGGCC", "B": "CCCAAACAACTCCCCCGACCAGACATAGTACGAGATCCTCAGCCACTGGCGACTCGGACGCGCAGAGTGTCCCGCTTAAGCGAGGCTGCAGAGAACGGCC", "C": "GGCCAGCAATTCCTCCGACCACGCATAGTACGAGATCGTCTGCCTCCTGCGAATCGGACGCGCAGAGTGTTCCGGTTAAGGGAGACTTCAGAGACCTGGC", "D": "CGCTAACAATTCCCCCGACCACACTGAGTACGAGATACTCGGACTCCGGCGATCTCTACTCGCAGAGAGTCCCACTTAAGCGAGACTGACGAGCACGGGC", "E": "ATTCTTCCACACCTGCGTGTTCGTCACGTATCAAATGCGGAGCCCACGTCCAATGGCACACGAACAGTCGGCCACGGAATCGCAGACTCGTTGACCAACG" } draw_tree(tree) l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa) l2 = spidir.find_ml_branch_lengths_hky(tree, align, bgfreq, kappa) draw_tree(tree) print "log lk", l, l2 self.assert_(l2 > l)
def compute_cost(self, gtree): """ Returns -log [P(topology) + P(branch)], min cost = min neg log prob = max log prob = max prob """ recon = phylo.reconcile(gtree, self.stree, self.gene2species) events = phylo.label_events(gtree, recon) # optimize branch lengths spidir.find_ml_branch_lengths_hky(gtree, self.align, self.bgfreq, self.kappa, maxiter=10, parsinit=False) branchp = spidir.branch_prior(gtree, self.stree, recon, events, self.params, self.duprate, self.lossrate, self.pretime) topp = spidir.calc_birth_death_prior(gtree, self.stree, recon, self.duprate, self.lossrate, events) return -(topp + branchp)
def _test_ml(self): """Test ML code""" # params bgfreq = [.258, .267, .266, .209] kappa = 1.59 # data tree = treelib.readTree("test/data/flies.nt/0/0.tree") align = fasta.readFasta("test/data/flies.nt/0/0.align") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) util.tic("find ML") for i in range(40): l = spidir.find_ml_branch_lengths_hky(tree, util.mget( align, tree.leafNames()), bgfreq, kappa, parsinit=False, maxiter=1) dists.append([n.dist for n in nodes]) likes.append(l) util.toc() print likes prep_dir("test/output/ml/") # distances plot util.rplot_start("test/output/ml/ml_branches.pdf") util.rplot("plot", util.cget(dists, 0), ylim=[0, max(dists[0])], t="l", main="branch length convergence", xlab="iterations", ylab="branch lengths (sub/site)") for d in zip(*dists): util.rplot("lines", d) util.rplot_end(True) print util.cget(dists, 4) # likelihood plot util.rplot_start("test/output/ml/ml_likelihood.pdf") util.rplot("plot", likes, t="l", xlab="iterations", ylab="log likelihood", main="likelihood convergence") util.rplot_end(True)
def _test_ml(self): """Test ML code""" # params bgfreq = [.258,.267,.266,.209] kappa = 1.59 # data tree = treelib.readTree("test/data/flies.nt/0/0.tree") align = fasta.readFasta("test/data/flies.nt/0/0.align") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) util.tic("find ML") for i in range(40): l = spidir.find_ml_branch_lengths_hky( tree, util.mget(align, tree.leafNames()), bgfreq, kappa, parsinit=False, maxiter=1) dists.append([n.dist for n in nodes]) likes.append(l) util.toc() print likes prep_dir("test/output/ml/") # distances plot util.rplot_start("test/output/ml/ml_branches.pdf") util.rplot("plot", util.cget(dists, 0), ylim=[0, max(dists[0])], t="l", main="branch length convergence", xlab="iterations", ylab="branch lengths (sub/site)") for d in zip(* dists): util.rplot("lines", d) util.rplot_end(True) print util.cget(dists, 4) # likelihood plot util.rplot_start("test/output/ml/ml_likelihood.pdf") util.rplot("plot", likes, t="l", xlab="iterations", ylab="log likelihood", main="likelihood convergence") util.rplot_end(True)
def optimize_model(self, gtree, stree, gene2species): """Optimizes the model""" CostModel.optimize_model(self, gtree, stree, gene2species) #============================= # read sequences if not self.align: self.parser.error("--align must be specified") self.align = fasta.read_fasta(self.align) #============================= # read SPIDIR parameters if not self.params: self.parser.error("--param must be specified") self.params = spidir.read_params(self.params) #============================= # determine background base frequency if self.bgfreq: # use supplied frequency vals = map(float, self.bgfreq.split(",")) if len(vals) != 4: self.parser.error("invalid --bgfreq: %s" % self.bgfreq) self.bgfreq = vals else: # compute frequency from alignment self.bgfreq = alignlib.compute_bgfreq(self.align) #============================= # branch lengths if self.kappa >= 0: # use supplied kappa self.kappa = self.kappa else: # compute kappa from alignment # from spidir.find_ml_kapp_hky minkappa = 0.4; maxkappa = 5.0; stepkappa = 0.1 maxlk = -util.INF maxk = minkappa for k in util.frange(minkappa, maxkappa, stepkappa): l = spidir.find_ml_branch_lengths_hky(gtree, self.align, self.bgfreq, k, maxiter=1, parsinit=(k == minkappa)) if l > maxlk: maxlk = l maxk = k self.kappa = maxk
def test_calc_hky_seq_likelihood(self): bgfreq = [0.258, 0.267, 0.266, 0.209] kappa = 1.59 tree = treelib.parse_newick("((A:.1,B:.1):.1,((C:.1,D:.1):.2,E:.3):.1);") align = { "A": "CGCAGACAACTCCCCCGACCACACATAGTACGAAATCCTCAGCCGCTGCCGACTCCGACGCGCGGACTGTCCGGGTTCAGCGAGGCTTAAGAGAACGGCC", "B": "CCCAAACAACTCCCCCGACCAGACATAGTACGAGATCCTCAGCCACTGGCGACTCGGACGCGCAGAGTGTCCCGCTTAAGCGAGGCTGCAGAGAACGGCC", "C": "GGCCAGCAATTCCTCCGACCACGCATAGTACGAGATCGTCTGCCTCCTGCGAATCGGACGCGCAGAGTGTTCCGGTTAAGGGAGACTTCAGAGACCTGGC", "D": "CGCTAACAATTCCCCCGACCACACTGAGTACGAGATACTCGGACTCCGGCGATCTCTACTCGCAGAGAGTCCCACTTAAGCGAGACTGACGAGCACGGGC", "E": "ATTCTTCCACACCTGCGTGTTCGTCACGTATCAAATGCGGAGCCCACGTCCAATGGCACACGAACAGTCGGCCACGGAATCGCAGACTCGTTGACCAACG", } draw_tree(tree) l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa) l2 = spidir.find_ml_branch_lengths_hky(tree, align, bgfreq, kappa) draw_tree(tree) print "log lk", l, l2 self.assert_(l2 > l)