Exemplo n.º 1
0
    def test_ml_large(self):
        """Test ML code"""

        # params
        bgfreq = [.258,.267,.266,.209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/verts/19520/19520.ensembl.tree")
        align = fasta.readFasta("test/data/verts/19520/19520.nt.mfa")


        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa)
        print l
        self.assert_(l != -util.INF)


        l = spidir.find_ml_branch_lengths_hky(
            tree,
            util.mget(align, tree.leafNames()),
            bgfreq, kappa,
            parsinit=False,
            maxiter=1)
        print l
        self.assert_(l != -util.INF)
Exemplo n.º 2
0
    def test_ml_large(self):
        """Test ML code"""

        # params
        bgfreq = [.258, .267, .266, .209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/verts/19520/19520.ensembl.tree")
        align = fasta.readFasta("test/data/verts/19520/19520.nt.mfa")

        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa)
        print l
        self.assert_(l != -util.INF)

        l = spidir.find_ml_branch_lengths_hky(tree,
                                              util.mget(
                                                  align, tree.leafNames()),
                                              bgfreq,
                                              kappa,
                                              parsinit=False,
                                              maxiter=1)
        print l
        self.assert_(l != -util.INF)
Exemplo n.º 3
0
    def _test_ml_speed(self):
        
        # params
        bgfreq = [.258,.267,.266,.209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")


        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in xrange(10):
            l = spidir.find_ml_branch_lengths_hky(
                tree,
                util.mget(align, tree.leafNames()),
                bgfreq, kappa,
                maxiter=10)            
        util.toc()

        dists.append([n.dist for n in nodes])
        likes.append(l)
Exemplo n.º 4
0
    def _test_ml_speed(self):

        # params
        bgfreq = [.258, .267, .266, .209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")

        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in xrange(10):
            l = spidir.find_ml_branch_lengths_hky(tree,
                                                  util.mget(
                                                      align, tree.leafNames()),
                                                  bgfreq,
                                                  kappa,
                                                  maxiter=10)
        util.toc()

        dists.append([n.dist for n in nodes])
        likes.append(l)
Exemplo n.º 5
0
    def test_calc_hky_seq_likelihood(self):

        bgfreq = [.258, .267, .266, .209]
        kappa = 1.59
        tree = treelib.parse_newick(
            "((A:.1,B:.1):.1,((C:.1,D:.1):.2,E:.3):.1);")
        align = {
            "A":
            "CGCAGACAACTCCCCCGACCACACATAGTACGAAATCCTCAGCCGCTGCCGACTCCGACGCGCGGACTGTCCGGGTTCAGCGAGGCTTAAGAGAACGGCC",
            "B":
            "CCCAAACAACTCCCCCGACCAGACATAGTACGAGATCCTCAGCCACTGGCGACTCGGACGCGCAGAGTGTCCCGCTTAAGCGAGGCTGCAGAGAACGGCC",
            "C":
            "GGCCAGCAATTCCTCCGACCACGCATAGTACGAGATCGTCTGCCTCCTGCGAATCGGACGCGCAGAGTGTTCCGGTTAAGGGAGACTTCAGAGACCTGGC",
            "D":
            "CGCTAACAATTCCCCCGACCACACTGAGTACGAGATACTCGGACTCCGGCGATCTCTACTCGCAGAGAGTCCCACTTAAGCGAGACTGACGAGCACGGGC",
            "E":
            "ATTCTTCCACACCTGCGTGTTCGTCACGTATCAAATGCGGAGCCCACGTCCAATGGCACACGAACAGTCGGCCACGGAATCGCAGACTCGTTGACCAACG"
        }

        draw_tree(tree)

        l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa)
        l2 = spidir.find_ml_branch_lengths_hky(tree, align, bgfreq, kappa)

        draw_tree(tree)

        print "log lk", l, l2
        self.assert_(l2 > l)
Exemplo n.º 6
0
    def compute_cost(self, gtree):
        """
        Returns -log [P(topology) + P(branch)],
        min cost = min neg log prob = max log prob = max prob
        """
        recon = phylo.reconcile(gtree, self.stree, self.gene2species)
        events = phylo.label_events(gtree, recon)

        # optimize branch lengths
        spidir.find_ml_branch_lengths_hky(gtree, self.align, self.bgfreq, self.kappa,
                                          maxiter=10, parsinit=False)

        branchp = spidir.branch_prior(gtree, self.stree, recon, events,
                                      self.params, self.duprate, self.lossrate, self.pretime)
        topp = spidir.calc_birth_death_prior(gtree, self.stree, recon,
                                             self.duprate, self.lossrate, events)
        return -(topp + branchp)
Exemplo n.º 7
0
    def _test_ml(self):
        """Test ML code"""

        # params
        bgfreq = [.258, .267, .266, .209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")

        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in range(40):
            l = spidir.find_ml_branch_lengths_hky(tree,
                                                  util.mget(
                                                      align, tree.leafNames()),
                                                  bgfreq,
                                                  kappa,
                                                  parsinit=False,
                                                  maxiter=1)

            dists.append([n.dist for n in nodes])
            likes.append(l)
        util.toc()

        print likes

        prep_dir("test/output/ml/")

        # distances plot
        util.rplot_start("test/output/ml/ml_branches.pdf")
        util.rplot("plot",
                   util.cget(dists, 0),
                   ylim=[0, max(dists[0])],
                   t="l",
                   main="branch length convergence",
                   xlab="iterations",
                   ylab="branch lengths (sub/site)")
        for d in zip(*dists):
            util.rplot("lines", d)
        util.rplot_end(True)

        print util.cget(dists, 4)

        # likelihood plot
        util.rplot_start("test/output/ml/ml_likelihood.pdf")
        util.rplot("plot",
                   likes,
                   t="l",
                   xlab="iterations",
                   ylab="log likelihood",
                   main="likelihood convergence")
        util.rplot_end(True)
Exemplo n.º 8
0
    def _test_ml(self):
        """Test ML code"""

        # params
        bgfreq = [.258,.267,.266,.209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")


        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in range(40):
            l = spidir.find_ml_branch_lengths_hky(
                    tree,
                    util.mget(align, tree.leafNames()),
                    bgfreq, kappa,
                    parsinit=False,
                    maxiter=1)
            
            dists.append([n.dist for n in nodes])
            likes.append(l)
        util.toc()

        print likes

        prep_dir("test/output/ml/")

        # distances plot
        util.rplot_start("test/output/ml/ml_branches.pdf")
        util.rplot("plot", util.cget(dists, 0),
                   ylim=[0, max(dists[0])], t="l",
                   main="branch length convergence",
                   xlab="iterations",
                   ylab="branch lengths (sub/site)")
        for d in zip(* dists):
            util.rplot("lines", d)
        util.rplot_end(True)

        print util.cget(dists, 4)

        # likelihood plot
        util.rplot_start("test/output/ml/ml_likelihood.pdf")
        util.rplot("plot", likes, t="l",
                   xlab="iterations",
                   ylab="log likelihood",
                   main="likelihood convergence")
        util.rplot_end(True)
Exemplo n.º 9
0
    def optimize_model(self, gtree, stree, gene2species):
        """Optimizes the model"""
        CostModel.optimize_model(self, gtree, stree, gene2species)

        #=============================
        # read sequences
        if not self.align:
            self.parser.error("--align must be specified")
        self.align = fasta.read_fasta(self.align)

        #=============================
        # read SPIDIR parameters
        if not self.params:
            self.parser.error("--param must be specified")
        self.params = spidir.read_params(self.params)

        #=============================
        # determine background base frequency
        if self.bgfreq:
            # use supplied frequency
            vals = map(float, self.bgfreq.split(","))
            if len(vals) != 4:
                self.parser.error("invalid --bgfreq: %s" % self.bgfreq)
            self.bgfreq = vals
        else:
            # compute frequency from alignment
            self.bgfreq = alignlib.compute_bgfreq(self.align)

        #=============================
        # branch lengths
        if self.kappa >= 0:
            # use supplied kappa
            self.kappa = self.kappa
        else:
            # compute kappa from alignment
            # from spidir.find_ml_kapp_hky
            minkappa = 0.4; maxkappa = 5.0; stepkappa = 0.1
            maxlk = -util.INF
            maxk = minkappa

            for k in util.frange(minkappa, maxkappa, stepkappa):
                l = spidir.find_ml_branch_lengths_hky(gtree, self.align, self.bgfreq, k, maxiter=1,
                                                      parsinit=(k == minkappa))
                if l > maxlk:
                    maxlk = l
                    maxk = k

            self.kappa = maxk
Exemplo n.º 10
0
    def test_calc_hky_seq_likelihood(self):

        bgfreq = [0.258, 0.267, 0.266, 0.209]
        kappa = 1.59
        tree = treelib.parse_newick("((A:.1,B:.1):.1,((C:.1,D:.1):.2,E:.3):.1);")
        align = {
            "A": "CGCAGACAACTCCCCCGACCACACATAGTACGAAATCCTCAGCCGCTGCCGACTCCGACGCGCGGACTGTCCGGGTTCAGCGAGGCTTAAGAGAACGGCC",
            "B": "CCCAAACAACTCCCCCGACCAGACATAGTACGAGATCCTCAGCCACTGGCGACTCGGACGCGCAGAGTGTCCCGCTTAAGCGAGGCTGCAGAGAACGGCC",
            "C": "GGCCAGCAATTCCTCCGACCACGCATAGTACGAGATCGTCTGCCTCCTGCGAATCGGACGCGCAGAGTGTTCCGGTTAAGGGAGACTTCAGAGACCTGGC",
            "D": "CGCTAACAATTCCCCCGACCACACTGAGTACGAGATACTCGGACTCCGGCGATCTCTACTCGCAGAGAGTCCCACTTAAGCGAGACTGACGAGCACGGGC",
            "E": "ATTCTTCCACACCTGCGTGTTCGTCACGTATCAAATGCGGAGCCCACGTCCAATGGCACACGAACAGTCGGCCACGGAATCGCAGACTCGTTGACCAACG",
        }

        draw_tree(tree)

        l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa)
        l2 = spidir.find_ml_branch_lengths_hky(tree, align, bgfreq, kappa)

        draw_tree(tree)

        print "log lk", l, l2
        self.assert_(l2 > l)