Пример #1
0
    def setUp(self):
        """Set up parameters for test."""
        random.seed(1)
        numpy.random.seed(1)

        self.underflowfreq = 1

        # define tree
        self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;')
        tempfile = '_temp.tree'
        with open(tempfile, 'w') as f:
            f.write(self.newick)
        self.tree = Bio.Phylo.read(tempfile, 'newick')
        os.remove(tempfile)

        # amino-acid preferences
        self.nsites = 50
        prefs = []
        minpref = 0.02
        for _r in range(self.nsites):
            rprefs = numpy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))

        # simulate alignment with pyvolve
        pyvolvetree = pyvolve.read_tree(tree=self.newick)
        self.nseqs = self.tree.count_terminals()
        expcm = phydmslib.models.ExpCM(prefs)
        partitions = phydmslib.simulate.pyvolvePartitions(expcm)
        alignment = '_temp_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        evolver(seqfile=alignment, infofile=info, ratefile=rates)
        self.alignment = [(s.description, str(s.seq))
                          for s in Bio.SeqIO.parse(alignment, 'fasta')]
        for f in [alignment, info, rates]:
            os.remove(f)
        assert len(self.alignment[0][1]) == self.nsites * 3
        assert len(self.alignment) == self.nseqs

        # define model
        if self.MODEL == phydmslib.models.ExpCM:
            self.model = phydmslib.models.ExpCM(prefs)
        else:
            raise ValueError("Invalid MODEL: {0}".format(self.MODEL))
        if self.DISTRIBUTIONMODEL is None:
            pass
        elif (self.DISTRIBUTIONMODEL ==
              phydmslib.models.GammaDistributedOmegaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        else:
            raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format(
                self.DISTRIBUTIONMODEL))
Пример #2
0
    def test_ExpCM(self):
        """Initialize `ExpCM`, test values, update, test again."""
        # create preferences
        random.seed(1)
        numpy.random.seed(1)
        self.nsites = 2
        self.prefs = []
        minpref = 0.01
        for _r in range(self.nsites):
            rprefs = numpy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))

        # create initial ExpCM
        phi = numpy.random.dirichlet([2] * N_NT)
        omega = 0.7
        kappa = 2.5
        beta = 1.9
        self.expcm = phydmslib.models.ExpCM(self.prefs,
                                            phi=phi,
                                            omega=omega,
                                            kappa=kappa,
                                            beta=beta)
        self.assertTrue(numpy.allclose(phi, self.expcm.phi))
        self.assertTrue(numpy.allclose(omega, self.expcm.omega))
        self.assertTrue(numpy.allclose(kappa, self.expcm.kappa))
        self.assertTrue(numpy.allclose(beta, self.expcm.beta))

        self.assertTrue(
            numpy.allclose(numpy.repeat(1.0, self.nsites),
                           self.expcm.stationarystate.sum(axis=1)))

        # now check ExpCM attributes / derivates, updating several times
        for _update in range(2):
            self.params = {
                "omega":
                random.uniform(*self.expcm.PARAMLIMITS["omega"]),
                "kappa":
                random.uniform(*self.expcm.PARAMLIMITS["kappa"]),
                "beta":
                random.uniform(0.5, 2.5),
                "eta":
                numpy.array([
                    random.uniform(*self.expcm.PARAMLIMITS["eta"])
                    for i in range(N_NT - 1)
                ]),
                "mu":
                random.uniform(0.05, 3.0)
            }
            self.expcm.updateParams(self.params)
            self.check_ExpCM_attributes()
            self.check_ExpCM_derivatives()
            self.check_ExpCM_matrix_exponentials()
Пример #3
0
    def test_ExpCM_empirical_phi_divpressure(self):
        """Init `ExpCM_empirical_phi_divpressure`, test, update, test again."""
        # create preferences
        random.seed(1)
        numpy.random.seed(1)
        self.nsites = 6
        self.prefs = []
        minpref = 0.01
        for _r in range(self.nsites):
            rprefs = numpy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        self.divpressure = numpy.random.randint(2, size=self.nsites)

        # create initial ExpCM
        g = numpy.random.dirichlet([3] * N_NT)
        omega = 0.7
        omega2 = 0.2
        kappa = 2.5
        beta = 1.2
        self.model = phydmslib.models.ExpCM_empirical_phi_divpressure(
            self.prefs,
            g=g,
            divPressureValues=self.divpressure,
            omega=omega,
            kappa=kappa,
            beta=beta,
            omega2=omega2)
        # now check ExpCM attributes / derivates, updating several times
        for _update in range(2):
            self.params = {
                "omega": random.uniform(0.1, 2),
                "kappa": random.uniform(0.5, 10),
                "beta": random.uniform(0.5, 3),
                "mu": random.uniform(0.05, 5.0),
                "omega2": random.uniform(0.1, 0.3),
            }
            self.model.updateParams(self.params)
            self.assertTrue(numpy.allclose(g, self.model.g))

            self.check_empirical_phi()

            self.check_dQxy_dbeta()

            self.check_dprx_dbeta()

            self.check_ExpCM_attributes()

            self.check_ExpCM_derivatives()

            self.check_ExpCM_matrix_exponentials()
Пример #4
0
    def testExpCM_spielmanwr(self):
        """Test the `ExpCM` function `_spielman_wr`."""
        # create models
        random.seed(1)
        numpy.random.seed(1)
        nsites = 10
        g = numpy.random.dirichlet([5] * N_NT)
        prefs = []
        minpref = 0.01
        for _r in range(nsites):
            rprefs = numpy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))

        if self.MODEL == phydmslib.models.ExpCM:
            self.model = phydmslib.models.ExpCM(prefs)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g)
        else:
            raise ValueError("Invalid MODEL: {0}".format(self.MODEL))

        # test `_spielman_wr` calculation
        wr = []
        for n in range(self.model.nsites):
            numerator = 0
            denominator = 0
            for x in range(N_CODON):
                for y in range(N_CODON):
                    if CODON_SINGLEMUT[x][y] and CODON_NONSYN[x][y]:
                        prx = self.model.stationarystate[n][x]
                        Prxy = self.model.Prxy[n][x][y]
                        Qxy = self.model.Qxy[x][y]
                        numerator += prx * Prxy
                        denominator += prx * Qxy
            wr.append(numerator / denominator)
        wr = numpy.array(wr)
        self.assertTrue(
            numpy.allclose(wr, self.model.spielman_wr(norm=False), rtol=0.01))
        self.assertTrue(
            numpy.allclose(wr / self.model.omega,
                           self.model.spielman_wr(),
                           rtol=0.01))
Пример #5
0
    def test_ExpCM_empirical_phi(self):
        """Initialize `ExpCM_empirical_phi`, test, update, test again."""
        # create preferences
        random.seed(1)
        numpy.random.seed(1)
        self.nsites = 7
        self.prefs = []
        minpref = 0.01
        for _r in range(self.nsites):
            rprefs = numpy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))

        # create initial ExpCM
        g = numpy.random.dirichlet([3] * N_NT)
        omega = 0.7
        kappa = 2.5
        beta = 1.2
        self.expcm = (phydmslib.models.ExpCM_empirical_phi(self.prefs,
                                                           g=g,
                                                           omega=omega,
                                                           kappa=kappa,
                                                           beta=beta))
        self.assertTrue(numpy.allclose(g, self.expcm.g))

        # now check ExpCM attributes / derivates, updating several times
        for _update in range(2):
            self.params = {
                'omega': random.uniform(0.1, 2),
                'kappa': random.uniform(0.5, 10),
                'beta': random.uniform(0.5, 5),
                'mu': random.uniform(0.05, 5.0),
            }
            self.expcm.updateParams(self.params)
            self.assertTrue(numpy.allclose(g, self.expcm.g))
            self.check_empirical_phi()
            self.check_dQxy_dbeta()
            self.check_dprx_dbeta()
            self.check_ExpCM_attributes()
            self.check_ExpCM_derivatives()
            self.check_ExpCM_matrix_exponentials()
Пример #6
0
 def setUp(self):
     """Set up for tests."""
     numpy.random.seed(1)
     random.seed(1)
     nsites = 1
     minpref = 0.001
     self.prefs = []
     for _r in range(nsites):
         rprefs = numpy.random.dirichlet([0.7] * N_AA)
         rprefs[rprefs < minpref] = minpref
         rprefs[0] = rprefs[1] + 1.0e-8  # near equal prefs handled OK
         rprefs /= rprefs.sum()
         self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
     self.expcm_fitprefs = self.MODEL(self.prefs,
                                      prior=None,
                                      kappa=3.0,
                                      omega=0.3,
                                      phi=numpy.random.dirichlet([5] *
                                                                 N_NT))
     assert len(self.expcm_fitprefs.zeta.flatten()) == nsites * (N_AA - 1)
     assert self.expcm_fitprefs.nsites == nsites
Пример #7
0
    def test_simulateAlignmentRandomSeed(self):
        """Simulate evolution, ensure scaled branches match number of subs."""
        numpy.random.seed(1)
        random.seed(1)

        # define model
        nsites = 200
        prefs = []
        minpref = 0.01
        for _r in range(nsites):
            rprefs = numpy.random.dirichlet([1] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        kappa = 4.2
        omega = 0.4
        beta = 1.5
        mu = 0.3
        if self.MODEL == phydmslib.models.ExpCM:
            phi = numpy.random.dirichlet([7] * N_NT)
            model = phydmslib.models.ExpCM(prefs,
                                           kappa=kappa,
                                           omega=omega,
                                           beta=beta,
                                           mu=mu,
                                           phi=phi,
                                           freeparams=['mu'])
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            g = numpy.random.dirichlet([7] * N_NT)
            model = phydmslib.models.ExpCM_empirical_phi(prefs,
                                                         g,
                                                         kappa=kappa,
                                                         omega=omega,
                                                         beta=beta,
                                                         mu=mu,
                                                         freeparams=['mu'])
        elif self.MODEL == phydmslib.models.YNGKP_M0:
            e_pw = numpy.asarray(
                [numpy.random.dirichlet([7] * N_NT) for i in range(3)])
            model = phydmslib.models.YNGKP_M0(e_pw, nsites)
        else:
            raise ValueError("Invalid MODEL: {0}".format(type(self.MODEL)))

        # make a test tree
        # tree is two sequences separated by a single branch
        t = 0.04 / model.branchScale
        newicktree = '(tip1:{0},tip2:{0});'.format(t / 2.0)
        temptree = '_temp.tree'
        with open(temptree, 'w') as f:
            f.write(newicktree)

        counter = 0
        seed = 1
        alignments = [{}, {}, {}]
        # alignments with the same seed number should be the same
        # make two alignments with the same seed number
        for counter in range(2):
            alignmentPrefix = "test_counter{0}_seed{1}".format(counter, seed)
            phydmslib.simulate.simulateAlignment(model, temptree,
                                                 alignmentPrefix, seed)
            for s in Bio.SeqIO.parse(
                    "test_counter{0}_seed{1}_simulated"
                    "alignment.fasta".format(counter, seed), "fasta"):
                alignments[counter][s.id] = str(s.seq)
        # check they are the same
        for key in alignments[counter].keys():
            self.assertTrue(alignments[counter][key] == alignments[counter -
                                                                   1][key])

        # alignments with different seed numbers should be different
        # make an alignment with a different seed number
        seed += 1
        counter += 1
        alignmentPrefix = "test_counter{0}_seed{1}".format(counter, seed)
        phydmslib.simulate.simulateAlignment(model, temptree, alignmentPrefix,
                                             seed)
        for s in Bio.SeqIO.parse(
                "test_counter{0}_seed{1}_simulatedalignment."
                "fasta".format(counter, seed), "fasta"):
            alignments[counter][s.id] = str(s.seq)
        # check they are different
        for key in alignments[counter].keys():
            self.assertFalse(alignments[counter][key] == alignments[counter -
                                                                    1][key])

        # general clean-up
        os.remove(temptree)
        for fasta in glob.glob("test*simulatedalignment.fasta"):
            if os.path.isfile(fasta):
                os.remove(fasta)
    def test_GammaDistributedOmega(self):
        """Initialize, test values, update, test again."""
        random.seed(1)
        numpy.random.seed(1)
        nsites = 10

        if self.BASEMODEL == phydmslib.models.ExpCM:
            prefs = []
            minpref = 0.01
            for _r in range(nsites):
                rprefs = numpy.random.dirichlet([0.5] * N_AA)
                rprefs[rprefs < minpref] = minpref
                rprefs /= rprefs.sum()
                prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
            paramvalues = {
                "eta": numpy.random.dirichlet([5] * (N_NT - 1)),
                "omega": 0.7,
                "kappa": 2.5,
                "beta": 1.2,
                "mu": 0.5
            }
            basemodel = self.BASEMODEL(prefs)
            assert set(paramvalues.keys()) == set(
                basemodel.freeparams), "{0} vs {1}".format(
                    set(paramvalues.keys()), set(basemodel.freeparams))
            basemodel.updateParams(paramvalues)
        elif self.BASEMODEL == phydmslib.models.YNGKP_M0:
            e_pw = numpy.random.uniform(0.4, 0.6, size=(3, N_NT))
            e_pw = e_pw / e_pw.sum(axis=1, keepdims=True)
            basemodel = self.BASEMODEL(e_pw, nsites)
            paramvalues = {"kappa": 2.5, "omega": 0.7, "mu": 0.5}
            assert set(paramvalues.keys()) == set(basemodel.freeparams)
            basemodel.updateParams(paramvalues)
        else:
            raise ValueError("Invalid BASEMODEL: {0}".format(self.BASEMODEL))
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))

        ncats = 4
        gammamodel = phydmslib.models.GammaDistributedOmegaModel(
            basemodel, ncats)
        self.assertTrue(
            numpy.allclose(
                numpy.array([m.omega for m in gammamodel._models]),
                phydmslib.models.DiscreteGamma(gammamodel.alpha_lambda,
                                               gammamodel.beta_lambda,
                                               gammamodel.ncats)))
        for (param, pvalue) in paramvalues.items():
            if param != gammamodel.distributedparam:
                self.assertTrue(
                    numpy.allclose(getattr(gammamodel, param), pvalue))

        # try some updates and make sure everything remains OK
        for _i in range(3):
            newvalues = {}
            for param in gammamodel.freeparams:
                (low, high) = gammamodel.PARAMLIMITS[param]
                if gammamodel.PARAMTYPES[param] == float:
                    newvalues[param] = random.uniform(low, high)
                else:
                    paramlength = gammamodel.PARAMTYPES[param][1]
                    newvalues[param] = numpy.random.uniform(
                        low, high, paramlength)
            gammamodel.updateParams(newvalues)
            self.assertTrue(
                numpy.allclose(
                    numpy.array([m.omega for m in gammamodel._models]),
                    phydmslib.models.DiscreteGamma(gammamodel.alpha_lambda,
                                                   gammamodel.beta_lambda,
                                                   gammamodel.ncats)))
            for (param, pvalue) in newvalues.items():
                if param != gammamodel.distributedparam:
                    self.assertTrue(
                        numpy.allclose(pvalue, getattr(gammamodel, param)))
                    if param not in gammamodel.distributionparams:
                        self.assertTrue(
                            all((numpy.allclose(pvalue, getattr(m, param))
                                 for m in gammamodel._models)))

            self.assertTrue(gammamodel._models[0].branchScale < gammamodel.
                            branchScale < gammamodel._models[-1].branchScale)

            t = 0.15
            for k in range(gammamodel.ncats):
                M = gammamodel.M(k, t)
                self.assertTrue(numpy.allclose(gammamodel._models[k].M(t), M))
                for param in gammamodel.freeparams:
                    if param not in gammamodel.distributionparams:
                        dM = gammamodel.dM(k, t, param, M)
                        self.assertTrue(
                            numpy.allclose(
                                dM, gammamodel._models[k].dM(t, param,
                                                             Mt=None)))

            # Check derivatives with respect to distribution params
            d_distparams = gammamodel.d_distributionparams
            self.assertTrue((d_distparams["alpha_lambda"] > 0).all())
            self.assertTrue((d_distparams["beta_lambda"] < 0).all())
            for param in gammamodel.distributionparams:
                diffs = []
                for k in range(gammamodel.ncats):
                    pvalue = getattr(gammamodel, param)

                    def func(x):
                        gammamodel.updateParams({param: x[0]})
                        return getattr(gammamodel._models[k],
                                       gammamodel.distributedparam)

                    def dfunc(x):
                        gammamodel.updateParams({param: x[0]})
                        return gammamodel.d_distributionparams[param][k]

                    diff = scipy.optimize.check_grad(func, dfunc,
                                                     numpy.array([pvalue]))
                    gammamodel.updateParams({param: pvalue})
                    diffs.append(diff)
                diffs = numpy.array(diffs)
                self.assertTrue(
                    (diffs < 1e-5).all(),
                    ("Excessive diff for d_distributionparams[{0}] when "
                     "distributionparams = {1}:\n{2}".format(
                         param, gammamodel.distributionparams, diffs)))
Пример #9
0
    def test_compare(self):
        """Make sure all attributes are the same when `divpressure` is 0."""
        random.seed(1)
        numpy.random.seed(1)

        nsites = 6
        prefs = []
        minpref = 0.01
        for _r in range(nsites):
            rprefs = numpy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        g = numpy.random.dirichlet([3] * N_NT)
        omega = 0.7
        omega2 = 0.2
        kappa = 2.5
        beta = 1.2
        divpressure = numpy.zeros(nsites)

        expcm = phydmslib.models.ExpCM_empirical_phi(
            prefs, g, omega=omega, kappa=kappa, beta=beta
        )

        expcm_divpressure = phydmslib.models.ExpCM_empirical_phi_divpressure(
            prefs,
            g,
            divPressureValues=divpressure,
            omega=omega,
            kappa=kappa,
            beta=beta,
            omega2=omega2,
        )

        self.assertTrue(numpy.allclose(expcm.stationarystate,
                                       expcm_divpressure.stationarystate),
                        "stationarystate differs.")
        self.assertTrue(numpy.allclose(expcm.Qxy, expcm_divpressure.Qxy),
                        "Qxy differs")
        self.assertTrue(
            numpy.allclose(expcm.Frxy, expcm_divpressure.Frxy), "Frxy differs")
        self.assertTrue(
            numpy.allclose(expcm.Prxy, expcm_divpressure.Prxy), "Prxy differs")
        t = 0.02
        self.assertTrue(
            numpy.allclose(expcm.M(t), expcm_divpressure.M(t)),
            "M({0}) differs".format(t))
        for param in ["kappa", "omega", "beta"]:
            self.assertTrue(
                numpy.allclose(
                    getattr(expcm, param), getattr(expcm_divpressure, param)),
                "param values differ for {0}".format(param))
            self.assertTrue(
                numpy.allclose(
                    expcm.dstationarystate(param),
                    (expcm_divpressure.dstationarystate(param))),
                "dstationarystate differs for {0}".format(param))
            self.assertTrue(
                numpy.allclose(
                    expcm.dM(t, param, expcm.M(t)),
                    (expcm_divpressure.dM(t, param, expcm_divpressure.M(t)))),
                "dM({0}) differs for {1}".format(t, param))
Пример #10
0
    def setUp(self):
        """Set up parameters for test."""
        random.seed(1)
        numpy.random.seed(1)

        self.underflowfreq = 1

        # define tree
        self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;')
        tempfile = '_temp.tree'
        with open(tempfile, 'w') as f:
            f.write(self.newick)
        self.tree = Bio.Phylo.read(tempfile, 'newick')
        os.remove(tempfile)

        # simulate alignment with pyvolve
        pyvolvetree = pyvolve.read_tree(tree=self.newick)
        self.nsites = 50
        self.nseqs = self.tree.count_terminals()
        e_pw = numpy.ndarray((3, N_NT), dtype='float')
        e_pw.fill(0.25)
        yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0)
        alignment = '_temp_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        evolver(seqfile=alignment, infofile=info, ratefile=rates)
        self.alignment = [(s.description, str(s.seq))
                          for s in Bio.SeqIO.parse(alignment, 'fasta')]
        for f in [alignment, info, rates]:
            os.remove(f)
        assert len(self.alignment[0][1]) == self.nsites * 3
        assert len(self.alignment) == self.nseqs

        # define model
        prefs = []
        minpref = 0.02
        g = numpy.random.dirichlet([10] * N_NT)
        for _r in range(self.nsites):
            rprefs = numpy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        if self.MODEL == phydmslib.models.ExpCM:
            self.model = phydmslib.models.ExpCM(prefs)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure:
            divpressure = numpy.random.uniform(-1, 5, self.nsites)
            divpressure /= max(abs(divpressure))
            self.model = phydmslib.models.ExpCM_empirical_phi_divpressure(
                prefs, g, divpressure)
        elif self.MODEL == phydmslib.models.YNGKP_M0:
            e_pw = numpy.random.uniform(0.2, 0.8, size=(3, N_NT))
            e_pw = e_pw / e_pw.sum(axis=1, keepdims=True)
            self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        else:
            raise ValueError("Invalid MODEL: {0}".format(self.MODEL))

        if self.DISTRIBUTIONMODEL is None:
            pass
        elif (self.DISTRIBUTIONMODEL ==
              phydmslib.models.GammaDistributedOmegaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        else:
            raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format(
                self.DISTRIBUTIONMODEL))
Пример #11
0
    def test_branchScale(self):
        """Simulate evolution, ensure scaled branches match number of subs."""
        numpy.random.seed(1)
        random.seed(1)

        # define model, only free parameter is mu for testing simulations
        nsites = 50
        prefs = []
        minpref = 0.01
        for _r in range(nsites):
            rprefs = numpy.random.dirichlet([1] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        kappa = 4.2
        omega = 0.4
        beta = 1.5
        mu = 0.3
        if self.MODEL == phydmslib.models.ExpCM:
            phi = numpy.random.dirichlet([7] * N_NT)
            model = phydmslib.models.ExpCM(prefs,
                                           kappa=kappa,
                                           omega=omega,
                                           beta=beta,
                                           mu=mu,
                                           phi=phi,
                                           freeparams=['mu'])
            partitions = phydmslib.simulate.pyvolvePartitions(model)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            g = numpy.random.dirichlet([7] * N_NT)
            model = phydmslib.models.ExpCM_empirical_phi(prefs,
                                                         g,
                                                         kappa=kappa,
                                                         omega=omega,
                                                         beta=beta,
                                                         mu=mu,
                                                         freeparams=['mu'])
            partitions = phydmslib.simulate.pyvolvePartitions(model)
        elif self.MODEL == phydmslib.models.YNGKP_M0:
            e_pw = numpy.asarray(
                [numpy.random.dirichlet([7] * N_NT) for i in range(3)])
            model = phydmslib.models.YNGKP_M0(e_pw, nsites)
            partitions = phydmslib.simulate.pyvolvePartitions(model)
        else:
            raise ValueError("Invalid MODEL: {0}".format(type(self.MODEL)))

        # tree is two sequences separated by a single branch
        t = 0.04 / model.branchScale
        newicktree = '(tip1:{0},tip2:{0});'.format(t / 2.0)
        pyvolvetree = pyvolve.read_tree(tree=newicktree)
        temptree = '_temp.tree'
        with open(temptree, 'w') as f:
            f.write(newicktree)
        biotree = Bio.Phylo.read(temptree, 'newick')
        os.remove(temptree)

        # Simulate evolution of two sequences separated by a long branch.
        # Then estimate subs per site in a heuristic way that will be
        # roughly correct for short branches. Do this all several times
        # and average results to get better accuracy.
        alignment = '_temp_branchScale_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        nsubs = 0  # subs in simulated seqs (estimate from Hamming distance)
        treedist = 0.0  # distance inferred by `TreeLikelihood`
        nreplicates = 100
        for _i in range(nreplicates):
            evolver(seqfile=alignment, infofile=info, ratefile=rates)
            a = [(s.description, str(s.seq))
                 for s in Bio.SeqIO.parse(alignment, 'fasta')]
            assert len(a[0][1]) == len(a[1][1]) == nsites * 3
            for f in [alignment, info, rates]:
                if os.path.isfile(f):
                    os.remove(f)
            for r in range(nsites):
                codon1 = a[0][1][3 * r:3 * r + 3]
                codon2 = a[1][1][3 * r:3 * r + 3]
                nsubs += len([j for j in range(3) if codon1[j] != codon2[j]])
            tl = phydmslib.treelikelihood.TreeLikelihood(biotree, a, model)
            tl.maximizeLikelihood()
            treedist += sum((n.branch_length for n in tl.tree.get_terminals()))
        nsubs /= float(nsites * nreplicates)
        treedist /= float(nreplicates)

        # We expect nsubs = branchScale * t, but build in some tolerance
        # with rtol since we simulated finite number of sites.
        self.assertTrue(
            numpy.allclose(nsubs, model.branchScale * t, rtol=0.2),
            ("Simulated subs per site of {0} is not close to "
             "expected value of {1} (branchScale = {2}, t = {3})").format(
                 nsubs, t * model.branchScale, model.branchScale, t))
        self.assertTrue(
            numpy.allclose(treedist, nsubs, rtol=0.2),
            ("Simulated subs per site of {0} is not close to inferred "
             "branch length of {1}").format(nsubs, treedist))
    def setUp(self):
        """Set up for tests."""
        numpy.random.seed(1)
        random.seed(1)

        nsites = 1
        minpref = 0.001
        self.prefs = []
        self.realprefs = []
        for _r in range(nsites):
            rprefs = numpy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
            numpy.random.shuffle(rprefs)
            self.realprefs.append(dict(zip(sorted(AA_TO_INDEX.keys()),
                                           rprefs)))
        self.kappa = 3.0
        self.omega = 3.0
        self.phi = numpy.random.dirichlet([5] * N_NT)
        self.model = self.MODEL(self.prefs,
                                prior=None,
                                kappa=self.kappa,
                                omega=self.omega,
                                phi=self.phi)
        self.realmodel = phydmslib.models.ExpCM(self.realprefs,
                                                kappa=self.kappa,
                                                omega=self.omega,
                                                mu=10.0,
                                                phi=self.phi)

        treefile = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         "./NP_data/NP_tree.newick"))
        self.tree = Bio.Phylo.read(treefile, "newick")
        self.tree.root_at_midpoint()

        # simulate alignment using realmodel
        evolver = pyvolve.Evolver(
            partitions=phydmslib.simulate.pyvolvePartitions(self.realmodel),
            tree=pyvolve.read_tree(file=treefile))
        alignmentfile = "_temp_fitprefs_simulatedalignment.fasta"
        info = "_temp_info.txt"
        rates = "_temp_ratefile.txt"
        evolver(seqfile=alignmentfile, infofile=info, ratefile=rates)
        self.alignment = phydmslib.file_io.ReadCodonAlignment(
            alignmentfile, True)
        assert len(self.alignment[0][1]) == nsites * 3
        for f in [alignmentfile, info, rates]:
            os.remove(f)
        self.codoncounts = {
            r: {INDEX_TO_CODON[c]: 0
                for c in range(N_CODON)}
            for r in range(nsites)
        }
        self.aacounts = {r: {a: 0 for a in range(N_AA)} for r in range(nsites)}
        for (_head, seq) in self.alignment:
            for r, i in enumerate(range(0, nsites + 1, 3)):
                self.codoncounts[r][seq[i:i + 3]] += 1
                self.aacounts[r][CODON_TO_AA[CODON_TO_INDEX[seq[i:i +
                                                                3]]]] += 1

        self.tl = phydmslib.treelikelihood.TreeLikelihood(
            self.tree, self.alignment, self.model)
Пример #13
0
    def setUp(self):
        """Set up parameters for test."""
        random.seed(1)
        numpy.random.seed(1)

        # define tree
        self.newick = "((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;"
        tempfile = "_temp.tree"
        with open(tempfile, "w") as f:
            f.write(self.newick)
        self.tree = Bio.Phylo.read(tempfile, "newick")
        os.remove(tempfile)
        self.brlen = {}
        for (name,
             brlen) in re.findall(r"(?P<name>node\d):(?P<brlen>\d+\.\d+)",
                                  self.newick):
            if name != self.tree.root.name:
                i = name[-1]  # node number
                self.brlen[int(i)] = float(brlen)

        # simulate alignment with pyvolve
        pyvolvetree = pyvolve.read_tree(tree=self.newick)
        self.nsites = 60
        self.nseqs = self.tree.count_terminals()
        e_pw = numpy.ndarray((3, N_NT), dtype="float")
        e_pw.fill(0.25)
        yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0)
        alignment = "_temp_simulatedalignment.fasta"
        info = "_temp_info.txt"
        rates = "_temp_ratefile.txt"
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        evolver(seqfile=alignment, infofile=info, ratefile=rates)
        self.alignment = [(s.description, str(s.seq))
                          for s in Bio.SeqIO.parse(alignment, "fasta")]
        for f in [alignment, info, rates]:
            os.remove(f)
        assert len(self.alignment[0][1]) == self.nsites * 3
        assert len(self.alignment) == self.nseqs
        self.codons = {}  # indexed by node, site, gives codon index
        for node in self.tree.get_terminals():
            node = node.name
            i = int(node[-1])
            self.codons[i] = {}
            seq = [seq for (head, seq) in self.alignment if node == head][0]
            for r in range(self.nsites):
                codon = seq[3 * r:3 * r + 3]
                self.codons[i][r] = CODON_TO_INDEX[codon]

        # define model
        prefs = []
        minpref = 0.02
        g = numpy.random.dirichlet([5] * N_NT)
        g[g < 0.1] = 0.1
        g /= g.sum()
        for _r in range(self.nsites):
            rprefs = numpy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        if self.MODEL == phydmslib.models.ExpCM:
            self.model = phydmslib.models.ExpCM(prefs)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure:
            divpressure = numpy.random.uniform(-1, 5, self.nsites)
            divpressure /= max(abs(divpressure))
            self.model = phydmslib.models.ExpCM_empirical_phi_divpressure(
                prefs, g, divpressure)
        elif self.MODEL == phydmslib.models.YNGKP_M0:
            e_pw = numpy.random.uniform(0.2, 0.8, size=(3, N_NT))
            e_pw = e_pw / e_pw.sum(axis=1, keepdims=True)
            self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        else:
            raise ValueError("Invalid MODEL: {0}".format(self.MODEL))

        if self.DISTRIBUTIONMODEL is None:
            pass
        elif self.DISTRIBUTIONMODEL == (
                phydmslib.models.GammaDistributedOmegaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        elif self.DISTRIBUTIONMODEL == (
                phydmslib.models.GammaDistributedBetaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        else:
            raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format(
                self.DISTRIBUTIONMODEL))
Пример #14
0
    def test_simulateAlignment(self):
        """Simulate evolution, ensure scaled branches match number of subs."""
        numpy.random.seed(1)
        random.seed(1)

        alignmentPrefix = "test"

        # define model
        nsites = 1000
        prefs = []
        minpref = 0.01
        for _r in range(nsites):
            rprefs = numpy.random.dirichlet([1] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        kappa = 4.2
        omega = 0.4
        beta = 1.5
        mu = 0.3
        omega2 = 1.2
        deltar = numpy.array([1 if x in random.sample(range(nsites), 20)
                              else 0 for x in range(nsites)])
        if self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure:
            g = numpy.random.dirichlet([7] * N_NT)
            model = (phydmslib.models
                     .ExpCM_empirical_phi_divpressure(prefs, g, deltar,
                                                      kappa=kappa, omega=omega,
                                                      beta=beta, mu=mu,
                                                      freeparams=['mu'],
                                                      omega2=omega2))
        else:
            raise ValueError("Invalid MODEL: {0}".format(type(self.MODEL)))

        # make a test tree
        # tree is two sequences separated by a single branch
        # the units are in sub/site
        t = 0.04
        newicktree = '(tip1:{0},tip2:{0});'.format(t / 2.0)
        temptree = '_temp.tree'
        with open(temptree, 'w') as f:
            f.write(newicktree)

        # simulate the alignment
        phydmslib.simulate.simulateAlignment(model, temptree, alignmentPrefix)

        # read in the test tree, re-scale the branch lengths, remove the file
        biotree = Bio.Phylo.read(temptree, 'newick')
        os.remove(temptree)
        for node in biotree.get_terminals() + biotree.get_nonterminals():
            if node.branch_length:
                node.branch_length /= model.branchScale

        # check and see if the simulated alignment has the expected number of
        # subs exists
        alignment = '{0}_simulatedalignment.fasta'.format(alignmentPrefix)
        nsubs = 0  # subs in simulated seqs (estimate from Hamming distance)
        treedist = 0.0  # distance inferred by `TreeLikelihood`
        a = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(
                alignment, 'fasta')]
        assert len(a[0][1]) == len(a[1][1]) == nsites * 3
        for f in [alignment]:
            if os.path.isfile(f):
                os.remove(f)
        for r in range(nsites):
            codon1 = a[0][1][3 * r: 3 * r + 3]
            codon2 = a[1][1][3 * r: 3 * r + 3]
            nsubs += len([j for j in range(3) if codon1[j] != codon2[j]])
        nsubs /= float(nsites)
        tl = phydmslib.treelikelihood.TreeLikelihood(biotree, a, model)
        tl.maximizeLikelihood()
        treedist += sum((n.branch_length for n in tl.tree.get_terminals()))

        # We expect nsubs = t, but build in some tolerance
        # with rtol since we simulated finite number of sites.
        self.assertTrue(numpy.allclose(nsubs, t, rtol=0.2),
                        ("Simulated subs per site of {0} is not close "
                         "to expected value of {1} (branchScale = {2}, "
                         "t = {3})").format(nsubs, t, model.branchScale, t))
        self.assertTrue(numpy.allclose(treedist, nsubs, rtol=0.2), (
                "Simulated subs per site of {0} is not close to inferred "
                "branch length of {1}").format(nsubs, treedist))