示例#1
0
    def bootstrap(self):
        """Returns a new data object, filled with bootstrapped data.

        It is a non-parametric bootstrap.  Data partitions are handled
        properly, that is if your data has a charpartition, the
        bootstrap has the same charpartition, and sites are sampled
        only from the appropriate charpartition subset.

        Generation of random numbers uses the GSL random number
        generator.  The state is held in ``var.gsl_rng``, which is
        None by default.  If you do a bootstrap using this method, it
        will use ``var.gsl_rng`` if it exists, or make it if it does
        not exist yet.  When it makes it, it seeds the state based on
        the current time.  That should give you lots of variation.

        If on the other hand you want to make a series of bootstraps
        that are the same as a previous series you can reseed the
        randomizer with the same seed before you do it, like this::

            if not var.gsl_rng:
                var.gsl_rng = pf.gsl_rng_get()
            # unusually, set the seed 
            mySeed = 23    # your chosen int seed
            pf.gsl_rng_set(var.gsl_rng, mySeed)
        """

        gm = ['Data.bootstrap()']

        import copy
        aligListCopy = copy.deepcopy(self.alignments)
        for alig in aligListCopy:
            # We do not want the cPart's, but neither do we want to free the
            # originals.
            for p in alig.parts:
                p.cPart = None
            del (alig.parts)
            alig.parts = []

        d = Data([])
        d._fill(aligListCopy)

        if not self.cData:
            self._setCStuff()
        d._setCStuff()

        if 0:
            print("\nSELF\n====")
            self.dump()
            print("\n\nNEW DATA\n========")
            d.dump()
            raise P4Error

        if not var.gsl_rng:
            var.gsl_rng = pf.gsl_rng_get()
            pf.gsl_rng_set(var.gsl_rng, int(time.time()))

        pf.bootstrapData(self.cData, d.cData, var.gsl_rng)

        # Data.resetSequencesFromParts() uses
        # Alignment.resetSequencesFromParts(), which uses
        # partSeq = pf.symbolSequences(self.parts[i].cPart)
        # which uses thePart->sequences

        d.resetSequencesFromParts()
        return d
示例#2
0
    def bootstrap(self, seed=None):
        """Returns a new data object, filled with bootstrapped data.

        It is a non-parametric bootstrap.  Data partitions are handled
        properly, that is if your data has a charpartition, the
        bootstrap has the same charpartition, and sites are sampled
        only from the appropriate charpartition subset.  """

        gm = ['Data.bootstrap()']

        import copy
        aligListCopy = copy.deepcopy(self.alignments)
        for alig in aligListCopy:
            # We do not want the cPart's, but neither do we want to free the
            # originals.
            for p in alig.parts:
                p.cPart = None
            del (alig.parts)
            alig.parts = []

        d = Data([])
        d._fill(aligListCopy)

        if not self.cData:
            self._setCStuff()
        d._setCStuff()

        if 0:
            print("\nSELF\n====")
            self.dump()
            print("\n\nNEW DATA\n========")
            d.dump()
            raise P4Error

        isNewGSL_RNG = 0
        if not var.gsl_rng:
            var.gsl_rng = pf.get_gsl_rng()
            isNewGSL_RNG = 1
            # print "got var.gsl_rng = %i" % var.gsl_rng

        # Set the GSL random number generator seed, only if it is a new GSL_RNG
        if isNewGSL_RNG:
            if seed != None:
                try:
                    newSeed = int(seed)
                    pf.gsl_rng_set(var.gsl_rng, newSeed)
                except ValueError:
                    print(gm[0])
                    print("    The seed should be convertable to an integer")
                    print("    Using the process id instead.")
                    pf.gsl_rng_set(var.gsl_rng, os.getpid())
            else:
                pf.gsl_rng_set(var.gsl_rng, os.getpid())

        pf.bootstrapData(self.cData, d.cData, var.gsl_rng)

        # Data.resetSequencesFromParts() uses
        # Alignment.resetSequencesFromParts(), which uses
        # partSeq = pf.symbolSequences(self.parts[i].cPart)
        # which uses thePart->sequences

        d.resetSequencesFromParts()
        return d
示例#3
0
    def simulate(self,
                 calculatePatterns=True,
                 resetSequences=True,
                 resetNexusSetsConstantMask=True,
                 refTree=None):
        """Simulate into the attached data.

        The tree self needs to have a data and model attached.

        Generation of random numbers uses the GSL random number
        generator.  The state is held in var.gsl_rng, which is None by
        default.  If you do a simulation using this method, it will
        use ``var.gsl_rng`` if it exists, or make it if it does not exist
        yet.  When it makes it, it seeds the state based on the
        current time.  That should give you lots of variation in the
        simulations.

        If on the other hand you want to make simulations that are the
        same you can reseed the randomizer with the same seed whenever
        you do it, like this::

            if not var.gsl_rng:
                var.gsl_rng = pf.gsl_rng_get()
            # unusually, set the seed with each simulation
            mySeed = 23    # your chosen int seed
            pf.gsl_rng_set(var.gsl_rng, mySeed)

        The usual way to simulate does not use reference data.  An unusual way to
        simulate comes from (inspired by?) PhyloBayes, where the simulation is
        conditional on the original data.  It uses conditional likelihoods of
        that reference data at the root.  To turn that on, set refTree to the
        tree+model+data that you would like to use.  Calculate a likelihood with
        that refTree before using it, so that conditional likelihoods are set.
        The tree and model for refTree should be identical to the tree and model
        for self.

        Args: 

            calculatePatterns (bool): True by default. Whether to "compress" the
                newly simulated data to facilitate a faster likelihood
                calculation.

            resetSequences (bool): True by default. whether to bring the
                simulated sequences in C back into Python

            resetNexusSetsConstantMask (bool): True by default.  When
                simulations are made, the constant mask in any associated nexus
                sets will get out of sync.  Setting this to True makes a new
                mask and sets it.

            refTree (Tree): None by default.  If supplied, a tree+model+data
                which has had its likelihood calculated, where the tree+model is
                identical to self.

        """

        if refTree:
            from p4.tree import Tree
            assert isinstance(refTree, Tree)
            assert refTree.model
            assert refTree.data
            if not refTree.cTree:
                refTree.calcLogLike(verbose=False)
            assert refTree.model.cModel
            assert refTree.data.cData

        if not var.gsl_rng:
            var.gsl_rng = pf.gsl_rng_get()
            pf.gsl_rng_set(var.gsl_rng, int(time.time()))

        self._commonCStuff()
        if refTree:
            assert refTree.data.cData != self.data.cData
            assert refTree.data.nParts == self.data.nParts
            assert refTree.data.nTax == self.data.nTax
            for i in range(self.data.nTax):
                assert refTree.data.taxNames[i] == self.data.taxNames[i]
            assert len(refTree.data.alignments) == len(self.data.alignments)
            assert refTree.logLike, "Do a likelihood calculation with the refTree before using it here."
            # could have some more checks ...

        # If there is a NexusSets object attached to any of the alignments
        # in the Data, the constant sites mask at least will become out of sync, but we can't just
        # delete the whole nexusSets object, as they define what the parts are.
        # for a in self.data.alignments:
        #
        #    if a.nexusSets:
        #        a.nexusSets = None

        # Probably better to do something like this
        # a.nexusSets.constant.mask = self.constantMask()
        # at the end.

        # print "About to pf.p4_simulate(self.cTree)"
        if refTree:
            pf.p4_simulate(self.cTree, refTree.cTree, var.gsl_rng)
        else:
            pf.p4_simulate(self.cTree, 0, var.gsl_rng)
        if calculatePatterns:
            for p in self.data.parts:
                pf.makePatterns(p.cPart)
                pf.setGlobalInvarSitesVec(p.cPart)
        if resetSequences:
            self.data.resetSequencesFromParts()
            if resetNexusSetsConstantMask:
                for a in self.data.alignments:
                    if a.nexusSets:
                        a.nexusSets.constant.mask = a.constantMask()
        else:
            if resetNexusSetsConstantMask:
                gm = ['Tree.simulate().']
                gm.append(
                    "resetSequences is not set, but resetNexusSetsConstantMask is set,"
                )
                gm.append("which is probably not going to work as you want.")
                raise P4Error(gm)