def simulate(self, calculatePatterns=True, resetSequences=True, resetNexusSetsConstantMask=True, refTree=None): """Simulate into the attached data. The tree self needs to have a data and model attached. This week, generation of random numbers uses the C language random function, which is in stdlib on Linux. It will use the same series of random numbers over and over, unless you tell it otherwise. That means that (unless you tell it otherwise) it will generate the same simulated data if you run it twice. To reset the randomizer, you can use p4.func.reseedCRandomizer(), eg func.reseedCRandomizer(os.getpid()) The usual way to simulate does not use reference data. An unsual way to simulate comes from (inspired by?) PhyloBayes, where the simulation is conditional on the original data. It uses conditional likelihoods of that reference data at the root. To turn that on, set refTree to the tree+model+data that you would like to use. Calculate a likelihood with that refTree before using it, so that conditional likelihoods are set. The tree and model for refTree should be identical to the tree and model for self. Args: calculatePatterns (bool): True by default. Whether to "compress" the newly simulated data to facilitate a faster likelihood calculation. resetSequences (bool): True by default. whether to bring the simulated sequences in C back into Python resetNexusSetsConstantMask (bool): True by default. When simulations are made, the constant mask in any associated nexus sets will get out of sync. Setting this to True makes a new mask and sets it. refTree (Tree): None by default. If supplied, a tree+model+data which has had its likelihood calculated, where the tree+model is identical to self. """ if refTree: from tree import Tree assert isinstance(refTree, Tree) assert refTree.model assert refTree.data if not refTree.cTree: refTree.calcLogLike(verbose=False) assert refTree.model.cModel assert refTree.data.cData self._commonCStuff() if refTree: assert refTree.data.cData != self.data.cData assert refTree.data.nParts == self.data.nParts assert refTree.data.nTax == self.data.nTax for i in range(self.data.nTax): assert refTree.data.taxNames[i] == self.data.taxNames[i] assert len(refTree.data.alignments) == len(self.data.alignments) assert refTree.logLike, "Do a likelihood calculation with the refTree before using it here." # could have some more checks ... # If there is a NexusSets object attached to any of the alignments # in the Data, the constant sites mask at least will become out of sync, but we can't just # delete the whole nexusSets object, as they define what the parts are. # for a in self.data.alignments: # # if a.nexusSets: # a.nexusSets = None # Probably better to do something like this # a.nexusSets.constant.mask = self.constantMask() # at the end. # print "About to pf.p4_simulate(self.cTree)" if refTree: pf.p4_simulate(self.cTree, refTree.cTree) else: pf.p4_simulate(self.cTree, 0) if calculatePatterns: for p in self.data.parts: pf.makePatterns(p.cPart) pf.setGlobalInvarSitesVec(p.cPart) if resetSequences: self.data.resetSequencesFromParts() if resetNexusSetsConstantMask: for a in self.data.alignments: if a.nexusSets: a.nexusSets.constant.mask = a.constantMask() else: if resetNexusSetsConstantMask: gm = ['Tree.simulate().'] gm.append( "resetSequences is not set, but resetNexusSetsConstantMask is set,") gm.append("which is probably not going to work as you want.") raise P4Error(gm)
def _initParts(self): gm = ['Alignment._initParts()'] if len(self.parts): for p in self.parts: del (p) self.parts = [] if self.equates: eqSymb = list(self.equates.keys()) eqSymb.sort() eqSymb = ''.join(eqSymb) else: eqSymb = '' if len(self.sequences) and self.length and self.symbols and self.dim: pass else: gm.append("Can't allocate part.") if not len(self.sequences): gm.append("-no sequences.") elif not self.length: gm.append("-the sequences have no length") elif not self.symbols: gm.append("-no symbols") elif not self.dim: gm.append("-dim not set") raise P4Error(gm) # its all one part if not self.nexusSets or not self.nexusSets.charPartition: aPart = Part() aPart.alignment = self aPart.name = 'all' aPart.lowName = 'all' aPart.dataType = self.dataType aPart.dim = self.dim aPart.symbols = self.symbols aPart.equates = self.equates aPart.nTax = len(self.sequences) aPart.nChar = self.length assert aPart.nChar if 0: print(gm[0]) print(" symbols=%s" % self.symbols) aPart.cPart = pf.newPart(len(self.sequences), self.length, eqSymb, self.symbols) if not aPart or not aPart.cPart: gm.append("Failed to get memory for part.") raise P4Error(gm) # Make the equates table verbose = 0 equatesTable = [] if verbose: print("equates is %s" % self.equates) print("eqSymb is %s" % eqSymb) # the keys print("symbols is %s" % self.symbols) for i in range(len(eqSymb)): if verbose: print("%3s: " % eqSymb[i], end=' ') e = self.equates[eqSymb[i]] if verbose: print("%8s : " % e, end=' ') for s in self.symbols: if s in e: if verbose: print("%1i" % 1, end=' ') equatesTable.append('1') else: if verbose: print("%1i" % 0, end=' ') equatesTable.append('0') if verbose: print('') equatesTable = ''.join(equatesTable) if verbose: print("\n\nequatesTable:") print(equatesTable) pf.pokeEquatesTable(aPart.cPart, equatesTable) sList = [] for s in self.sequences: sList.append(s.sequence) if 0: print(gm[0]) print("sList = %s" % sList) print("joined = %s" % ''.join(sList)) pf.pokeSequences(aPart.cPart, ''.join(sList)) # print "about to makePatterns ..." pf.makePatterns(aPart.cPart) # print "about to setInvar" pf.setGlobalInvarSitesVec(aPart.cPart) # pf.dumpPart(aPart.cPart) self.parts.append(aPart) elif self.nexusSets.charPartition: for cpp in self.nexusSets.charPartition.subsets: # print "Doing subset '%s', mask: %s" % (cpp.name, cpp.mask) # print "About to subsetUsingMask (self length is %i)" % # self.length b = self.subsetUsingMask(cpp.mask) # This very method, but now there are no charPartitions in b. b._initParts() b.parts[0].name = cpp.name b.parts[0].lowName = cpp.name.lower() self.parts.append(b.parts[0]) b.parts = [] # so we don't try free-ing it twice
def resetPartsContentFromSequences(self): """Reset Part.cPart sequences from self.sequences. It then makes patterns, and sets the global invariant sites array. """ gm = ['Alignment.resetPartsContentFromSequences()'] if len(self.parts) == 1: # its all one part aPart = self.parts[0] if not var.doDataPart: sList = [] for s in self.sequences: sList.append(s.sequence) pf.pokeSequences(aPart.cPart, ''.join(sList)) # are the following necessary? pf.makePatterns(aPart.cPart) pf.setGlobalInvarSitesVec(aPart.cPart) else: for sNum in range(len(self.sequences)): s = self.sequences[sNum] for cNum in range(self.length): theChar = s.sequence[cNum] if theChar == '-': aPart.seq[sNum, cNum] = var.GAP_CODE if theChar == '?': aPart.seq[sNum, cNum] = var.QMARK_CODE elif aPart.equateSymbols and theChar in aPart.equateSymbols: aPart.seq[sNum, cNum] = var.EQUATES_BASE + \ aPart.equateSymbols.index(theChar) else: aPart.seq[sNum, cNum] = aPart.symbols.index(theChar) elif len(self.parts) > 1: # the number of parts is also the length of the subsets list if self.nexusSets and self.nexusSets.charPartition and \ self.nexusSets.charPartition.subsets and \ len(self.nexusSets.charPartition.subsets) == len(self.parts): pass else: gm.append( 'Something is wrong with the nexusSets or its charPartition' ) raise P4Error(gm) for i in range(len(self.parts)): cpSubset = self.nexusSets.charPartition.subsets[i] aPart = self.parts[i] b = self.subsetUsingMask(cpSubset.mask) if not var.doDataPart: sList = [] for s in b.sequences: sList.append(s.sequence) pf.pokeSequences(aPart.cPart, ''.join(sList)) # are the following necessary? pf.makePatterns(aPart.cPart) pf.setGlobalInvarSitesVec(aPart.cPart) else: for sNum in range(len(b.sequences)): s = b.sequences[sNum] for cNum in range(b.length): theChar = s.sequence[cNum] if theChar == '-': aPart.seq[sNum, cNum] = var.GAP_CODE if theChar == '?': aPart.seq[sNum, cNum] = var.QMARK_CODE elif aPart.equateSymbols and theChar in aPart.equateSymbols: aPart.seq[ sNum, cNum] = var.EQUATES_BASE + aPart.equateSymbols.index( theChar) else: aPart.seq[sNum, cNum] = aPart.symbols.index(theChar) else: gm.append("No parts.") raise P4Error(gm)
def simulate(self, calculatePatterns=True, resetSequences=True, resetNexusSetsConstantMask=True, refTree=None): """Simulate into the attached data. The tree self needs to have a data and model attached. Generation of random numbers uses the GSL random number generator. The state is held in var.gsl_rng, which is None by default. If you do a simulation using this method, it will use ``var.gsl_rng`` if it exists, or make it if it does not exist yet. When it makes it, it seeds the state based on the current time. That should give you lots of variation in the simulations. If on the other hand you want to make simulations that are the same you can reseed the randomizer with the same seed whenever you do it, like this:: if not var.gsl_rng: var.gsl_rng = pf.gsl_rng_get() # unusually, set the seed with each simulation mySeed = 23 # your chosen int seed pf.gsl_rng_set(var.gsl_rng, mySeed) The usual way to simulate does not use reference data. An unusual way to simulate comes from (inspired by?) PhyloBayes, where the simulation is conditional on the original data. It uses conditional likelihoods of that reference data at the root. To turn that on, set refTree to the tree+model+data that you would like to use. Calculate a likelihood with that refTree before using it, so that conditional likelihoods are set. The tree and model for refTree should be identical to the tree and model for self. Args: calculatePatterns (bool): True by default. Whether to "compress" the newly simulated data to facilitate a faster likelihood calculation. resetSequences (bool): True by default. whether to bring the simulated sequences in C back into Python resetNexusSetsConstantMask (bool): True by default. When simulations are made, the constant mask in any associated nexus sets will get out of sync. Setting this to True makes a new mask and sets it. refTree (Tree): None by default. If supplied, a tree+model+data which has had its likelihood calculated, where the tree+model is identical to self. """ if refTree: from p4.tree import Tree assert isinstance(refTree, Tree) assert refTree.model assert refTree.data if not refTree.cTree: refTree.calcLogLike(verbose=False) assert refTree.model.cModel assert refTree.data.cData if not var.gsl_rng: var.gsl_rng = pf.gsl_rng_get() pf.gsl_rng_set(var.gsl_rng, int(time.time())) self._commonCStuff() if refTree: assert refTree.data.cData != self.data.cData assert refTree.data.nParts == self.data.nParts assert refTree.data.nTax == self.data.nTax for i in range(self.data.nTax): assert refTree.data.taxNames[i] == self.data.taxNames[i] assert len(refTree.data.alignments) == len(self.data.alignments) assert refTree.logLike, "Do a likelihood calculation with the refTree before using it here." # could have some more checks ... # If there is a NexusSets object attached to any of the alignments # in the Data, the constant sites mask at least will become out of sync, but we can't just # delete the whole nexusSets object, as they define what the parts are. # for a in self.data.alignments: # # if a.nexusSets: # a.nexusSets = None # Probably better to do something like this # a.nexusSets.constant.mask = self.constantMask() # at the end. # print "About to pf.p4_simulate(self.cTree)" if refTree: pf.p4_simulate(self.cTree, refTree.cTree, var.gsl_rng) else: pf.p4_simulate(self.cTree, 0, var.gsl_rng) if calculatePatterns: for p in self.data.parts: pf.makePatterns(p.cPart) pf.setGlobalInvarSitesVec(p.cPart) if resetSequences: self.data.resetSequencesFromParts() if resetNexusSetsConstantMask: for a in self.data.alignments: if a.nexusSets: a.nexusSets.constant.mask = a.constantMask() else: if resetNexusSetsConstantMask: gm = ['Tree.simulate().'] gm.append( "resetSequences is not set, but resetNexusSetsConstantMask is set," ) gm.append("which is probably not going to work as you want.") raise P4Error(gm)