Exemplos de FlagIndexArray.getLength em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: smp.smp

Classe / Tipo: FlagIndexArray

Método / Função: getLength

Exemplos em hotexamples.com: 2

FlagIndexArray.getLength em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de smp.smp.FlagIndexArray.getLength em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

FlagIndexArray(3)

addSingles(2)

addFlagIndexArray(1)

flagCount(1)

flagIndex(1)

getLength(1)

Métodos Frequentes

FlagIndexArray (3)

addSingles (2)

addFlagIndexArray (1)

flagCount (1)

flagIndex (1)

getLength (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: model.py Projeto: zoginni/helit

class Sample: """Stores a single sample drawn from the model - the topics, clusters and each document being sampled over. Stores counts and parameters required to make them into distributions, rather than final distributions. Has clonning capability.""" def __init__(self, state, calcNLL=True, priorsOnly=False): """Given a state this draws a sample from it, as a specific parametrisation of the model. Also a copy constructor, with a slight modification - if the priorsOnly flag is set it will only copy across the priors, and initialise to an empty model.""" if isinstance(state, Sample): # Code for clonning. self.alpha = state.alpha self.beta = state.beta.copy() self.gamma = state.gamma self.rho = state.rho self.mu = state.mu self.phi = state.phi.copy() if not priorsOnly: self.topicWord = state.topicWord.copy() self.topicUse = state.topicUse.copy() else: self.topicWord = numpy.zeros((0, state.topicWord.shape[1]), dtype=numpy.int32) self.topicUse = numpy.zeros(0, dtype=numpy.int32) self.topicConc = state.topicConc self.abnormTopicWord = state.abnormTopicWord.copy() self.abnorms = dict(state.abnorms) self.fia = FlagIndexArray(state.fia) if not priorsOnly: self.cluster = map( lambda t: (t[0].copy(), t[1], t[2].copy(), t[3].copy()), state.cluster) self.clusterUse = state.clusterUse.copy() else: self.cluster = [] self.clusterUse = numpy.zeros(0, dtype=numpy.int32) self.clusterConc = state.clusterConc if not priorsOnly: self.doc = map(lambda ds: DocSample(ds), state.doc) else: self.doc = [] else: # Normal initialisation code. self.alpha = state.alpha self.beta = state.beta.copy() self.gamma = state.gamma self.rho = state.rho self.mu = state.mu self.phi = state.phi.copy() # Topic stuff... self.topicWord = state.topicWord.copy() self.topicUse = state.topicUse.copy() self.topicConc = state.topicConc # Abnormality stuff... self.abnormTopicWord = state.abnormTopicWord.copy() self.abnorms = dict(state.abnorms) self.fia = FlagIndexArray(state.fia) # Cluster stuff... self.cluster = map( lambda t: (t[0].copy(), t[1], t[2].copy(), t[3].copy()), state.cluster) self.clusterUse = state.clusterUse.copy() self.clusterConc = state.clusterConc # The details for each document... self.doc = [] for d in xrange(len(state.doc)): self.doc.append(DocSample(state.doc[d])) # Second pass through documents to fill in the negative log liklihoods - need some data structures for this... if calcNLL: for d in xrange(len(state.doc)): self.doc[d].calcNLL(state.doc[d], state) def merge(self, other): """Given a sample this merges it into this sample. Works under the assumption that the new sample was learnt with this sample as its only prior, and ends up as though both the prior and the sample were drawn whilst simultaneously being modeled. Trashes the given sample - do not continue to use.""" # Update the old documents - there are potentially more behaviours in the new sample, which means adjusting the behaviour flags... if self.fia.getLength() != other.fia.getLength(): for doc in self.doc: newBehFlags = numpy.zeros(other.fia.getLength(), dtype=numpy.uint8) newBehFlags[0] = doc.behFlags[0] for abnorm, index in self.abnorms: newIndex = other.abnorms[abnorm] newBehFlags[newIndex] = doc.behFlags[index] doc.behFlags = newBehFlags # Replace the basic parameters... self.alpha = other.alpha self.beta = other.beta self.gamma = other.gamma self.rho = other.rho self.mu = other.mu self.phi = other.phi self.topicWord = other.topicWord self.topicUse = other.topicUse self.topicConc = other.topicConc self.abnormTopicWord = other.abnormTopicWord self.abnorms = other.abnorms self.fia = other.fia self.cluster = other.cluster self.clusterUse = other.clusterUse self.clusterConc = other.clusterConc # Add in the (presumably) new documents... for doc in other.doc: self.doc.append(doc) def getAlphaPrior(self): """Returns the PriorConcDP that was used for the alpha parameter, which is the concentration parameter for the DP in each document.""" return self.alpha def getBeta(self): """Returns the beta prior, which is a vector representing a Dirichlet distribution from which the multinomials for each topic are drawn, from which words are drawn.""" return self.beta def getGammaPrior(self): """Returns the PriorConcDP that was used for the gamma parameter, which is the concentration parameter for the global DP from which topics are drawn.""" return self.gamma def getRhoPrior(self): """Returns the PriorConcDP that was used for the rho parameter, which is the concentration parameter for each specific clusters DP.""" return self.rho def getMuPrior(self): """Returns the PriorConcDP that was used for the mu parameter, which is the concentration parameter for the DP from which clusters are drawn.""" return self.mu def getPhi(self): """Returns the phi Dirichlet distribution prior on the behavioural multinomial for each cluster.""" return self.phi def getTopicCount(self): """Returns the number of topics in the sample.""" return self.topicWord.shape[0] def getWordCount(self): """Returns the number of words in the topic multinomial.""" return self.topicWord.shape[1] def getTopicUseWeight(self, t): """Returns how many times the given topic has been instanced in a cluster.""" return self.topicUse[t] def getTopicUseWeights(self): """Returns an array, indexed by topic id, that contains how many times each topic has been instanciated in a cluster. Do not edit the return value - copy it first.""" return self.topicUse def getTopicConc(self): """Returns the sampled concentration parameter for drawing topic instances from the global DP.""" return self.topicConc def getTopicWordCount(self, t): """Returns the number of samples assigned to each word for the given topic, as an integer numpy array. Do not edit the return value - make a copy first.""" return self.topicWord[t, :] def getTopicWordCounts(self, t): """Returns the number of samples assigned to each word for all topics, indexed [topic, word], as an integer numpy array. Do not edit the return value - make a copy first.""" return self.topicWord def getTopicMultinomial(self, t): """Returns the calculated multinomial for a given topic ident.""" ret = self.beta.copy() ret += self.topicWord[t, :] ret /= ret.sum() return ret def getTopicMultinomials(self): """Returns the multinomials for all topics, in a single array - indexed by [topic, word] to give P(word|topic).""" ret = numpy.vstack([self.beta] * self.topicWord.shape[0]) ret += self.topicWord ret = (ret.T / ret.sum(axis=1)).T return ret def getBehCount(self): """Returns the number of behaviours, which is the number of abnormalities plus 1, and the entry count for the indexing variable for abnormals in the relevant methods.""" return self.abnormTopicWord.shape[0] def getAbnormWordCount(self, b): """Returns the number of samples assigned to each word for the given abnormal topic. Note that entry 0 equates to normal behaviour and is a dummy that should be ignored.""" return self.abnormTopicWord[b, :] def getAbnormWordCounts(self): """Returns the number of samples assigned to each word in each abnormal behaviour. An integer 2D array indexed with [behaviour, word], noting that behaviour 0 is a dummy for normal behaviour. Do not edit the return value - make a copy first.""" return self.abnormTopicWord def getAbnormMultinomial(self, b): """Returns the calculated multinomial for a given abnormal behaviour.""" ret = self.beta.copy() ret += self.abnormTopicWord[b, :] ret /= ret.sum() return ret def getAbnormMultinomials(self): """Returns the multinomials for all abnormalities, in a single array - indexed by [behaviour, word] to give P(word|topic associated with behaviour). Entry 0 is a dummy to fill in for normal behaviour, and should be ignored.""" ret = numpy.vstack([self.beta] * self.abnormTopicWord.shape[0]) ret += self.abnormTopicWord ret = (ret.T / ret.sum(axis=1)).T return ret def getAbnormDict(self): """Returns a dictionary that takes each abnormalities user provided token to the behaviour index used for it. Allows the use of the getAbnorm* methods, amung other things.""" return self.abnorms def getClusterCount(self): """Returns how many clusters there are.""" return len(self.cluster) def getClusterDrawWeight(self, c): """Returns how many times the given cluster has been instanced by a document.""" return self.clusterUse[c] def getClusterDrawWeights(self): """Returns an array, indexed by cluster id, that contains how many times each cluster has been instanciated by a document. Do not edit the return value - copy it first.""" return self.clusterUse def getClusterDrawConc(self): """Returns the sampled concentration parameter for drawing cluster instances for documents.""" return self.clusterConc def getClusterInstCount(self, c): """Returns how many instances of topics exist in the given cluster.""" return self.cluster[c][0].shape[0] def getClusterInstWeight(self, c, ti): """Returns how many times the given cluster topic instance has been instanced by a documents DP.""" return self.cluster[c][0][ti, 1] def getClusterInstTopic(self, c, ti): """Returns which topic the given cluster topic instance is an instance of.""" return self.cluster[c][0][ti, 0] def getClusterInstDual(self, c): """Returns a 2D array, where the first dimension is indexed by the topic instance, and the second contains two columns - the first the topic index, the second the weight. Do not edit return value - copy before use.""" return self.cluster[c][0] def getClusterInstConc(self, c): """Returns the sampled concentration that goes with the DP from which the members of each documents DP are drawn.""" return self.cluster[c][1] def getClusterInstBehMN(self, c): """Returns the multinomial on drawing behaviours for the given cluster.""" return self.cluster[c][2] def getClusterInstPriorBehMN(self, c): """Returns the prior on the behaviour multinomial, as an array of integer counts aligned with the flag set.""" return self.cluster[c][3] def docCount(self): """Returns the number of documents stored within. Should be the same as the corpus from which the sample was drawn.""" return len(self.doc) def getDoc(self, d): """Given a document index this returns the appropriate DocSample object. These indices should align up with the document indices in the Corpus from which this Sample was drawn, assuming no documents have been deleted.""" return self.doc[d] def delDoc(self, ident): """Given a document ident this finds the document with the ident and removes it from the model, completly - i.e. all the variables in the sample are also updated. Primarilly used to remove documents for resampling prior to using the model as a prior. Note that this can potentially leave entities with no users - they get culled when the model is loaded into the C++ data structure so as to not cause problems.""" # Find and remove it from the document list... index = None for i in xrange(len(self.doc)): if self.doc[i].getIdent() == ident: index = i break if index == None: return victim = self.doc[index] self.doc = self.doc[:index] + self.doc[index + 1:] # Update all the variables left behind by subtracting the relevant terms... cluster = self.cluster[victim.cluster] self.clusterUse[victim.cluster] -= 1 ## First pass through the dp and remove its influence; at the same time note the arrays that need to be updated by each user when looping through... dp_ext = [] for i in xrange(victim.dp.shape[0]): beh = victim.dp[i, 0] #count = victim.dp[i,2] if beh == 0: # Normal behaviour cluInst = victim.dp[i, 1] # Update the instance, and topic use counts if necessary... topic = cluster[0][cluInst, 0] cluster[0][cluInst, 1] -= 1 if cluster[0][cluInst, 1] == 0: self.topicUse[topic] -= 1 # Store the entity that needs updating in correspondence with this dp instance in the next step... dp_ext.append((self.topicWord, topic)) else: # Abnormal behaviour. # Store the entity that needs updating in correspondence with the dp... dp_ext.append((self.abnormTopicWord, beh)) ## Go through the samples array and remove their influnce - the hard part was done by the preceding step... for si in xrange(victim.samples.shape[0]): inst = victim.samples[si, 0] word = victim.samples[si, 1] mat, topic = dp_ext[inst] mat[topic, word] -= 1 # Clean up all zeroed items... self.cleanZeros() def cleanZeros(self): """Goes through and removes anything that has a zero reference count, adjusting all indices accordingly.""" # Remove the zeros from this object, noting the changes... ## Topics... newTopicCount = 0 topicMap = dict() for t in xrange(self.topicUse.shape[0]): if self.topicUse[t] != 0: topicMap[t] = newTopicCount newTopicCount += 1 if newTopicCount != self.topicUse.shape[0]: newTopicWord = numpy.zeros( (newTopicCount, self.topicWord.shape[1]), dtype=numpy.int32) newTopicUse = numpy.zeros(newTopicCount, dtype=numpy.int32) for origin, dest in topicMap.iteritems(): newTopicWord[dest, :] = self.topicWord[origin, :] newTopicUse[dest] = self.topicUse[origin] self.topicWord = newTopicWord self.topicUse = newTopicUse ## Clusters... newClusterCount = 0 clusterMap = dict() for c in xrange(self.clusterUse.shape[0]): if self.clusterUse[c] != 0: clusterMap[c] = newClusterCount newClusterCount += 1 if newClusterCount != self.clusterUse.shape[0]: newCluster = [None] * newClusterCount newClusterUse = numpy.zeros(newClusterCount, dtype=numpy.int32) for origin, dest in clusterMap.iteritems(): newCluster[dest] = self.cluster[origin] newClusterUse[dest] = self.clusterUse[origin] self.cluster = newCluster self.clusterUse = newClusterUse ## Cluster instances... # (Change is noted by a 2-tuple of (new length, dict) where new length is the new length and dict goes from old indices to new indices.) cluInstAdj = [] for ci in xrange(len(self.cluster)): newInstCount = 0 instMap = dict() for i in xrange(self.cluster[ci][0].shape[0]): if self.cluster[ci][0][i, 1] != 0: instMap[i] = newInstCount newInstCount += 1 cluInstAdj.append((newInstCount, instMap)) if newInstCount != self.cluster[ci][0].shape[0]: newInst = numpy.zeros((newInstCount, 2), dtype=numpy.int32) for origin, dest in instMap.iteritems(): newInst[dest, :] = self.cluster[ci][0][origin, :] self.cluster[ci] = (newInst, self.cluster[ci][1], self.cluster[ci][2], self.cluster[ci][3]) # Iterate and update the topic indices of the cluster instances... for ci in xrange(len(self.cluster)): for i in xrange(self.cluster[ci][0].shape[0]): self.cluster[ci][0][i, 0] = topicMap[self.cluster[ci][0][i, 0]] # Now iterate the documents and update their cluster and cluster instance indices... for doc in self.doc: doc.cluster = clusterMap[doc.cluster] _, instMap = cluInstAdj[doc.cluster] for di in xrange(doc.dp.shape[0]): if doc.dp[di, 0] == 0: doc.dp[di, 1] = instMap[doc.dp[di, 1]] def nllAllDocs(self): """Returns the negative log likelihood of all the documents in the sample - a reasonable value to compare various samples with.""" return sum(map(lambda d: d.getNLL(), self.doc)) def logNegProbWordsGivenClusterAbnorm(self, doc, cluster, particles=16, cap=-1): """Uses wallach's 'left to right' method to calculate the negative log probability of the words in the document given the rest of the model. Both the cluster (provided as an index) and the documents abnormalities vector are fixed for this calculation. Returns the average of the results for each sample contained within model. particles is the number of particles to use in the left to right estimation algorithm. This is implimented using scipy.weave.""" return solvers.leftRightNegLogProbWord(self, doc, cluster, particles, cap) def logNegProbWordsGivenAbnorm(self, doc, particles=16, cap=-1): """Uses logNegProbWordsGivenClusterAbnorm and simply sums out the cluster variable.""" # Get the probability of each with the dependence with clusters... cluScores = map( lambda c: solvers.leftRightNegLogProbWord(self, doc, c, particles, cap), xrange(self.getClusterCount())) # Multiply each by the probability of the cluster, so it can be summed out... cluNorm = float(self.clusterUse.sum()) + self.clusterConc cluScores = map( lambda c, s: s - math.log(float(self.clusterUse[c]) / cluNorm), xrange(len(cluScores)), cluScores) # Also need to include the probability of a new cluster, even though it is likelly to be a neglible contribution... newVal = solvers.leftRightNegLogProbWord(self, doc, -1, particles, cap) newVal -= math.log(self.clusterConc / cluNorm) cluScores.append(newVal) # Sum out the cluster variable, in a numerically stable way given that we are dealing with negative log likelihood values that will map to extremelly low probabilities... minScore = min(cluScores) cluPropProb = map(lambda s: math.exp(minScore - s), cluScores) return minScore - math.log(sum(cluPropProb))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: model.py Projeto: zerocolar/Project_Code

class Sample: """Stores a single sample drawn from the model - the topics, clusters and each document being sampled over. Stores counts and parameters required to make them into distributions, rather than final distributions. Has clonning capability.""" def __init__(self, state, calcNLL = True, priorsOnly = False): """Given a state this draws a sample from it, as a specific parametrisation of the model. Also a copy constructor, with a slight modification - if the priorsOnly flag is set it will only copy across the priors, and initialise to an empty model.""" if isinstance(state, Sample): # Code for clonning. self.alpha = state.alpha self.beta = state.beta.copy() self.gamma = state.gamma self.rho = state.rho self.mu = state.mu self.phi = state.phi.copy() if not priorsOnly: self.topicWord = state.topicWord.copy() self.topicUse = state.topicUse.copy() else: self.topicWord = numpy.zeros((0,state.topicWord.shape[1]), dtype=numpy.int32) self.topicUse = numpy.zeros(0,dtype=numpy.int32) self.topicConc = state.topicConc self.abnormTopicWord = state.abnormTopicWord.copy() self.abnorms = dict(state.abnorms) self.fia = FlagIndexArray(state.fia) if not priorsOnly: self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster) self.clusterUse = state.clusterUse.copy() else: self.cluster = [] self.clusterUse = numpy.zeros(0,dtype=numpy.int32) self.clusterConc = state.clusterConc if not priorsOnly: self.doc = map(lambda ds: DocSample(ds), state.doc) else: self.doc = [] else: # Normal initialisation code. self.alpha = state.alpha self.beta = state.beta.copy() self.gamma = state.gamma self.rho = state.rho self.mu = state.mu self.phi = state.phi.copy() # Topic stuff... self.topicWord = state.topicWord.copy() self.topicUse = state.topicUse.copy() self.topicConc = state.topicConc # Abnormality stuff... self.abnormTopicWord = state.abnormTopicWord.copy() self.abnorms = dict(state.abnorms) self.fia = FlagIndexArray(state.fia) # Cluster stuff... self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster) self.clusterUse = state.clusterUse.copy() self.clusterConc = state.clusterConc # The details for each document... self.doc = [] for d in xrange(len(state.doc)): self.doc.append(DocSample(state.doc[d])) # Second pass through documents to fill in the negative log liklihoods - need some data structures for this... if calcNLL: for d in xrange(len(state.doc)): self.doc[d].calcNLL(state.doc[d],state) def merge(self, other): """Given a sample this merges it into this sample. Works under the assumption that the new sample was learnt with this sample as its only prior, and ends up as though both the prior and the sample were drawn whilst simultaneously being modeled. Trashes the given sample - do not continue to use.""" # Update the old documents - there are potentially more behaviours in the new sample, which means adjusting the behaviour flags... if self.fia.getLength()!=other.fia.getLength(): for doc in self.doc: newBehFlags = numpy.zeros(other.fia.getLength(), dtype=numpy.uint8) newBehFlags[0] = doc.behFlags[0] for abnorm, index in self.abnorms: newIndex = other.abnorms[abnorm] newBehFlags[newIndex] = doc.behFlags[index] doc.behFlags = newBehFlags # Replace the basic parameters... self.alpha = other.alpha self.beta = other.beta self.gamma = other.gamma self.rho = other.rho self.mu = other.mu self.phi = other.phi self.topicWord = other.topicWord self.topicUse = other.topicUse self.topicConc = other.topicConc self.abnormTopicWord = other.abnormTopicWord self.abnorms = other.abnorms self.fia = other.fia self.cluster = other.cluster self.clusterUse = other.clusterUse self.clusterConc = other.clusterConc # Add in the (presumably) new documents... for doc in other.doc: self.doc.append(doc) def getAlphaPrior(self): """Returns the PriorConcDP that was used for the alpha parameter, which is the concentration parameter for the DP in each document.""" return self.alpha def getBeta(self): """Returns the beta prior, which is a vector representing a Dirichlet distribution from which the multinomials for each topic are drawn, from which words are drawn.""" return self.beta def getGammaPrior(self): """Returns the PriorConcDP that was used for the gamma parameter, which is the concentration parameter for the global DP from which topics are drawn.""" return self.gamma def getRhoPrior(self): """Returns the PriorConcDP that was used for the rho parameter, which is the concentration parameter for each specific clusters DP.""" return self.rho def getMuPrior(self): """Returns the PriorConcDP that was used for the mu parameter, which is the concentration parameter for the DP from which clusters are drawn.""" return self.mu def getPhi(self): """Returns the phi Dirichlet distribution prior on the behavioural multinomial for each cluster.""" return self.phi def getTopicCount(self): """Returns the number of topics in the sample.""" return self.topicWord.shape[0] def getWordCount(self): """Returns the number of words in the topic multinomial.""" return self.topicWord.shape[1] def getTopicUseWeight(self, t): """Returns how many times the given topic has been instanced in a cluster.""" return self.topicUse[t] def getTopicUseWeights(self): """Returns an array, indexed by topic id, that contains how many times each topic has been instanciated in a cluster. Do not edit the return value - copy it first.""" return self.topicUse def getTopicConc(self): """Returns the sampled concentration parameter for drawing topic instances from the global DP.""" return self.topicConc def getTopicWordCount(self, t): """Returns the number of samples assigned to each word for the given topic, as an integer numpy array. Do not edit the return value - make a copy first.""" return self.topicWord[t,:] def getTopicWordCounts(self, t): """Returns the number of samples assigned to each word for all topics, indexed [topic, word], as an integer numpy array. Do not edit the return value - make a copy first.""" return self.topicWord def getTopicMultinomial(self, t): """Returns the calculated multinomial for a given topic ident.""" ret = self.beta.copy() ret += self.topicWord[t,:] ret /= ret.sum() return ret def getTopicMultinomials(self): """Returns the multinomials for all topics, in a single array - indexed by [topic, word] to give P(word|topic).""" ret = numpy.vstack([self.beta]*self.topicWord.shape[0]) ret += self.topicWord ret = (ret.T / ret.sum(axis=1)).T return ret def getBehCount(self): """Returns the number of behaviours, which is the number of abnormalities plus 1, and the entry count for the indexing variable for abnormals in the relevant methods.""" return self.abnormTopicWord.shape[0] def getAbnormWordCount(self, b): """Returns the number of samples assigned to each word for the given abnormal topic. Note that entry 0 equates to normal behaviour and is a dummy that should be ignored.""" return self.abnormTopicWord[b,:] def getAbnormWordCounts(self): """Returns the number of samples assigned to each word in each abnormal behaviour. An integer 2D array indexed with [behaviour, word], noting that behaviour 0 is a dummy for normal behaviour. Do not edit the return value - make a copy first.""" return self.abnormTopicWord def getAbnormMultinomial(self, b): """Returns the calculated multinomial for a given abnormal behaviour.""" ret = self.beta.copy() ret += self.abnormTopicWord[b,:] ret /= ret.sum() return ret def getAbnormMultinomials(self): """Returns the multinomials for all abnormalities, in a single array - indexed by [behaviour, word] to give P(word|topic associated with behaviour). Entry 0 is a dummy to fill in for normal behaviour, and should be ignored.""" ret = numpy.vstack([self.beta]*self.abnormTopicWord.shape[0]) ret += self.abnormTopicWord ret = (ret.T / ret.sum(axis=1)).T return ret def getAbnormDict(self): """Returns a dictionary that takes each abnormalities user provided token to the behaviour index used for it. Allows the use of the getAbnorm* methods, amung other things.""" return self.abnorms def getClusterCount(self): """Returns how many clusters there are.""" return len(self.cluster) def getClusterDrawWeight(self, c): """Returns how many times the given cluster has been instanced by a document.""" return self.clusterUse[c] def getClusterDrawWeights(self): """Returns an array, indexed by cluster id, that contains how many times each cluster has been instanciated by a document. Do not edit the return value - copy it first.""" return self.clusterUse def getClusterDrawConc(self): """Returns the sampled concentration parameter for drawing cluster instances for documents.""" return self.clusterConc def getClusterInstCount(self, c): """Returns how many instances of topics exist in the given cluster.""" return self.cluster[c][0].shape[0] def getClusterInstWeight(self, c, ti): """Returns how many times the given cluster topic instance has been instanced by a documents DP.""" return self.cluster[c][0][ti,1] def getClusterInstTopic(self, c, ti): """Returns which topic the given cluster topic instance is an instance of.""" return self.cluster[c][0][ti,0] def getClusterInstDual(self, c): """Returns a 2D array, where the first dimension is indexed by the topic instance, and the second contains two columns - the first the topic index, the second the weight. Do not edit return value - copy before use.""" return self.cluster[c][0] def getClusterInstConc(self, c): """Returns the sampled concentration that goes with the DP from which the members of each documents DP are drawn.""" return self.cluster[c][1] def getClusterInstBehMN(self, c): """Returns the multinomial on drawing behaviours for the given cluster.""" return self.cluster[c][2] def getClusterInstPriorBehMN(self, c): """Returns the prior on the behaviour multinomial, as an array of integer counts aligned with the flag set.""" return self.cluster[c][3] def docCount(self): """Returns the number of documents stored within. Should be the same as the corpus from which the sample was drawn.""" return len(self.doc) def getDoc(self,d): """Given a document index this returns the appropriate DocSample object. These indices should align up with the document indices in the Corpus from which this Sample was drawn, assuming no documents have been deleted.""" return self.doc[d] def delDoc(self, ident): """Given a document ident this finds the document with the ident and removes it from the model, completly - i.e. all the variables in the sample are also updated. Primarilly used to remove documents for resampling prior to using the model as a prior. Note that this can potentially leave entities with no users - they get culled when the model is loaded into the C++ data structure so as to not cause problems.""" # Find and remove it from the document list... index = None for i in xrange(len(self.doc)): if self.doc[i].getIdent()==ident: index = i break if index==None: return victim = self.doc[index] self.doc = self.doc[:index] + self.doc[index+1:] # Update all the variables left behind by subtracting the relevant terms... cluster = self.cluster[victim.cluster] self.clusterUse[victim.cluster] -= 1 ## First pass through the dp and remove its influence; at the same time note the arrays that need to be updated by each user when looping through... dp_ext = [] for i in xrange(victim.dp.shape[0]): beh = victim.dp[i,0] #count = victim.dp[i,2] if beh==0: # Normal behaviour cluInst = victim.dp[i,1] # Update the instance, and topic use counts if necessary... topic = cluster[0][cluInst,0] cluster[0][cluInst,1] -= 1 if cluster[0][cluInst,1]==0: self.topicUse[topic] -= 1 # Store the entity that needs updating in correspondence with this dp instance in the next step... dp_ext.append((self.topicWord, topic)) else: # Abnormal behaviour. # Store the entity that needs updating in correspondence with the dp... dp_ext.append((self.abnormTopicWord, beh)) ## Go through the samples array and remove their influnce - the hard part was done by the preceding step... for si in xrange(victim.samples.shape[0]): inst = victim.samples[si,0] word = victim.samples[si,1] mat, topic = dp_ext[inst] mat[topic,word] -= 1 # Clean up all zeroed items... self.cleanZeros() def cleanZeros(self): """Goes through and removes anything that has a zero reference count, adjusting all indices accordingly.""" # Remove the zeros from this object, noting the changes... ## Topics... newTopicCount = 0 topicMap = dict() for t in xrange(self.topicUse.shape[0]): if self.topicUse[t]!=0: topicMap[t] = newTopicCount newTopicCount += 1 if newTopicCount!=self.topicUse.shape[0]: newTopicWord = numpy.zeros((newTopicCount, self.topicWord.shape[1]), dtype=numpy.int32) newTopicUse = numpy.zeros(newTopicCount,dtype=numpy.int32) for origin, dest in topicMap.iteritems(): newTopicWord[dest,:] = self.topicWord[origin,:] newTopicUse[dest] = self.topicUse[origin] self.topicWord = newTopicWord self.topicUse = newTopicUse ## Clusters... newClusterCount = 0 clusterMap = dict() for c in xrange(self.clusterUse.shape[0]): if self.clusterUse[c]!=0: clusterMap[c] = newClusterCount newClusterCount += 1 if newClusterCount!=self.clusterUse.shape[0]: newCluster = [None]*newClusterCount newClusterUse = numpy.zeros(newClusterCount, dtype=numpy.int32) for origin, dest in clusterMap.iteritems(): newCluster[dest] = self.cluster[origin] newClusterUse[dest] = self.clusterUse[origin] self.cluster = newCluster self.clusterUse = newClusterUse ## Cluster instances... # (Change is noted by a 2-tuple of (new length, dict) where new length is the new length and dict goes from old indices to new indices.) cluInstAdj = [] for ci in xrange(len(self.cluster)): newInstCount = 0 instMap = dict() for i in xrange(self.cluster[ci][0].shape[0]): if self.cluster[ci][0][i,1]!=0: instMap[i] = newInstCount newInstCount += 1 cluInstAdj.append((newInstCount, instMap)) if newInstCount!=self.cluster[ci][0].shape[0]: newInst = numpy.zeros((newInstCount,2), dtype=numpy.int32) for origin, dest in instMap.iteritems(): newInst[dest,:] = self.cluster[ci][0][origin,:] self.cluster[ci] = (newInst, self.cluster[ci][1], self.cluster[ci][2], self.cluster[ci][3]) # Iterate and update the topic indices of the cluster instances... for ci in xrange(len(self.cluster)): for i in xrange(self.cluster[ci][0].shape[0]): self.cluster[ci][0][i,0] = topicMap[self.cluster[ci][0][i,0]] # Now iterate the documents and update their cluster and cluster instance indices... for doc in self.doc: doc.cluster = clusterMap[doc.cluster] _, instMap = cluInstAdj[doc.cluster] for di in xrange(doc.dp.shape[0]): if doc.dp[di,0]==0: doc.dp[di,1] = instMap[doc.dp[di,1]] def nllAllDocs(self): """Returns the negative log likelihood of all the documents in the sample - a reasonable value to compare various samples with.""" return sum(map(lambda d: d.getNLL(),self.doc)) def logNegProbWordsGivenClusterAbnorm(self, doc, cluster, particles = 16, cap = -1): """Uses wallach's 'left to right' method to calculate the negative log probability of the words in the document given the rest of the model. Both the cluster (provided as an index) and the documents abnormalities vector are fixed for this calculation. Returns the average of the results for each sample contained within model. particles is the number of particles to use in the left to right estimation algorithm. This is implimented using scipy.weave.""" return solvers.leftRightNegLogProbWord(self, doc, cluster, particles, cap) def logNegProbWordsGivenAbnorm(self, doc, particles = 16, cap = -1): """Uses logNegProbWordsGivenClusterAbnorm and simply sums out the cluster variable.""" # Get the probability of each with the dependence with clusters... cluScores = map(lambda c: solvers.leftRightNegLogProbWord(self, doc, c, particles, cap), xrange(self.getClusterCount())) # Multiply each by the probability of the cluster, so it can be summed out... cluNorm = float(self.clusterUse.sum()) + self.clusterConc cluScores = map(lambda c,s: s - math.log(float(self.clusterUse[c])/cluNorm), xrange(len(cluScores)), cluScores) # Also need to include the probability of a new cluster, even though it is likelly to be a neglible contribution... newVal = solvers.leftRightNegLogProbWord(self, doc, -1, particles, cap) newVal -= math.log(self.clusterConc/cluNorm) cluScores.append(newVal) # Sum out the cluster variable, in a numerically stable way given that we are dealing with negative log likelihood values that will map to extremelly low probabilities... minScore = min(cluScores) cluPropProb = map(lambda s: math.exp(minScore-s), cluScores) return minScore - math.log(sum(cluPropProb))