Exemplo n.º 1
0
class Sample:
    """Stores a single sample drawn from the model - the topics, clusters and each document being sampled over. Stores counts and parameters required to make them into distributions, rather than final distributions. Has clonning capability."""
    def __init__(self, state, calcNLL=True, priorsOnly=False):
        """Given a state this draws a sample from it, as a specific parametrisation of the model. Also a copy constructor, with a slight modification - if the priorsOnly flag is set it will only copy across the priors, and initialise to an empty model."""
        if isinstance(state, Sample):  # Code for clonning.
            self.alpha = state.alpha
            self.beta = state.beta.copy()
            self.gamma = state.gamma
            self.rho = state.rho
            self.mu = state.mu
            self.phi = state.phi.copy()

            if not priorsOnly:
                self.topicWord = state.topicWord.copy()
                self.topicUse = state.topicUse.copy()
            else:
                self.topicWord = numpy.zeros((0, state.topicWord.shape[1]),
                                             dtype=numpy.int32)
                self.topicUse = numpy.zeros(0, dtype=numpy.int32)
            self.topicConc = state.topicConc

            self.abnormTopicWord = state.abnormTopicWord.copy()
            self.abnorms = dict(state.abnorms)
            self.fia = FlagIndexArray(state.fia)

            if not priorsOnly:
                self.cluster = map(
                    lambda t: (t[0].copy(), t[1], t[2].copy(), t[3].copy()),
                    state.cluster)
                self.clusterUse = state.clusterUse.copy()
            else:
                self.cluster = []
                self.clusterUse = numpy.zeros(0, dtype=numpy.int32)
            self.clusterConc = state.clusterConc

            if not priorsOnly:
                self.doc = map(lambda ds: DocSample(ds), state.doc)
            else:
                self.doc = []
        else:  # Normal initialisation code.
            self.alpha = state.alpha
            self.beta = state.beta.copy()
            self.gamma = state.gamma
            self.rho = state.rho
            self.mu = state.mu
            self.phi = state.phi.copy()

            # Topic stuff...
            self.topicWord = state.topicWord.copy()
            self.topicUse = state.topicUse.copy()
            self.topicConc = state.topicConc

            # Abnormality stuff...
            self.abnormTopicWord = state.abnormTopicWord.copy()
            self.abnorms = dict(state.abnorms)
            self.fia = FlagIndexArray(state.fia)

            # Cluster stuff...
            self.cluster = map(
                lambda t: (t[0].copy(), t[1], t[2].copy(), t[3].copy()),
                state.cluster)
            self.clusterUse = state.clusterUse.copy()
            self.clusterConc = state.clusterConc

            # The details for each document...
            self.doc = []
            for d in xrange(len(state.doc)):
                self.doc.append(DocSample(state.doc[d]))

            # Second pass through documents to fill in the negative log liklihoods - need some data structures for this...
            if calcNLL:
                for d in xrange(len(state.doc)):
                    self.doc[d].calcNLL(state.doc[d], state)

    def merge(self, other):
        """Given a sample this merges it into this sample. Works under the assumption that the new sample was learnt with this sample as its only prior, and ends up as though both the prior and the sample were drawn whilst simultaneously being modeled. Trashes the given sample - do not continue to use."""

        # Update the old documents - there are potentially more behaviours in the new sample, which means adjusting the behaviour flags...
        if self.fia.getLength() != other.fia.getLength():
            for doc in self.doc:
                newBehFlags = numpy.zeros(other.fia.getLength(),
                                          dtype=numpy.uint8)
                newBehFlags[0] = doc.behFlags[0]

                for abnorm, index in self.abnorms:
                    newIndex = other.abnorms[abnorm]
                    newBehFlags[newIndex] = doc.behFlags[index]

                doc.behFlags = newBehFlags

        # Replace the basic parameters...
        self.alpha = other.alpha
        self.beta = other.beta
        self.gamma = other.gamma
        self.rho = other.rho
        self.mu = other.mu
        self.phi = other.phi

        self.topicWord = other.topicWord
        self.topicUse = other.topicUse
        self.topicConc = other.topicConc

        self.abnormTopicWord = other.abnormTopicWord
        self.abnorms = other.abnorms
        self.fia = other.fia

        self.cluster = other.cluster
        self.clusterUse = other.clusterUse
        self.clusterConc = other.clusterConc

        # Add in the (presumably) new documents...
        for doc in other.doc:
            self.doc.append(doc)

    def getAlphaPrior(self):
        """Returns the PriorConcDP that was used for the alpha parameter, which is the concentration parameter for the DP in each document."""
        return self.alpha

    def getBeta(self):
        """Returns the beta prior, which is a vector representing a Dirichlet distribution from which the multinomials for each topic are drawn, from which words are drawn."""
        return self.beta

    def getGammaPrior(self):
        """Returns the PriorConcDP that was used for the gamma parameter, which is the concentration parameter for the global DP from which topics are drawn."""
        return self.gamma

    def getRhoPrior(self):
        """Returns the PriorConcDP that was used for the rho parameter, which is the concentration parameter for each specific clusters DP."""
        return self.rho

    def getMuPrior(self):
        """Returns the PriorConcDP that was used for the mu parameter, which is the concentration parameter for the DP from which clusters are drawn."""
        return self.mu

    def getPhi(self):
        """Returns the phi Dirichlet distribution prior on the behavioural multinomial for each cluster."""
        return self.phi

    def getTopicCount(self):
        """Returns the number of topics in the sample."""
        return self.topicWord.shape[0]

    def getWordCount(self):
        """Returns the number of words in the topic multinomial."""
        return self.topicWord.shape[1]

    def getTopicUseWeight(self, t):
        """Returns how many times the given topic has been instanced in a cluster."""
        return self.topicUse[t]

    def getTopicUseWeights(self):
        """Returns an array, indexed by topic id, that contains how many times each topic has been instanciated in a cluster. Do not edit the return value - copy it first."""
        return self.topicUse

    def getTopicConc(self):
        """Returns the sampled concentration parameter for drawing topic instances from the global DP."""
        return self.topicConc

    def getTopicWordCount(self, t):
        """Returns the number of samples assigned to each word for the given topic, as an integer numpy array. Do not edit the return value - make a copy first."""
        return self.topicWord[t, :]

    def getTopicWordCounts(self, t):
        """Returns the number of samples assigned to each word for all topics, indexed [topic, word], as an integer numpy array. Do not edit the return value - make a copy first."""
        return self.topicWord

    def getTopicMultinomial(self, t):
        """Returns the calculated multinomial for a given topic ident."""
        ret = self.beta.copy()
        ret += self.topicWord[t, :]
        ret /= ret.sum()
        return ret

    def getTopicMultinomials(self):
        """Returns the multinomials for all topics, in a single array - indexed by [topic, word] to give P(word|topic)."""
        ret = numpy.vstack([self.beta] * self.topicWord.shape[0])
        ret += self.topicWord
        ret = (ret.T / ret.sum(axis=1)).T
        return ret

    def getBehCount(self):
        """Returns the number of behaviours, which is the number of abnormalities plus 1, and the entry count for the indexing variable for abnormals in the relevant methods."""
        return self.abnormTopicWord.shape[0]

    def getAbnormWordCount(self, b):
        """Returns the number of samples assigned to each word for the given abnormal topic. Note that entry 0 equates to normal behaviour and is a dummy that should be ignored."""
        return self.abnormTopicWord[b, :]

    def getAbnormWordCounts(self):
        """Returns the number of samples assigned to each word in each abnormal behaviour. An integer 2D array indexed with [behaviour, word], noting that behaviour 0 is a dummy for normal behaviour. Do not edit the return value - make a copy first."""
        return self.abnormTopicWord

    def getAbnormMultinomial(self, b):
        """Returns the calculated multinomial for a given abnormal behaviour."""
        ret = self.beta.copy()
        ret += self.abnormTopicWord[b, :]
        ret /= ret.sum()
        return ret

    def getAbnormMultinomials(self):
        """Returns the multinomials for all abnormalities, in a single array - indexed by [behaviour, word] to give P(word|topic associated with behaviour). Entry 0 is a dummy to fill in for normal behaviour, and should be ignored."""
        ret = numpy.vstack([self.beta] * self.abnormTopicWord.shape[0])
        ret += self.abnormTopicWord
        ret = (ret.T / ret.sum(axis=1)).T
        return ret

    def getAbnormDict(self):
        """Returns a dictionary that takes each abnormalities user provided token to the behaviour index used for it. Allows the use of the getAbnorm* methods, amung other things."""
        return self.abnorms

    def getClusterCount(self):
        """Returns how many clusters there are."""
        return len(self.cluster)

    def getClusterDrawWeight(self, c):
        """Returns how many times the given cluster has been instanced by a document."""
        return self.clusterUse[c]

    def getClusterDrawWeights(self):
        """Returns an array, indexed by cluster id, that contains how many times each cluster has been instanciated by a document. Do not edit the return value - copy it first."""
        return self.clusterUse

    def getClusterDrawConc(self):
        """Returns the sampled concentration parameter for drawing cluster instances for documents."""
        return self.clusterConc

    def getClusterInstCount(self, c):
        """Returns how many instances of topics exist in the given cluster."""
        return self.cluster[c][0].shape[0]

    def getClusterInstWeight(self, c, ti):
        """Returns how many times the given cluster topic instance has been instanced by a documents DP."""
        return self.cluster[c][0][ti, 1]

    def getClusterInstTopic(self, c, ti):
        """Returns which topic the given cluster topic instance is an instance of."""
        return self.cluster[c][0][ti, 0]

    def getClusterInstDual(self, c):
        """Returns a 2D array, where the first dimension is indexed by the topic instance, and the second contains two columns - the first the topic index, the second the weight. Do not edit return value - copy before use."""
        return self.cluster[c][0]

    def getClusterInstConc(self, c):
        """Returns the sampled concentration that goes with the DP from which the members of each documents DP are drawn."""
        return self.cluster[c][1]

    def getClusterInstBehMN(self, c):
        """Returns the multinomial on drawing behaviours for the given cluster."""
        return self.cluster[c][2]

    def getClusterInstPriorBehMN(self, c):
        """Returns the prior on the behaviour multinomial, as an array of integer counts aligned with the flag set."""
        return self.cluster[c][3]

    def docCount(self):
        """Returns the number of documents stored within. Should be the same as the corpus from which the sample was drawn."""
        return len(self.doc)

    def getDoc(self, d):
        """Given a document index this returns the appropriate DocSample object. These indices should align up with the document indices in the Corpus from which this Sample was drawn, assuming no documents have been deleted."""
        return self.doc[d]

    def delDoc(self, ident):
        """Given a document ident this finds the document with the ident and removes it from the model, completly - i.e. all the variables in the sample are also updated. Primarilly used to remove documents for resampling prior to using the model as a prior. Note that this can potentially leave entities with no users - they get culled when the model is loaded into the C++ data structure so as to not cause problems."""
        # Find and remove it from the document list...
        index = None
        for i in xrange(len(self.doc)):
            if self.doc[i].getIdent() == ident:
                index = i
                break
        if index == None: return

        victim = self.doc[index]
        self.doc = self.doc[:index] + self.doc[index + 1:]

        # Update all the variables left behind by subtracting the relevant terms...
        cluster = self.cluster[victim.cluster]
        self.clusterUse[victim.cluster] -= 1

        ## First pass through the dp and remove its influence; at the same time note the arrays that need to be updated by each user when looping through...
        dp_ext = []
        for i in xrange(victim.dp.shape[0]):
            beh = victim.dp[i, 0]
            #count = victim.dp[i,2]

            if beh == 0:  # Normal behaviour
                cluInst = victim.dp[i, 1]

                # Update the instance, and topic use counts if necessary...
                topic = cluster[0][cluInst, 0]
                cluster[0][cluInst, 1] -= 1
                if cluster[0][cluInst, 1] == 0:
                    self.topicUse[topic] -= 1

                # Store the entity that needs updating in correspondence with this dp instance in the next step...
                dp_ext.append((self.topicWord, topic))

            else:  # Abnormal behaviour.
                # Store the entity that needs updating in correspondence with the dp...
                dp_ext.append((self.abnormTopicWord, beh))

        ## Go through the samples array and remove their influnce - the hard part was done by the preceding step...
        for si in xrange(victim.samples.shape[0]):
            inst = victim.samples[si, 0]
            word = victim.samples[si, 1]
            mat, topic = dp_ext[inst]
            mat[topic, word] -= 1

        # Clean up all zeroed items...
        self.cleanZeros()

    def cleanZeros(self):
        """Goes through and removes anything that has a zero reference count, adjusting all indices accordingly."""

        # Remove the zeros from this object, noting the changes...

        ## Topics...
        newTopicCount = 0
        topicMap = dict()
        for t in xrange(self.topicUse.shape[0]):
            if self.topicUse[t] != 0:
                topicMap[t] = newTopicCount
                newTopicCount += 1

        if newTopicCount != self.topicUse.shape[0]:
            newTopicWord = numpy.zeros(
                (newTopicCount, self.topicWord.shape[1]), dtype=numpy.int32)
            newTopicUse = numpy.zeros(newTopicCount, dtype=numpy.int32)

            for origin, dest in topicMap.iteritems():
                newTopicWord[dest, :] = self.topicWord[origin, :]
                newTopicUse[dest] = self.topicUse[origin]

            self.topicWord = newTopicWord
            self.topicUse = newTopicUse

        ## Clusters...
        newClusterCount = 0
        clusterMap = dict()
        for c in xrange(self.clusterUse.shape[0]):
            if self.clusterUse[c] != 0:
                clusterMap[c] = newClusterCount
                newClusterCount += 1

        if newClusterCount != self.clusterUse.shape[0]:
            newCluster = [None] * newClusterCount
            newClusterUse = numpy.zeros(newClusterCount, dtype=numpy.int32)

            for origin, dest in clusterMap.iteritems():
                newCluster[dest] = self.cluster[origin]
                newClusterUse[dest] = self.clusterUse[origin]

            self.cluster = newCluster
            self.clusterUse = newClusterUse

        ## Cluster instances...
        # (Change is noted by a 2-tuple of (new length, dict) where new length is the new length and dict goes from old indices to new indices.)
        cluInstAdj = []
        for ci in xrange(len(self.cluster)):
            newInstCount = 0
            instMap = dict()
            for i in xrange(self.cluster[ci][0].shape[0]):
                if self.cluster[ci][0][i, 1] != 0:
                    instMap[i] = newInstCount
                    newInstCount += 1

            cluInstAdj.append((newInstCount, instMap))

            if newInstCount != self.cluster[ci][0].shape[0]:
                newInst = numpy.zeros((newInstCount, 2), dtype=numpy.int32)

                for origin, dest in instMap.iteritems():
                    newInst[dest, :] = self.cluster[ci][0][origin, :]

                self.cluster[ci] = (newInst, self.cluster[ci][1],
                                    self.cluster[ci][2], self.cluster[ci][3])

        # Iterate and update the topic indices of the cluster instances...
        for ci in xrange(len(self.cluster)):
            for i in xrange(self.cluster[ci][0].shape[0]):
                self.cluster[ci][0][i, 0] = topicMap[self.cluster[ci][0][i, 0]]

        # Now iterate the documents and update their cluster and cluster instance indices...
        for doc in self.doc:
            doc.cluster = clusterMap[doc.cluster]
            _, instMap = cluInstAdj[doc.cluster]

            for di in xrange(doc.dp.shape[0]):
                if doc.dp[di, 0] == 0:
                    doc.dp[di, 1] = instMap[doc.dp[di, 1]]

    def nllAllDocs(self):
        """Returns the negative log likelihood of all the documents in the sample - a reasonable value to compare various samples with."""
        return sum(map(lambda d: d.getNLL(), self.doc))

    def logNegProbWordsGivenClusterAbnorm(self,
                                          doc,
                                          cluster,
                                          particles=16,
                                          cap=-1):
        """Uses wallach's 'left to right' method to calculate the negative log probability of the words in the document given the rest of the model. Both the cluster (provided as an index) and the documents abnormalities vector are fixed for this calculation. Returns the average of the results for each sample contained within model. particles is the number of particles to use in the left to right estimation algorithm. This is implimented using scipy.weave."""
        return solvers.leftRightNegLogProbWord(self, doc, cluster, particles,
                                               cap)

    def logNegProbWordsGivenAbnorm(self, doc, particles=16, cap=-1):
        """Uses logNegProbWordsGivenClusterAbnorm and simply sums out the cluster variable."""

        # Get the probability of each with the dependence with clusters...
        cluScores = map(
            lambda c: solvers.leftRightNegLogProbWord(self, doc, c, particles,
                                                      cap),
            xrange(self.getClusterCount()))

        # Multiply each by the probability of the cluster, so it can be summed out...
        cluNorm = float(self.clusterUse.sum()) + self.clusterConc
        cluScores = map(
            lambda c, s: s - math.log(float(self.clusterUse[c]) / cluNorm),
            xrange(len(cluScores)), cluScores)

        # Also need to include the probability of a new cluster, even though it is likelly to be a neglible contribution...
        newVal = solvers.leftRightNegLogProbWord(self, doc, -1, particles, cap)
        newVal -= math.log(self.clusterConc / cluNorm)
        cluScores.append(newVal)

        # Sum out the cluster variable, in a numerically stable way given that we are dealing with negative log likelihood values that will map to extremelly low probabilities...
        minScore = min(cluScores)
        cluPropProb = map(lambda s: math.exp(minScore - s), cluScores)
        return minScore - math.log(sum(cluPropProb))
Exemplo n.º 2
0
class Sample:
  """Stores a single sample drawn from the model - the topics, clusters and each document being sampled over. Stores counts and parameters required to make them into distributions, rather than final distributions. Has clonning capability."""
  def __init__(self, state, calcNLL = True, priorsOnly = False):
    """Given a state this draws a sample from it, as a specific parametrisation of the model. Also a copy constructor, with a slight modification - if the priorsOnly flag is set it will only copy across the priors, and initialise to an empty model."""
    if isinstance(state, Sample): # Code for clonning.
      self.alpha = state.alpha
      self.beta = state.beta.copy()
      self.gamma = state.gamma
      self.rho = state.rho
      self.mu = state.mu
      self.phi = state.phi.copy()

      if not priorsOnly:
        self.topicWord = state.topicWord.copy()
        self.topicUse = state.topicUse.copy()
      else:
        self.topicWord = numpy.zeros((0,state.topicWord.shape[1]), dtype=numpy.int32)
        self.topicUse = numpy.zeros(0,dtype=numpy.int32)
      self.topicConc = state.topicConc

      self.abnormTopicWord = state.abnormTopicWord.copy()
      self.abnorms = dict(state.abnorms)
      self.fia = FlagIndexArray(state.fia)

      if not priorsOnly:
        self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster)
        self.clusterUse = state.clusterUse.copy()
      else:
        self.cluster = []
        self.clusterUse = numpy.zeros(0,dtype=numpy.int32)
      self.clusterConc = state.clusterConc

      if not priorsOnly:
        self.doc = map(lambda ds: DocSample(ds), state.doc)
      else:
        self.doc = []
    else: # Normal initialisation code.
      self.alpha = state.alpha
      self.beta = state.beta.copy()
      self.gamma = state.gamma
      self.rho = state.rho
      self.mu = state.mu
      self.phi = state.phi.copy()

      # Topic stuff...
      self.topicWord = state.topicWord.copy()
      self.topicUse = state.topicUse.copy()
      self.topicConc = state.topicConc

      # Abnormality stuff...
      self.abnormTopicWord = state.abnormTopicWord.copy()
      self.abnorms = dict(state.abnorms)
      self.fia = FlagIndexArray(state.fia)

      # Cluster stuff...
      self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster)
      self.clusterUse = state.clusterUse.copy()
      self.clusterConc = state.clusterConc

      # The details for each document...
      self.doc = []
      for d in xrange(len(state.doc)):
        self.doc.append(DocSample(state.doc[d]))

      # Second pass through documents to fill in the negative log liklihoods - need some data structures for this...
      if calcNLL:
        for d in xrange(len(state.doc)):
          self.doc[d].calcNLL(state.doc[d],state)


  def merge(self, other):
    """Given a sample this merges it into this sample. Works under the assumption that the new sample was learnt with this sample as its only prior, and ends up as though both the prior and the sample were drawn whilst simultaneously being modeled. Trashes the given sample - do not continue to use."""

    # Update the old documents - there are potentially more behaviours in the new sample, which means adjusting the behaviour flags...
    if self.fia.getLength()!=other.fia.getLength():
      for doc in self.doc:
        newBehFlags = numpy.zeros(other.fia.getLength(), dtype=numpy.uint8)
        newBehFlags[0] = doc.behFlags[0]

        for abnorm, index in self.abnorms:
          newIndex = other.abnorms[abnorm]
          newBehFlags[newIndex] = doc.behFlags[index]
        
        doc.behFlags = newBehFlags

    # Replace the basic parameters...
    self.alpha = other.alpha
    self.beta = other.beta
    self.gamma = other.gamma
    self.rho = other.rho
    self.mu = other.mu
    self.phi = other.phi

    self.topicWord = other.topicWord
    self.topicUse = other.topicUse
    self.topicConc = other.topicConc

    self.abnormTopicWord = other.abnormTopicWord
    self.abnorms = other.abnorms
    self.fia = other.fia

    self.cluster = other.cluster
    self.clusterUse = other.clusterUse
    self.clusterConc = other.clusterConc

    # Add in the (presumably) new documents...
    for doc in other.doc:
      self.doc.append(doc)


  def getAlphaPrior(self):
    """Returns the PriorConcDP that was used for the alpha parameter, which is the concentration parameter for the DP in each document."""
    return self.alpha

  def getBeta(self):
    """Returns the beta prior, which is a vector representing a Dirichlet distribution from which the multinomials for each topic are drawn, from which words are drawn."""
    return self.beta

  def getGammaPrior(self):
    """Returns the PriorConcDP that was used for the gamma parameter, which is the concentration parameter for the global DP from which topics are drawn."""
    return self.gamma
    
  def getRhoPrior(self):
    """Returns the PriorConcDP that was used for the rho parameter, which is the concentration parameter for each specific clusters DP."""
    return self.rho

  def getMuPrior(self):
    """Returns the PriorConcDP that was used for the mu parameter, which is the concentration parameter for the DP from which clusters are drawn."""
    return self.mu

  def getPhi(self):
    """Returns the phi Dirichlet distribution prior on the behavioural multinomial for each cluster."""
    return self.phi


  def getTopicCount(self):
    """Returns the number of topics in the sample."""
    return self.topicWord.shape[0]

  def getWordCount(self):
    """Returns the number of words in the topic multinomial."""
    return self.topicWord.shape[1]

  def getTopicUseWeight(self, t):
    """Returns how many times the given topic has been instanced in a cluster."""
    return self.topicUse[t]

  def getTopicUseWeights(self):
    """Returns an array, indexed by topic id, that contains how many times each topic has been instanciated in a cluster. Do not edit the return value - copy it first."""
    return self.topicUse

  def getTopicConc(self):
    """Returns the sampled concentration parameter for drawing topic instances from the global DP."""
    return self.topicConc
    
  def getTopicWordCount(self, t):
    """Returns the number of samples assigned to each word for the given topic, as an integer numpy array. Do not edit the return value - make a copy first."""
    return self.topicWord[t,:]

  def getTopicWordCounts(self, t):
    """Returns the number of samples assigned to each word for all topics, indexed [topic, word], as an integer numpy array. Do not edit the return value - make a copy first."""
    return self.topicWord

  def getTopicMultinomial(self, t):
    """Returns the calculated multinomial for a given topic ident."""
    ret = self.beta.copy()
    ret += self.topicWord[t,:]
    ret /= ret.sum()
    return ret

  def getTopicMultinomials(self):
    """Returns the multinomials for all topics, in a single array - indexed by [topic, word] to give P(word|topic)."""
    ret = numpy.vstack([self.beta]*self.topicWord.shape[0])
    ret += self.topicWord
    ret = (ret.T / ret.sum(axis=1)).T
    return ret


  def getBehCount(self):
    """Returns the number of behaviours, which is the number of abnormalities plus 1, and the entry count for the indexing variable for abnormals in the relevant methods."""
    return self.abnormTopicWord.shape[0]

  def getAbnormWordCount(self, b):
    """Returns the number of samples assigned to each word for the given abnormal topic. Note that entry 0 equates to normal behaviour and is a dummy that should be ignored."""
    return self.abnormTopicWord[b,:]

  def getAbnormWordCounts(self):
    """Returns the number of samples assigned to each word in each abnormal behaviour. An integer 2D array indexed with [behaviour, word], noting that behaviour 0 is a dummy for normal behaviour. Do not edit the return value - make a copy first."""
    return self.abnormTopicWord

  def getAbnormMultinomial(self, b):
    """Returns the calculated multinomial for a given abnormal behaviour."""
    ret = self.beta.copy()
    ret += self.abnormTopicWord[b,:]
    ret /= ret.sum()
    return ret

  def getAbnormMultinomials(self):
    """Returns the multinomials for all abnormalities, in a single array - indexed by [behaviour, word] to give P(word|topic associated with behaviour). Entry 0 is a dummy to fill in for normal behaviour, and should be ignored."""
    ret = numpy.vstack([self.beta]*self.abnormTopicWord.shape[0])
    ret += self.abnormTopicWord
    ret = (ret.T / ret.sum(axis=1)).T
    return ret


  def getAbnormDict(self):
    """Returns a dictionary that takes each abnormalities user provided token to the behaviour index used for it. Allows the use of the getAbnorm* methods, amung other things."""
    return self.abnorms


  def getClusterCount(self):
    """Returns how many clusters there are."""
    return len(self.cluster)

  def getClusterDrawWeight(self, c):
    """Returns how many times the given cluster has been instanced by a document."""
    return self.clusterUse[c]

  def getClusterDrawWeights(self):
    """Returns an array, indexed by cluster id, that contains how many times each cluster has been instanciated by a document. Do not edit the return value - copy it first."""
    return self.clusterUse

  def getClusterDrawConc(self):
    """Returns the sampled concentration parameter for drawing cluster instances for documents."""
    return self.clusterConc

  def getClusterInstCount(self, c):
    """Returns how many instances of topics exist in the given cluster."""
    return self.cluster[c][0].shape[0]
    
  def getClusterInstWeight(self, c, ti):
    """Returns how many times the given cluster topic instance has been instanced by a documents DP."""
    return self.cluster[c][0][ti,1]
    
  def getClusterInstTopic(self, c, ti):
    """Returns which topic the given cluster topic instance is an instance of."""
    return self.cluster[c][0][ti,0]

  def getClusterInstDual(self, c):
    """Returns a 2D array, where the first dimension is indexed by the topic instance, and the second contains two columns - the first the topic index, the second the weight. Do not edit return value - copy before use."""
    return self.cluster[c][0]

  def getClusterInstConc(self, c):
    """Returns the sampled concentration that goes with the DP from which the members of each documents DP are drawn."""
    return self.cluster[c][1]

  def getClusterInstBehMN(self, c):
    """Returns the multinomial on drawing behaviours for the given cluster."""
    return self.cluster[c][2]

  def getClusterInstPriorBehMN(self, c):
    """Returns the prior on the behaviour multinomial, as an array of integer counts aligned with the flag set."""
    return self.cluster[c][3]


  def docCount(self):
    """Returns the number of documents stored within. Should be the same as the corpus from which the sample was drawn."""
    return len(self.doc)

  def getDoc(self,d):
    """Given a document index this returns the appropriate DocSample object. These indices should align up with the document indices in the Corpus from which this Sample was drawn, assuming no documents have been deleted."""
    return self.doc[d]


  def delDoc(self, ident):
    """Given a document ident this finds the document with the ident and removes it from the model, completly - i.e. all the variables in the sample are also updated. Primarilly used to remove documents for resampling prior to using the model as a prior. Note that this can potentially leave entities with no users - they get culled when the model is loaded into the C++ data structure so as to not cause problems."""
    # Find and remove it from the document list...
    index = None
    for i in xrange(len(self.doc)):
      if self.doc[i].getIdent()==ident:
        index = i
        break
    if index==None: return

    victim = self.doc[index]
    self.doc = self.doc[:index] + self.doc[index+1:]
    

    # Update all the variables left behind by subtracting the relevant terms...
    cluster = self.cluster[victim.cluster]
    self.clusterUse[victim.cluster] -= 1

    ## First pass through the dp and remove its influence; at the same time note the arrays that need to be updated by each user when looping through...
    dp_ext = []
    for i in xrange(victim.dp.shape[0]):
      beh = victim.dp[i,0]
      #count = victim.dp[i,2]

      if beh==0: # Normal behaviour
        cluInst = victim.dp[i,1]

        # Update the instance, and topic use counts if necessary...
        topic = cluster[0][cluInst,0]
        cluster[0][cluInst,1] -= 1
        if cluster[0][cluInst,1]==0:
          self.topicUse[topic] -= 1

        # Store the entity that needs updating in correspondence with this dp instance in the next step...
        dp_ext.append((self.topicWord, topic))

      else: # Abnormal behaviour.
        # Store the entity that needs updating in correspondence with the dp...
        dp_ext.append((self.abnormTopicWord, beh))
    
    ## Go through the samples array and remove their influnce - the hard part was done by the preceding step...
    for si in xrange(victim.samples.shape[0]):
      inst = victim.samples[si,0]
      word = victim.samples[si,1]
      mat, topic = dp_ext[inst]
      mat[topic,word] -= 1

    # Clean up all zeroed items...
    self.cleanZeros()


  def cleanZeros(self):
    """Goes through and removes anything that has a zero reference count, adjusting all indices accordingly."""

    # Remove the zeros from this object, noting the changes...

    ## Topics...
    newTopicCount = 0
    topicMap = dict()
    for t in xrange(self.topicUse.shape[0]):
      if self.topicUse[t]!=0:
        topicMap[t] = newTopicCount
        newTopicCount += 1

    if newTopicCount!=self.topicUse.shape[0]:
      newTopicWord = numpy.zeros((newTopicCount, self.topicWord.shape[1]), dtype=numpy.int32)
      newTopicUse = numpy.zeros(newTopicCount,dtype=numpy.int32)

      for origin, dest in topicMap.iteritems():
        newTopicWord[dest,:] = self.topicWord[origin,:]
        newTopicUse[dest] = self.topicUse[origin]
      
      self.topicWord = newTopicWord
      self.topicUse = newTopicUse

    ## Clusters...
    newClusterCount = 0
    clusterMap = dict()
    for c in xrange(self.clusterUse.shape[0]):
      if self.clusterUse[c]!=0:
        clusterMap[c] = newClusterCount
        newClusterCount += 1

    if newClusterCount!=self.clusterUse.shape[0]:
      newCluster = [None]*newClusterCount
      newClusterUse = numpy.zeros(newClusterCount, dtype=numpy.int32)

      for origin, dest in clusterMap.iteritems():
        newCluster[dest] = self.cluster[origin]
        newClusterUse[dest] = self.clusterUse[origin]

      self.cluster = newCluster
      self.clusterUse = newClusterUse

    ## Cluster instances...
    # (Change is noted by a 2-tuple of (new length, dict) where new length is the new length and dict goes from old indices to new indices.)
    cluInstAdj = []
    for ci in xrange(len(self.cluster)):
      newInstCount = 0
      instMap = dict()
      for i in xrange(self.cluster[ci][0].shape[0]):
        if self.cluster[ci][0][i,1]!=0:
          instMap[i] = newInstCount
          newInstCount += 1

      cluInstAdj.append((newInstCount, instMap))

      if newInstCount!=self.cluster[ci][0].shape[0]:
        newInst = numpy.zeros((newInstCount,2), dtype=numpy.int32)

        for origin, dest in instMap.iteritems():
          newInst[dest,:] = self.cluster[ci][0][origin,:]

        self.cluster[ci] = (newInst, self.cluster[ci][1], self.cluster[ci][2], self.cluster[ci][3])


    # Iterate and update the topic indices of the cluster instances...
    for ci in xrange(len(self.cluster)):
      for i in xrange(self.cluster[ci][0].shape[0]):
        self.cluster[ci][0][i,0] = topicMap[self.cluster[ci][0][i,0]]

    # Now iterate the documents and update their cluster and cluster instance indices...
    for doc in self.doc:
      doc.cluster = clusterMap[doc.cluster]
      _, instMap = cluInstAdj[doc.cluster]

      for di in xrange(doc.dp.shape[0]):
        if doc.dp[di,0]==0:
          doc.dp[di,1] = instMap[doc.dp[di,1]]


  def nllAllDocs(self):
    """Returns the negative log likelihood of all the documents in the sample - a reasonable value to compare various samples with."""
    return sum(map(lambda d: d.getNLL(),self.doc))

  def logNegProbWordsGivenClusterAbnorm(self, doc, cluster, particles = 16, cap = -1):
    """Uses wallach's 'left to right' method to calculate the negative log probability of the words in the document given the rest of the model. Both the cluster (provided as an index) and the documents abnormalities vector are fixed for this calculation. Returns the average of the results for each sample contained within model. particles is the number of particles to use in the left to right estimation algorithm. This is implimented using scipy.weave."""
    return solvers.leftRightNegLogProbWord(self, doc, cluster, particles, cap)

  def logNegProbWordsGivenAbnorm(self, doc, particles = 16, cap = -1):
    """Uses logNegProbWordsGivenClusterAbnorm and simply sums out the cluster variable."""

    # Get the probability of each with the dependence with clusters...
    cluScores = map(lambda c: solvers.leftRightNegLogProbWord(self, doc, c, particles, cap), xrange(self.getClusterCount()))

    # Multiply each by the probability of the cluster, so it can be summed out...
    cluNorm = float(self.clusterUse.sum()) + self.clusterConc
    cluScores = map(lambda c,s: s - math.log(float(self.clusterUse[c])/cluNorm), xrange(len(cluScores)), cluScores)

    # Also need to include the probability of a new cluster, even though it is likelly to be a neglible contribution...
    newVal = solvers.leftRightNegLogProbWord(self, doc, -1, particles, cap)
    newVal -= math.log(self.clusterConc/cluNorm)
    cluScores.append(newVal)

    # Sum out the cluster variable, in a numerically stable way given that we are dealing with negative log likelihood values that will map to extremelly low probabilities...
    minScore = min(cluScores)
    cluPropProb = map(lambda s: math.exp(minScore-s), cluScores)
    return minScore - math.log(sum(cluPropProb))