示例#1
0
class State:
  """State object, as manipulated by a Gibbs sampler to get samples of the unknown parameters of the model."""
  def __init__(self, obj, params = None):
    """Constructs a state object given either another State object (clone), or a Corpus and a Params object. If the Params object is omitted it uses the default. Also supports construction from a single Document, where it uses lots of defaults but is basically identical to a Corpus with a single Document in - used as a shortcut when fitting a Document to an already learnt model."""
    if isinstance(obj, State):
      # Cloning time...
      self.dnrDocInsts = obj.dnrDocInsts
      self.dnrCluInsts = obj.dnrCluInsts
      self.seperateClusterConc = obj.seperateClusterConc
      self.seperateDocumentConc = obj.seperateDocumentConc
      self.oneCluster = obj.oneCluster
      self.calcBeta = obj.calcBeta
      self.calcCluBmn = obj.calcCluBmn
      self.calcPhi = obj.calcPhi
      self.resampleConcs = obj.resampleConcs
      self.behSamples = obj.behSamples
      
      self.alpha = PriorConcDP(obj.alpha)
      self.beta = obj.beta.copy()
      self.gamma = PriorConcDP(obj.gamma)
      self.rho = PriorConcDP(obj.rho)
      self.mu = PriorConcDP(obj.mu)
      self.phi = obj.phi.copy()

      self.topicWord = obj.topicWord.copy()
      self.topicUse = obj.topicUse.copy()
      self.topicConc = obj.topicConc

      self.abnormTopicWord = obj.abnormTopicWord.copy()

      self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy()),obj.cluster)
      self.clusterUse = obj.clusterUse.copy()
      self.clusterConc = obj.clusterConc

      self.doc = map(lambda d: DocState(d), obj.doc)
      self.abnorms = dict(obj.abnorms)

      self.fia = FlagIndexArray(obj.fia)

      self.params = Params(obj.params)
      self.model = Model(obj.model)

    elif isinstance(obj, Document):
      # Construct from a single document...

      self.dnrDocInsts = False
      self.dnrCluInsts = False
      self.seperateClusterConc = False
      self.seperateDocumentConc = False
      self.oneCluster = False
      self.calcBeta = False
      self.calcCluBmn = False
      self.calcPhi = False
      self.resampleConcs = False
      self.behSamples = 1024

      wordCount = obj.getWord(obj.getWordCount()-1)[0]

      self.alpha = PriorConcDP()
      self.beta = numpy.ones(wordCount, dtype=numpy.float32)
      self.gamma = PriorConcDP()
      self.rho = PriorConcDP()
      self.mu = PriorConcDP()
      self.phi = numpy.ones(1+len(obj.getAbnorms()), dtype=numpy.float32)
      self.phi[0] *= 10.0
      self.phi /= self.phi.sum()

      self.topicWord = numpy.zeros((0,wordCount), dtype=numpy.int32)
      self.topicUse = numpy.zeros(0,dtype=numpy.int32)
      self.topicConc = self.gamma.conc

      self.abnormTopicWord = numpy.zeros((1+len(obj.getAbnorms()), wordCount), dtype=numpy.int32)

      self.cluster = []
      self.clusterUse = numpy.zeros(0,dtype=numpy.int32)
      self.clusterConc = self.mu.conc

      abnormDict = dict()
      for i, abnorm in enumerate(obj.getAbnorms()):
        abnormDict[abnorm] = i+1
        
      self.doc = [DocState(obj,self.alpha,abnormDict)]
      self.abnorms = dict()
      for num, abnorm in enumerate(obj.getAbnorms()):
        self.abnorms[abnorm] = num+1

      self.fia = FlagIndexArray(len(self.abnorms)+1)
      self.fia.addSingles()

      for doc in self.doc:
        doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags)

      if params!=None: self.params = params
      else: self.params = Params()

      self.model = Model()
    else:
      # Construct from a corpus, as that is the only remaining option...

      # Behaviour flags...
      self.dnrDocInsts = obj.getDocInstsDNR()
      self.dnrCluInsts = obj.getCluInstsDNR()
      self.seperateClusterConc = obj.getSeperateClusterConc()
      self.seperateDocumentConc = obj.getSeperateDocumentConc()
      self.oneCluster = obj.getOneCluster()
      self.calcBeta = obj.getCalcBeta()
      self.calcCluBmn = obj.getCalcClusterBMN()
      self.calcPhi = obj.getCalcPhi()
      self.resampleConcs = obj.getResampleConcs()
      self.behSamples = obj.getBehSamples()

      # Concentration parameters - these are all constant...
      self.alpha = PriorConcDP(obj.getAlpha())
      self.beta = numpy.ones(obj.getWordCount(),dtype=numpy.float32)
      self.beta *= obj.getBeta()
      self.gamma = PriorConcDP(obj.getGamma())
      self.rho = PriorConcDP(obj.getRho())
      self.mu = PriorConcDP(obj.getMu())

      self.phi = numpy.ones(1+len(obj.getAbnormDict()), dtype=numpy.float32)
      self.phi[0] *= obj.getPhiRatio()
      self.phi *= obj.getPhiConc()*self.phi.shape[0] / self.phi.sum()

      # The topics in the model - consists of three parts - first an array indexed by [topic,word] which gives how many times each word has been drawn from the given topic - this alongside beta allows the relevant Dirichlet posterior to be determined. Additionally we have topicUse, which counts how many times each topic has been instanced in a cluster - this alongside topicConc, which is the sampled concentration, defines the DP from which topics are drawn for inclusion in clusters...
      self.topicWord = numpy.zeros((0,obj.getWordCount()),dtype=numpy.int32)
      self.topicUse = numpy.zeros(0,dtype=numpy.int32)
      self.topicConc = self.gamma.conc

      # A second topicWord-style matrix, indexed by behaviour and containing the abnormal topics. Entry 0, which is normal, is again an empty dummy...
      self.abnormTopicWord = numpy.zeros((1+len(obj.getAbnormDict()), obj.getWordCount()), dtype=numpy.int32)

      # Defines the clusters, as a list of (inst, conc, bmn, bmnPrior). inst is a 2D array, containing all the topic instances that make up the cluster - whilst the first dimension of the array indexes each instance the second has two entrys only, the first the index number for the topic, the second the number of using document instances. conc is the sampled concentration that completes the definition of the DP defined for each cluster. bmn is the multinomial on behaviours associated with the cluster - a 1D array of floats. bmnPrior is the flagSet aligned integer array that is the prior on bmn. Additionally we have the DDP from which the specific clusters are drawn - this is defined by clusterUse and clusterConc, just as for the topics...
      self.cluster = []
      self.clusterUse = numpy.zeros(0, dtype=numpy.int32)
      self.clusterConc = self.mu.conc

      # List of document objects, to contain the documents - whilst declared immediatly below as an empty list we then proceed to fill it in with the information from the given Corpus...
      self.doc = []

      for doc in obj.documentList():
        self.doc.append(DocState(doc, self.alpha, obj.getAbnormDict()))

      # The abnormality dictionary - need a copy so we can convert from flags to the user provided codes after fitting the model...
      self.abnorms = dict(obj.getAbnormDict())

      # The flag index array - converts each flag combination to an index - required for learning the per-cluster behaviour multinomials...
      self.fia = FlagIndexArray(len(self.abnorms)+1)
      self.fia.addSingles()

      for doc in self.doc:
        doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags)

      # Store the parameters...
      if params!=None: self.params = params
      else: self.params = Params()

      # Create a model object, for storing samples into...
      self.model = Model()


  def setGlobalParams(self, sample):
    """Sets a number of parameters for the State after initialisation, taking them from the given Sample object. Designed for use with the addPrior method this allows you to extract all relevant parameters from a Sample. Must be called before any Gibbs sampling takes place."""
    self.alpha = PriorConcDP(sample.alpha)
    self.beta = sample.beta.copy()
    self.gamma = PriorConcDP(sample.gamma)
    self.rho = PriorConcDP(sample.rho)
    self.mu = PriorConcDP(sample.mu)

    # No correct way of combining - the below seems reasonable enough however, and is correct if they have the same entrys...
    for key,fromIndex in sample.abnorms.iteritems():
      if key in self.abnorms:
        toIndex = self.abnorms[key]
        self.phi[toIndex] = sample.phi[fromIndex]
    self.phi /= self.phi.sum()

    self.topicConc = sample.topicConc
    self.clusterConc = sample.clusterConc
    for doc in self.doc:
      doc.conc = self.alpha.conc
  
  def addPrior(self, sample):
    """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place."""

    # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ###################
    
    # Do the topics...
    offset = self.topicWord.shape[0]
    if self.topicWord.shape[0]!=0:
      self.topicWord = numpy.vstack((self.topicWord,sample.topicWord))
    else:
      self.topicWord = sample.topicWord.copy()
    self.topicUse = numpy.hstack((self.topicUse,sample.topicUse))

    # Calculate the new abnormalities dictionary...
    newAbnorms = dict(sample.abnorms)
    for key,_ in self.abnorms.iteritems():
      if key not in newAbnorms:
        val = len(newAbnorms)+1
        newAbnorms[key] = val

    # Transfer over the abnormal word counts...
    newAbnormTopicWord = numpy.zeros((1+len(newAbnorms), max((self.abnormTopicWord.shape[1], sample.abnormTopicWord.shape[1]))), dtype=numpy.int32)

    for abnorm,origin in self.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      limit = self.abnormTopicWord.shape[1]
      newAbnormTopicWord[dest,:limit] += self.abnormTopicWord[origin,:limit]

    for abnorm,origin in sample.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      limit = sample.abnormTopicWord.shape[1]
      newAbnormTopicWord[dest,:limit] += sample.abnormTopicWord[origin,:limit]

    # Update the document flags/counts for behaviours...
    for doc in self.doc:
      newFlags = numpy.zeros(1+len(newAbnorms), dtype=numpy.uint8)
      newCounts = numpy.zeros(1+len(newAbnorms), dtype=numpy.int32)
      newFlags[0] = doc.behFlags[0]
      newCounts[0] = doc.behCounts[0]

      for abnorm,origin in self.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        newFlags[dest] = doc.behFlags[origin]
        newCounts[dest] = doc.behCounts[origin]
      
      doc.behFlags = newFlags
      doc.behCounts = newCounts

    # Update the old clusters behaviour arrays...
    def mapOldCluster(c):
      c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32)
      c2 /= c2.sum()
      
      c2[0] *= c[2][0]
      for abnorm,origin in self.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        c2[dest] *= c[2][origin]
      c2 /= c2.sum()
      
      return (c[0],c[1],c2,c[3])
      
    self.cluster = map(mapOldCluster ,self.cluster)
    origCluCount = len(self.cluster)
    
    # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents...
    def mapCluster(pair):
      ci, c = pair
      
      c0 = c[0].copy()
      c0[:,0] += offset

      c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32)
      c2 /= c2.sum()

      c2[0] *= c[2][0]
      for abnorm,origin in sample.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        c2[dest] *= c[2][origin]
      c2 /= c2.sum()

      c3 = c[3].copy()
      for doc in filter(lambda doc: doc.cluster==ci, sample.doc):
        fi = sample.fia.flagIndex(doc.behFlags, False)
        if fi>=len(doc.behFlags): # Only bother if the document has abnormalities, of which this is a valid test.
          total = 0
          for i in xrange(doc.dp.shape[0]):
            c3[doc.dp[i,0]] += doc.dp[i,2]
            total += doc.dp[i,2]
          c3[fi] -= total + 1
      
      return (c0,c[1],c2,c3)
      
    self.cluster += map(mapCluster, enumerate(sample.cluster))
    self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse))
    
    # Update phi...
    newPhi = numpy.ones(len(newAbnorms)+1,dtype=numpy.float32)
    newPhi[0] = 0.5*(self.phi[0]+sample.phi[0])
    
    for abnorm,origin in self.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      newPhi[dest] = self.phi[origin]
    for abnorm,origin in sample.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      if abnorm not in self.abnorms:
        newPhi[dest] = sample.phi[origin]
      else:
        newPhi[dest] = 0.5*(newPhi[dest] + sample.phi[origin])
      
    self.phi = newPhi
    self.phi /= self.phi.sum()

    # Recreate the flag index array...
    remapOrig = dict() # Old flag positions to new flag positions.
    remapOrig[0] = 0
    for abnorm,origin in self.abnorms.iteritems():
      remapOrig[origin] = newAbnorms[abnorm]

    remapSam = dict() # sample flag positions to new flag positions.
    remapSam[0] = 0
    for abnorm,origin in sample.abnorms.iteritems():
      remapSam[origin] = newAbnorms[abnorm]
    
    newFia = FlagIndexArray(len(newAbnorms)+1)
    newFia.addSingles()
    behIndAdjOrig = newFia.addFlagIndexArray(self.fia,remapOrig)
    behIndAdjSam  = newFia.addFlagIndexArray(sample.fia,remapSam)

    for doc in self.doc:
      doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex]

    # Update cluster priors on bmn arrays...
    for c in xrange(len(self.cluster)):
      clu = self.cluster[c]
      newBmn = numpy.zeros(newFia.flagCount(),dtype=numpy.int32)
      oldBmn = clu[3].copy()

      # Transilate from old set...
      for b in xrange(oldBmn.shape[0]):
        index = behIndAdjOrig[b] if c<origCluCount else behIndAdjSam[b]
        newBmn[index] += oldBmn[b]

      self.cluster[c] = (clu[0], clu[1], clu[2], newBmn)

    # Replace the old abnormality and fia stuff...
    self.abnormTopicWord = newAbnormTopicWord
    self.abnorms = newAbnorms
    self.fia = newFia


  def sample(self):
    """Samples the current state, storing the current estimate of the model parameters."""
    self.model.sampleState(self)

  def absorbClone(self,clone):
    """Given a clone absorb all its samples - used for multiprocessing."""
    self.model.absorbModel(clone.model)


  def getParams(self):
    """Returns the parameters object."""
    return self.params
    
  def getModel(self):
    """Returns the model constructed from all the calls to sample()."""
    return self.model
示例#2
0
class State:
    """State object, as manipulated by a Gibbs sampler to get samples of the unknown parameters of the model."""
    def __init__(self, obj, params=None):
        """Constructs a state object given either another State object (clone), or a Corpus and a Params object. If the Params object is omitted it uses the default. Also supports construction from a single Document, where it uses lots of defaults but is basically identical to a Corpus with a single Document in - used as a shortcut when fitting a Document to an already learnt model."""
        if isinstance(obj, State):
            # Cloning time...
            self.dnrDocInsts = obj.dnrDocInsts
            self.dnrCluInsts = obj.dnrCluInsts
            self.seperateClusterConc = obj.seperateClusterConc
            self.seperateDocumentConc = obj.seperateDocumentConc
            self.oneCluster = obj.oneCluster
            self.calcBeta = obj.calcBeta
            self.calcCluBmn = obj.calcCluBmn
            self.calcPhi = obj.calcPhi
            self.resampleConcs = obj.resampleConcs
            self.behSamples = obj.behSamples

            self.alpha = PriorConcDP(obj.alpha)
            self.beta = obj.beta.copy()
            self.gamma = PriorConcDP(obj.gamma)
            self.rho = PriorConcDP(obj.rho)
            self.mu = PriorConcDP(obj.mu)
            self.phi = obj.phi.copy()

            self.topicWord = obj.topicWord.copy()
            self.topicUse = obj.topicUse.copy()
            self.topicConc = obj.topicConc

            self.abnormTopicWord = obj.abnormTopicWord.copy()

            self.cluster = map(lambda t: (t[0].copy(), t[1], t[2].copy()),
                               obj.cluster)
            self.clusterUse = obj.clusterUse.copy()
            self.clusterConc = obj.clusterConc

            self.doc = map(lambda d: DocState(d), obj.doc)
            self.abnorms = dict(obj.abnorms)

            self.fia = FlagIndexArray(obj.fia)

            self.params = Params(obj.params)
            self.model = Model(obj.model)

        elif isinstance(obj, Document):
            # Construct from a single document...

            self.dnrDocInsts = False
            self.dnrCluInsts = False
            self.seperateClusterConc = False
            self.seperateDocumentConc = False
            self.oneCluster = False
            self.calcBeta = False
            self.calcCluBmn = False
            self.calcPhi = False
            self.resampleConcs = False
            self.behSamples = 1024

            wordCount = obj.getWord(obj.getWordCount() - 1)[0]

            self.alpha = PriorConcDP()
            self.beta = numpy.ones(wordCount, dtype=numpy.float32)
            self.gamma = PriorConcDP()
            self.rho = PriorConcDP()
            self.mu = PriorConcDP()
            self.phi = numpy.ones(1 + len(obj.getAbnorms()),
                                  dtype=numpy.float32)
            self.phi[0] *= 10.0
            self.phi /= self.phi.sum()

            self.topicWord = numpy.zeros((0, wordCount), dtype=numpy.int32)
            self.topicUse = numpy.zeros(0, dtype=numpy.int32)
            self.topicConc = self.gamma.conc

            self.abnormTopicWord = numpy.zeros(
                (1 + len(obj.getAbnorms()), wordCount), dtype=numpy.int32)

            self.cluster = []
            self.clusterUse = numpy.zeros(0, dtype=numpy.int32)
            self.clusterConc = self.mu.conc

            abnormDict = dict()
            for i, abnorm in enumerate(obj.getAbnorms()):
                abnormDict[abnorm] = i + 1

            self.doc = [DocState(obj, self.alpha, abnormDict)]
            self.abnorms = dict()
            for num, abnorm in enumerate(obj.getAbnorms()):
                self.abnorms[abnorm] = num + 1

            self.fia = FlagIndexArray(len(self.abnorms) + 1)
            self.fia.addSingles()

            for doc in self.doc:
                doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags)

            if params != None: self.params = params
            else: self.params = Params()

            self.model = Model()
        else:
            # Construct from a corpus, as that is the only remaining option...

            # Behaviour flags...
            self.dnrDocInsts = obj.getDocInstsDNR()
            self.dnrCluInsts = obj.getCluInstsDNR()
            self.seperateClusterConc = obj.getSeperateClusterConc()
            self.seperateDocumentConc = obj.getSeperateDocumentConc()
            self.oneCluster = obj.getOneCluster()
            self.calcBeta = obj.getCalcBeta()
            self.calcCluBmn = obj.getCalcClusterBMN()
            self.calcPhi = obj.getCalcPhi()
            self.resampleConcs = obj.getResampleConcs()
            self.behSamples = obj.getBehSamples()

            # Concentration parameters - these are all constant...
            self.alpha = PriorConcDP(obj.getAlpha())
            self.beta = numpy.ones(obj.getWordCount(), dtype=numpy.float32)
            self.beta *= obj.getBeta()
            self.gamma = PriorConcDP(obj.getGamma())
            self.rho = PriorConcDP(obj.getRho())
            self.mu = PriorConcDP(obj.getMu())

            self.phi = numpy.ones(1 + len(obj.getAbnormDict()),
                                  dtype=numpy.float32)
            self.phi[0] *= obj.getPhiRatio()
            self.phi *= obj.getPhiConc() * self.phi.shape[0] / self.phi.sum()

            # The topics in the model - consists of three parts - first an array indexed by [topic,word] which gives how many times each word has been drawn from the given topic - this alongside beta allows the relevant Dirichlet posterior to be determined. Additionally we have topicUse, which counts how many times each topic has been instanced in a cluster - this alongside topicConc, which is the sampled concentration, defines the DP from which topics are drawn for inclusion in clusters...
            self.topicWord = numpy.zeros((0, obj.getWordCount()),
                                         dtype=numpy.int32)
            self.topicUse = numpy.zeros(0, dtype=numpy.int32)
            self.topicConc = self.gamma.conc

            # A second topicWord-style matrix, indexed by behaviour and containing the abnormal topics. Entry 0, which is normal, is again an empty dummy...
            self.abnormTopicWord = numpy.zeros(
                (1 + len(obj.getAbnormDict()), obj.getWordCount()),
                dtype=numpy.int32)

            # Defines the clusters, as a list of (inst, conc, bmn, bmnPrior). inst is a 2D array, containing all the topic instances that make up the cluster - whilst the first dimension of the array indexes each instance the second has two entrys only, the first the index number for the topic, the second the number of using document instances. conc is the sampled concentration that completes the definition of the DP defined for each cluster. bmn is the multinomial on behaviours associated with the cluster - a 1D array of floats. bmnPrior is the flagSet aligned integer array that is the prior on bmn. Additionally we have the DDP from which the specific clusters are drawn - this is defined by clusterUse and clusterConc, just as for the topics...
            self.cluster = []
            self.clusterUse = numpy.zeros(0, dtype=numpy.int32)
            self.clusterConc = self.mu.conc

            # List of document objects, to contain the documents - whilst declared immediatly below as an empty list we then proceed to fill it in with the information from the given Corpus...
            self.doc = []

            for doc in obj.documentList():
                self.doc.append(DocState(doc, self.alpha, obj.getAbnormDict()))

            # The abnormality dictionary - need a copy so we can convert from flags to the user provided codes after fitting the model...
            self.abnorms = dict(obj.getAbnormDict())

            # The flag index array - converts each flag combination to an index - required for learning the per-cluster behaviour multinomials...
            self.fia = FlagIndexArray(len(self.abnorms) + 1)
            self.fia.addSingles()

            for doc in self.doc:
                doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags)

            # Store the parameters...
            if params != None: self.params = params
            else: self.params = Params()

            # Create a model object, for storing samples into...
            self.model = Model()

    def setGlobalParams(self, sample):
        """Sets a number of parameters for the State after initialisation, taking them from the given Sample object. Designed for use with the addPrior method this allows you to extract all relevant parameters from a Sample. Must be called before any Gibbs sampling takes place."""
        self.alpha = PriorConcDP(sample.alpha)
        self.beta = sample.beta.copy()
        self.gamma = PriorConcDP(sample.gamma)
        self.rho = PriorConcDP(sample.rho)
        self.mu = PriorConcDP(sample.mu)

        # No correct way of combining - the below seems reasonable enough however, and is correct if they have the same entrys...
        for key, fromIndex in sample.abnorms.iteritems():
            if key in self.abnorms:
                toIndex = self.abnorms[key]
                self.phi[toIndex] = sample.phi[fromIndex]
        self.phi /= self.phi.sum()

        self.topicConc = sample.topicConc
        self.clusterConc = sample.clusterConc
        for doc in self.doc:
            doc.conc = self.alpha.conc

    def addPrior(self, sample):
        """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place."""

        # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ###################

        # Do the topics...
        offset = self.topicWord.shape[0]
        if self.topicWord.shape[0] != 0:
            self.topicWord = numpy.vstack((self.topicWord, sample.topicWord))
        else:
            self.topicWord = sample.topicWord.copy()
        self.topicUse = numpy.hstack((self.topicUse, sample.topicUse))

        # Calculate the new abnormalities dictionary...
        newAbnorms = dict(sample.abnorms)
        for key, _ in self.abnorms.iteritems():
            if key not in newAbnorms:
                val = len(newAbnorms) + 1
                newAbnorms[key] = val

        # Transfer over the abnormal word counts...
        newAbnormTopicWord = numpy.zeros(
            (1 + len(newAbnorms),
             max((self.abnormTopicWord.shape[1],
                  sample.abnormTopicWord.shape[1]))),
            dtype=numpy.int32)

        for abnorm, origin in self.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            limit = self.abnormTopicWord.shape[1]
            newAbnormTopicWord[dest, :limit] += self.abnormTopicWord[
                origin, :limit]

        for abnorm, origin in sample.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            limit = sample.abnormTopicWord.shape[1]
            newAbnormTopicWord[dest, :limit] += sample.abnormTopicWord[
                origin, :limit]

        # Update the document flags/counts for behaviours...
        for doc in self.doc:
            newFlags = numpy.zeros(1 + len(newAbnorms), dtype=numpy.uint8)
            newCounts = numpy.zeros(1 + len(newAbnorms), dtype=numpy.int32)
            newFlags[0] = doc.behFlags[0]
            newCounts[0] = doc.behCounts[0]

            for abnorm, origin in self.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                newFlags[dest] = doc.behFlags[origin]
                newCounts[dest] = doc.behCounts[origin]

            doc.behFlags = newFlags
            doc.behCounts = newCounts

        # Update the old clusters behaviour arrays...
        def mapOldCluster(c):
            c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32)
            c2 /= c2.sum()

            c2[0] *= c[2][0]
            for abnorm, origin in self.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                c2[dest] *= c[2][origin]
            c2 /= c2.sum()

            return (c[0], c[1], c2, c[3])

        self.cluster = map(mapOldCluster, self.cluster)
        origCluCount = len(self.cluster)

        # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents...
        def mapCluster(pair):
            ci, c = pair

            c0 = c[0].copy()
            c0[:, 0] += offset

            c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32)
            c2 /= c2.sum()

            c2[0] *= c[2][0]
            for abnorm, origin in sample.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                c2[dest] *= c[2][origin]
            c2 /= c2.sum()

            c3 = c[3].copy()
            for doc in filter(lambda doc: doc.cluster == ci, sample.doc):
                fi = sample.fia.flagIndex(doc.behFlags, False)
                if fi >= len(
                        doc.behFlags
                ):  # Only bother if the document has abnormalities, of which this is a valid test.
                    total = 0
                    for i in xrange(doc.dp.shape[0]):
                        c3[doc.dp[i, 0]] += doc.dp[i, 2]
                        total += doc.dp[i, 2]
                    c3[fi] -= total + 1

            return (c0, c[1], c2, c3)

        self.cluster += map(mapCluster, enumerate(sample.cluster))
        self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse))

        # Update phi...
        newPhi = numpy.ones(len(newAbnorms) + 1, dtype=numpy.float32)
        newPhi[0] = 0.5 * (self.phi[0] + sample.phi[0])

        for abnorm, origin in self.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            newPhi[dest] = self.phi[origin]
        for abnorm, origin in sample.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            if abnorm not in self.abnorms:
                newPhi[dest] = sample.phi[origin]
            else:
                newPhi[dest] = 0.5 * (newPhi[dest] + sample.phi[origin])

        self.phi = newPhi
        self.phi /= self.phi.sum()

        # Recreate the flag index array...
        remapOrig = dict()  # Old flag positions to new flag positions.
        remapOrig[0] = 0
        for abnorm, origin in self.abnorms.iteritems():
            remapOrig[origin] = newAbnorms[abnorm]

        remapSam = dict()  # sample flag positions to new flag positions.
        remapSam[0] = 0
        for abnorm, origin in sample.abnorms.iteritems():
            remapSam[origin] = newAbnorms[abnorm]

        newFia = FlagIndexArray(len(newAbnorms) + 1)
        newFia.addSingles()
        behIndAdjOrig = newFia.addFlagIndexArray(self.fia, remapOrig)
        behIndAdjSam = newFia.addFlagIndexArray(sample.fia, remapSam)

        for doc in self.doc:
            doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex]

        # Update cluster priors on bmn arrays...
        for c in xrange(len(self.cluster)):
            clu = self.cluster[c]
            newBmn = numpy.zeros(newFia.flagCount(), dtype=numpy.int32)
            oldBmn = clu[3].copy()

            # Transilate from old set...
            for b in xrange(oldBmn.shape[0]):
                index = behIndAdjOrig[b] if c < origCluCount else behIndAdjSam[
                    b]
                newBmn[index] += oldBmn[b]

            self.cluster[c] = (clu[0], clu[1], clu[2], newBmn)

        # Replace the old abnormality and fia stuff...
        self.abnormTopicWord = newAbnormTopicWord
        self.abnorms = newAbnorms
        self.fia = newFia

    def sample(self):
        """Samples the current state, storing the current estimate of the model parameters."""
        self.model.sampleState(self)

    def absorbClone(self, clone):
        """Given a clone absorb all its samples - used for multiprocessing."""
        self.model.absorbModel(clone.model)

    def getParams(self):
        """Returns the parameters object."""
        return self.params

    def getModel(self):
        """Returns the model constructed from all the calls to sample()."""
        return self.model
示例#3
0
  def addPrior(self, sample):
    """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place."""

    # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ###################
    
    # Do the topics...
    offset = self.topicWord.shape[0]
    if self.topicWord.shape[0]!=0:
      self.topicWord = numpy.vstack((self.topicWord,sample.topicWord))
    else:
      self.topicWord = sample.topicWord.copy()
    self.topicUse = numpy.hstack((self.topicUse,sample.topicUse))

    # Calculate the new abnormalities dictionary...
    newAbnorms = dict(sample.abnorms)
    for key,_ in self.abnorms.iteritems():
      if key not in newAbnorms:
        val = len(newAbnorms)+1
        newAbnorms[key] = val

    # Transfer over the abnormal word counts...
    newAbnormTopicWord = numpy.zeros((1+len(newAbnorms), max((self.abnormTopicWord.shape[1], sample.abnormTopicWord.shape[1]))), dtype=numpy.int32)

    for abnorm,origin in self.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      limit = self.abnormTopicWord.shape[1]
      newAbnormTopicWord[dest,:limit] += self.abnormTopicWord[origin,:limit]

    for abnorm,origin in sample.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      limit = sample.abnormTopicWord.shape[1]
      newAbnormTopicWord[dest,:limit] += sample.abnormTopicWord[origin,:limit]

    # Update the document flags/counts for behaviours...
    for doc in self.doc:
      newFlags = numpy.zeros(1+len(newAbnorms), dtype=numpy.uint8)
      newCounts = numpy.zeros(1+len(newAbnorms), dtype=numpy.int32)
      newFlags[0] = doc.behFlags[0]
      newCounts[0] = doc.behCounts[0]

      for abnorm,origin in self.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        newFlags[dest] = doc.behFlags[origin]
        newCounts[dest] = doc.behCounts[origin]
      
      doc.behFlags = newFlags
      doc.behCounts = newCounts

    # Update the old clusters behaviour arrays...
    def mapOldCluster(c):
      c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32)
      c2 /= c2.sum()
      
      c2[0] *= c[2][0]
      for abnorm,origin in self.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        c2[dest] *= c[2][origin]
      c2 /= c2.sum()
      
      return (c[0],c[1],c2,c[3])
      
    self.cluster = map(mapOldCluster ,self.cluster)
    origCluCount = len(self.cluster)
    
    # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents...
    def mapCluster(pair):
      ci, c = pair
      
      c0 = c[0].copy()
      c0[:,0] += offset

      c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32)
      c2 /= c2.sum()

      c2[0] *= c[2][0]
      for abnorm,origin in sample.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        c2[dest] *= c[2][origin]
      c2 /= c2.sum()

      c3 = c[3].copy()
      for doc in filter(lambda doc: doc.cluster==ci, sample.doc):
        fi = sample.fia.flagIndex(doc.behFlags, False)
        if fi>=len(doc.behFlags): # Only bother if the document has abnormalities, of which this is a valid test.
          total = 0
          for i in xrange(doc.dp.shape[0]):
            c3[doc.dp[i,0]] += doc.dp[i,2]
            total += doc.dp[i,2]
          c3[fi] -= total + 1
      
      return (c0,c[1],c2,c3)
      
    self.cluster += map(mapCluster, enumerate(sample.cluster))
    self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse))
    
    # Update phi...
    newPhi = numpy.ones(len(newAbnorms)+1,dtype=numpy.float32)
    newPhi[0] = 0.5*(self.phi[0]+sample.phi[0])
    
    for abnorm,origin in self.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      newPhi[dest] = self.phi[origin]
    for abnorm,origin in sample.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      if abnorm not in self.abnorms:
        newPhi[dest] = sample.phi[origin]
      else:
        newPhi[dest] = 0.5*(newPhi[dest] + sample.phi[origin])
      
    self.phi = newPhi
    self.phi /= self.phi.sum()

    # Recreate the flag index array...
    remapOrig = dict() # Old flag positions to new flag positions.
    remapOrig[0] = 0
    for abnorm,origin in self.abnorms.iteritems():
      remapOrig[origin] = newAbnorms[abnorm]

    remapSam = dict() # sample flag positions to new flag positions.
    remapSam[0] = 0
    for abnorm,origin in sample.abnorms.iteritems():
      remapSam[origin] = newAbnorms[abnorm]
    
    newFia = FlagIndexArray(len(newAbnorms)+1)
    newFia.addSingles()
    behIndAdjOrig = newFia.addFlagIndexArray(self.fia,remapOrig)
    behIndAdjSam  = newFia.addFlagIndexArray(sample.fia,remapSam)

    for doc in self.doc:
      doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex]

    # Update cluster priors on bmn arrays...
    for c in xrange(len(self.cluster)):
      clu = self.cluster[c]
      newBmn = numpy.zeros(newFia.flagCount(),dtype=numpy.int32)
      oldBmn = clu[3].copy()

      # Transilate from old set...
      for b in xrange(oldBmn.shape[0]):
        index = behIndAdjOrig[b] if c<origCluCount else behIndAdjSam[b]
        newBmn[index] += oldBmn[b]

      self.cluster[c] = (clu[0], clu[1], clu[2], newBmn)

    # Replace the old abnormality and fia stuff...
    self.abnormTopicWord = newAbnormTopicWord
    self.abnorms = newAbnorms
    self.fia = newFia
示例#4
0
    def addPrior(self, sample):
        """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place."""

        # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ###################

        # Do the topics...
        offset = self.topicWord.shape[0]
        if self.topicWord.shape[0] != 0:
            self.topicWord = numpy.vstack((self.topicWord, sample.topicWord))
        else:
            self.topicWord = sample.topicWord.copy()
        self.topicUse = numpy.hstack((self.topicUse, sample.topicUse))

        # Calculate the new abnormalities dictionary...
        newAbnorms = dict(sample.abnorms)
        for key, _ in self.abnorms.iteritems():
            if key not in newAbnorms:
                val = len(newAbnorms) + 1
                newAbnorms[key] = val

        # Transfer over the abnormal word counts...
        newAbnormTopicWord = numpy.zeros(
            (1 + len(newAbnorms),
             max((self.abnormTopicWord.shape[1],
                  sample.abnormTopicWord.shape[1]))),
            dtype=numpy.int32)

        for abnorm, origin in self.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            limit = self.abnormTopicWord.shape[1]
            newAbnormTopicWord[dest, :limit] += self.abnormTopicWord[
                origin, :limit]

        for abnorm, origin in sample.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            limit = sample.abnormTopicWord.shape[1]
            newAbnormTopicWord[dest, :limit] += sample.abnormTopicWord[
                origin, :limit]

        # Update the document flags/counts for behaviours...
        for doc in self.doc:
            newFlags = numpy.zeros(1 + len(newAbnorms), dtype=numpy.uint8)
            newCounts = numpy.zeros(1 + len(newAbnorms), dtype=numpy.int32)
            newFlags[0] = doc.behFlags[0]
            newCounts[0] = doc.behCounts[0]

            for abnorm, origin in self.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                newFlags[dest] = doc.behFlags[origin]
                newCounts[dest] = doc.behCounts[origin]

            doc.behFlags = newFlags
            doc.behCounts = newCounts

        # Update the old clusters behaviour arrays...
        def mapOldCluster(c):
            c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32)
            c2 /= c2.sum()

            c2[0] *= c[2][0]
            for abnorm, origin in self.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                c2[dest] *= c[2][origin]
            c2 /= c2.sum()

            return (c[0], c[1], c2, c[3])

        self.cluster = map(mapOldCluster, self.cluster)
        origCluCount = len(self.cluster)

        # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents...
        def mapCluster(pair):
            ci, c = pair

            c0 = c[0].copy()
            c0[:, 0] += offset

            c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32)
            c2 /= c2.sum()

            c2[0] *= c[2][0]
            for abnorm, origin in sample.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                c2[dest] *= c[2][origin]
            c2 /= c2.sum()

            c3 = c[3].copy()
            for doc in filter(lambda doc: doc.cluster == ci, sample.doc):
                fi = sample.fia.flagIndex(doc.behFlags, False)
                if fi >= len(
                        doc.behFlags
                ):  # Only bother if the document has abnormalities, of which this is a valid test.
                    total = 0
                    for i in xrange(doc.dp.shape[0]):
                        c3[doc.dp[i, 0]] += doc.dp[i, 2]
                        total += doc.dp[i, 2]
                    c3[fi] -= total + 1

            return (c0, c[1], c2, c3)

        self.cluster += map(mapCluster, enumerate(sample.cluster))
        self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse))

        # Update phi...
        newPhi = numpy.ones(len(newAbnorms) + 1, dtype=numpy.float32)
        newPhi[0] = 0.5 * (self.phi[0] + sample.phi[0])

        for abnorm, origin in self.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            newPhi[dest] = self.phi[origin]
        for abnorm, origin in sample.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            if abnorm not in self.abnorms:
                newPhi[dest] = sample.phi[origin]
            else:
                newPhi[dest] = 0.5 * (newPhi[dest] + sample.phi[origin])

        self.phi = newPhi
        self.phi /= self.phi.sum()

        # Recreate the flag index array...
        remapOrig = dict()  # Old flag positions to new flag positions.
        remapOrig[0] = 0
        for abnorm, origin in self.abnorms.iteritems():
            remapOrig[origin] = newAbnorms[abnorm]

        remapSam = dict()  # sample flag positions to new flag positions.
        remapSam[0] = 0
        for abnorm, origin in sample.abnorms.iteritems():
            remapSam[origin] = newAbnorms[abnorm]

        newFia = FlagIndexArray(len(newAbnorms) + 1)
        newFia.addSingles()
        behIndAdjOrig = newFia.addFlagIndexArray(self.fia, remapOrig)
        behIndAdjSam = newFia.addFlagIndexArray(sample.fia, remapSam)

        for doc in self.doc:
            doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex]

        # Update cluster priors on bmn arrays...
        for c in xrange(len(self.cluster)):
            clu = self.cluster[c]
            newBmn = numpy.zeros(newFia.flagCount(), dtype=numpy.int32)
            oldBmn = clu[3].copy()

            # Transilate from old set...
            for b in xrange(oldBmn.shape[0]):
                index = behIndAdjOrig[b] if c < origCluCount else behIndAdjSam[
                    b]
                newBmn[index] += oldBmn[b]

            self.cluster[c] = (clu[0], clu[1], clu[2], newBmn)

        # Replace the old abnormality and fia stuff...
        self.abnormTopicWord = newAbnormTopicWord
        self.abnorms = newAbnorms
        self.fia = newFia