Exemplo n.º 1
0
  def addPrior(self, sample):
    """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place."""

    # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ###################
    
    # Do the topics...
    offset = self.topicWord.shape[0]
    if self.topicWord.shape[0]!=0:
      self.topicWord = numpy.vstack((self.topicWord,sample.topicWord))
    else:
      self.topicWord = sample.topicWord.copy()
    self.topicUse = numpy.hstack((self.topicUse,sample.topicUse))

    # Calculate the new abnormalities dictionary...
    newAbnorms = dict(sample.abnorms)
    for key,_ in self.abnorms.iteritems():
      if key not in newAbnorms:
        val = len(newAbnorms)+1
        newAbnorms[key] = val

    # Transfer over the abnormal word counts...
    newAbnormTopicWord = numpy.zeros((1+len(newAbnorms), max((self.abnormTopicWord.shape[1], sample.abnormTopicWord.shape[1]))), dtype=numpy.int32)

    for abnorm,origin in self.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      limit = self.abnormTopicWord.shape[1]
      newAbnormTopicWord[dest,:limit] += self.abnormTopicWord[origin,:limit]

    for abnorm,origin in sample.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      limit = sample.abnormTopicWord.shape[1]
      newAbnormTopicWord[dest,:limit] += sample.abnormTopicWord[origin,:limit]

    # Update the document flags/counts for behaviours...
    for doc in self.doc:
      newFlags = numpy.zeros(1+len(newAbnorms), dtype=numpy.uint8)
      newCounts = numpy.zeros(1+len(newAbnorms), dtype=numpy.int32)
      newFlags[0] = doc.behFlags[0]
      newCounts[0] = doc.behCounts[0]

      for abnorm,origin in self.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        newFlags[dest] = doc.behFlags[origin]
        newCounts[dest] = doc.behCounts[origin]
      
      doc.behFlags = newFlags
      doc.behCounts = newCounts

    # Update the old clusters behaviour arrays...
    def mapOldCluster(c):
      c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32)
      c2 /= c2.sum()
      
      c2[0] *= c[2][0]
      for abnorm,origin in self.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        c2[dest] *= c[2][origin]
      c2 /= c2.sum()
      
      return (c[0],c[1],c2,c[3])
      
    self.cluster = map(mapOldCluster ,self.cluster)
    origCluCount = len(self.cluster)
    
    # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents...
    def mapCluster(pair):
      ci, c = pair
      
      c0 = c[0].copy()
      c0[:,0] += offset

      c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32)
      c2 /= c2.sum()

      c2[0] *= c[2][0]
      for abnorm,origin in sample.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        c2[dest] *= c[2][origin]
      c2 /= c2.sum()

      c3 = c[3].copy()
      for doc in filter(lambda doc: doc.cluster==ci, sample.doc):
        fi = sample.fia.flagIndex(doc.behFlags, False)
        if fi>=len(doc.behFlags): # Only bother if the document has abnormalities, of which this is a valid test.
          total = 0
          for i in xrange(doc.dp.shape[0]):
            c3[doc.dp[i,0]] += doc.dp[i,2]
            total += doc.dp[i,2]
          c3[fi] -= total + 1
      
      return (c0,c[1],c2,c3)
      
    self.cluster += map(mapCluster, enumerate(sample.cluster))
    self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse))
    
    # Update phi...
    newPhi = numpy.ones(len(newAbnorms)+1,dtype=numpy.float32)
    newPhi[0] = 0.5*(self.phi[0]+sample.phi[0])
    
    for abnorm,origin in self.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      newPhi[dest] = self.phi[origin]
    for abnorm,origin in sample.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      if abnorm not in self.abnorms:
        newPhi[dest] = sample.phi[origin]
      else:
        newPhi[dest] = 0.5*(newPhi[dest] + sample.phi[origin])
      
    self.phi = newPhi
    self.phi /= self.phi.sum()

    # Recreate the flag index array...
    remapOrig = dict() # Old flag positions to new flag positions.
    remapOrig[0] = 0
    for abnorm,origin in self.abnorms.iteritems():
      remapOrig[origin] = newAbnorms[abnorm]

    remapSam = dict() # sample flag positions to new flag positions.
    remapSam[0] = 0
    for abnorm,origin in sample.abnorms.iteritems():
      remapSam[origin] = newAbnorms[abnorm]
    
    newFia = FlagIndexArray(len(newAbnorms)+1)
    newFia.addSingles()
    behIndAdjOrig = newFia.addFlagIndexArray(self.fia,remapOrig)
    behIndAdjSam  = newFia.addFlagIndexArray(sample.fia,remapSam)

    for doc in self.doc:
      doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex]

    # Update cluster priors on bmn arrays...
    for c in xrange(len(self.cluster)):
      clu = self.cluster[c]
      newBmn = numpy.zeros(newFia.flagCount(),dtype=numpy.int32)
      oldBmn = clu[3].copy()

      # Transilate from old set...
      for b in xrange(oldBmn.shape[0]):
        index = behIndAdjOrig[b] if c<origCluCount else behIndAdjSam[b]
        newBmn[index] += oldBmn[b]

      self.cluster[c] = (clu[0], clu[1], clu[2], newBmn)

    # Replace the old abnormality and fia stuff...
    self.abnormTopicWord = newAbnormTopicWord
    self.abnorms = newAbnorms
    self.fia = newFia
Exemplo n.º 2
0
    def addPrior(self, sample):
        """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place."""

        # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ###################

        # Do the topics...
        offset = self.topicWord.shape[0]
        if self.topicWord.shape[0] != 0:
            self.topicWord = numpy.vstack((self.topicWord, sample.topicWord))
        else:
            self.topicWord = sample.topicWord.copy()
        self.topicUse = numpy.hstack((self.topicUse, sample.topicUse))

        # Calculate the new abnormalities dictionary...
        newAbnorms = dict(sample.abnorms)
        for key, _ in self.abnorms.iteritems():
            if key not in newAbnorms:
                val = len(newAbnorms) + 1
                newAbnorms[key] = val

        # Transfer over the abnormal word counts...
        newAbnormTopicWord = numpy.zeros(
            (1 + len(newAbnorms),
             max((self.abnormTopicWord.shape[1],
                  sample.abnormTopicWord.shape[1]))),
            dtype=numpy.int32)

        for abnorm, origin in self.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            limit = self.abnormTopicWord.shape[1]
            newAbnormTopicWord[dest, :limit] += self.abnormTopicWord[
                origin, :limit]

        for abnorm, origin in sample.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            limit = sample.abnormTopicWord.shape[1]
            newAbnormTopicWord[dest, :limit] += sample.abnormTopicWord[
                origin, :limit]

        # Update the document flags/counts for behaviours...
        for doc in self.doc:
            newFlags = numpy.zeros(1 + len(newAbnorms), dtype=numpy.uint8)
            newCounts = numpy.zeros(1 + len(newAbnorms), dtype=numpy.int32)
            newFlags[0] = doc.behFlags[0]
            newCounts[0] = doc.behCounts[0]

            for abnorm, origin in self.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                newFlags[dest] = doc.behFlags[origin]
                newCounts[dest] = doc.behCounts[origin]

            doc.behFlags = newFlags
            doc.behCounts = newCounts

        # Update the old clusters behaviour arrays...
        def mapOldCluster(c):
            c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32)
            c2 /= c2.sum()

            c2[0] *= c[2][0]
            for abnorm, origin in self.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                c2[dest] *= c[2][origin]
            c2 /= c2.sum()

            return (c[0], c[1], c2, c[3])

        self.cluster = map(mapOldCluster, self.cluster)
        origCluCount = len(self.cluster)

        # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents...
        def mapCluster(pair):
            ci, c = pair

            c0 = c[0].copy()
            c0[:, 0] += offset

            c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32)
            c2 /= c2.sum()

            c2[0] *= c[2][0]
            for abnorm, origin in sample.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                c2[dest] *= c[2][origin]
            c2 /= c2.sum()

            c3 = c[3].copy()
            for doc in filter(lambda doc: doc.cluster == ci, sample.doc):
                fi = sample.fia.flagIndex(doc.behFlags, False)
                if fi >= len(
                        doc.behFlags
                ):  # Only bother if the document has abnormalities, of which this is a valid test.
                    total = 0
                    for i in xrange(doc.dp.shape[0]):
                        c3[doc.dp[i, 0]] += doc.dp[i, 2]
                        total += doc.dp[i, 2]
                    c3[fi] -= total + 1

            return (c0, c[1], c2, c3)

        self.cluster += map(mapCluster, enumerate(sample.cluster))
        self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse))

        # Update phi...
        newPhi = numpy.ones(len(newAbnorms) + 1, dtype=numpy.float32)
        newPhi[0] = 0.5 * (self.phi[0] + sample.phi[0])

        for abnorm, origin in self.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            newPhi[dest] = self.phi[origin]
        for abnorm, origin in sample.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            if abnorm not in self.abnorms:
                newPhi[dest] = sample.phi[origin]
            else:
                newPhi[dest] = 0.5 * (newPhi[dest] + sample.phi[origin])

        self.phi = newPhi
        self.phi /= self.phi.sum()

        # Recreate the flag index array...
        remapOrig = dict()  # Old flag positions to new flag positions.
        remapOrig[0] = 0
        for abnorm, origin in self.abnorms.iteritems():
            remapOrig[origin] = newAbnorms[abnorm]

        remapSam = dict()  # sample flag positions to new flag positions.
        remapSam[0] = 0
        for abnorm, origin in sample.abnorms.iteritems():
            remapSam[origin] = newAbnorms[abnorm]

        newFia = FlagIndexArray(len(newAbnorms) + 1)
        newFia.addSingles()
        behIndAdjOrig = newFia.addFlagIndexArray(self.fia, remapOrig)
        behIndAdjSam = newFia.addFlagIndexArray(sample.fia, remapSam)

        for doc in self.doc:
            doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex]

        # Update cluster priors on bmn arrays...
        for c in xrange(len(self.cluster)):
            clu = self.cluster[c]
            newBmn = numpy.zeros(newFia.flagCount(), dtype=numpy.int32)
            oldBmn = clu[3].copy()

            # Transilate from old set...
            for b in xrange(oldBmn.shape[0]):
                index = behIndAdjOrig[b] if c < origCluCount else behIndAdjSam[
                    b]
                newBmn[index] += oldBmn[b]

            self.cluster[c] = (clu[0], clu[1], clu[2], newBmn)

        # Replace the old abnormality and fia stuff...
        self.abnormTopicWord = newAbnormTopicWord
        self.abnorms = newAbnorms
        self.fia = newFia