示例#1
0
 def _logScore(self, input, word, totalMass):
     output = '{'
     sep = ''
     idxs = (range(feature.first(input), -1, -1)
             for feature in self.features)
     for featureIds in itertools.product(*idxs):
         features = [
             feature[featureId]
             for feature, featureId in zip(self.features, featureIds)
         ]
         contexts = tuple(
             feature.contextFunc(input) for feature in features)
         featureStr = ' '.join(feature.name for feature in features)
         countStr = ''
         subSep = ''
         count = self.counts[featureIds][(contexts, word)]
         if count > 0:
             isStartPoint = '*' if featureIds in self.startPoints else ' '
             for i in range(len(self.features)):
                 backoffCount = self.backoffCounts[i][featureIds][(contexts,
                                                                   word)]
                 discount = self.discount(featureIds, i, backoffCount,
                                          featureStr)
                 countStr += subSep + '({},{})'.format(
                     backoffCount, numStr(discount))
                 subSep = ' '
             discount = self.discount(featureIds, None, count, featureStr)
             output += sep + "{}{}:'{}' ({},{}) {}".format(
                 isStartPoint, featureStr, _contextsToStr(contexts), count,
                 numStr(discount), countStr)
             sep = ',\n'
     output += '}'
     return '{} {}'.format(numStr(self.probability(input, word)), output)
示例#2
0
 def _logContext(self, input):
     output = '{'
     sep = ''
     idxs = (range(feature.first(input), -1, -1)
             for feature in self.features)
     first = True
     for featureIds in itertools.product(*idxs):
         features = [
             feature[featureId]
             for feature, featureId in zip(self.features, featureIds)
         ]
         contexts = tuple(
             feature.contextFunc(input) for feature in features)
         if first or self.contextCounts[featureIds][contexts] > 0:
             first = False
             featureStr = ' '.join(feature.name for feature in features)
             isStartPoint = '*' if featureIds in self.startPoints else ' '
             countStr = ' '.join('({},{})'.format(
                 self.backoffContextCounts[i][featureIds][contexts],
                 numStr(self.backoffPoolCache[i][(featureIds, contexts)]))
                                 for i in range(len(self.features)))
             output += sep + "{}{}:'{}' ({},{}) {}".format(
                 isStartPoint, featureStr, _contextsToStr(contexts),
                 self.contextCounts[featureIds][contexts],
                 numStr(self.poolCache[(featureIds, contexts)]), countStr)
             sep = ',\n'
     output += '}'
     return output
示例#3
0
 def _logScore(self, input, word, totalMass):
    output = '{'
    sep = ''
    idxs = (range(feature.first(input),-1,-1) for feature in self.features)
    for featureIds in itertools.product(*idxs):
       features = [feature[featureId] for feature,featureId in
                   zip(self.features, featureIds)]
       contexts = tuple(feature.contextFunc(input) for feature in features)
       featureStr = ' '.join(feature.name for feature in features)
       countStr = ''
       subSep = ''
       count = self.counts[featureIds][(contexts, word)]
       if count > 0:
          isStartPoint = '*' if featureIds in self.startPoints else ' '
          for i in range(len(self.features)):
             backoffCount = self.backoffCounts[i][featureIds][(contexts, word)]
             discount = self.discount(featureIds, i, backoffCount, featureStr)
             countStr += subSep + '({},{})'.format(backoffCount, numStr(discount))
             subSep = ' '
          discount = self.discount(featureIds, None, count, featureStr)
          output += sep + "{}{}:'{}' ({},{}) {}".format(isStartPoint,
                featureStr, _contextsToStr(contexts), count,
                numStr(discount), countStr)
          sep = ',\n'
    output += '}'
    return '{} {}'.format(numStr(self.probability(input, word)), output)
示例#4
0
    def _prob(self, input, word, featureIds, backoffOrigin, doLogging):
        features = [
            feature[featureId]
            for feature, featureId in zip(self.features, featureIds)
        ]

        contexts = tuple(feature.contextFunc(input) for feature in features)
        # FixMe: [correctness] Figure out how to handle nontrivial featureVal
        featureVal = word
        contextCounts = self.backoffContextCounts[backoffOrigin][featureIds] \
                        if backoffOrigin is not None \
                        else self.contextCounts[featureIds]
        total = contextCounts[contexts]

        if doLogging:
            # We cache logStr here because it will be overwritten by the
            # recursive call to _prob
            baseStr = self.logStr + self.logSep
            self.logSep = ', '
        if featureIds in self.backoffTots:
            numTot, denomTot = self.backoffTots[featureIds]
        else:
            numTot, denomTot = 0, 0
            for i, featureId in enumerate(featureIds):
                # print i,featureId
                nextFeatureId = self.features[i].succ(featureId)
                if nextFeatureId is None:
                    if not self.uniformCache:
                        self.uniformCache = float(len(self.words))
                    numTot += 1
                    denomTot += self.uniformCache
                else:
                    nextFeatureIds = list(featureIds)
                    nextFeatureIds[i] = nextFeatureId
                    num, denom = self._prob(input, word, tuple(nextFeatureIds),
                                            i, doLogging)
                    numTot += num
                    denomTot += denom
            self.backoffTots[featureIds] = (numTot, denomTot)
        backoffProb = numTot / denomTot

        featureStr = ' '.join(feature.name for feature in features)

        if total == 0:
            return numTot, denomTot
        counts = self.backoffCounts[backoffOrigin][featureIds] \
                 if backoffOrigin is not None else self.counts[featureIds]
        count = counts[(contexts, featureVal)]
        discount = self.discount(featureIds, backoffOrigin, count, featureStr)
        pool = self.pool(featureIds, contexts, backoffOrigin)
        modCount = count - discount + pool * backoffProb
        if doLogging:
            # Note that the self.logStr at the end is the one set by the
            # recursive call to _prob
            self.logStr = baseStr + \
               '{}:{} ({}-{}+{})'.format(featureStr, numStr(modCount / total),
                                         count, numStr(discount),
                                         numStr(pool*backoffProb)) + \
               self.logStr
        return modCount, total
示例#5
0
   def _prob(self, input, word, featureIds, backoffOrigin, doLogging):
      features = [feature[featureId] for feature,featureId in
                  zip(self.features, featureIds)]

      contexts = tuple(feature.contextFunc(input) for feature in features)
      # FixMe: [correctness] Figure out how to handle nontrivial featureVal
      featureVal = word
      contextCounts = self.backoffContextCounts[backoffOrigin][featureIds] \
                      if backoffOrigin is not None \
                      else self.contextCounts[featureIds]
      total = contextCounts[contexts]

      if doLogging:
         # We cache logStr here because it will be overwritten by the
         # recursive call to _prob
         baseStr = self.logStr + self.logSep
         self.logSep = ', '
      if featureIds in self.backoffTots:
         numTot, denomTot = self.backoffTots[featureIds]
      else:
         numTot, denomTot = 0, 0
         for i,featureId in enumerate(featureIds):
            # print i,featureId
            nextFeatureId = self.features[i].succ(featureId)
            if nextFeatureId is None:
               if not self.uniformCache:
                  self.uniformCache = float(len(self.words))
               numTot += 1
               denomTot += self.uniformCache
            else:
               nextFeatureIds = list(featureIds)
               nextFeatureIds[i] = nextFeatureId
               num, denom = self._prob(input, word, tuple(nextFeatureIds),
                                       i, doLogging)
               numTot += num
               denomTot += denom
         self.backoffTots[featureIds] = (numTot, denomTot)
      backoffProb = numTot / denomTot

      featureStr = ' '.join(feature.name for feature in features)

      if total == 0:
         return numTot, denomTot
      counts = self.backoffCounts[backoffOrigin][featureIds] \
               if backoffOrigin is not None else self.counts[featureIds]
      count = counts[(contexts, featureVal)]
      discount = self.discount(featureIds, backoffOrigin, count, featureStr)
      pool = self.pool(featureIds, contexts, backoffOrigin)
      modCount = count - discount + pool*backoffProb
      if doLogging:
         # Note that the self.logStr at the end is the one set by the
         # recursive call to _prob
         self.logStr = baseStr + \
            '{}:{} ({}-{}+{})'.format(featureStr, numStr(modCount / total),
                                      count, numStr(discount),
                                      numStr(pool*backoffProb)) + \
            self.logStr
      return modCount, total
示例#6
0
 def logScore(self, input, word, totalMass):
    output = '{'
    sep = ''
    prob = 0
    for feature in self.classifier.featureCounters:
       if isinstance(feature, KNFeatureChain) or \
          isinstance(feature, KNFeatureGrid):
          prob += math.log(feature.probability(input, word, True))
          output += sep + feature.logStr
          sep = '\n'
       else:
          output += sep + '{}:{}({})'.format(feature.feature.name,
                numStr(feature.probability(input, word)), feature.featureCount(input,
                   word))
          prob += math.log(feature.probability(input, word))
          sep = ', '
    output += '}'
    return '{} {}'.format(numStr(math.exp(prob-totalMass)), output)
示例#7
0
 def update(self, input, word):
     prediction = self.classifier._predictAnnotated(input)
     self.classifier._logPrediction(input, prediction, word)
     topChoice = prediction[0][0]
     logFile = Logger.logFile('log')
     for featureCounter in self.classifier.featureCounters:
         logFile.log('{}, {}, {}'.format(
             featureCounter.feature.name,
             numStr(featureCounter.probability(input, word)),
             numStr(featureCounter.probability(input, topChoice))))
         self.weights[featureCounter.feature.name] += \
            self.l_rate * (math.log(featureCounter.probability(input, word)) - \
            math.log(featureCounter.probability(input, topChoice)))
     self.t += 1.0
     self.l_rate = 1.0 / math.sqrt(self.t)
     weightString = ', '.join([
         '{}:{}'.format(it[0], numStr(it[1]))
         for it in self.weights.items()
     ])
     logFile.log('weights = {{{}}}'.format(weightString))
示例#8
0
 def logContext(self, input):
    # FixMe: output context and pool
    output = ''
    sep = ''
    for featureId in range(self.features.first(input),-1,-1):
       feature = self.features[featureId]
       context = feature.contextFunc(input)
       output += sep + "{}:'{}' ({},{})".format(feature.name,
             _contextToStr(context), self.contextCounts[featureId][context],
             numStr(self.poolCache[(featureId, context)]))
       sep = ', '
    return output
 def update(self, input, word):
     prediction = self.classifier._predictAnnotated(input)
     self.classifier._logPrediction(input, prediction, word)
     topChoice = prediction[0][0]
     logFile = Logger.logFile("log")
     for featureCounter in self.classifier.featureCounters:
         logFile.log(
             "{}, {}, {}".format(
                 featureCounter.feature.name,
                 numStr(featureCounter.probability(input, word)),
                 numStr(featureCounter.probability(input, topChoice)),
             )
         )
         self.weights[featureCounter.feature.name] += self.l_rate * (
             math.log(featureCounter.probability(input, word))
             - math.log(featureCounter.probability(input, topChoice))
         )
     self.t += 1.0
     self.l_rate = 1.0 / math.sqrt(self.t)
     weightString = ", ".join(["{}:{}".format(it[0], numStr(it[1])) for it in self.weights.items()])
     logFile.log("weights = {{{}}}".format(weightString))
示例#10
0
 def logContext(self, input):
     # FixMe: output context and pool
     output = ''
     sep = ''
     for featureId in range(self.features.first(input), -1, -1):
         feature = self.features[featureId]
         context = feature.contextFunc(input)
         output += sep + "{}:'{}' ({},{})".format(
             feature.name, _contextToStr(context),
             self.contextCounts[featureId][context],
             numStr(self.poolCache[(featureId, context)]))
         sep = ', '
     return output
示例#11
0
 def _prob(self, input, word, featureId, useBackoff, doLogging):
     if featureId == None:
         if not self.uniformCache:
             self.uniformCache = 1 / float(len(self.classifier.words))
         return self.uniformCache
     else:
         feature = self.features[featureId]
         context = feature.contextFunc(input)
         featureVal = feature.featureFunc(input, word)
         contextCounts = self.backoffContextCounts[featureId] if useBackoff \
                         else self.contextCounts[featureId]
         total = contextCounts[context]
         if total == 0:
             return self._prob(input, word, self.features.succ(featureId),
                               True, doLogging)
         counts = self.backoffCounts[featureId] if useBackoff else \
                  self.counts[featureId]
         count = counts[(context, featureVal)]
         discount = self.discount(featureId, useBackoff, count)
         pool = self.pool(featureId, context, useBackoff)
         if doLogging:
             # We cache logStr here because it will be overwritten by the
             # recursive call to _prob
             baseStr = self.logStr + self.logSep
             self.logSep = ', '
         backoffProb = self._prob(input, word,
                                  self.features.succ(featureId), True,
                                  doLogging)
         prob = (count - discount + pool * backoffProb) / total
         if doLogging:
             # Note that the self.logStr at the end is the one set by the
             # recursive call to _prob
             self.logStr = baseStr + \
                '{}:{} ({}-{}+{})'.format(feature.name, numStr(prob), count,
                                         numStr(discount),
                                         numStr(pool*backoffProb)) + \
                self.logStr
         return prob
示例#12
0
 def _prob(self, input, word, featureId, useBackoff, doLogging):
    if featureId == None:
       if not self.uniformCache:
          self.uniformCache = 1 / float(len(self.classifier.words))
       return self.uniformCache
    else:
       feature = self.features[featureId]
       context = feature.contextFunc(input)
       featureVal = feature.featureFunc(input, word)
       contextCounts = self.backoffContextCounts[featureId] if useBackoff \
                       else self.contextCounts[featureId]
       total = contextCounts[context]
       if total == 0:
          return self._prob(input, word, self.features.succ(featureId),
                            True, doLogging)
       counts = self.backoffCounts[featureId] if useBackoff else \
                self.counts[featureId]
       count = counts[(context, featureVal)]
       discount = self.discount(featureId, useBackoff, count)
       pool = self.pool(featureId, context, useBackoff)
       if doLogging:
          # We cache logStr here because it will be overwritten by the
          # recursive call to _prob
          baseStr = self.logStr + self.logSep
          self.logSep = ', '
       backoffProb = self._prob(input, word, self.features.succ(featureId),
                                True, doLogging)
       prob = (count - discount + pool*backoffProb) / total
       if doLogging:
          # Note that the self.logStr at the end is the one set by the
          # recursive call to _prob
          self.logStr = baseStr + \
             '{}:{} ({}-{}+{})'.format(feature.name, numStr(prob), count,
                                      numStr(discount),
                                      numStr(pool*backoffProb)) + \
             self.logStr
       return prob
示例#13
0
 def _logContext(self, input):
    output = '{'
    sep = ''
    idxs = (range(feature.first(input),-1,-1) for feature in self.features)
    first = True
    for featureIds in itertools.product(*idxs):
       features = [feature[featureId] for feature,featureId in
                   zip(self.features, featureIds)]
       contexts = tuple(feature.contextFunc(input) for feature in features)
       if first or self.contextCounts[featureIds][contexts] > 0:
          first = False
          featureStr = ' '.join(feature.name for feature in features)
          isStartPoint = '*' if featureIds in self.startPoints else ' '
          countStr = ' '.join('({},{})'.format(
                self.backoffContextCounts[i][featureIds][contexts],
                numStr(self.backoffPoolCache[i][(featureIds, contexts)]))
                for i in range(len(self.features)))
          output += sep + "{}{}:'{}' ({},{}) {}".format(isStartPoint,
                featureStr, _contextsToStr(contexts),
                self.contextCounts[featureIds][contexts],
                numStr(self.poolCache[(featureIds, contexts)]), countStr)
          sep = ',\n'
    output += '}'
    return output
示例#14
0
 def logScore(self, input, word, totalMass):
     output = "{"
     sep = ""
     score = 0.0
     for feature in self.classifier.featureCounters:
         if isinstance(feature, KNFeatureChain):
             val = math.log(feature.probability(input, word, True)) * self.weights[feature.feature.name]
             output += sep + feature.logStr
             sep = "\n"
             score += val
         else:
             val = math.log(feature.probability(input, word)) * self.weights[feature.feature.name]
             output += sep + "{}:{:.1f}({})".format(feature.feature.name, val, feature.featureCount(input, word))
             sep = ", "
             score += val
     output += "}"
     return "{} {}".format(numStr(math.exp(score - totalMass)), output)
示例#15
0
 def _logPrediction(self, input, prediction, word):
     logFile = Logger.logFile('log')
     predictionSummaries = Logger.logFile('predictionSummaries')
     line = input.lineIndex + 1
     lastNewLine = input.lines[input.lineIndex][1]
     loc = input.input.location
     locString = '{}:{},{}'.format(input.input.path, line,
                                   loc - lastNewLine + 1)
     words = input.words
     logFile.start(locString)
     logFile.log('context = ' + self._logContext(input))
     totalMass = self._totalMass(prediction)
     logFile.log('totalMass = {}'.format(numStr(math.exp(totalMass))))
     predictionWords = [w for w, score in prediction]
     index = input.index
     prevWords = ' '.join(
         [w for w, l, n in words[max(0, index - 10):index]])
     firstChar = ''
     if len(words) > index:
         firstChar = words[index][0]
     predictionSummary = [
         locString, prevWords, firstChar, ' '.join(predictionWords[:5])
     ]
     if word:
         logFile.start('actual')
         if word in predictionWords:
             output = self._logScore(input, word, totalMass)
             actualIndex = predictionWords.index(word) + 1
             predictionSummary += [str(actualIndex)]
             logFile.log('{}/{}. {} {}'.format(actualIndex, len(self.words),
                                               word, output))
         else:
             predictionSummary += ['-1']
             logFile.log('-1/{}. {}'.format(len(self.words), word))
         logFile.end()
     predictionSummaries.log('\t'.join(predictionSummary))
     logFile.start('guesses')
     for index, guess in enumerate(predictionWords[:5]):
         output = self._logScore(input, guess, totalMass)
         logFile.log(str(index + 1) + '. ' + guess + ' ' + output)
     logFile.end()
     logFile.end()
示例#16
0
 def logScore(self, input, word, totalMass):
     output = '{'
     sep = ''
     score = 0.0
     for feature in self.classifier.featureCounters:
         if isinstance(feature, KNFeatureChain):
             val = math.log(feature.probability(
                 input, word, True)) * self.weights[feature.feature.name]
             output += sep + feature.logStr
             sep = '\n'
             score += val
         else:
             val = math.log(feature.probability(
                 input, word)) * self.weights[feature.feature.name]
             output += sep + '{}:{:.1f}({})'.format(
                 feature.feature.name, val, feature.featureCount(
                     input, word))
             sep = ', '
             score += val
     output += '}'
     return '{} {}'.format(numStr(math.exp(score - totalMass)), output)
示例#17
0
 def _logPrediction(self, input, prediction, word):
    logFile = Logger.logFile('log')
    predictionSummaries = Logger.logFile('predictionSummaries')
    line = input.lineIndex + 1
    lastNewLine = input.lines[input.lineIndex][1]
    loc = input.input.location
    locString = '{}:{},{}'.format(input.input.path, line,
                                  loc - lastNewLine + 1)
    words = input.words
    logFile.start(locString)
    logFile.log('context = ' + self._logContext(input))
    totalMass = self._totalMass(prediction)
    logFile.log('totalMass = {}'.format(numStr(math.exp(totalMass))))
    predictionWords = [w for w,score in prediction]
    index = input.index
    prevWords = ' '.join([w for w,l,n in words[max(0, index-10):index]])
    firstChar = ''
    if len(words) > index:
       firstChar = words[index][0]
    predictionSummary = [locString, prevWords, firstChar,
                         ' '.join(predictionWords[:5])]
    if word:
       logFile.start('actual')
       if word in predictionWords:
          output = self._logScore(input, word, totalMass)
          actualIndex = predictionWords.index(word) + 1
          predictionSummary += [str(actualIndex)]
          logFile.log('{}/{}. {} {}'.format(actualIndex,
                                           len(self.words), word, output))
       else:
          predictionSummary += ['-1']
          logFile.log('-1/{}. {}'.format(len(self.words), word))
       logFile.end()
    predictionSummaries.log('\t'.join(predictionSummary))
    logFile.start('guesses')
    for index,guess in enumerate(predictionWords[:5]):
       output = self._logScore(input, guess, totalMass)
       logFile.log(str(index+1) + '. ' + guess + ' ' + output)
    logFile.end()
    logFile.end()