def _logScore(self, input, word, totalMass): output = '{' sep = '' idxs = (range(feature.first(input), -1, -1) for feature in self.features) for featureIds in itertools.product(*idxs): features = [ feature[featureId] for feature, featureId in zip(self.features, featureIds) ] contexts = tuple( feature.contextFunc(input) for feature in features) featureStr = ' '.join(feature.name for feature in features) countStr = '' subSep = '' count = self.counts[featureIds][(contexts, word)] if count > 0: isStartPoint = '*' if featureIds in self.startPoints else ' ' for i in range(len(self.features)): backoffCount = self.backoffCounts[i][featureIds][(contexts, word)] discount = self.discount(featureIds, i, backoffCount, featureStr) countStr += subSep + '({},{})'.format( backoffCount, numStr(discount)) subSep = ' ' discount = self.discount(featureIds, None, count, featureStr) output += sep + "{}{}:'{}' ({},{}) {}".format( isStartPoint, featureStr, _contextsToStr(contexts), count, numStr(discount), countStr) sep = ',\n' output += '}' return '{} {}'.format(numStr(self.probability(input, word)), output)
def _logContext(self, input): output = '{' sep = '' idxs = (range(feature.first(input), -1, -1) for feature in self.features) first = True for featureIds in itertools.product(*idxs): features = [ feature[featureId] for feature, featureId in zip(self.features, featureIds) ] contexts = tuple( feature.contextFunc(input) for feature in features) if first or self.contextCounts[featureIds][contexts] > 0: first = False featureStr = ' '.join(feature.name for feature in features) isStartPoint = '*' if featureIds in self.startPoints else ' ' countStr = ' '.join('({},{})'.format( self.backoffContextCounts[i][featureIds][contexts], numStr(self.backoffPoolCache[i][(featureIds, contexts)])) for i in range(len(self.features))) output += sep + "{}{}:'{}' ({},{}) {}".format( isStartPoint, featureStr, _contextsToStr(contexts), self.contextCounts[featureIds][contexts], numStr(self.poolCache[(featureIds, contexts)]), countStr) sep = ',\n' output += '}' return output
def _logScore(self, input, word, totalMass): output = '{' sep = '' idxs = (range(feature.first(input),-1,-1) for feature in self.features) for featureIds in itertools.product(*idxs): features = [feature[featureId] for feature,featureId in zip(self.features, featureIds)] contexts = tuple(feature.contextFunc(input) for feature in features) featureStr = ' '.join(feature.name for feature in features) countStr = '' subSep = '' count = self.counts[featureIds][(contexts, word)] if count > 0: isStartPoint = '*' if featureIds in self.startPoints else ' ' for i in range(len(self.features)): backoffCount = self.backoffCounts[i][featureIds][(contexts, word)] discount = self.discount(featureIds, i, backoffCount, featureStr) countStr += subSep + '({},{})'.format(backoffCount, numStr(discount)) subSep = ' ' discount = self.discount(featureIds, None, count, featureStr) output += sep + "{}{}:'{}' ({},{}) {}".format(isStartPoint, featureStr, _contextsToStr(contexts), count, numStr(discount), countStr) sep = ',\n' output += '}' return '{} {}'.format(numStr(self.probability(input, word)), output)
def _prob(self, input, word, featureIds, backoffOrigin, doLogging): features = [ feature[featureId] for feature, featureId in zip(self.features, featureIds) ] contexts = tuple(feature.contextFunc(input) for feature in features) # FixMe: [correctness] Figure out how to handle nontrivial featureVal featureVal = word contextCounts = self.backoffContextCounts[backoffOrigin][featureIds] \ if backoffOrigin is not None \ else self.contextCounts[featureIds] total = contextCounts[contexts] if doLogging: # We cache logStr here because it will be overwritten by the # recursive call to _prob baseStr = self.logStr + self.logSep self.logSep = ', ' if featureIds in self.backoffTots: numTot, denomTot = self.backoffTots[featureIds] else: numTot, denomTot = 0, 0 for i, featureId in enumerate(featureIds): # print i,featureId nextFeatureId = self.features[i].succ(featureId) if nextFeatureId is None: if not self.uniformCache: self.uniformCache = float(len(self.words)) numTot += 1 denomTot += self.uniformCache else: nextFeatureIds = list(featureIds) nextFeatureIds[i] = nextFeatureId num, denom = self._prob(input, word, tuple(nextFeatureIds), i, doLogging) numTot += num denomTot += denom self.backoffTots[featureIds] = (numTot, denomTot) backoffProb = numTot / denomTot featureStr = ' '.join(feature.name for feature in features) if total == 0: return numTot, denomTot counts = self.backoffCounts[backoffOrigin][featureIds] \ if backoffOrigin is not None else self.counts[featureIds] count = counts[(contexts, featureVal)] discount = self.discount(featureIds, backoffOrigin, count, featureStr) pool = self.pool(featureIds, contexts, backoffOrigin) modCount = count - discount + pool * backoffProb if doLogging: # Note that the self.logStr at the end is the one set by the # recursive call to _prob self.logStr = baseStr + \ '{}:{} ({}-{}+{})'.format(featureStr, numStr(modCount / total), count, numStr(discount), numStr(pool*backoffProb)) + \ self.logStr return modCount, total
def _prob(self, input, word, featureIds, backoffOrigin, doLogging): features = [feature[featureId] for feature,featureId in zip(self.features, featureIds)] contexts = tuple(feature.contextFunc(input) for feature in features) # FixMe: [correctness] Figure out how to handle nontrivial featureVal featureVal = word contextCounts = self.backoffContextCounts[backoffOrigin][featureIds] \ if backoffOrigin is not None \ else self.contextCounts[featureIds] total = contextCounts[contexts] if doLogging: # We cache logStr here because it will be overwritten by the # recursive call to _prob baseStr = self.logStr + self.logSep self.logSep = ', ' if featureIds in self.backoffTots: numTot, denomTot = self.backoffTots[featureIds] else: numTot, denomTot = 0, 0 for i,featureId in enumerate(featureIds): # print i,featureId nextFeatureId = self.features[i].succ(featureId) if nextFeatureId is None: if not self.uniformCache: self.uniformCache = float(len(self.words)) numTot += 1 denomTot += self.uniformCache else: nextFeatureIds = list(featureIds) nextFeatureIds[i] = nextFeatureId num, denom = self._prob(input, word, tuple(nextFeatureIds), i, doLogging) numTot += num denomTot += denom self.backoffTots[featureIds] = (numTot, denomTot) backoffProb = numTot / denomTot featureStr = ' '.join(feature.name for feature in features) if total == 0: return numTot, denomTot counts = self.backoffCounts[backoffOrigin][featureIds] \ if backoffOrigin is not None else self.counts[featureIds] count = counts[(contexts, featureVal)] discount = self.discount(featureIds, backoffOrigin, count, featureStr) pool = self.pool(featureIds, contexts, backoffOrigin) modCount = count - discount + pool*backoffProb if doLogging: # Note that the self.logStr at the end is the one set by the # recursive call to _prob self.logStr = baseStr + \ '{}:{} ({}-{}+{})'.format(featureStr, numStr(modCount / total), count, numStr(discount), numStr(pool*backoffProb)) + \ self.logStr return modCount, total
def logScore(self, input, word, totalMass): output = '{' sep = '' prob = 0 for feature in self.classifier.featureCounters: if isinstance(feature, KNFeatureChain) or \ isinstance(feature, KNFeatureGrid): prob += math.log(feature.probability(input, word, True)) output += sep + feature.logStr sep = '\n' else: output += sep + '{}:{}({})'.format(feature.feature.name, numStr(feature.probability(input, word)), feature.featureCount(input, word)) prob += math.log(feature.probability(input, word)) sep = ', ' output += '}' return '{} {}'.format(numStr(math.exp(prob-totalMass)), output)
def update(self, input, word): prediction = self.classifier._predictAnnotated(input) self.classifier._logPrediction(input, prediction, word) topChoice = prediction[0][0] logFile = Logger.logFile('log') for featureCounter in self.classifier.featureCounters: logFile.log('{}, {}, {}'.format( featureCounter.feature.name, numStr(featureCounter.probability(input, word)), numStr(featureCounter.probability(input, topChoice)))) self.weights[featureCounter.feature.name] += \ self.l_rate * (math.log(featureCounter.probability(input, word)) - \ math.log(featureCounter.probability(input, topChoice))) self.t += 1.0 self.l_rate = 1.0 / math.sqrt(self.t) weightString = ', '.join([ '{}:{}'.format(it[0], numStr(it[1])) for it in self.weights.items() ]) logFile.log('weights = {{{}}}'.format(weightString))
def logContext(self, input): # FixMe: output context and pool output = '' sep = '' for featureId in range(self.features.first(input),-1,-1): feature = self.features[featureId] context = feature.contextFunc(input) output += sep + "{}:'{}' ({},{})".format(feature.name, _contextToStr(context), self.contextCounts[featureId][context], numStr(self.poolCache[(featureId, context)])) sep = ', ' return output
def update(self, input, word): prediction = self.classifier._predictAnnotated(input) self.classifier._logPrediction(input, prediction, word) topChoice = prediction[0][0] logFile = Logger.logFile("log") for featureCounter in self.classifier.featureCounters: logFile.log( "{}, {}, {}".format( featureCounter.feature.name, numStr(featureCounter.probability(input, word)), numStr(featureCounter.probability(input, topChoice)), ) ) self.weights[featureCounter.feature.name] += self.l_rate * ( math.log(featureCounter.probability(input, word)) - math.log(featureCounter.probability(input, topChoice)) ) self.t += 1.0 self.l_rate = 1.0 / math.sqrt(self.t) weightString = ", ".join(["{}:{}".format(it[0], numStr(it[1])) for it in self.weights.items()]) logFile.log("weights = {{{}}}".format(weightString))
def logContext(self, input): # FixMe: output context and pool output = '' sep = '' for featureId in range(self.features.first(input), -1, -1): feature = self.features[featureId] context = feature.contextFunc(input) output += sep + "{}:'{}' ({},{})".format( feature.name, _contextToStr(context), self.contextCounts[featureId][context], numStr(self.poolCache[(featureId, context)])) sep = ', ' return output
def _prob(self, input, word, featureId, useBackoff, doLogging): if featureId == None: if not self.uniformCache: self.uniformCache = 1 / float(len(self.classifier.words)) return self.uniformCache else: feature = self.features[featureId] context = feature.contextFunc(input) featureVal = feature.featureFunc(input, word) contextCounts = self.backoffContextCounts[featureId] if useBackoff \ else self.contextCounts[featureId] total = contextCounts[context] if total == 0: return self._prob(input, word, self.features.succ(featureId), True, doLogging) counts = self.backoffCounts[featureId] if useBackoff else \ self.counts[featureId] count = counts[(context, featureVal)] discount = self.discount(featureId, useBackoff, count) pool = self.pool(featureId, context, useBackoff) if doLogging: # We cache logStr here because it will be overwritten by the # recursive call to _prob baseStr = self.logStr + self.logSep self.logSep = ', ' backoffProb = self._prob(input, word, self.features.succ(featureId), True, doLogging) prob = (count - discount + pool * backoffProb) / total if doLogging: # Note that the self.logStr at the end is the one set by the # recursive call to _prob self.logStr = baseStr + \ '{}:{} ({}-{}+{})'.format(feature.name, numStr(prob), count, numStr(discount), numStr(pool*backoffProb)) + \ self.logStr return prob
def _prob(self, input, word, featureId, useBackoff, doLogging): if featureId == None: if not self.uniformCache: self.uniformCache = 1 / float(len(self.classifier.words)) return self.uniformCache else: feature = self.features[featureId] context = feature.contextFunc(input) featureVal = feature.featureFunc(input, word) contextCounts = self.backoffContextCounts[featureId] if useBackoff \ else self.contextCounts[featureId] total = contextCounts[context] if total == 0: return self._prob(input, word, self.features.succ(featureId), True, doLogging) counts = self.backoffCounts[featureId] if useBackoff else \ self.counts[featureId] count = counts[(context, featureVal)] discount = self.discount(featureId, useBackoff, count) pool = self.pool(featureId, context, useBackoff) if doLogging: # We cache logStr here because it will be overwritten by the # recursive call to _prob baseStr = self.logStr + self.logSep self.logSep = ', ' backoffProb = self._prob(input, word, self.features.succ(featureId), True, doLogging) prob = (count - discount + pool*backoffProb) / total if doLogging: # Note that the self.logStr at the end is the one set by the # recursive call to _prob self.logStr = baseStr + \ '{}:{} ({}-{}+{})'.format(feature.name, numStr(prob), count, numStr(discount), numStr(pool*backoffProb)) + \ self.logStr return prob
def _logContext(self, input): output = '{' sep = '' idxs = (range(feature.first(input),-1,-1) for feature in self.features) first = True for featureIds in itertools.product(*idxs): features = [feature[featureId] for feature,featureId in zip(self.features, featureIds)] contexts = tuple(feature.contextFunc(input) for feature in features) if first or self.contextCounts[featureIds][contexts] > 0: first = False featureStr = ' '.join(feature.name for feature in features) isStartPoint = '*' if featureIds in self.startPoints else ' ' countStr = ' '.join('({},{})'.format( self.backoffContextCounts[i][featureIds][contexts], numStr(self.backoffPoolCache[i][(featureIds, contexts)])) for i in range(len(self.features))) output += sep + "{}{}:'{}' ({},{}) {}".format(isStartPoint, featureStr, _contextsToStr(contexts), self.contextCounts[featureIds][contexts], numStr(self.poolCache[(featureIds, contexts)]), countStr) sep = ',\n' output += '}' return output
def logScore(self, input, word, totalMass): output = "{" sep = "" score = 0.0 for feature in self.classifier.featureCounters: if isinstance(feature, KNFeatureChain): val = math.log(feature.probability(input, word, True)) * self.weights[feature.feature.name] output += sep + feature.logStr sep = "\n" score += val else: val = math.log(feature.probability(input, word)) * self.weights[feature.feature.name] output += sep + "{}:{:.1f}({})".format(feature.feature.name, val, feature.featureCount(input, word)) sep = ", " score += val output += "}" return "{} {}".format(numStr(math.exp(score - totalMass)), output)
def _logPrediction(self, input, prediction, word): logFile = Logger.logFile('log') predictionSummaries = Logger.logFile('predictionSummaries') line = input.lineIndex + 1 lastNewLine = input.lines[input.lineIndex][1] loc = input.input.location locString = '{}:{},{}'.format(input.input.path, line, loc - lastNewLine + 1) words = input.words logFile.start(locString) logFile.log('context = ' + self._logContext(input)) totalMass = self._totalMass(prediction) logFile.log('totalMass = {}'.format(numStr(math.exp(totalMass)))) predictionWords = [w for w, score in prediction] index = input.index prevWords = ' '.join( [w for w, l, n in words[max(0, index - 10):index]]) firstChar = '' if len(words) > index: firstChar = words[index][0] predictionSummary = [ locString, prevWords, firstChar, ' '.join(predictionWords[:5]) ] if word: logFile.start('actual') if word in predictionWords: output = self._logScore(input, word, totalMass) actualIndex = predictionWords.index(word) + 1 predictionSummary += [str(actualIndex)] logFile.log('{}/{}. {} {}'.format(actualIndex, len(self.words), word, output)) else: predictionSummary += ['-1'] logFile.log('-1/{}. {}'.format(len(self.words), word)) logFile.end() predictionSummaries.log('\t'.join(predictionSummary)) logFile.start('guesses') for index, guess in enumerate(predictionWords[:5]): output = self._logScore(input, guess, totalMass) logFile.log(str(index + 1) + '. ' + guess + ' ' + output) logFile.end() logFile.end()
def logScore(self, input, word, totalMass): output = '{' sep = '' score = 0.0 for feature in self.classifier.featureCounters: if isinstance(feature, KNFeatureChain): val = math.log(feature.probability( input, word, True)) * self.weights[feature.feature.name] output += sep + feature.logStr sep = '\n' score += val else: val = math.log(feature.probability( input, word)) * self.weights[feature.feature.name] output += sep + '{}:{:.1f}({})'.format( feature.feature.name, val, feature.featureCount( input, word)) sep = ', ' score += val output += '}' return '{} {}'.format(numStr(math.exp(score - totalMass)), output)
def _logPrediction(self, input, prediction, word): logFile = Logger.logFile('log') predictionSummaries = Logger.logFile('predictionSummaries') line = input.lineIndex + 1 lastNewLine = input.lines[input.lineIndex][1] loc = input.input.location locString = '{}:{},{}'.format(input.input.path, line, loc - lastNewLine + 1) words = input.words logFile.start(locString) logFile.log('context = ' + self._logContext(input)) totalMass = self._totalMass(prediction) logFile.log('totalMass = {}'.format(numStr(math.exp(totalMass)))) predictionWords = [w for w,score in prediction] index = input.index prevWords = ' '.join([w for w,l,n in words[max(0, index-10):index]]) firstChar = '' if len(words) > index: firstChar = words[index][0] predictionSummary = [locString, prevWords, firstChar, ' '.join(predictionWords[:5])] if word: logFile.start('actual') if word in predictionWords: output = self._logScore(input, word, totalMass) actualIndex = predictionWords.index(word) + 1 predictionSummary += [str(actualIndex)] logFile.log('{}/{}. {} {}'.format(actualIndex, len(self.words), word, output)) else: predictionSummary += ['-1'] logFile.log('-1/{}. {}'.format(len(self.words), word)) logFile.end() predictionSummaries.log('\t'.join(predictionSummary)) logFile.start('guesses') for index,guess in enumerate(predictionWords[:5]): output = self._logScore(input, guess, totalMass) logFile.log(str(index+1) + '. ' + guess + ' ' + output) logFile.end() logFile.end()