def __distance(self): """ Kullback-Leibler Distance between the model and an observation. We expect a model, an observation, epsilon, alpha and beta already estimated properly. """ dist = 0. # Estimates the distance using each of the ngrams for x in self.ngrams: probamodel = self.epsilon if x in self.model: probamodel = self.alpha * self.model[x] probangram = self.beta * (float(self.ngrams.count(x))/float(len(self.ngrams))) d = ( (probamodel-probangram) * log2(probamodel/probangram) ) #print " - ",x,": probangram=",probangram," probamodel=",probamodel, " d=",d dist += d # Estimates the distance using ngrams in the model for x in self.model: if not x in self.ngrams: probamodel = self.alpha * self.model[x] dist += ( (probamodel-self.epsilon) * log2(probamodel/self.epsilon) ) return dist
def __distance_emptymodel(self): """ Distance between an empty model and an observation. We expect an observation, epsilon and alpha already estimated properly. """ dist = 0. for x in self.model: probamodel = self.alpha * self.model[x] dist += ( (probamodel) * log2(self.epsilon/self.epsilon) ) return dist
def __distance_emptyobservation(self): """ Distance between model and an empty observation. We expect a model, epsilon and beta already estimated properly. """ dist = 0. for x in self.ngrams: probangram = self.beta * (float(self.ngrams.count(x))/float(len(self.ngrams))) dist += ( (self.epsilon-probangram) * log2(self.epsilon/probangram) ) return dist
def get(self, symbols): """ Estimates the perplexity of a vector of symbols. @return float value """ exr = symbols_to_items(symbols, self.ngram) entropy = 0 for symbol, occurrences in exr.items(): realsymbol = " ".join(symbol).strip() probability = self.model.get(realsymbol, self.epsilon) self_information = log2(1.0 / probability) entropy += (probability * self_information) * occurrences return pow(2, entropy)
def get(self): """ Estimates the Shannon entropy of a vector of symbols. Shannon's entropy measures the information contained in a message as opposed to the portion of the message that is determined (or predictable). @return float value """ exr = symbols_to_items(self.symbols, self.ngram) total = len(self.symbols) - self.ngram + 1 entropy = 0 for symbol,occurrences in exr.items(): probability = 1.0 * occurrences / total self_information = log2( 1.0 / probability ) entropy += (probability * self_information) return entropy