예제 #1
0
    def __distance(self):
        """
        Kullback-Leibler Distance between the model and an observation.
        We expect a model, an observation, epsilon, alpha and beta
        already estimated properly.

        """
        dist = 0.

        # Estimates the distance using each of the ngrams
        for x in self.ngrams:
            probamodel = self.epsilon
            if x in self.model:
                probamodel = self.alpha * self.model[x]
            probangram = self.beta * (float(self.ngrams.count(x))/float(len(self.ngrams)))
            d = ( (probamodel-probangram) * log2(probamodel/probangram) )
            #print "   - ",x,": probangram=",probangram," probamodel=",probamodel, " d=",d
            dist += d
        # Estimates the distance using ngrams in the model
        for x in self.model:
            if not x in self.ngrams:
                probamodel = self.alpha * self.model[x]
                dist += ( (probamodel-self.epsilon) * log2(probamodel/self.epsilon) )

        return dist
예제 #2
0
    def __distance_emptymodel(self):
        """
        Distance between an empty model and an observation.
        We expect an observation, epsilon and alpha already estimated properly.

        """
        dist = 0.
        for x in self.model:
            probamodel = self.alpha * self.model[x]
            dist += ( (probamodel) * log2(self.epsilon/self.epsilon) )
        return dist
예제 #3
0
    def __distance_emptyobservation(self):
        """
        Distance between model and an empty observation.
        We expect a model, epsilon and beta already estimated properly.

        """
        dist = 0.
        for x in self.ngrams:
            probangram = self.beta * (float(self.ngrams.count(x))/float(len(self.ngrams)))
            dist += ( (self.epsilon-probangram) * log2(self.epsilon/probangram) )
        return dist
예제 #4
0
    def get(self, symbols):
        """
        Estimates the perplexity of a vector of symbols.

        @return float value

        """
        exr = symbols_to_items(symbols, self.ngram)
        entropy = 0

        for symbol, occurrences in exr.items():

            realsymbol = " ".join(symbol).strip()
            probability = self.model.get(realsymbol, self.epsilon)
            self_information = log2(1.0 / probability)
            entropy += (probability * self_information) * occurrences

        return pow(2, entropy)
예제 #5
0
    def get(self):
        """
        Estimates the Shannon entropy of a vector of symbols.
        Shannon's entropy measures the information contained in a message as
        opposed to the portion of the message that is determined
        (or predictable).

        @return float value

        """
        exr = symbols_to_items(self.symbols, self.ngram)
        total = len(self.symbols) - self.ngram + 1
        entropy = 0

        for symbol,occurrences in exr.items():

            probability = 1.0 * occurrences / total
            self_information = log2( 1.0 / probability )
            entropy += (probability * self_information)

        return entropy