Exemplo n.º 1
0
    def prob_classify(self, featureset):
        """Calculate the probabilities the given featureset classifications
        and return a DictionaryProbDist instance.

        Works in O(nm) with n = # of labels, m = # of featureset elements.
        """

        # Work on a copy of the feature set, because we mutate it.
        fset = featureset.copy()
        for fname in featureset:
            for label in self._labels:
                if (label, fname) in self._feature_probdist:
                    break
            else:
                # Discard feature name we haven't been trained on from the
                # input set.
                del fset[fname]

        # Now we're working with a feature set that only includes known
        # features.

        # Instead of working with the product of the separate probabilities,
        # we use the sum of the logarithms to prevent underflows and make the
        # result more stable.

        #: The probability of each label, given the features. Starting with
        #: the probability of the label itself.
        logprob = {}
        for label in self._labels:
            logprob[label] = self._label_probdist.logprob(label)

        # Add the logarithmic probability of the features given the labels.
        for label in self._labels:
            for (fname, fval) in fset.items():
                feature_probs = self._feature_probdist.get((label, fname))

                if feature_probs is not None:
                    logprob[label] += feature_probs.logprob(fval)
                else:
                    # This should not occur if the classifier was created with
                    # the train() method.
                    logprob[label] += sum_logs([])  # = -INF.

        return DictionaryProbDist(logprob, normalize=True, log=True)
Exemplo n.º 2
0
    def prob_classify(self, featureset):
        """Calculate the probabilities the given featureset classifications
        and return a DictionaryProbDist instance.

        Works in O(nm) with n = # of labels, m = # of featureset elements.
        """

        # Work on a copy of the feature set, because we mutate it.
        fset = featureset.copy()
        for fname in featureset:
            for label in self._labels:
                if (label, fname) in self._feature_probdist:
                    break
            else:
                # Discard feature name we haven't been trained on from the
                # input set.
                del fset[fname]

        # Now we're working with a feature set that only includes known
        # features.

        # Instead of working with the product of the separate probabilities,
        # we use the sum of the logarithms to prevent underflows and make the
        # result more stable.

        #: The probability of each label, given the features. Starting with
        #: the probability of the label itself.
        logprob = {}
        for label in self._labels:
            logprob[label] = self._label_probdist.logprob(label)

        # Add the logarithmic probability of the features given the labels.
        for label in self._labels:
            for (fname, fval) in fset.items():
                feature_probs = self._feature_probdist.get((label, fname))

                if feature_probs is not None:
                    logprob[label] += feature_probs.logprob(fval)
                else:
                    # This should not occur if the classifier was created with
                    # the train() method.
                    logprob[label] += sum_logs([])  # = -INF.

        return DictionaryProbDist(logprob, normalize=True, log=True)
Exemplo n.º 3
0
    def test_sum_logs_ninf(self):
        from twentiment.thirdparty.probability import sum_logs

        self.assertEqual(sum_logs([]), _NINF)
Exemplo n.º 4
0
    def test_sum_logs_ninf(self):
        from twentiment.thirdparty.probability import sum_logs

        self.assertEqual(sum_logs([]), _NINF)