예제 #1
0
    def prob_classify(self, featureset):
        # Discard any feature names that we've never seen before.
        # Otherwise, we'll just assign a probability of 0 to
        # everything.
        featureset = featureset.copy()
        for fname in list(featureset.keys()):
            for label in self._labels:
                if (label, fname) in self._feature_probdist:
                    break
            else:
                #print 'Ignoring unseen feature %s' % fname
                del featureset[fname]

        # Find the log probabilty of each label, given the features.
        # Start with the log probability of the label itself.
        logprob = {}
        for label in self._labels:
            logprob[label] = self._label_probdist.logprob(label)

        # Then add in the log probability of features given labels.
        for label in self._labels:
            for (fname, fval) in featureset.items():
                if (label, fname) in self._feature_probdist:
                    feature_probs = self._feature_probdist[label, fname]
                    logprob[label] += feature_probs.logprob(fval)
                else:
                    # nb: This case will never come up if the
                    # classifier was created by
                    # NaiveBayesClassifier.train().
                    logprob[label] += sum_logs([]) # = -INF.

        return DictionaryProbDist(logprob, normalize=True, log=True)
예제 #2
0
    def prob_classify(self, featureset):
        # Discard any feature names that we've never seen before.
        # Otherwise, we'll just assign a probability of 0 to
        # everything.
        featureset = featureset.copy()
        for fname in list(featureset.keys()):
            for label in self._labels:
                if (label, fname) in self._feature_probdist:
                    break
            else:
                #print 'Ignoring unseen feature %s' % fname
                del featureset[fname]

        # Find the log probabilty of each label, given the features.
        # Start with the log probability of the label itself.
        logprob = {}
        for label in self._labels:
            logprob[label] = self._label_probdist.logprob(label)

        # Then add in the log probability of features given labels.
        for label in self._labels:
            for (fname, fval) in featureset.items():
                if (label, fname) in self._feature_probdist:
                    feature_probs = self._feature_probdist[label, fname]
                    logprob[label] += feature_probs.logprob(fval)
                else:
                    # nb: This case will never come up if the
                    # classifier was created by
                    # NaiveBayesClassifier.train().
                    logprob[label] += sum_logs([])  # = -INF.

        return DictionaryProbDist(logprob, normalize=True, log=True)
예제 #3
0
파일: main.py 프로젝트: stpwin/tweet-nlp
def prob_classify(_classifier, featureset):
    # Discard any feature names that we've never seen before.
    # Otherwise, we'll just assign a probability of 0 to
    # everything.
    featureset = featureset.copy()
    for fname in list(featureset.keys()):
        for label in _classifier._labels:
            if (label, fname) in _classifier._feature_probdist:
                break
        else:
            # print 'Ignoring unseen feature %s' % fname
            del featureset[fname]

    # Find the log probabilty of each label, given the features.
    # Start with the log probability of the label itself.
    logprob = {}
    prob_features = []  #{Tuple[str, float]}

    for label in _classifier._labels:
        logprob[label] = _classifier._label_probdist.logprob(label)

    # Then add in the log probability of features given labels.
    for label in _classifier._labels:
        for (fname, fval) in featureset.items():
            if (label, fname) in _classifier._feature_probdist:
                feature_probs = _classifier._feature_probdist[label, fname]
                # print(f"{fname} Label: {label}, Prop: {feature_probs.logprob(fval)}")
                logprob[label] += feature_probs.logprob(fval)
                if fval:
                    data = {
                        'word': fname[9:-1],
                        label: feature_probs.logprob(fval)
                    }
                    prob_features.append(data)

            else:
                # nb: This case will never come up if the
                # classifier was created by
                # NaiveBayesClassifier.train().
                # print(f"{fname} Label[else]: {label}, Prop: {sum_logs([])}")
                logprob[label] += sum_logs([])  # = -INF.

        # print(f"Label: {label}, Prop: {logprob[label]}")

    words_prob = {}
    for item in prob_features:
        if item['word'] in words_prob:
            words_prob[item['word']].update(item)
        else:
            words_prob[item['word']] = item
    words_prob = [val for (_, val) in words_prob.items()]

    # print(f"prob_features: {words_prob}")
    return logprob, words_prob
예제 #4
0
 def log_renormalise(self, r, nr):
   '''
   Calculates the renormalisation factor for observed sample types.
   '''
   log_prob_cov = sum_logs([log(nr_, 2) + self.log_prob_measure(r_) for r_, nr_ in zip(r, nr)])
   if self._prob_measure(0) < 1:
     self.log_renormal = log(1 - self._prob_measure(0), 2) + log_prob_cov
     self._renormal = exp(self.log_renormal)
   else:  # If this happens, Good-Turing smoothing is probably a bad idea...
     self.log_renormal = float('-inf')
     self._renormal = 0.0
예제 #5
0
    def _prob_classify(self, input):

        # Make a featureset of the input after tokenizing tokenized
        input_tokenized_featureset = self._tokenizeInputToFeatures(
            input).copy()

        # Ensuring that all the feature names are valid and can be ued
        for input_feature_name in input_tokenized_featureset.keys():
            for label in self._labels:
                if (label, input_feature_name
                    ) in self._featureProbabilityDistribution:
                    break
            else:
                #print 'Ignoring unseen feature %s' % input_feature_name
                del input_tokenized_featureset[input_feature_name]

        # Start with a log probability of 0 to avoid skewing towards larger data sets
        logprob = {}
        for label in self._labels:
            #print "in here adding labels"
            logprob[label] = 0

        # Add in the log probability of features given labels.
        # Iterate through the labels assigned eg : location,time, noise
        for label in self._labels:

            # Iterate through the input feature set one by one eg "{turkey:true, bacon:true}"
            for (input_feature_name,
                 input_feature_val) in input_tokenized_featureset.items():

                # If the combination ie (location,turkey) belongs in the trainig set, add the log probability
                if (label, input_feature_name
                    ) in self._featureProbabilityDistribution:
                    # Assign its probability
                    feature_probs = self._featureProbabilityDistribution[
                        label, input_feature_name]
                    logprob[label] += feature_probs.logprob(input_feature_val)
                else:
                    # nb: This case will never come up if the classifier was created by
                    # NaiveBayesClassifier.train().
                    logprob[label] += sum_logs([])  # = -INF.

        # print out the log prob for each label before normalizing
        #for key,value in  self._featureProbabilityDistribution.items():
        #    print "key value of featureProbabilityDistribution " + str(key) + "," + str(value.freqdist() )

        #print "log prob with features is " + str(logprob)
        dictprobDist = DictionaryProbDist(logprob, normalize=True, log=True)

        ## print out the probability for each label
        #for label in dictprobDist.samples():
        #    print label + " is probability " + str(dictprobDist.prob(label))

        return dictprobDist
예제 #6
0
 def log_renormalise(self, r, nr):
     '''
 Calculates the renormalisation factor for observed sample types.
 '''
     log_prob_cov = sum_logs([
         log(nr_, 2) + self.log_prob_measure(r_) for r_, nr_ in zip(r, nr)
     ])
     if self._prob_measure(0) < 1:
         self.log_renormal = log(1 - self._prob_measure(0),
                                 2) + log_prob_cov
         self._renormal = exp(self.log_renormal)
     else:  # If this happens, Good-Turing smoothing is probably a bad idea...
         self.log_renormal = float('-inf')
         self._renormal = 0.0
예제 #7
0
    def _prob_classify(self, input):

        # Make a featureset of the input after tokenizing tokenized
        input_tokenized_featureset = self._tokenizeInputToFeatures(input).copy()

        # Ensuring that all the feature names are valid and can be ued
        for input_feature_name in input_tokenized_featureset.keys(): 
            for label in self._labels: 
                if (label, input_feature_name) in self._featureProbabilityDistribution: 
                    break 
            else: 
                #print 'Ignoring unseen feature %s' % input_feature_name 
                del input_tokenized_featureset[input_feature_name] 

        # Start with a log probability of 0 to avoid skewing towards larger data sets
        logprob = {} 
        for label in self._labels: 
            #print "in here adding labels"
            logprob[label] = 0                 

        # Add in the log probability of features given labels. 
        # Iterate through the labels assigned eg : location,time, noise
        for label in self._labels: 

            # Iterate through the input feature set one by one eg "{turkey:true, bacon:true}"
            for (input_feature_name, input_feature_val) in input_tokenized_featureset.items(): 
                
                # If the combination ie (location,turkey) belongs in the trainig set, add the log probability
                if (label, input_feature_name) in self._featureProbabilityDistribution: 
                        # Assign its probability
                        feature_probs = self._featureProbabilityDistribution[label,input_feature_name] 
                        logprob[label] += feature_probs.logprob(input_feature_val) 
                else: 
                        # nb: This case will never come up if the classifier was created by 
                        # NaiveBayesClassifier.train(). 
                        logprob[label] += sum_logs([]) # = -INF.
        

        # print out the log prob for each label before normalizing
        #for key,value in  self._featureProbabilityDistribution.items():
        #    print "key value of featureProbabilityDistribution " + str(key) + "," + str(value.freqdist() )

        #print "log prob with features is " + str(logprob)    
        dictprobDist = DictionaryProbDist(logprob, normalize=True, log=True)

        ## print out the probability for each label
        #for label in dictprobDist.samples():
        #    print label + " is probability " + str(dictprobDist.prob(label))

        return dictprobDist
예제 #8
0
    def prob_classify(self, featureset):
        featureset = featureset.copy()
        for fname in list(featureset.keys()):
            for label in self._labels:
                if (label, fname) in self._feature_probdist:
                    break
            else:
                del featureset[fname]

        logprob = {}
        for label in self._labels:
            logprob[label] = self._label_probdist.logprob(label)

        for label in self._labels:
            for (fname, fval) in featureset.items():
                if (label, fname) in self._feature_probdist:
                    feature_probs = self._feature_probdist[label, fname]
                    logprob[label] += feature_probs.logprob(fval)
                else:
                    logprob[label] += sum_logs([])

        return DictionaryProbDist(logprob, normalize=True, log=True)
    def prob_classify(self, featureset):
        featureset = featureset.copy()
        for fname in list(featureset.keys()):
            for label in self._labels:
                if (label, fname) in self._feature_probdist:
                    break
            else:
                del featureset[fname]

        logprob = {}
        for label in self._labels:
            logprob[label] = self._label_probdist.logprob(label)

        for label in self._labels:
            for (fname, fval) in featureset.items():
                if (label, fname) in self._feature_probdist:
                    feature_probs = self._feature_probdist[label,fname]
                    logprob[label] += feature_probs.logprob(fval)
                else:
                    logprob[label] += sum_logs([]) # = -INF.

        return DictionaryProbDist(logprob, normalize=True, log=True)