Пример #1
0
def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist):
    """A copy of the nltk.NaiveBayesClassifer.train(...)
    method to allow inspection of what the method is actually doing
    and how long it's taking"""
    """ 
    @param labeled_featuresets: A list of classified featuresets, 
             i.e., a list of tuples C{(featureset, label)}. 
          """ 
    label_freqdist = nltk.FreqDist() 
    feature_freqdist = nltk.defaultdict(nltk.FreqDist) 
    feature_values = nltk.defaultdict(set) 
    fnames = set() 

    print 'There are ' + str(len(labeled_featuresets)) + ' labeled featuresets'
    # Count up how many times each feature value occured, given 
    # the label and featurename.
    print 'Counting feature value occurence'
    i = 0
    for featureset, label in labeled_featuresets: 
        label_freqdist.inc(label)
        for fname, fval in featureset.items(): 
            # Increment freq(fval|label, fname) 
            feature_freqdist[label, fname].inc(fval) 
            # Record that fname can take the value fval. 
            feature_values[fname].add(fval) 
            # Keep a list of all feature names. 
            fnames.add(fname)
        print 'At featureset...' + str(i)
        i+=1
   
    # If a feature didn't have a value given for an instance, then 
    # we assume that it gets the implicit value 'None.'  This loop 
    # counts up the number of 'missing' feature values for each 
    # (label,fname) pair, and increments the count of the fval 
    # 'None' by that amount. 
    for label in label_freqdist: 
        num_samples = label_freqdist[label] 
        for fname in fnames: 
            count = feature_freqdist[label, fname].N() 
            feature_freqdist[label, fname].inc(None, num_samples-count) 
            feature_values[fname].add(None) 
   
    # Create the P(label) distribution
    print 'Making the P(label) distribution...'
    label_probdist = estimator(label_freqdist) 

   
    # Create the P(fval|label, fname) distribution
    print 'Making the P(fval|label, fname) distribution from '\
    + str(len(feature_freqdist.items()))\
    + ' feature freqs...'
    feature_probdist = {} 
    for ((label, fname), freqdist) in feature_freqdist.items(): 
        probdist = estimator(freqdist, bins=len(feature_values[fname])) 
        feature_probdist[label,fname] = probdist 
                 
    return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
    def __init__(self, rebuild=False):
        # declare variables for sentiment searcher
        self.relevant_documents = {}

        # create sentiment model for objectivity
        self.word_features = []
        self.classifier = None

        if os.path.exists('models/sentiment/label_probdist.p') and \
           os.path.exists('models/sentiment/feature_probdist.p') and \
           os.path.exists('models/sentiment/word_feature_list.p') and  not rebuild:
            print 'loading sentiment model'

            # load in model files
            with open('models/sentiment/label_probdist.p',
                      'rb') as label_probdist_file:
                label_probdist = pickle.load(label_probdist_file)
            with open('models/sentiment/feature_probdist.p',
                      'rb') as feature_probdist_file:
                feature_probdist = pickle.load(feature_probdist_file)
            with open('models/sentiment/word_feature_list.p',
                      'rb') as word_feature_list_file:
                self.word_features = pickle.load(word_feature_list_file)

            # instantiate classifier
            self.classifier = nltk.NaiveBayesClassifier(
                label_probdist, feature_probdist)
        else:
            print 'generating sentiment model'

            # get training data
            subjective_sents = nltk.corpus.subjectivity.sents(
                categories='subj')
            objective_sents = nltk.corpus.subjectivity.sents(categories='obj')

            subjective_docs = [(sent, 'subj') for sent in subjective_sents]
            objective_docs = [(sent, 'obj') for sent in objective_sents]

            # train model
            sentiment_training_data = subjective_docs + objective_docs
            self.create_word_features(
                self.extract_words(sentiment_training_data))
            self.classifier = self.train_sentiment_classifier(
                sentiment_training_data)

            # save out model so it will not need to be regenerated
            with open('models/sentiment/label_probdist.p',
                      'wb') as label_probdist_file:
                pickle.dump(self.classifier._label_probdist,
                            label_probdist_file)
            with open('models/sentiment/feature_probdist.p',
                      'wb') as feature_probdist_file:
                pickle.dump(self.classifier._feature_probdist,
                            feature_probdist_file)
            with open('models/sentiment/word_feature_list.p',
                      'wb') as word_feature_list_file:
                pickle.dump(self.word_features, word_feature_list_file)
Пример #3
0
    def get_classifier(self):
        label_probdist = self.estimator(self.label_freqdist)
        feature_probdist = {}
        for ((label, fname), freqdist) in self.feature_freqdist.iteritems():
            probdist = self.estimator(freqdist,
                                      bins=len(self.feature_values[fname]))
            feature_probdist[label, fname] = probdist

        return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
Пример #4
0
 def testSetAccuracy(self):
     '''
     
     '''
     accuracy = 0
     for i in range(5):
         print 'iteration %d' % i
         random.shuffle(self.training_set)
         train_set = self.training_set[20:]
         test_set = self.training_set[:20]
         tempClassifier = nltk.NaiveBayesClassifier(self.training_set)
         accuracy += nltk.classify.accuracy(tempClassifier, test_set)
     return accuracy / 5
Пример #5
0
    def read_probdist(self):
        f = open("label_probdist.dat")
        label_probdist = pickle.loads(f.read())
        f.close()

        f = open("feature_probdist.dat")
        feature_probdist = pickle.loads(f.read())
        f.close()

        f = open("all_words.dat")
        self.all_words = pickle.loads(f.read())
        f.close()

        self.classifier = nltk.NaiveBayesClassifier(label_probdist,
                                                    feature_probdist)
Пример #6
0
def getNaiveBayesTrainedClassifier(dataset):
    #train_set, test_set = getOpinionTrainingData() #change this to the currently used corpus approach
    #est = lambda fdist : LaplaceProbDist
    train_set, test_set, prob_dist = dataset()
    from nltk.probability import DictionaryProbDist

    dict_probs = {'positive': .1, 'negative': .1}
    label_probdist = DictionaryProbDist(dict_probs)

    classifier = nltk.NaiveBayesClassifier(label_probdist=label_probdist,
                                           feature_probdist=prob_dist)

    classifier = classifier.train(train_set, estimator=LaplaceProbDist)
    print("Classifier accuracy percent: ",
          (nltk.classify.accuracy(classifier, test_set)) * 100)
    #print(classifier.show_most_informative_features(10))
    return classifier
Пример #7
0
	def CreatNaiveBayes(self, data):
		

		label_freqdist = FreqDist() 
   
		for (name, total, ethList) in data:
			for i in range(5):
				label_freqdist[self._ethicity[i]] += ethList[i]

		label_probdist = ELEProbDist(label_freqdist) 
		feature_freqdist = defaultdict(FreqDist)
		feature_values = defaultdict(set)
		#for (name, total, ethList) in data:

		#	x-lets
		for (name, total, ethList) in data:
			x_lets = self.get3_let(name)
			for i in range(5):
				for x_let in x_lets:
					feature_freqdist[(self._ethicity[i], x_let)][True] += ethList[i]
					feature_values[x_let].add(True)

		for ((label, x_let), freqdist) in feature_freqdist.items():
			num = 0
			for i in range(5):
				if label == self._ethicity[i]:
					num = i
					break
			tot = 0
			for (name, total, ethList) in data:
				if x_let not in name:
					tot += ethList[num]
					feature_values[x_let].add(None)
			if tot > 0:
				feature_freqdist[(label, x_let)][None] += tot;
				

		feature_probdist = {}
		for ((label, fname), freqdist) in feature_freqdist.items():
			probdist = ELEProbDist(freqdist, bins=len(feature_values[fname]))
			feature_probdist[label, fname] = probdist
		
		self.classifier = nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
Пример #8
0
def load_bayes_from_file(filename):
    d = pickle.load(open(filename))
    return nltk.NaiveBayesClassifier(d["_label_probdist"], d["_feature_probdist"])
Пример #9
0
 def train(labeled_featuresets, estimator=nltk.ELEProbDist):
     label_probdist = estimator(nltk.label_freqdist)
     feature_probdist = {}
     return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
Пример #10
0
def train():
    classifier = nltk.NaiveBayesClassifier()
    print classifier