示例#1
0
def createClassifier(ignoreTweets=False):
    neg_ids = movie_reviews.fileids('neg')
    pos_ids = movie_reviews.fileids('pos')
    neg_sents = [(extractWords(movie_reviews.words(fileids=[f])), 'neg')
                 for f in neg_ids]
    pos_sents = [(extractWords(movie_reviews.words(fileids=[f])), 'pos')
                 for f in pos_ids]

    #if you dont want to process all tweets, just call :
    (neg_pols, pos_pols) = getSentPolarities()
    (neg_tweets, pos_tweets) = getTweets(ignoreTweets)
    neg_sents = neg_sents + neg_tweets + neg_pols
    pos_sents = pos_sents + pos_tweets + pos_pols
    trainsizeneg = int(0.75 * len(neg_sents))
    trainsizepos = int(0.75 * len(pos_sents))

    all_train = neg_sents[:trainsizeneg] + pos_sents[:trainsizepos]
    all_test = neg_sents[trainsizeneg:] + pos_sents[trainsizepos:]
    # train size = 1500, test size = 500

    s_analyzer = SentimentAnalyzer()

    classifier = NaiveBayesClassifier.train(all_train)
    print accuracy(classifier, all_test)
    #classifier.show_most_informative_features()

    return classifier
示例#2
0
def classify(inputdir):
        #filenames = os.listdir('d:\\shir\\')
        filenames = os.listdir(inputdir)

        feat_set = []
        sets = []
        for name in filenames:
        # print name
            lineno=0
            path = os.path.join(inputdir, name)
            sense = name.split('\\')[-1].split('.')[0]
            print 'training', sense

            file = codecs.open(path, 'r', 'utf-8')
            allwords = []
            for line in file:
              if len(line.split())>2:
                     lineno+=1
                     line = line.strip()
                     words=[]
                     tags=[]
                     tokens = line.split()

                     for item in tokens:
                           if len(item.split('\\'))==2:
                                word=item.split('\\')[0]
                                tag= item.split('\\')[1]
                                words.append(word)
                                tags.append(tag)
                                allwords.append(word)
                     feat_set.append((bag_of_words(line),sense))
                     #feat_set.append((get_feature2(line),sense))
              else:
                  words=[]
                  tags=[]
            file.close()

        random.shuffle(feat_set)
        random.shuffle(feat_set)
        #random.shuffle(feat_set)



        train_data = train_feats(feat_set)
        test_data  = test_feats(feat_set)
        #classifier=  MaxentClassifier.train(train_data)
        nb_classifier = NaiveBayesClassifier.train(train_data)
        dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30)
       # pickle.dump(classifier, classifier_save_file)
        entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=1, min_lldelta=0.5)
        print "nb accuracy "+ str(accuracy(nb_classifier, test_data) * 100)
        print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100)
        print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100)
        mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, entropy_classifier)
        print "max vote accuracy "+ str(accuracy(mv_classifier, test_data) * 100)
def train(cleanedDataCollection, tagPool):
	posSamples = []
	negSamples = []

	featuresets = [(extractFeatures(d,tagPool), c) for (d,c) in cleanedDataCollection]
	for sample in featuresets:
		if sample[1] == "trash":
			negSamples.append(sample)
		else:
			posSamples.append(sample)

	train_set = negSamples[10:]+posSamples[10:]
	test_set = negSamples[:10]+posSamples[:10]


	# classifier = nltk.NaiveBayesClassifier.train(train_set)
	# print(nltk.classify.accuracy(classifier, test_set))
	# classifier.show_most_informative_features(5) 
	# return classifier

	sk_classifier = SklearnClassifier(MultinomialNB())
	sk_classifier.train(train_set)
	print "accuracy is: %s" % (accuracy(sk_classifier, test_set))

	precision, recall, fMeasure = precision_recall_fmeasure(sk_classifier,  test_set, "useful")

	print "precision is: %s" % (precision)
	print "recall is: %s" % (recall)
	print "F-measure is: %s" % (fMeasure)
	return sk_classifier
示例#4
0
def main():
    results = {'Topic': [], 'Precision': [], 'Recall': [], 'F-measure': []}
    print('\nPreparing data...')
    (train_set, test_set) = get_train_test_sets('data/content')
    print('\nNB classifier training...')
    classifier = NaiveBayesClassifier.train(train_set)
    print('NB classifier is trained with {}% accuracy'.format(
        round(accuracy(classifier, test_set) * 100, 1)))

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    for topic in topics:
        results['Topic'].append(topic)
        results['Precision'].append(
            round(precision(refsets[topic], testsets[topic]) * 100, 1))
        results['Recall'].append(
            round(recall(refsets[topic], testsets[topic]) * 100, 1))
        results['F-measure'].append(
            round(f_measure(refsets[topic], testsets[topic]) * 100, 1))

    del classifier, train_set, test_set, refsets, testsets
    gc.collect()

    print(results)
def rte_classifier(trainer, features=rte_features):
    """
    Classify RTEPairs
    """
    train = [(pair, pair.value) for pair in nltk.corpus.rte.pairs(
        ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])]
    test = [(pair, pair.value) for pair in nltk.corpus.rte.pairs(
        ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])]

    # Train up a classifier.
    print('Training classifier...')
    classifier = trainer([(features(pair), label) for (pair, label) in train])

    # Run the classifier on the test data.
    print('Testing classifier...')
    acc = accuracy(classifier,
                   [(features(pair), label) for (pair, label) in test])
    print('Accuracy: %6.4f' % acc)

    test_label = [label for (pair, label) in test]
    result = [classifier.classify(features(pair)) for (pair, label) in test]

    print(metrics.accuracy_score(test_label, result))
    print(metrics.classification_report(test_label, result))

    # Return the classifier
    return classifier
    def train(self):
        print 'Classifier Training in progress....'
        poscutoff = len(self.positiveFeatures)
        negcutoff = len(self.negativeFeatures)
        print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff)
        trainfeats = self.positiveFeatures[:poscutoff] + self.negativeFeatures[:negcutoff]
        
        testfeats = self.test()        
        print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
        self.classifier = NaiveBayesClassifier.train(trainfeats)        
        print 'accuracy:', accuracy(self.classifier, testfeats)
        
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set) 
        
        for i, (feats, label) in enumerate(testfeats):    
            refsets[label].add(i)    
            observed = self.classifier.classify(feats)  
            #print label, observed  
            testsets[observed].add(i)

        print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
        print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
        print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
示例#7
0
def model_test(classifier, test_features):
    print('Model Accuracy: {0}'.format(accuracy(classifier, test_features)))
    precisions, recalls, f_measure, conf_matrix = get_precision_recall_fmeasure_conf_matrix(classifier, test_features)
    print('Precisions: {0}'.format(precisions))
    print('Recalls: {0}'.format(recalls))
    print('F-Measure: {0}'.format(f_measure))
    print('Confusion Matrix: {0}'.format(conf_matrix))
示例#8
0
    def train_with_movie_db(self):
        """
        Training possible with movie reviews
        - this does not yield particularly good results
        """
        self.use_movie_reviews = True

        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')

        negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
                     "negative") for f in negids]
        posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
                     "positive") for f in posids]

        negcutoff = len(negfeats) * 3 / 4
        poscutoff = len(posfeats) * 3 / 4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)))

        self.classifier = NaiveBayesClassifier.train(trainfeats)

        DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats)))
        DLOG(self.classifier.show_most_informative_features())
示例#9
0
def rte_classifier():
    train_set = rte_corpus.pairs(
        ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
    test_set = rte_corpus.pairs(['rte1_test.xml'])
    featurized_train_set = rte_featurize(train_set, True)
    featurized_test_set = rte_featurize(test_set, False, test_id=0)
    print('Training classifier...')
    svm = SklearnClassifier(LinearSVC())
    clf_svm = svm.train(featurized_train_set)
    # clf_nb = nltk.NaiveBayesClassifier.train(featurized_train_set)
    # clf_gis = MaxentClassifier.train(featurized_train_set, 'GIS')
    # clf_iis = MaxentClassifier.train(featurized_train_set, 'IIS')
    # clf_dt = SklearnClassifier(RandomForestClassifier(random_state=0)).train(featurized_train_set)
    # clf_dt = DecisionTreeClassifier.train(featurized_train_set)
    print('Testing classifier...')
    # acc = m_accuracy(clf, featurized_test_set, test_set)

    # acc_dt = accuracy(clf_dt, featurized_test_set)
    # acc_gis = accuracy(clf_gis, featurized_test_set)
    # acc_iis = accuracy(clf_iis, featurized_test_set)
    acc_svm = accuracy(clf_svm, featurized_test_set)
    # acc_nb = accuracy(clf_nb, featurized_test_set)
    # print('rf Accuracy: %8.4f' % acc_dt)
    print('svm Accuracy: %8.4f' % acc_svm)
    # print('nb Accuracy: %8.4f' % acc_nb)
    # print('gis Accuracy: %8.4f' % acc_gis)
    # print('iis Accuracy: %8.4f' % acc_iis)
    print '==================================='
def run_classifier_tests(classifier):
    testfiles = [{
        'F': 'C:/research/HIT_QIWEI/data/clean/label/F_r.txt'
    }, {
        'H': 'C:/research/HIT_QIWEI/data/clean/label/H_r.txt'
    }, {
        'O': 'C:/research/HIT_QIWEI/data/clean/label/O_r.txt'
    }, {
        'P': 'C:/research/HIT_QIWEI/data/clean/label/P_r.txt'
    }, {
        'S': 'C:/research/HIT_QIWEI/data/clean/label/S_r.txt'
    }]
    #testfiles = [{'performance': 'http://en.wikipedia.org/wiki/Performance_measurement'},
    #            {'resource': 'http://en.wikipedia.org/wiki/Resource_management'},
    #           {'risk': 'http://en.wikipedia.org/wiki/Risk_management'},
    #          {'strategic': 'http://en.wikipedia.org/wiki/Strategic_alignment'},
    #         {'value': 'http://en.wikipedia.org/wiki/Val_IT'},]
    testfeats = []
    for file in testfiles:
        for sense, loc in file.iteritems():
            for line in open(loc).read():
                testfeats = testfeats + create_training_dict(line, sense)

    acc = accuracy(classifier, testfeats) * 100
    print 'accuracy: %.2f%%' % acc
def create_and_evaluate_classifier_10_fold(features):
    """
    Uses 10 fold cross validation to create a classifier with naive bayes
    for the given features. Evaluates the created classifier afterwards.
    Results are printed to the console
    :param features: List of features
    """

    k_fold_validator = KFold(n_splits=10, shuffle=False)
    accuracies = []
    counter = 1

    for train_index, test_index in k_fold_validator.split(features):
        # print str(counter * 10) + '% cross-validation'

        training_features = [features[i] for i in train_index]
        test_features = [features[i] for i in test_index]

        classifier = NaiveBayesClassifier.train(training_features)

        accuracy = util.accuracy(classifier, test_features)

        accuracies.append(accuracy)
        counter = counter + 1

    print 'Accuracy:', sum(accuracies) / float(len(accuracies)), '\n'
def rte_classifier(algorithm, sample_N=None):
    from nltk.corpus import rte as rte_corpus

    train_set = rte_corpus.pairs(
        ["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
    test_set = rte_corpus.pairs(
        ["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])

    if sample_N is not None:
        train_set = train_set[:sample_N]
        test_set = test_set[:sample_N]

    featurized_train_set = rte_featurize(train_set)
    featurized_test_set = rte_featurize(test_set)

    # Train the classifier
    print("Training classifier...")
    if algorithm in ["megam"]:  # MEGAM based algorithms.
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    elif algorithm in ["GIS", "IIS"]:  # Use default GIS/IIS MaxEnt algorithm
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    else:
        err_msg = str("RTEClassifier only supports these algorithms:\n "
                      "'megam', 'GIS', 'IIS'.\n")
        raise Exception(err_msg)
    print("Testing classifier...")
    acc = accuracy(clf, featurized_test_set)
    print("Accuracy: %6.4f" % acc)
    return clf
示例#13
0
def train(cleanedDataCollection, tagPool):
    posSamples = []
    negSamples = []

    featuresets = [(extractFeatures(d, tagPool), c)
                   for (d, c) in cleanedDataCollection]
    for sample in featuresets:
        if sample[1] == "trash":
            negSamples.append(sample)
        else:
            posSamples.append(sample)

    train_set = negSamples[10:] + posSamples[10:]
    test_set = negSamples[:10] + posSamples[:10]

    # classifier = nltk.NaiveBayesClassifier.train(train_set)
    # print(nltk.classify.accuracy(classifier, test_set))
    # classifier.show_most_informative_features(5)
    # return classifier

    sk_classifier = SklearnClassifier(MultinomialNB())
    sk_classifier.train(train_set)
    print "accuracy is: %s" % (accuracy(sk_classifier, test_set))

    precision, recall, fMeasure = precision_recall_fmeasure(
        sk_classifier, test_set, "useful")

    print "precision is: %s" % (precision)
    print "recall is: %s" % (recall)
    print "F-measure is: %s" % (fMeasure)
    return sk_classifier
示例#14
0
def rte_classifier(algorithm):
    from nltk.corpus import rte as rte_corpus

    train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
    test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
    featurized_train_set = rte_featurize(train_set)
    featurized_test_set = rte_featurize(test_set)
    # Train the classifier
    print('Training classifier...')
    if algorithm in ['megam', 'BFGS']:  # MEGAM based algorithms.
        # Ensure that MEGAM is configured first.
        check_megam_config()
        clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
    elif algorithm in ['GIS', 'IIS']:  # Use default GIS/IIS MaxEnt algorithm
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    else:
        err_msg = str(
            "RTEClassifier only supports these algorithms:\n "
            "'megam', 'BFGS', 'GIS', 'IIS'.\n"
        )
        raise Exception(err_msg)
    print('Testing classifier...')
    acc = accuracy(clf, featurized_test_set)
    print('Accuracy: %6.4f' % acc)
    return clf
示例#15
0
def main():
    '''
    Main function of the boilerplate code is the entry point of the 'chitragoopt' executable script (defined in setup.py).
    
    Use doctests, those are very helpful.
    
    >>> main()
    Hello
    >>> 2 + 2
    4
    '''

    lfeats = label_feats_from_corpus(movie_reviews)
    train_feats, test_feats = split_label_feats(lfeats, split=0.75)
    train_feats, test_feats = split_label_feats(lfeats, split=0.75)
    # nb_classifier = NaiveBayesClassifier.train(train_feats)
    print(sys.argv[1].split())
    negfeat = bag_of_words(sys.argv[1].split())

    f = open('my_classifier.pickle')
    nb_classifier = pickle.load(f)
    f.close()
    print(accuracy(nb_classifier, test_feats))
    print(nb_classifier.classify(negfeat))

    for x in range(0, 50):
        print(nb_classifier.classify(negfeat))
def rte_classifier(algorithm):
    from nltk.corpus import rte as rte_corpus

    train_set = rte_corpus.pairs(
        ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
    test_set = rte_corpus.pairs(
        ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
    featurized_train_set = rte_featurize(train_set)
    featurized_test_set = rte_featurize(test_set)
    # Train the classifier
    print('Training classifier...')
    if algorithm in ['megam', 'BFGS']:  # MEGAM based algorithms.
        # Ensure that MEGAM is configured first.
        check_megam_config()
        clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
    elif algorithm in ['GIS', 'IIS']:  # Use default GIS/IIS MaxEnt algorithm
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    else:
        err_msg = str("RTEClassifier only supports these algorithms:\n "
                      "'megam', 'BFGS', 'GIS', 'IIS'.\n")
        raise Exception(err_msg)
    print('Testing classifier...')
    acc = accuracy(clf, featurized_test_set)
    print('Accuracy: %6.4f' % acc)
    return clf
    def train(self):
        """Trains new sentimental analyzer model.

        :return:
        """
        data = self._get_train_data()

        percentage_to_train = 0.9

        neg_feats = [(word_feats(tokenize_clean_text(tweet[1])), self.NEGATIVE)
                     for tweet in data if tweet[0] == self.NEGATIVE]
        pos_feats = [(word_feats(tokenize_clean_text(tweet[1])), self.POSITIVE)
                     for tweet in data if tweet[0] == self.POSITIVE]

        del data

        neg_cutoff = round(len(neg_feats) * percentage_to_train)
        pos_cutoff = round(len(pos_feats) * percentage_to_train)

        train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff]
        test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:]

        # Train Classifier.
        print('train on %d instances, test on %d instances' %
              (len(train_feats), len(test_feats)))
        self._classifier = NaiveBayesClassifier.train(train_feats)
        print('accuracy: ', accuracy(self._classifier, test_feats))

        self._classifier.show_most_informative_features()
示例#18
0
    def test(self):
        if not hasattr(self, 'test_sets'): self._split_features()
        if not hasattr(self, 'classifiers'): self.train()

        result = {}
        for tag, classifier in self.classifiers.iteritems():
            result[tag] = accuracy(classifier, self.test_sets[tag])
        return result
def results(train, query_data, query_no_label, query_labels):
    print '\nCalculating final results...'
    megam_classifier = MaxentClassifier.train(train, 'megam')  # build and train the maxent classifier
    accu = accuracy(megam_classifier, query_data)  # calculate the classification accuracy

    predicted = megam_classifier.classify_many(query_no_label)  # get a list of predicted labels
    cm = confusion_matrix(query_labels, predicted)  # build confusion matrix

    return accu, cm
    def implementMethods(self,sents,labelsData,clsent):
        labelwords=[]
        k=0

        
        cl=self.featureList(sents,labelsData)
        tr,te=self.setSplit(cl)
        nb_classifier = NaiveBayesClassifier.train(tr)
        print('Accuracy = '+str(accuracy(nb_classifier, te)*100)+'%')
        return nb_classifier
示例#21
0
    def evaluate_accuracy(self):
        '''Evaluate accuracy given a classifer model'''

        accuracies = []
        lfeats = self.label_feats_from_corpus()
        for i in range(1, 10):
            train_feats, test_feats, nb_classifier = self\
                .__get_elements_for_classification(lfeats, train_number=i, classifying=False)
            accuracies.append(accuracy(nb_classifier, test_feats))
        return sum(accuracies)/len(accuracies)
示例#22
0
 def RunBayesNetwork(self, type_of_Feature_extractor):      
     #Bayes Network classifier, return accuracy
     
     if type_of_Feature_extractor == 1:
         #Format the positive and negative separately
         formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor1)  
         formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor1) 
         #Same again but for the testing data
         formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor1) 
         formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor1) 
     elif type_of_Feature_extractor == 2:
         #Format the positive and negative separately
         formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor2)  
         formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor2) 
         #Same again but for the testing data
         formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor2) 
         formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor2)
     elif type_of_Feature_extractor == 3:
         #Format the positive and negative separately
         formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor3)  
         formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor3) 
         #Same again but for the testing data
         formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor3) 
         formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor3) 
     elif type_of_Feature_extractor == 4:
         #Format the positive and negative separately
         formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor4)  
         formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor4) 
         #Same again but for the testing data
         formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor4) 
         formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor4) 
     else:
         #Format the positive and negative separately
         formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos")  
         formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg") 
         #Same again but for the testing data
         formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos") 
         formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg") 
                 
                                                               
     #Combine them
     formatted_training_data = formatted_pos_training + formatted_neg_training        
     #Combine them
     formatted_testing_data = formatted_pos_testing + formatted_neg_testing
     
     #Train on a list of reviews
     nb_classifier = NaiveBayesClassifier.train(formatted_training_data)
     
     #Print the features that the NB classifier found to be most important in making classifications
     nb_classifier.show_most_informative_features() 
     
     #Test on another list of reviews
     accuracy_ = accuracy(nb_classifier, formatted_testing_data)
     
     return accuracy_
示例#23
0
def evaluate_features(feature_extractor, N, only_acc=False):
    from nltk.corpus import movie_reviews
    from nltk.classify import NaiveBayesClassifier as naive
    from nltk.classify.util import accuracy
    from nltk.metrics import precision, recall, f_measure
    from sys import stdout

    negative = movie_reviews.fileids('neg')
    positive = movie_reviews.fileids('pos')
    negfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])), 'neg')
                for f in negative]

    posfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])), 'pos')
                for f in positive]
    negtrain, negtest = stratifiedSamples(negfeats, N)
    postrain, postest = stratifiedSamples(posfeats, N)

    trainfeats = negtrain + postrain
    testfeats = negtest + postest
    classifier = naive.train(trainfeats)
    if only_acc: return accuracy(classifier, testfeats)
    print 'accuracy: {}'.format(accuracy(classifier, testfeats))

    # Precision, Recall, F-measure
    from collections import defaultdict
    refsets = defaultdict(set)
    testsets = defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    print 'pos precision:', precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', recall(refsets['pos'], testsets['pos'])
    print 'pos F-measure:', f_measure(refsets['pos'], testsets['pos'])
    print 'neg precision:', precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', recall(refsets['neg'], testsets['neg'])
    print 'neg F-measure:', f_measure(refsets['neg'], testsets['neg'])
    stdout.flush()
    classifier.show_most_informative_features()
    return classifier
示例#24
0
文件: hw2.py 项目: lxmonk/nlg12_hw2
def evaluate_features(feature_extractor, N, only_acc=False):
    from nltk.corpus import movie_reviews
    from nltk.classify import NaiveBayesClassifier as naive
    from nltk.classify.util import accuracy
    from nltk.metrics import precision, recall, f_measure
    from sys import stdout
    
    negative = movie_reviews.fileids('neg')
    positive = movie_reviews.fileids('pos')
    negfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])),
                 'neg') for f in negative]

    posfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])),
                 'pos') for f in positive]
    negtrain, negtest = stratifiedSamples(negfeats, N)
    postrain, postest = stratifiedSamples(posfeats, N)

    trainfeats = negtrain + postrain
    testfeats = negtest + postest
    classifier = naive.train(trainfeats)
    if only_acc: return accuracy(classifier, testfeats)
    print 'accuracy: {}'.format(accuracy(classifier, testfeats))

    # Precision, Recall, F-measure
    from collections import defaultdict
    refsets = defaultdict(set)
    testsets = defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
        
    print 'pos precision:', precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', recall(refsets['pos'], testsets['pos'])
    print 'pos F-measure:', f_measure(refsets['pos'], testsets['pos'])
    print 'neg precision:', precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', recall(refsets['neg'], testsets['neg'])
    print 'neg F-measure:', f_measure(refsets['neg'], testsets['neg'])
    stdout.flush()
    classifier.show_most_informative_features()
    return classifier
示例#25
0
def dt_classify(filename):
    raw_sample_stream = get_samples_stream(filename)
    all_samples = list( binary_bow_feature(raw_sample_stream) )

    # filter out two classes of outliers
    # these two categories contain too few examples, so the word frequency in these two categories
    # cannot reflect the true probability
    # all_samples = [(features,aspect) for features,aspect in all_samples if aspect != common.AspectNothing and aspect != common.AspectBusiness]

    test_sample_ratio = 0.25
    train_samples,test_samples = split_samples(all_samples,test_sample_ratio)
    print "training set has {} samples, test set has {} samples".format(len(train_samples),len(test_samples))

    classifier = DecisionTreeClassifier.train(train_samples,binary=True, depth_cutoff=15,verbose=True)
    print "training completes"

    print "training accuracy: {}".format(accuracy(classifier,train_samples))
    print "test accuracy: {}".format(accuracy(classifier,test_samples))

    return classifier
示例#26
0
def find_scores():
    #Text formatting to classify
    def format_text(text):
        return ({word: True for word in nltk.word_tokenize(text)})

    #Load positive categorized text
    pos = []
    with open("./pos.txt", encoding='ISO-8859-1') as f:
        for i in f:
            pos.append([
                format_text(i.encode("utf-8").decode("unicode-escape")),
                'positive'
            ])
    #Load negative categorized text
    neg = []
    with open("./neg.txt", encoding='ISO-8859-1') as f:
        for i in f:
            neg.append([
                format_text(i.encode("utf-8").decode("unicode-escape")),
                'negative'
            ])
    #Load negative categorized text
    neu = []
    with open("./neu.txt", encoding='ISO-8859-1') as f:
        for i in f:
            neu.append([
                format_text(i.encode("utf-8").decode("unicode-escape")),
                'neutre'
            ])
    #Split data into training(80%) and testing(20%) sets
    training_set = pos[:int((.80) * len(pos))] + neg[:int(
        (.80) * len(neg))] + neu[:int((.80) * len(neu))]
    test_set = pos[int((.80) * len(pos)):] + neg[int(
        (.80) * len(neg)):] + neu[int((.80) * len(neu)):]
    #Training classifier
    classifier = NaiveBayesClassifier.train(training_set)
    #Calculate scores
    trueset = collections.defaultdict(set)
    testset = collections.defaultdict(set)
    #Test all test-set items using defined classifier
    for i, (text, label) in enumerate(test_set):
        trueset[label].add(i)
        result = classifier.classify(text)
        testset[result].add(i)
        #accurays
    return accuracy(classifier, test_set), f_measure(
        trueset['positive'], testset['negative']), f_measure(
            testset['negative'], trueset['positive']), f_measure(
                testset['neutre'], trueset['positive']), f_measure(
                    testset['positive'], trueset['neutre']), f_measure(
                        testset['negative'],
                        trueset['neutre']), f_measure(testset['neutre'],
                                                      trueset['negative'])
示例#27
0
def cross_validate():
    training_set = load_training_set()
    random.shuffle(training_set)
    average = 0
    cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None)
    for traincv, evalcv in cv:
        classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv) - 1]])
        acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]])
        print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1]
        print 'Accuracy: %4.2f' % acc
        average += acc
    print 'Average accuracy: %4.2f' % (average / 10)
def classifyReviews():
    ''' Perform sentiment classification on movie reviews '''
    # Read the data from the file
    data = pd.read_csv("data/movieReviews.csv")

    # get the text of the positive and negative reviews only.
    # positive and negative will be lists of strings
    # For now we use only very positive and very negative reviews.
    positive = getReviews(data, 4)
    negative = getReviews(data, 0)

    # Split each data set into training and testing sets.
    # You have to write the function splitTrainTest
    (posTrainText, posTestText) = splitTrainTest(positive, 0.8)
    (negTrainText, negTestText) = splitTrainTest(negative, 0.8)

    # Format the data to be passed to the classifier.
    # You have to write the formatForClassifer function
    posTrain = formatForClassifier(posTrainText, 'pos')
    negTrain = formatForClassifier(negTrainText, 'neg')

    # Create the training set by appending the pos and neg training examples
    training = posTrain + negTrain

    # Format the testing data for use with the classifier
    posTest = formatForClassifier(posTestText, 'pos')
    negTest = formatForClassifier(negTestText, 'neg')
    # Create the test set
    test = posTest + negTest

    # Train a Naive Bayes Classifier
    # Uncomment the next line once the code above is working
    classifier = NaiveBayesClassifier.train(training)

    # Uncomment the next two lines once everything above is working
    print("Accuracy of the classifier is: " + str(accuracy(classifier, test)))
    print("Accuracy of the positive is: " + str(accuracy(classifier, posTest)))
    print("Accuracy of the negative is: " + str(accuracy(classifier, negTest)))

    classifier.show_most_informative_features()
示例#29
0
def naivebayes_classify(filename,filter_small_category):
    raw_sample_stream = get_samples_stream(filename)
    all_samples = list( binary_bow_feature(raw_sample_stream) )

    if filter_small_category:
    # filter out two classes of outliers
    # these two categories contain too few examples, so the word frequency in these two categories
    # cannot reflect the true probability
        all_samples = [(features,aspect) for features,aspect in all_samples if aspect != common.AspectNothing and aspect != common.AspectBusiness]

    test_sample_ratio = 0.25
    train_samples,test_samples = split_samples(all_samples,test_sample_ratio)
    print "training set has {} samples, test set has {} samples".format(len(train_samples),len(test_samples))

    classifier = NaiveBayesClassifier.train(train_samples)
    print "training completes"

    print "########## training accuracy: {}".format(accuracy(classifier,train_samples))
    print "########## test accuracy: {}".format(accuracy(classifier,test_samples))
    classifier.show_most_informative_features(n=10)

    return classifier
示例#30
0
def run_classifier_tests(classifier):
    testfiles = [{'traffic': 'traffic-training.txt'},
                 {'useless': 'useless-training.txt'}]
    testfeats = []
    for file in testfiles:
        for sense, loc in file.iteritems():
            for line in open(loc, 'r'):
                testfeats = testfeats + create_training_dict(line, sense)

    acc = accuracy(classifier, testfeats) * 100
    print 'accuracy: %.2f%%' % acc

    sys.exit()
def run_classifier_tests(classifier):
   testfiles = [{'fruit': 'http://litfuel.net/plush/files/disambiguation/apple-fruit-training.txt'},
                {'company': 'http://litfuel.net/plush/files/disambiguation/apple-company-training.txt'}]
   testfeats = []
   for file in testfiles:
       for sense, loc in file.iteritems():
           for line in urllib2.urlopen(loc):
               testfeats = testfeats + create_training_dict(line, sense)
 
 
   acc = accuracy(classifier, testfeats) * 100
   print 'accuracy: %.2f%%' % acc
 
   sys.exit()
def implementMethods(sents,labelsData,clsent):
    labelwords=[]
    k=0
    for sent in sents:
        labelwords.append((labelsData[k],nltk.tokenize.word_tokenize(sent)))
        k=k+1
    high_info_words=set(high_information_words(labelwords))
    feat_det=lambda words:bag_of_words_in_set(words,high_info_words)
    
    cl=featureList(sents,labelsData,feature_detector=feat_det)
    tr,te=setSplit(cl)
    nb_classifier = NaiveBayesClassifier.train(tr)
    print('Accuracy = '+str(accuracy(nb_classifier, te)*100)+'%')
    return nb_classifier
示例#33
0
def run_classifier_tests(classifier):
    testfiles = [{
        'traffic': 'traffic-training.txt'
    }, {
        'useless': 'useless-training.txt'
    }]
    testfeats = []
    for file in testfiles:
        for sense, loc in file.iteritems():
            for line in open(loc, 'r'):
                testfeats = testfeats + create_training_dict(line, sense)

    acc = accuracy(classifier, testfeats) * 100
    print 'accuracy: %.2f%%' % acc

    sys.exit()
def random_test(iterations, haiku_labeled, short_labeled, corpus_length):
    import random
    from nltk.classify import NaiveBayesClassifier  #import directly from .py file while in appropriate directory (then use updated function, which returns features as list)
    from nltk.classify.util import accuracy
    random_scores = []
    #need to mash the corpora together here, and then send the newly labeled corpora to the "for" loop
    #switch labels for half of each corpora
    haiku_a = haiku_labeled[0:int(len(haiku_labeled) / 2)]
    haiku_b = haiku_labeled[int(len(haiku_labeled) / 2):]
    for i in haiku_a:
        i[1][1] = 'not-haiku'
    poetry_a = short_labeled[0:int(len(short_labeled) / 2)]
    poetry_b = short_labeled[int(len(short_labeled) / 2):]
    for i in poetry_a:
        i[1][1] = 'haiku'
    #create new corpora based on these false labels
    haiku_sample = haiku_b + poetry_a  #i.e., real haiku and poetry labeled as haiku
    poetry_sample = haiku_a + poetry_b
    #now run the classification
    for i in range(iterations):
        #for i,k in zip(haiku_random[0::2], haiku_random[1::2]): #iterates through every other item
        #    i[1][1] = 'not-haiku'
        #for i,k in zip(poetry_random[0::2], poetry_random[1::2]):
        #    i[1][1] = 'haiku'
        haiku_random = []
        poetry_random = []
        haiku_random = random.sample(haiku_sample, corpus_length)
        poetry_random = random.sample(poetry_sample, corpus_length)
        cut_point = int(corpus_length / 4)
        hfold1 = haiku_random[0:cut_point]
        hfold2 = haiku_random[cut_point:(cut_point * 2)]
        hfold3 = haiku_random[(cut_point * 2):(cut_point * 3)]
        hfold4 = haiku_random[(cut_point * 3):]
        pfold1 = poetry_random[0:cut_point]
        pfold2 = poetry_random[cut_point:(cut_point * 2)]
        pfold3 = poetry_random[(cut_point * 2):(cut_point * 3)]
        pfold4 = poetry_random[(cut_point * 3):]
        train_set = hfold1 + hfold2 + hfold3 + pfold1 + pfold2 + pfold3
        test_set = hfold4 + pfold4
        #train the classifier
        nb_classifier = NaiveBayesClassifier.train([e[1] for e in train_set])
        nb_classifier.labels()
        #check accuracy of classifier and store accuracy measure
        random_scores.append(
            accuracy(nb_classifier, [el[1] for el in test_set]))
    return random_scores
示例#35
0
def trainDanger():
    danger = []
    with open("./anger.txt") as f:
        for i in f:
            danger.append([format_sentence(i), 'danger'])
    calm = []
    with open("./calm.txt") as f:
        for i in f:
            calm.append([format_sentence(i), 'calm'])
    training = danger[:int((.8) * len(danger))] + calm[:int((.8) * len(calm))]
    test = danger[int((.8) * len(danger)):] + calm[int((.8) * len(calm)):]
    from nltk.classify import NaiveBayesClassifier
    classifier = NaiveBayesClassifier.train(training)
    from nltk.classify.util import accuracy
    print("Test data accuracy" + str(accuracy(classifier, test)))
    classifier.show_most_informative_features()
    return classifier
示例#36
0
    def cross_validate(self):
        """
        Performs cross validation by training the model on 90% of the
        corpus then checking the accuracy on the remaining 10%.
        """
        start = time.time()

        feats = self.featureset()
        offset = len(feats) / 10
        random.shuffle(feats)

        train = feats[:offset]
        test = feats[offset:]

        classifier, _ = self.train(train)
        self.accuracy = accuracy(classifier, test)

        self.validtime = time.time() - start
示例#37
0
def rte_classifier(trainer, features=rte_features):
    """
    Classify RTEPairs
    """
    train = ((pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']))
    test = ((pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']))

    # Train up a classifier.
    print('Training classifier...')
    classifier = trainer( [(features(pair), label) for (pair,label) in train] )

    # Run the classifier on the test data.
    print('Testing classifier...')
    acc = accuracy(classifier, [(features(pair), label) for (pair,label) in test])
    print('Accuracy: %6.4f' % acc)

    # Return the classifier
    return classifier
示例#38
0
    def cross_validate(self):
        """
        Performs cross validation by training the model on 90% of the
        corpus then checking the accuracy on the remaining 10%.
        """
        start  = time.time()

        feats  = self.featureset()
        offset = len(feats) / 10
        random.shuffle(feats)

        train  = feats[:offset]
        test   = feats[offset:]

        classifier, _  = self.train(train)
        self.accuracy  = accuracy(classifier, test)

        self.validtime = time.time() - start
示例#39
0
def rte_classifier(trainer, features=rte_features):
    """
    Classify RTEPairs
    """
    train = ((pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']))
    test = ((pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']))

    # Train up a classifier.
    print('Training classifier...')
    classifier = trainer( [(features(pair), label) for (pair,label) in train] )

    # Run the classifier on the test data.
    print('Testing classifier...')
    acc = accuracy(classifier, [(features(pair), label) for (pair,label) in test])
    print('Accuracy: %6.4f' % acc)

    # Return the classifier
    return classifier
示例#40
0
def cross_validate():
    training_set = load_training_set()
    random.shuffle(training_set)
    average = 0
    cv = KFold(len(training_set),
               n_folds=10,
               indices=True,
               shuffle=False,
               random_state=None)
    for traincv, evalcv in cv:
        classifier = NaiveBayesClassifier.train(
            training_set[traincv[0]:traincv[len(traincv) - 1]])
        acc = accuracy(classifier,
                       training_set[evalcv[0]:evalcv[len(evalcv) - 1]])
        print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1]
        print 'Accuracy: %4.2f' % acc
        average += acc
    print 'Average accuracy: %4.2f' % (average / 10)
示例#41
0
def train(cleanedDataCollection, word_features, tagToBeTrained, high_info_wordSet):
	random.shuffle(cleanedDataCollection)
	featuresets = [(extractTfIdfFeatureOfADocument(d,word_features, high_info_wordSet), c) for (d,c) in cleanedDataCollection]
	train_set, test_set = featuresets[800:], featuresets[:800]

	# classifier = nltk.NaiveBayesClassifier.train(train_set)
	# print(nltk.classify.accuracy(classifier, test_set))
	# classifier.show_most_informative_features(5) 
	# return classifier

	sk_classifier = SklearnClassifier(MultinomialNB())
	sk_classifier.train(train_set)
	print "accuracy is: %s" % (accuracy(sk_classifier, test_set))

	precision, recall, fMeasure = precision_recall_fmeasure(sk_classifier,  test_set, tagToBeTrained)

	print "precision is: %s" % (precision)
	print "recall is: %s" % (recall)
	print "F-measure is: %s" % (fMeasure)
	return sk_classifier
def trainAndTest(trFeatureSets, trLabels, teFeatureSets, teLabels, types, toPrint):
    classifier = NaiveBayesClassifier.train(mapFeaturesToLabels(trFeatureSets, trLabels))
    nuLabels = classifier.classify_many(teFeatureSets)
    if toPrint:
	    print 'Test labels:', teLabels
	    print 'New labels:', nuLabels
    print 'Accuracy: %.2f' % accuracy(classifier, mapFeaturesToLabels(teFeatureSets, teLabels))
    # classifier.show_most_informative_features()

    teIndices = getLabelIndices(teLabels, types)
    nuIndices = getLabelIndices(nuLabels, types)
    teCounts = Counter(teLabels)
    nuCounts = Counter(nuLabels)
    metrices = []
    for type in types:
        matches = len(teIndices[type] & nuIndices[type])
        precision = 1.0 * matches / nuCounts[type] if nuCounts[type] > 0 else '-'
        recall = 1.0 * matches / teCounts[type] if teCounts[type] > 0 else '-'
        metrices += [[type, precision, recall]]
    print tabulate(metrices, ['LABEL', 'PRECISION', 'RECALL'], tablefmt='fancy_grid', floatfmt='.2f')
示例#43
0
def training():
	pos_sen = open("positive.txt", 'r', encoding = 'latin-1').read()
	neg_sen = open("negative.txt", 'r', encoding = 'latin-1').read()

	emoji = open("emoji.txt",'r', encoding = 'latin-1').read()
	pos_emoji = []
	neg_emoji = []
	for i in emoji.split('\n'):
		exp = ''
		if i[len(i)-2] == '-':
			for j in range(len(i) - 2):
				exp += i[j]
			neg_emoji.append(( {exp : True}, 'negative'))
		else:
			for j in range(len(i)-1):
				exp += i[j]
			pos_emoji.append(( {exp : True}, 'positive'))

	prev = [(features(words), 'positive') for words in pos_sen.split('\n')]
	nrev = [(features(words), 'negative') for words in neg_sen.split('\n')]
	
	pos_set = prev + pos_emoji
	neg_set = nrev + neg_emoji

	real_classifier = NaiveBayesClassifier.train(prev+nrev)

	# SAVE IN FILE TO AVOID TRAIINING THE DATA AGAIN
	save_doc = open("classifier.pickle", 'wb')
	pickle.dump(real_classifier, save_doc)
	
	save_doc.close()

	#TO TEST ACCURACY OF CLASSIFIER UNCCOMMENT THE CODE BELOW
	#ACCURACY : 78.1695423855964
	ncutoff = int(len(nrev)*3/4)
	pcutoff = int(len(prev)*3/4)
	train_set = nrev[:ncutoff] + prev[:pcutoff] + pos_emoji + neg_emoji
	test_set = nrev[ncutoff:] + prev[pcutoff:]
	test_classifier = NaiveBayesClassifier.train(train_set)

	print("Accuracy is : ", util.accuracy(test_classifier, test_set) * 100)
示例#44
0
def main():

    pid = movie_reviews.fileids('neg')
    nid = movie_reviews.fileids('pos')

    prev = [(features(movie_reviews.words(fileids=id)), 'positive')
            for id in pid]
    nrev = [(features(movie_reviews.words(fileids=id)), 'negative')
            for id in nid]

    ncutoff = int(len(nrev) * 3 / 4)
    pcutoff = int(len(prev) * 3 / 4)

    train_set = nrev[:ncutoff] + prev[:pcutoff]
    test_set = nrev[ncutoff:] + prev[pcutoff:]

    # NaiveBayesClassifier
    classifier = NaiveBayesClassifier.train(train_set)

    # Accuracy
    print("Accuracy is : ", util.accuracy(classifier, test_set) * 100)
示例#45
0
def makePrediction():

    labels = movie_reviews.categories()
    print("Labels for reviews are: {}\n".format(labels) )

    labeled_words = [(label, movie_reviews.words(categories=[label])) for label in labels]
    print("Labeled words:{}\n".format(labeled_words[:10]))

    high_info_words = set(Toolbox.high_information_words(labeled_words))
    print("High information  words:{}\n".format(list(high_info_words)[:10]))

    feat_det = lambda words: Toolbox.bag_of_words_in_set(words, high_info_words)

    lfeats = Toolbox.label_feats_from_corpus(movie_reviews, feature_detector=feat_det)

    train_feats, test_feats = Toolbox.split_label_feats(lfeats)

    mv_classifier = ClassifierTrainer.trainClassifier(train_feats)

    accuracyScore = accuracy(mv_classifier, test_feats)

    print("Accuracy is {}".format(accuracyScore))
示例#46
0
    def train(self):
        print 'Classifier Training in progress....'
        poscutoff = len(self.positiveFeatures)
        negcutoff = len(self.negativeFeatures)
        print "Train Pos Cutoff: " + str(
            poscutoff) + " Train Neg Cutoff: " + str(negcutoff)
        trainfeats = self.positiveFeatures[:
                                           poscutoff] + self.negativeFeatures[:
                                                                              negcutoff]

        testfeats = self.test()
        print 'Train on %d instances, test on %d instances' % (len(trainfeats),
                                                               len(testfeats))
        self.classifier = NaiveBayesClassifier.train(trainfeats)
        print 'accuracy:', accuracy(self.classifier, testfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = self.classifier.classify(feats)
            #print label, observed
            testsets[observed].add(i)

        print 'pos precision:', nltk.metrics.precision(refsets['pos'],
                                                       testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(refsets['pos'],
                                                 testsets['pos'])
        print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'],
                                                       testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(refsets['neg'],
                                                       testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(refsets['neg'],
                                                 testsets['neg'])
        print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'],
                                                       testsets['neg'])
示例#47
0
def trainLove():
    #Delete common chars like comma
    regex = re.compile('[\[,\.!?—\]]')
    pos = []
    with open("./pos_rom.txt") as f:
        for i in f:
            pos.append([format_sentence(regex.sub('', i.lower())), 'pos'])
    neg = []
    with open("./neg_rom.txt") as f:
        for i in f:
            neg.append([format_sentence(regex.sub('', i.lower())), 'neg'])

# next, split labeled data into the training and test data
    training = pos[:int((.8) * len(pos))] + neg[:int((.8) * len(neg))]
    test = pos[int((.8) * len(pos)):] + neg[int((.8) * len(neg)):]
    from nltk.classify import NaiveBayesClassifier
    classifier = NaiveBayesClassifier.train(training)
    from nltk.classify.util import accuracy
    print("Test data accuracy" + str(accuracy(classifier, test)))
    classifier.show_most_informative_features()
    data = []
    #with open("./raw.txt", 'r', encoding='utf-8') as f:
    #    data=f.read()
    return classifier
def main():
    #print(movie_reviews)
    ##Importing The movie_reviews dataset pos and neg review
    pid = movie_reviews.fileids('neg')
    nid = movie_reviews.fileids('pos')

    prev = [(features(movie_reviews.words(fileids = id)), 'positive') for id in pid]
    nrev = [(features(movie_reviews.words(fileids = id)), 'negative') for id in nid]

    ncutoff = int(len(nrev)*3/4)
    pcutoff = int(len(prev)*3/4)
    ##Importing The movie_reviews dataset pos and neg review
    
    #Training and Testing Sets
    train_set = nrev[:ncutoff] + prev[:pcutoff]
    test_set = nrev[ncutoff:] + prev[pcutoff:]
    #Training and Testing Sets


    # Training Text Classification Model and Evaluating The Model
    classifier = NaiveBayesClassifier.train(train_set)

    # Accuracy
    print ("Accuracy is : ", util.accuracy(classifier, test_set) * 100)
示例#49
0
def split_label_feats(lfeats, split=0.90):
    train_feats = []
    test_feats = []
    for label, feats in lfeats.iteritems():
        random.shuffle(feats, random.random)
        cutoff = int(len(feats) * split)
        train_feats.extend([(feat, label) for feat in feats[:cutoff]])
        test_feats.extend([(feat, label) for feat in feats[cutoff:]])
    return train_feats, test_feats




reader.categories()

lfeats = label_feats_from_corpus(reader)
lfeats.keys()
train_feats, test_feats = split_label_feats(lfeats)
len(train_feats)
len(test_feats)

from nltk.classify import NaiveBayesClassifier
nb_classifier = NaiveBayesClassifier.train(train_feats)
nb_classifier.labels()

from nltk.classify.util import accuracy
accuracy(nb_classifier, test_feats)


示例#50
0
from nltk.classify.util import accuracy

def bag_of_words(words):
	return dict([(word, True) for word in words])

train_feats =[];
test_feats=[];
#print(movie_reviews.categories()); #list categories
#There are 1000 positive files and 1000 negative files
for label in movie_reviews.categories():
	#print [label]; #display neg,pos
	i=0;
	for fileid in movie_reviews.fileids([label]):
		i=i+1;
		words=movie_reviews.words(fileid);
		bag=bag_of_words(words);
		features=[bag,label];
		if i < 750:
			train_feats.append(features);
		else:
			test_feats.append(features);
#print len(train_feats);
classifier = NaiveBayesClassifier.train(train_feats); #Naive bayes classifier
#test the data
posi=bag_of_words(['kate','winslet','is','accessible']);
nega=bag_of_words(['the','plot','was','ludicrous']);
print classifier.classify(nega);
print classifier.classify(posi);
#display accuracy
print(accuracy(classifier,test_feats));
示例#51
0
classifier = nltk.NaiveBayesClassifier.train(training_set)

def train(labeled_featuresets, estimator=ELEProbDist):
    # Create the P(label) distribution
    label_probdist = estimator(label_freqdist)
    # Create the P(fval|label, fname) distribution
    feature_probdist = {}
    return NaiveBayesClassifier(label_probdist, feature_probdist)

#print "Most informative features"
#print classifier.show_most_informative_features(32)

print "-"*50
tweet = 'Multiple users on one machine never seemed to behave like youd expect'
print tweet
print "___/^"
#print extract_features(tweet.split())
print classifier.classify(extract_features(tweet.split()))
print "-"*50

print listare[0]


for texte in listare:
        classifier.classify(extract_features(texte[0]))

acc = accuracy(classifier, training_set) * 100
print 'accuracy: %.2f%%' % acc

示例#52
0
        feature[word] = True
    pdata.append((feature, 'POSITIVE'))
ndata = []
fileids = nc.movie_reviews.fileids('neg')
for fileid in fileids:
    feature = {}
    words = nc.movie_reviews.words(fileid)
    for word in words:
        feature[word] = True
    ndata.append((feature, 'NEGATIVE'))
pnumb, nnumb = \
    int(0.8 * len(pdata)), int(0.8 * len(ndata))
train_data = pdata[:pnumb] + ndata[:nnumb]
test_data = pdata[pnumb:] + ndata[nnumb:]
model = cf.NaiveBayesClassifier.train(train_data)
ac = cu.accuracy(model, test_data)
print('%.2f%%' % round(ac * 100, 2))
reviews = [
    'It is an amazing movie.',
    'This is a dull movie. I would never recommand it to anyone.',
    'The cinemagraphy is pretty great in this movie',
    'The direction was terrible and the story was all over the place'
]
sents, probs = [], []
for review in reviews:
    feature = {}
    words = review.split(' ')
    for word in words:
        feature[word] = True
    pcls = model.prob_classify(feature)
    sent = pcls.max()
示例#53
0
		print 'training a multi-binary %s classifier' % args.algorithm
	
	classifier = MultiBinaryClassifier.train(labels, train_feats, trainf, **train_kwargs)
else:
	if args.trace:
		print 'training a %s classifier' % args.algorithm
	
	classifier = trainf(train_feats, **train_kwargs)

################
## evaluation ##
################

if not args.no_eval:
	if not args.no_accuracy:
		print 'accuracy: %f' % accuracy(classifier, test_feats)
	
	if args.multi and args.binary and not args.no_masi_distance:
		print 'average masi distance: %f' % (scoring.avg_masi_distance(classifier, test_feats))
	
	if not args.no_precision or not args.no_recall or not args.no_fmeasure:
		if args.multi and args.binary:
			refsets, testsets = scoring.multi_ref_test_sets(classifier, test_feats)
		else:
			refsets, testsets = scoring.ref_test_sets(classifier, test_feats)
		
		for label in labels:
			ref = refsets[label]
			test = testsets[label]
			
			if not args.no_precision:
示例#54
0
def wordsInCorpus(corpus):
    words = list()
    for line in corpus:
        for word in line.split():
            if word not in stopWords:
                words.append(normalize(word))
    return words

posCorpus = open("finalPositiveCorpus.txt", "r")
poswords = wordsInCorpus(posCorpus)
negCorpus = open("finalNegativeCorpus.txt", "r")
negwords = wordsInCorpus(negCorpus)

#list of tuples (word, label) for every non-stop word that occurs in each corpus
labeled_features = ([(word, 'pos') for word in poswords] + [(word, 'neg') for word in negwords])

import random
random.shuffle(labeled_features)

cutOff = len(labeled_features) * 3/4

train_set, test_set = labeled_features[cutOff:], labeled_features[:cutOff]

#this is where the script crashes
#This WOULD work if NaiveBayesClassifier.train() worked on lists
#But it only works on dictionaries, contrary to what documentation says
classifier = NaiveBayesClassifier.train(train_set)  

print 'accuracy:', accuracy(classifier, test_set)
arquivoClassificador.close()
arquivoClassificados = open('classificados.json')
classificados = ujson.load(arquivoClassificados)
arquivoClassificados.close()
sentimentos = {}
featuresClassificados = []
comeco = datetime.now()
for resposta in classificados:
	texto = resposta['corpo']
	frases = tokenizerFrases.tokenize(texto)
	feature = {}
	for frase in frases:
		palavras = tokenizerPalavras.tokenize(frase)
		palavras = [palavra for palavra in palavras if palavra not in stopWords]
		for palavra in palavras:
			feature[palavra] = True
	sentimentos[texto] = (resposta, classificador.classify(feature))
	featuresClassificados.append((feature, resposta['sentimento']))
tempo = datetime.now() - comeco
arquivoMedicoes = open('medicoes_analise_sequencial.txt', 'w')
arquivoMedicoes.write('Tempo de Execução = ' + str(tempo) + '\nPrecisão = {0:.2f}%'.format(accuracy(classificador, featuresClassificados) * 100))
arquivoMedicoes.close()
arquivoResultados = open('resultados_sem_stopwords.csv', 'w', newline='')
w = writer(arquivoResultados, delimiter=',')
linhas = [['Resposta', 'Pontos', 'Sentimento - Naive Bayes', 'Sentimento - AlchemyAPI']]
for texto in sentimentos.keys():
	tupla = sentimentos[texto]
	resposta = tupla[0]
	linhas.append([texto, resposta['pontos'], tupla[1], resposta['sentimento']])
w.writerows(linhas)
arquivoResultados.close()
def word_features(words):
    return dict([(word, True) for word in words])


# Get all the movie reviews with positive data set and negative data set
posRev = movie_reviews.fileids('pos')
negRev = movie_reviews.fileids('neg')

# Mark the words in data set as positive and negative:
posWords = [(word_features(movie_reviews.words(fileids=[f])), 'pos') for f in posRev]
negWords = [(word_features(movie_reviews.words(fileids=[f])), 'neg') for f in negRev]

# Set cut off for separating the training data and the testing data:
posCutoff = len(posWords) * 50 / 100
negCutoff = len(negWords) * 50 / 100

# Fill the training data and the testing data with positive and negative data set:
TestRev = posWords[posCutoff:] + negWords[negCutoff:]
Test_set = array(TestRev)
TrainRev = posWords[:posCutoff] + negWords[:negCutoff]
Train_set = array(TrainRev)
print 'train on %d instances, test on %d instances' % (len(Train_set), len(Test_set))

# Call Maximum Entropy classifier to classify the training data:
algo = MaxentClassifier.ALGORITHMS[0]
classifier = MaxentClassifier.train(Train_set, algorithm=algo, max_iter=3)
classifier.show_most_informative_features(10)

# Print the algorithm accuracy
print 'Accuracy is', util.accuracy(classifier, Test_set)
				elif sentimento == 'negativo':
					negativos.append(resposta)
				else:
					neutros.append(resposta)
threads = []
comeco = datetime.now()
for resposta in classificados:
	thread = ThreadSentimento(resposta)
	threads.append(thread)
	thread.start()
for thread in threads:
	thread.join()
tempo = datetime.now() - comeco
iteracao = iteracao + 1
arquivoMedicoes = open('medicoes_analise_threads_' + str(iteracao) + '.txt', 'w')
precisao = accuracy(classificador, featuresClassificados) * 100
arquivoMedicoes.write('Tempo de Execução = ' + str(tempo) + '\nPrecisão = {0:.2f}%'.format(precisao))
arquivoMedicoes.close()
features = resultadoPositivos.get() + resultadoNegativos.get() + resultadosNeutros.get()
pool1.terminate()
pool1.close()
pool2.terminate()
pool2.close()
pool3.terminate()
pool3.close()
if precisao > 50:
	features.extend(featuresClassificados)
	shuffle(features)
	classificador = NaiveBayesClassifier.train(features)
	arquivoClassificador = open('classificador.pickle', 'wb')
	dump(classificador, arquivoClassificador, protocol=HIGHEST_PROTOCOL)