testing_set = nltk.classify.apply_features(extract_features, test_tweets) for (tweet, sentiment) in test_tweets: print(classifier.classify(extract_features(tweet))) print(nltk.classify.accuracy(classifier, testing_set)) classifier.show_most_informative_features(5) """ pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k='all')), ('nb', MultinomialNB())]) """ pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k='all')), ('nb', MultinomialNB())]) classif = SklearnClassifier(pipeline) classif.train(training_set) print(classif.labels()) for (tweet, sentiment) in test_tweets: print(classif.classify(extract_features(tweet))) print(nltk.classify.accuracy(classif, testing_set))
save_classifier.close() except: print("Pb dans le NaiveBayesClassifier") LogisticRegression_classifier = SklearnClassifier( LogisticRegression()) LogisticRegression_classifier.train(training_set) print("sklearn classifier créer en LogisticRegression : \n", LogisticRegression_classifier) #LogisticRegression_classifier.fit(training_set) #print(LogisticRegression_classifier) print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100) print("Labels :", LogisticRegression_classifier.labels()) print("Type : ", type(LogisticRegression_classifier)) dictum = [tupl[0] for tupl in testing_set] print("dictum : ", dictum) try: print("classify many:", LogisticRegression_classifier.classify_many(dictum)) except: print("classify many erreur \n", "Type testing_set: ", type(dictum), "\n testing_set :", dictum) try: print("prob_classify_many:", LogisticRegression_classifier.prob_classify_many(dictum)) for probdisti in LogisticRegression_classifier.prob_classify_many(
print "creating feature sets..." tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/testdata.csv') labeld_features = label_feats_from_tweets(tweetlist) #labeld_features = label_feats_from_corpus(movie_reviews) training_set, test_set = split_label_feats(labeld_features) # tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv') # training_set = label_feats_from_tweets(tweetlist) # training_set, garbage = split_label_feats(training_set, 1.0) # test_set, garbage = split_label_feats(labeld_features, 1.0) print "training set length: %i test set length: %i" % (len(training_set), len(test_set)) print prettifyFeatureSet(test_set) print "training classifier..." #classifier = NaiveBayesClassifier.train(training_set) #classifier = MaxentClassifier.train(training_set, algorithm='iis', max_iter=99, min_lldelta=0.01) #classifier = MaxentClassifier.train(training_set) classifier = SklearnClassifier(LogisticRegression()).train(training_set) print "calculating accuracy..." print 'accuracy:', nltk.classify.util.accuracy(classifier, test_set) #classifier.show_most_informative_features(30) negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous']) print classifier.classify(negfeat) probdist = classifier.prob_classify(negfeat) print "pos: ", probdist.prob('pos'), " neg: ", probdist.prob('neg') print classifier.labels() classify_tweet(classifier, "I love this movie!", True) classify_tweet(classifier, "!!!", True)
train_corpus = corpus[150:] test_corpus = corpus[:150] train_set = [(features(words), labelize(category in categories)) for (words, categories) in train_corpus] test_set = [(features(words), labelize(category in categories)) for (words, categories) in test_corpus] # train classifier # print "Training classifier for '%s'" % category # classifier = MaxentClassifier.train(train_set, max_iter= 3) # classifier = NaiveBayesClassifier.train(train_set) model = MultinomialNB() classifier = SklearnClassifier(model) # set priors classifier._encoder.fit([category, "no"]) # [category, "no"] unless this is true then ["no", category] flip = classifier.labels()[0] == "no" categorized_proportion = len([words for (words, categories) in corpus if category in categories]) * 1.0 / len(corpus) if flip: model.class_prior = [1-categorized_proportion, categorized_proportion] else: model.class_prior = [categorized_proportion, 1-categorized_proportion] classifier.train(train_set) # test classifier test_results = classifier.classify_many([feat for (feat, label) in test_set]) pos_test_set = set(i for i, result in enumerate(test_results) if result == category) reference_values = [label for (feat, label) in test_set] pos_ref_set = set(i for i, (feat, label) in enumerate(test_set) if label == category) accuracy = scores.accuracy(reference_values, test_results) accuracies.append(accuracy)
(['not', 'like', 'that', 'man'], 'negative'), (['house', 'not', 'great'], 'negative'), (['your', 'song', 'annoying'], 'negative')] testing_set = nltk.classify.apply_features(extract_features, test_tweets) for (tweet, sentiment) in test_tweets: print(classifier.classify(extract_features(tweet))) print(nltk.classify.accuracy(classifier, testing_set)) classifier.show_most_informative_features(5) """ pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k='all')), ('nb', MultinomialNB())]) """ pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k='all')), ('nb', MultinomialNB())]) classif = SklearnClassifier(pipeline) classif.train(training_set) print(classif.labels()) for (tweet, sentiment) in test_tweets: print(classif.classify(extract_features(tweet))) print(nltk.classify.accuracy(classif, testing_set))
poscutoff = len(posfeats)*3/4 cls_set = ['pos', 'neg'] for i in range(0,2): print negfeats[i] print '\n------------------------------------------------\n' trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classif = SklearnClassifier(LinearSVC()) classif.train(trainfeats) print classif.labels() test_skl = [] t_test_skl = [] for d in testfeats: test_skl.append(d[0]) t_test_skl.append(d[1]) print(set(t_test_skl)) result = [] for item in test_skl: p = classif.classify(item) result.append(p) print len(result) print len(t_test_skl)