def classify(): #corpus = 'Cornell_text_polarity' #corpus = 'BingLiu_selected_sentences' corpus = 'Cornell_sentence_polarity' cases = load_corpus(corpus) features = get_word_features(cases) train_feats = [] test_feats = [] for polarity, feats in features.items(): #cutoff = len(feats) * 2 / 4 cutoff = 1000 print polarity, 'number of train:', cutoff #train_feats += feats[:cutoff] #test_feats += feats[cutoff:] temp_feats = feats[:] random.shuffle(temp_feats) train_feats += temp_feats[:cutoff] test_feats += temp_feats[cutoff:] print 'train on %d instances, test on %d instances' % (len(train_feats), len(test_feats)) classifier = SvmClassifier.train(train_feats) print 'Test classify:', classifier.classify(dict([('I', 0.0),('love',1.0), ('you', 0.0)])) print 'accuracy:', nltk.classify.util.accuracy(classifier, test_feats) classifier.show_most_informative_features()
# filter the feature vectors: fvecs = [(tweet_features.get_tweet_features(t, worstfeaturesfilter), s) for (t, s) in tweets] v_train = fvecs[0:num_train] # v_train = fvecs v_test = fvecs[num_train : len(tweets)] # dump tweets which our feature selector found nothing # for i in range(0,len(tweets)): # if tweet_features.is_zero_dict( fvecs[i][0] ): # print tweets[i][1] + ': ' + tweets[i][0] # train classifier # classifier = nltk.NaiveBayesClassifier.train(v_train); # classifier = nltk.classify.maxent.train_maxent_classifier_with_gis(v_train); classifier = SvmClassifier.train(v_train) # classifier = nltk.classify.maxent.train_maxent_classifier_with_gis(v_train,count_cutoff=2) # classifier = nltk.classify.maxent.train_maxent_classifier_with_iis(v_train,count_cutoff=4) # classifier = nltk.classify.maxent.train_maxent_classifier_with_scipy(v_train, algorithm='BFGS'); pickle.dump(worstfeaturesfilter, open("worstfeaturesfilter.pickle", "w")) print "WARNING: NOT PICKELING CLASSIFIER ANYMORE" # pickle.dump(classifier, open('classifier.pickle', 'w')) # print classifier.classify(tweet_features.get_tweet_features("Christmas Eve without my, with cold feet and nobody to.", worstfeaturesfilter)) # print classifier.classify(tweet_features.get_tweet_features("Nochebuena sin mi @ tdomhan, con los pies fríos y nadie a quien abrazar.", worstfeaturesfilter)) # print classifier.classify(tweet_features.get_tweet_features("Nochebuena sin mi, con los pies fríos y nadie a quien abrazar.", worstfeaturesfilter)) # print classifier.explain(tweet_features.get_tweet_features("Nochebuena sin mi @ tdomhan, con los pies fríos y nadie a quien abrazar.", worstfeaturesfilter)) # classify and dump results for interpretation
#filter the feature vectors: fvecs = [(tweet_features.get_tweet_features(t, worstfeaturesfilter), s) for (t, s) in tweets] v_train = fvecs[0:num_train] #v_train = fvecs v_test = fvecs[num_train:len(tweets)] # dump tweets which our feature selector found nothing #for i in range(0,len(tweets)): # if tweet_features.is_zero_dict( fvecs[i][0] ): # print tweets[i][1] + ': ' + tweets[i][0] # train classifier #classifier = nltk.NaiveBayesClassifier.train(v_train); #classifier = nltk.classify.maxent.train_maxent_classifier_with_gis(v_train); classifier = SvmClassifier.train(v_train) #classifier = nltk.classify.maxent.train_maxent_classifier_with_gis(v_train,count_cutoff=2) #classifier = nltk.classify.maxent.train_maxent_classifier_with_iis(v_train,count_cutoff=4) #classifier = nltk.classify.maxent.train_maxent_classifier_with_scipy(v_train, algorithm='BFGS'); pickle.dump(worstfeaturesfilter, open('worstfeaturesfilter.pickle', 'w')) print "WARNING: NOT PICKELING CLASSIFIER ANYMORE" #pickle.dump(classifier, open('classifier.pickle', 'w')) #print classifier.classify(tweet_features.get_tweet_features("Christmas Eve without my, with cold feet and nobody to.", worstfeaturesfilter)) #print classifier.classify(tweet_features.get_tweet_features("Nochebuena sin mi @ tdomhan, con los pies fríos y nadie a quien abrazar.", worstfeaturesfilter)) #print classifier.classify(tweet_features.get_tweet_features("Nochebuena sin mi, con los pies fríos y nadie a quien abrazar.", worstfeaturesfilter)) #print classifier.explain(tweet_features.get_tweet_features("Nochebuena sin mi @ tdomhan, con los pies fríos y nadie a quien abrazar.", worstfeaturesfilter)) # classify and dump results for interpretation
tweets.append(item) if sentiment == "positive": pos_tweets += item else: neg_tweets += item negcutoff, poscutoff = len(neg_tweets) * 4 / 5, len(pos_tweets) * 4 / 5 pos_train, pos_test = pos_tweets[:poscutoff], pos_tweets[poscutoff:] neg_train, neg_test = neg_tweets[:negcutoff], neg_tweets[negcutoff:] neg_feats_train = get_train_features_from_tweets(neg_train, 'neg') pos_feats_train = get_train_features_from_tweets(pos_train, 'pos') train_feats = neg_feats_train + pos_feats_train classifier = SvmClassifier.train(train_feats) # classifier = nltk.NaiveBayesClassifier.train(train_feats) # Evaluation correct, wrong = 0, 0 for tweet in neg_test: features = get_features_from_tweet(tweet) result = classifier.classify(features) if result == "neg": correct += 1 else: wrong += 1