def posVsRest(X,Y): #Y now contains 1 for tweets that were positive #and 0 for negative, neutral or irrelevant Y= u.tweak_labels(Y,["positive"]) classes = np.unique(Y) for c in classes: print("#%s: %i" % (c, sum(Y==c))) train_model(Models.create_ngram_model,X,Y,"posVsRest",True)
def sentimentAndNoSentiment(X,Y): #Y now contains 1 for tweets that were positive or negative #and 0 for neutral or irrelevant Y= u.tweak_labels(Y,["positive","negative"]) classes = np.unique(Y) for c in classes: print("#%s: %i" % (c, sum(Y==c))) train_model(Models.get_best_union_model,X,Y,"sentimentVsNoSentiment",True)
best_clf = create_ngram_model(best_params) return best_clf if __name__ == "__main__": X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: print("#%s: %i" % (c, sum(Y_orig == c))) print("== Pos vs. neg ==") pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print("== Pos/neg vs. irrelevant/neutral ==") X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) # best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs # rest", plot=True) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print("== Pos vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True)
best_clf = create_ngram_model(best_params) return best_clf if __name__ == "__main__": X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== Pos vs. neg ==" pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print "== Pos/neg vs. irrelevant/neutral ==" X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) # best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs # rest", plot=True) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print "== Pos vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True)
clf__alpha=0.01, ) best_clf = create_ngram_model(best_params) return best_clf if __name__ == "__main__": X_orig, Y_orig = load_data() classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== Pos vs. neg ==" pos_neg = np.logical_or(Y_orig == "Pos", Y_orig == "Neg") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["Pos"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print "== Pos/neg vs. irrelevant/neutral ==" X = X_orig Y = tweak_labels(Y_orig, ["Pos", "Neg"]) #best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs rest", plot=True) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print "== Pos vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["Pos"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True)
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.naive_bayes import MultinomialNB #Naive Bayes proves to be quite robust to irrelevant features, learns fast, #doesn't need lots of storage. So why Naive? Well features are required to #examined independent of one another. # if __name__ == "__main__": X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== Pos vs. neg ==" pos_neg = np.logical or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) def create_ngram_model(): tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3), analyzer="word", binary=False) clf = MultinomialNB() pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)]) return pipeline
for idx in xrange(len(X_wrong)): print "clf.predict('%s')=%i instead of %i" %\ (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) if __name__ == "__main__": X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== pos vs. neg ==" pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") x = X_orig[pos_neg] y = Y_orig[pos_neg] y = tweak_labels(y, ["positive"]) train_model(create_ngram_model, x, y, name="pos vs neg", plot="true") print "== pos/neg vs. irrelevant/neutral ==" x = X_orig y = tweak_labels(Y_orig, ["positive", "negative"]) train_model(create_ngram_model, x, y, name="sent vs rest", plot=True) print "== pos vs. rest ==" x = X_orig y = tweak_labels(Y_orig, ["positive"]) train_model(create_ngram_model, x, y, name="pos vs rest", plot="true") print "== neg vs. rest ==" x = X_orig