def __init__(self): db_mgr = DataManager(self.DATABASE) self.train_tweets, self.train_labels = db_mgr.select_wikipedia_train() self.vectorizer = get_vectorizer("tfidf", min_df=1) self.nb = Classifier(classifier="nb") self.train_data = self.vectorizer.fit_transform(self.train_tweets) self.nb.fit(self.train_data, self.train_labels)
from features import * from classifiers import Classifier from db import DataManager N_TIMES = 1 for i in range(0,N_TIMES): print i+1, "times" DATABASE = "us_twitter.db" split = 0.8 db_mgr = DataManager(DATABASE) train_tweets, train_labels = db_mgr.select_wikipedia_train() test_tweets, test_labels, dummy1, dummy2 = db_mgr.select_tweets(limit=10, state_fips=True, table="us_tweets", label=state_fips) results = get("results.json") vectorizer = get_vectorizer("tfidf", min_df=1) classifiers = { "BernoulliNB": Classifier(classifier="bnb"), "MultinomialNB": Classifier(classifier="nb"), "KNN-1000": Classifier(classifier="knn", k=1000), "KNN-2000": Classifier(classifier="knn", k=2000), # "SVC": Classifier(classifier="svm", params={"C" : 1.0,"kernel" : 'linear','verbose':True}) "SVC": Classifier(load="classifier-SVC") }
import pickle from lib import * from labels import * from metrics import * from features import * from db import DataManager from classifiers import Classifier DATABASE = "us_twitter.db" db_mgr = DataManager(DATABASE) train_data, train_labels = db_mgr.select_wikipedia_train() vectorizers = { "count":get_vectorizer("tfidf", min_df=1), "tfidf":get_vectorizer("count", min_df=1) } print "Vectorizing Training Data..." count_data = vectorizers["count"].fit_transform(train_data) tf_idf_data = vectorizers["tfidf"].fit_transform(train_data) classifiers = { "BernoulliNB": { "count":Classifier(classifier="bnb"), "tfidf":Classifier(classifier="bnb") }, "MultinomialNB": { "count":Classifier(classifier="nb"), "tfidf":Classifier(classifier="nb")