from classifiers import Classifier from db import DataManager N_TIMES = 1 for i in range(0,N_TIMES): print i+1, "times" DATABASE = "us_twitter.db" split = 0.8 db_mgr = DataManager(DATABASE) train_tweets, train_labels = db_mgr.select_wikipedia_train() test_tweets, test_labels, dummy1, dummy2 = db_mgr.select_tweets(limit=10, state_fips=True, table="us_tweets", label=state_fips) results = get("results.json") vectorizer = get_vectorizer("tfidf", min_df=1) classifiers = { "BernoulliNB": Classifier(classifier="bnb"), "MultinomialNB": Classifier(classifier="nb"), "KNN-1000": Classifier(classifier="knn", k=1000), "KNN-2000": Classifier(classifier="knn", k=2000), # "SVC": Classifier(classifier="svm", params={"C" : 1.0,"kernel" : 'linear','verbose':True}) "SVC": Classifier(load="classifier-SVC") } # Vectorizing Training Data
# ["tweets", "preprocess", "grid_5_label",grid_5_degree,True], # ["tweets", "preprocess", "grid_10_label",grid_10_degree,True] ] for p in range(0,len(params)): print params[p] TRAINING, PREPROCESSING, LABEL_FUNC, label_func, preprocess = params[p] for i in range(0,N_TIMES): print i+1, "times" DATABASE = "us_twitter.db" split = 0.8 db_mgr = DataManager(DATABASE) if TRAINING == "tweets": train_tweets, train_labels, test_tweets, test_labels = db_mgr.select_tweets(limit=SIZE, preprocess=preprocess, table="us_tweets", split=0.8, label=label_func) else: train_tweets, train_labels = db_mgr.select_wikipedia_train() test_tweets, test_labels, dummy1, dummy2 = db_mgr.select_tweets(limit=(SIZE * 0.2), state_fips=True, table="us_tweets", label=label_func) # print "Train Size:", len(train_tweets) # print "Test Size:", len(test_tweets) vectorizer = get_vectorizer(VECTORIZER, min_df=1) classifiers = { "BernoulliNB": Classifier(classifier="bnb"), "MultinomialNB": Classifier(classifier="nb"), # "KNN-50": Classifier(classifier="knn", k=50), # "KNN-100": Classifier(classifier="knn", k=100), # "KNN-1000": Classifier(classifier="knn", k=1000),