Exemplo n.º 1
0
from classifiers import Classifier

from db import DataManager

N_TIMES = 1

for i in range(0,N_TIMES):
  print i+1, "times"
  DATABASE = "us_twitter.db"

  split = 0.8

  db_mgr = DataManager(DATABASE)

  train_tweets, train_labels = db_mgr.select_wikipedia_train()
  test_tweets, test_labels, dummy1, dummy2 = db_mgr.select_tweets(limit=10, state_fips=True, table="us_tweets", label=state_fips)

  results = get("results.json")

  vectorizer = get_vectorizer("tfidf", min_df=1)

  classifiers = {
    "BernoulliNB": Classifier(classifier="bnb"),
    "MultinomialNB": Classifier(classifier="nb"),
    "KNN-1000": Classifier(classifier="knn", k=1000),
    "KNN-2000": Classifier(classifier="knn", k=2000),
    # "SVC": Classifier(classifier="svm", params={"C" : 1.0,"kernel" : 'linear','verbose':True})
    "SVC": Classifier(load="classifier-SVC")
  }

  # Vectorizing Training Data
Exemplo n.º 2
0
          # ["tweets", "preprocess", "grid_5_label",grid_5_degree,True],
          # ["tweets", "preprocess", "grid_10_label",grid_10_degree,True]
         ]
for p in range(0,len(params)):
  print params[p]
  TRAINING, PREPROCESSING, LABEL_FUNC, label_func, preprocess = params[p]
  for i in range(0,N_TIMES):
    print i+1, "times"
    DATABASE = "us_twitter.db"

    split = 0.8

    db_mgr = DataManager(DATABASE)

    if TRAINING == "tweets":
      train_tweets, train_labels, test_tweets, test_labels = db_mgr.select_tweets(limit=SIZE, preprocess=preprocess, table="us_tweets", split=0.8, label=label_func)
    else:
      train_tweets, train_labels = db_mgr.select_wikipedia_train()
      test_tweets, test_labels, dummy1, dummy2 = db_mgr.select_tweets(limit=(SIZE * 0.2), state_fips=True, table="us_tweets", label=label_func)
    # print "Train Size:", len(train_tweets)
    # print "Test Size:", len(test_tweets)


    vectorizer = get_vectorizer(VECTORIZER, min_df=1)

    classifiers = {
      "BernoulliNB": Classifier(classifier="bnb"),
      "MultinomialNB": Classifier(classifier="nb"),
      # "KNN-50": Classifier(classifier="knn", k=50),
      # "KNN-100": Classifier(classifier="knn", k=100),
      # "KNN-1000": Classifier(classifier="knn", k=1000),