X_test = format("trainingandtestdata/testdata.manual.2009.06.14.csv")
X_test, y_test = zip(*X_test)
X_test, y_test = map(lambda x: list(x), [X_test, y_test])

pr = Processor()

print '\nNo twitter-specific features'
print '#'*40

# ~8min processing phase
# The twitter-specific tokenizer makes the parsing slow, however
#   the accuracy is much improved with it.
X_train, train_feats = pr.process(X_train, verbose=True)

# ~7min vectoring phase
X_mat = pr.fit_transform(X_train, saveVectorizer=False, saveMatrix=False, verbose=True)
X_test, test_feats = pr.process(X_test, verbose=True)
X_test = pr.transform(X_test, saveMatrix=False, verbose=True)

# Compare the accuracy with and w/o the twitter-specific features.
# Must scale the features matrix before concatenating with the ngrams matrix.
print '\nTF-IDF Unigrams and Bigrams || Logistic Regression classifier'
print '-'*40

clf = LR()
# Roughly 3 minutes on training
t0 = time.time()
print 'Training on %d samples...' % (X_mat.shape[0])
clf.fit(X_mat, y_train)
print 'Training time: %.0fs' % ((time.time()-t0))