X_test = format("trainingandtestdata/testdata.manual.2009.06.14.csv") X_test, y_test = zip(*X_test) X_test, y_test = map(lambda x: list(x), [X_test, y_test]) pr = Processor() print '\nNo twitter-specific features' print '#'*40 # ~8min processing phase # The twitter-specific tokenizer makes the parsing slow, however # the accuracy is much improved with it. X_train, train_feats = pr.process(X_train, verbose=True) # ~7min vectoring phase X_mat = pr.fit_transform(X_train, saveVectorizer=False, saveMatrix=False, verbose=True) X_test, test_feats = pr.process(X_test, verbose=True) X_test = pr.transform(X_test, saveMatrix=False, verbose=True) # Compare the accuracy with and w/o the twitter-specific features. # Must scale the features matrix before concatenating with the ngrams matrix. print '\nTF-IDF Unigrams and Bigrams || Logistic Regression classifier' print '-'*40 clf = LR() # Roughly 3 minutes on training t0 = time.time() print 'Training on %d samples...' % (X_mat.shape[0]) clf.fit(X_mat, y_train) print 'Training time: %.0fs' % ((time.time()-t0))