def kfold(k=10): print "Loading data." videos, users, reviews = load_data() print "Extracting features." orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews]) feats = create_features(orig_X, users) #y = np.array([1 if x['spam'] == 'true' else 0 for x in reviews]) y = np.array([1 if x['adult'] == 'true' else 0 for x in reviews]) print "Vectorizing features." v = DictVectorizer(sparse=False) feats = v.fit_transform(feats) print "Starting K-fold cross validation." cv = cross_validation.KFold(len(feats), k=k, indices=True, shuffle=True, random_state=1234) cls = LogisticRegression(penalty='l2', tol=0.00001, fit_intercept=False, dual=False, C=2.4105, class_weight=None) if PRINT_COEFS: cls.fit(feats, y) c = v.inverse_transform(cls.coef_) for key, val in sorted(c[0].iteritems(), key=lambda x: x[1]): # if isinstance(key, str) and key.startswith("_"): print key, val quit() f1sum = 0 for i, (train_idx, test_idx) in enumerate(cv): train_X, train_y, test_X, test_y = feats[train_idx], \ y[train_idx], feats[test_idx], y[test_idx] cls.fit(train_X, train_y) preds = cls.predict(test_X) if PRINT_ERRORS: # worst = np.argsort(np.abs(test_y - preds)) #for j in worst[-1:-10:-1]: orig_test = orig_X[test_idx] # for j in worst: for j in range(len(orig_test)): if test_y[j] != preds[j]: print j, orig_test[j][1], test_y[j], preds[j] #quit() f1 = metrics.f1_score(test_y, preds) print "Fold %d F1 score: %.5f" % (i, f1) f1sum += f1 avgf1 = (f1sum / k) print "Mean F1 score: %.5f" % (f1sum / k) # scores = cross_validation.cross_val_score(cls, feats, y, cv=10, score_func=metrics.f1_score) # for i, score in enumerate(scores): # print "Fold %d: %.5f" % (i, score) # print "Mean score: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) return avgf1
import numpy as np import cPickle from features import create_features, PROJECT from parse import load_data from dict_vectorizer import DictVectorizer videos, users, reviews = load_data() orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews]) feats = create_features(orig_X, None) v = DictVectorizer(sparse=False) feats = v.fit_transform(feats) # feats is now in vectorized format # v.transform() is the transformation that needs to be used on test data cPickle.dump(v, open(PROJECT + "db/dictvectorizer.pickle", "wb"))