def train_model(): TIL_n = feat.count_TIL_corpus() decoy_n = TIL_n*_DECOY_PROPORTION FP_n = feat.count_TIL_false_pos() wiki_n = feat.count_WIKI_corpus() skip_wiki_n = wiki_n // decoy_n # Keep the number of false positives in about the same Order-of-Mag skip_FP = FP_n // TIL_n print "Skipping every {} value in FP".format(skip_FP) if FLAG_BUILD_DECOY_LIST: build_skip_query(skip_wiki_n) print "Loading features" features = Word2Vec.load(feat.f_features) dimension = 100 # default dimension ITR_decoy = query_skip_decoys() print "Building training set" ITR_train = list(feat.TIL_full_corpus_iter()) print "Building the false positive set" ITR_FP = list(feat.TIL_false_pos_iter(skip_FP)) print "Building corpus iter" ITR = feat.chainer(ITR_train, ITR_FP, ITR_decoy) ITR = list(ITR) Y = np.zeros(len(ITR)) Y[:TIL_n] = 1.0 TTS = train_test_split x_train, x_test, y_train, y_test = TTS(ITR, Y, test_size=0.2) print "Proportion of answers {}/{}".format(y_train.sum(), y_test.sum()) print "Calculating the wordVecs for train" vec_train = np.concatenate([getWordVecs(text,weight, features,dimension) for text,weight in x_train]) print "Building the scalar" scaler = preprocessing.StandardScaler().fit(vec_train) print "Saving the scaler" joblib.dump(scaler, f_norm_scale) print "Scaling train vectors" vec_train = scaler.transform(vec_train) print "Calculating the wordVecs for test" vec_test = np.concatenate([getWordVecs(text,weight,features,dimension) for text,weight in x_test]) print "Scaling test vectors" vec_test = scaler.transform(vec_test) print "Train size/TP in sample", vec_train.shape, (y_train==1).sum() print "Test size/TP in sample", vec_test.shape, (y_test==1).sum() print "Training classifer" #from sklearn.linear_model import SGDClassifier as Classifier #from sklearn.linear_model import LogisticRegression as Classifier #from sklearn.linear_model import BayesianRidge as Classifier #from sklearn.naive_bayes import BernoulliNB as Classifier #from sklearn.naive_bayes import GaussianNB as Classifier #from sklearn.naive_bayes import GaussianNB as Classifier #from sklearn.ensemble import RandomForestClassifier as Classifier from sklearn.ensemble import ExtraTreesClassifier as Classifier # This seems to be the best... but high FP rate #from sklearn.naive_bayes import BernoulliNB as Classifier #clf = Classifier(loss='log', penalty='l1',verbose=2) # SGD #clf = Classifier(C=2500,verbose=2) # LogisiticRegression #clf = Classifier() # Naive Bayes clf = Classifier(n_estimators=200,n_jobs=8) # ExtraTrees clf.fit(vec_train, y_train) print 'Test Accuracy: %.3f'%clf.score(vec_test, y_test) idx_TP = np.array(y_test) > 0 vec_TP = np.array(vec_test)[idx_TP] y_TP = np.array(y_test)[idx_TP] print 'Test Accuracy on TP: %.3f'%clf.score(vec_TP, y_TP) vec_FP = np.array(vec_test)[~idx_TP] y_FP = np.array(y_test)[~idx_TP] print 'Test Accuracy on FP: %.3f'%clf.score(vec_FP, y_FP) print "Saving the classifer" joblib.dump(clf, f_clf) #Create ROC curve from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt pred_probas = clf.predict_proba(vec_test)[:,1] fpr,tpr,_ = roc_curve(y_test, pred_probas) roc_auc = auc(fpr,tpr) plt.plot(fpr,tpr,label='area = %.2f' %roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.legend(loc='lower right') plt.show()
# Create the score table cmd_template = ''' DROP TABLE IF EXISTS score; CREATE TABLE IF NOT EXISTS score ( idx INTEGER PRIMARY KEY NOT NULL, score FLOAT DEFAULT NULL ); INSERT INTO score (idx) SELECT decoy_idx FROM decoy; ''' #print "Templating" #conn_decoy.executescript(cmd_template) total_samples = feat.count_WIKI_corpus() cmd_check_remaining = '''SELECT COUNT(*) FROM score WHERE score IS NULL''' score_remaining = conn_decoy.execute(cmd_check_remaining).next() print "There are {} left to score".format(score_remaining) def getWordVecs(text, weight, model, dimension): # The vector is the entropy-weighted average of each word vec = np.zeros(dimension).reshape((1, dimension)) tokens = text.split() count = 0.0 for single_entropy, word in zip(weight, tokens):
from sklearn import preprocessing import build_features as feat #FLAG_BUILD_DECOY_LIST = True FLAG_BUILD_DECOY_LIST = False # When training, how big of a decoy set to use _DECOY_PROPORTION = 5 conn_decoy = feat.conn_decoy conn_train = feat.conn_train f_clf = "db/clf.joblib" f_norm_scale = "db/scale.joblib" total_samples = feat.count_WIKI_corpus() def query_skip_decoys(): cmd_select = '''SELECT tokens,weights FROM decoy AS A JOIN skip_decoy_query AS B ON A.decoy_idx=B.decoy_idx ''' cursor = conn_decoy.execute(cmd_select) for text,weight_str in cursor: # Load the entropy weight w = np.fromstring(weight_str[1:-1],sep=',') yield text, w def build_skip_query(skip_n): cmd_template = '''
def train_model(): TIL_n = feat.count_TIL_corpus() decoy_n = TIL_n * _DECOY_PROPORTION FP_n = feat.count_TIL_false_pos() wiki_n = feat.count_WIKI_corpus() skip_wiki_n = wiki_n // decoy_n # Keep the number of false positives in about the same Order-of-Mag skip_FP = FP_n // TIL_n print "Skipping every {} value in FP".format(skip_FP) if FLAG_BUILD_DECOY_LIST: build_skip_query(skip_wiki_n) print "Loading features" features = Word2Vec.load(feat.f_features) dimension = 100 # default dimension ITR_decoy = query_skip_decoys() print "Building training set" ITR_train = list(feat.TIL_full_corpus_iter()) print "Building the false positive set" ITR_FP = list(feat.TIL_false_pos_iter(skip_FP)) print "Building corpus iter" ITR = feat.chainer(ITR_train, ITR_FP, ITR_decoy) ITR = list(ITR) Y = np.zeros(len(ITR)) Y[:TIL_n] = 1.0 TTS = train_test_split x_train, x_test, y_train, y_test = TTS(ITR, Y, test_size=0.2) print "Proportion of answers {}/{}".format(y_train.sum(), y_test.sum()) print "Calculating the wordVecs for train" vec_train = np.concatenate([ getWordVecs(text, weight, features, dimension) for text, weight in x_train ]) print "Building the scalar" scaler = preprocessing.StandardScaler().fit(vec_train) print "Saving the scaler" joblib.dump(scaler, f_norm_scale) print "Scaling train vectors" vec_train = scaler.transform(vec_train) print "Calculating the wordVecs for test" vec_test = np.concatenate([ getWordVecs(text, weight, features, dimension) for text, weight in x_test ]) print "Scaling test vectors" vec_test = scaler.transform(vec_test) print "Train size/TP in sample", vec_train.shape, (y_train == 1).sum() print "Test size/TP in sample", vec_test.shape, (y_test == 1).sum() print "Training classifer" #from sklearn.linear_model import SGDClassifier as Classifier #from sklearn.linear_model import LogisticRegression as Classifier #from sklearn.linear_model import BayesianRidge as Classifier #from sklearn.naive_bayes import BernoulliNB as Classifier #from sklearn.naive_bayes import GaussianNB as Classifier #from sklearn.naive_bayes import GaussianNB as Classifier #from sklearn.ensemble import RandomForestClassifier as Classifier from sklearn.ensemble import ExtraTreesClassifier as Classifier # This seems to be the best... but high FP rate #from sklearn.naive_bayes import BernoulliNB as Classifier #clf = Classifier(loss='log', penalty='l1',verbose=2) # SGD #clf = Classifier(C=2500,verbose=2) # LogisiticRegression #clf = Classifier() # Naive Bayes clf = Classifier(n_estimators=200, n_jobs=8) # ExtraTrees clf.fit(vec_train, y_train) print 'Test Accuracy: %.3f' % clf.score(vec_test, y_test) idx_TP = np.array(y_test) > 0 vec_TP = np.array(vec_test)[idx_TP] y_TP = np.array(y_test)[idx_TP] print 'Test Accuracy on TP: %.3f' % clf.score(vec_TP, y_TP) vec_FP = np.array(vec_test)[~idx_TP] y_FP = np.array(y_test)[~idx_TP] print 'Test Accuracy on FP: %.3f' % clf.score(vec_FP, y_FP) print "Saving the classifer" joblib.dump(clf, f_clf) #Create ROC curve from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt pred_probas = clf.predict_proba(vec_test)[:, 1] fpr, tpr, _ = roc_curve(y_test, pred_probas) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='area = %.2f' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.legend(loc='lower right') plt.show()