def calc_fit(model, metric, train_x, train_y, test_x, test_y, p): train_x = map(lambda x: list(compress(x, p)), train_x) test_x = map(lambda x: list(compress(x, p)), test_x) clf = model.fit(train_x, train_y) predictions = clf.predict(test_x) if metric == 'precision': return precision_score(test_y, predictions, [0, 1]) elif metric == 'recall': return recall_score(test_y, predictions, [0, 1]) elif metric == 'accuracy': return accuracy_score(test_y, predictions, [0, 1]) return precision_score(test_y, predictions, [0, 1]) + recall_score(test_y, predictions, [0, 1]) + accuracy_score(test_y, predictions, [0, 1])
def Predict(self, inp, labels, classifier, folds, name, paramdesc): X= inp y = labels X, y = X[y != 2], y[y != 2] n_samples, n_features = X.shape ############################################################################### # Classification and ROC analysis # Run classifier with cross-validation and plot ROC curves cv = StratifiedKFold(y, n_folds=folds) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] _precision = 0.0 _recall = 0.0 _accuracy = 0.0 _f1 = 0.0 for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) pred_ = classifier.predict(X[test]) _precision += precision_score(y[test], pred_) _recall += recall_score(y[test], pred_) _accuracy += accuracy_score(y[test], pred_) _f1 += f1_score(y[test], pred_) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) _precision /= folds _recall /= folds _accuracy /= folds _f1 /= folds plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic - {0}'.format(name)) plt.legend(loc="lower right") plt.savefig(self.configObject['outputdir'] + '/' + name + '.png') plt.close() result = self.OutputResult(name, paramdesc, len(inp), floor(labels.size / folds), _precision, _recall, _accuracy, _f1) Announce(result)
def run_grid_search(grid_search, show_evaluation=True): """ Run the GridSearch algorithm and compute evaluation metrics """ X_train, X_test, y_train, y_test = split_dataset() grid_search.fit(X_train, y_train) # for key, value in grid_search.cv_results_.items(): # print key, value predictions = grid_search.predict(X_test) if show_evaluation: logger.debug("macro_recall: %s", recall_score(y_test, predictions, average="macro")) logger.debug(precision_recall_fscore_support(y_test, predictions)) logger.debug(confusion_matrix(y_test, predictions))
def run(): paras, sents = create_dataset() X = np.array(get_features(paras)) Y = np.array(get_ys(paras)) print len(X[0]) sents = np.array(sents) skf = StratifiedKFold(Y, n_folds=10) f = open('results/correct.txt','w') f2 = open('results/wrong.txt','w') accs = [] precs = [] recs = [] f1s = [] for train_index, test_index in skf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] sent_train = sents[train_index] sent_test = sents[test_index] # cv = CountVectorizer(stop_words="english", ngram_range=(1,1), min_df = 5) # sent_train_counts = cv.fit_transform(sent_train) # # tf_transformer = TfidfTransformer(use_idf=True).fit(sent_train_counts) # sent_train_counts = tf_transformer.transform(sent_train_counts) # # sent_train_counts = sent_train_counts.toarray() # # print sent_train_counts.shape # print X_train.shape # # new_train = [] # for i,j in zip(X_train, sent_train_counts): # new_train.append(np.append(i,j)) #fs = SelectKBest(chi2, k=24) #X_train = fs.fit_transform(X_train, y_train) clf = LogisticRegression() clf.fit(X_train, y_train) print clf.coef_ # # sent_test_counts = cv.transform(sent_test) # sent_test_counts = tf_transformer.transform(sent_test_counts) # # sent_test_counts = sent_test_counts.toarray() # # new_test = [] # for i,j in zip(X_test, sent_test_counts): # new_test.append(np.append(i,j)) #X_test = fs.transform(X_test) y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) prec = precision_score(y_test, y_pred) rec = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) accs.append(acc) precs.append(prec) recs.append(rec) f1s.append(f1) print 'Acc \t %s' % acc print 'Prec \t %s' % prec print 'Recall \t %s' % rec print 'F1 \t %s' % f1 for (index,test),(y_t, y_p) in zip(zip(test_index, X_test), zip(y_test, y_pred)): if y_t == y_p: # if paras[index]['prev_para']: # f.write('%s\n' % paras[index]['prev_para']['sents']) f.write('%s\n' % sents[index]) f.write('%s\n' % (y_t)) else: # if paras[index]['prev_para']: # f2.write('%s\n' % paras[index]['prev_para']['sents']) f2.write('%s\n' % sents[index]) f2.write('%s\n' % (y_t)) print 'Avg Acc \t %s \t ' % np.mean(accs) print 'Avg Prec \t %s' % np.mean(precs) print 'Avg Recall \t %s' % np.mean(recs) print 'Avg F1 \t %s' % np.mean(f1s)
for i in train: y_train.append(features[i][6]) tmp = [features[i][0], features[i][1], features[i][2], features[i][3], features[i][4], features[i][5]] x_train.append(tmp) y_test = [] x_test = [] for i in test: y_test.append(features[i][6]) tmp = [features[i][0], features[i][1], features[i][2], features[i][3], features[i][4], features[i][5]] x_test.append(tmp) lr.fit(x_train, y_train) lrPredTest = lr.predict(x_test) lrPrecisionTest = precision_score(y_test, lrPredTest) lrRecallTest = recall_score(y_test, lrPredTest) lrF1Test = f1_score(y_test, lrPredTest) lrAvgPrecision += lrPrecisionTest lrAvgRecall += lrRecallTest lrAvgF1 += lrF1Test print "log reg completed in ", time.time() - start, " s" print "lr:\n Precision {}\n Recall {}\n F1 {}\n".format(lrAvgPrecision / 5, lrAvgRecall / 5, lrAvgF1 / 5) start = time.time() """RANDOM FOREST""" rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=5) rfAvgPrecision = 0.0 rfAvgRecall = 0.0