def train(infile_x="data/X-strat.npy", infile_y="data/y-strat.npy", outfile_model="data/model.pickle", verbose=4): dataset = Dataset(infile_x, infile_y) clf = models[main.arguments["<model_name>"]]() X_train, X_test, y_train, y_test = dataset.train_test_split() clf.fit(X_train, y_train) with open(outfile_model, "wb") as f: cPickle.dump(clf, f, -1) print "Saved file to {}".format(outfile_model)
def train(infile_x='data/X-strat.npy', infile_y='data/y-strat.npy', outfile_model='data/model.pickle', verbose=4): dataset = Dataset(infile_x, infile_y) clf = models[main.arguments['<model_name>']]() X_train, X_test, y_train, y_test = dataset.train_test_split() clf.fit(X_train, y_train) with open(outfile_model, 'wb') as f: cPickle.dump(clf, f, -1) print "Saved file to {}".format(outfile_model)
def report(infile_x='data/X-strat.npy', infile_y='data/y-strat.npy'): dataset = Dataset(infile_x, infile_y) clf = models[main.arguments['<model_name>']]() X_train, X_test, y_train, y_test = dataset.train_test_split() clf.fit(X_train, y_train) y_pred_probas = clf.predict_proba(X_test) y_pred = ((y_pred_probas[:, 1] - y_pred_probas[:, 0]) > 0).astype(int) y_pred_train = clf.predict(X_train) # Classification report print "Classification report for training set:" print print classification_report(y_train, y_pred_train) print "Classification report for test set:" print print classification_report(y_test, y_pred) # Compute confusion matrix print "Confusion matrix:" cm = confusion_matrix(y_test, y_pred) print cm print # Precision-Recall curve precision, recall, thresholds = precision_recall_curve( y_test, y_pred_probas[:, 1]) area = auc(recall, precision) print "Area Under Curve: %0.2f" % area # Plot pl.clf() pl.plot(recall, precision, label='precision-recall curve') pl.xlabel('recall') pl.ylabel('precision') pl.ylim([0.0, 1.05]) pl.xlim([0.0, 1.0]) pl.title('Precision-Recall; AUC=%0.2f' % area) pl.legend(loc='lower left') pl.show()
def curve(infile_x='data/X-strat.npy', infile_y='data/y-strat.npy', learning_curve=learning_curve): dataset = Dataset(infile_x, infile_y) clf = models[main.arguments['<model_name>']]() scores_train, scores_test, sizes = learning_curve(dataset, clf, verbose=1) pl.plot(sizes, scores_train, 'b', label='training set') pl.plot(sizes, scores_test, 'r', label='test set') pl.xlabel('n training cases') pl.ylabel('score') pl.title('Learning curve') pl.legend(loc='lower right') pl.show()
def report(infile_x="data/X-strat.npy", infile_y="data/y-strat.npy"): dataset = Dataset(infile_x, infile_y) clf = models[main.arguments["<model_name>"]]() X_train, X_test, y_train, y_test = dataset.train_test_split() clf.fit(X_train, y_train) y_pred_probas = clf.predict_proba(X_test) y_pred = ((y_pred_probas[:, 1] - y_pred_probas[:, 0]) > 0).astype(int) y_pred_train = clf.predict(X_train) # Classification report print "Classification report for training set:" print print classification_report(y_train, y_pred_train) print "Classification report for test set:" print print classification_report(y_test, y_pred) # Compute confusion matrix print "Confusion matrix:" cm = confusion_matrix(y_test, y_pred) print cm print # Precision-Recall curve precision, recall, thresholds = precision_recall_curve(y_test, y_pred_probas[:, 1]) area = auc(recall, precision) print "Area Under Curve: %0.2f" % area # Plot pl.clf() pl.plot(recall, precision, label="precision-recall curve") pl.xlabel("recall") pl.ylabel("precision") pl.ylim([0.0, 1.05]) pl.xlim([0.0, 1.0]) pl.title("Precision-Recall; AUC=%0.2f" % area) pl.legend(loc="lower left") pl.show()
def _analyze(clf, infile_x="data/X-strat.npy", infile_y="data/y-strat.npy"): dataset = Dataset(infile_x, infile_y) X_train, X_test, y_train, y_test = dataset.train_test_split() clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test) y_pred = y_pred[:, 1] - y_pred[:, 0] X_test_pos = X_test[y_test == 1] y_pred_pos = y_pred[y_test == 1] y_pred_pos_sorted = y_pred_pos.argsort() X_test_neg = X_test[y_test == 0] y_pred_neg = y_pred[y_test == 0] y_pred_neg_sorted = y_pred_neg.argsort() true_pos = (X_test_pos[y_pred_pos_sorted][::-1], y_pred_pos[y_pred_pos_sorted][::-1]) true_neg = (X_test_neg[y_pred_neg_sorted], y_pred_neg[y_pred_neg_sorted]) false_pos = (X_test_neg[y_pred_neg_sorted][::-1], y_pred_neg[y_pred_neg_sorted][::-1]) false_neg = (X_test_pos[y_pred_pos_sorted], y_pred_pos[y_pred_pos_sorted]) return true_pos, true_neg, false_pos, false_neg
def search(infile_x='data/X-strat.npy', infile_y='data/y-strat.npy', verbose=4, n_jobs=1): model = models[main.arguments['<model_name>']] dataset = Dataset(infile_x, infile_y) return grid_search( dataset, model(), model.grid_search_params, verbose=verbose, cv=dataset.split_indices, score_func=f1_score, n_jobs=n_jobs, )
def _analyze(clf, infile_x='data/X-strat.npy', infile_y='data/y-strat.npy'): dataset = Dataset(infile_x, infile_y) X_train, X_test, y_train, y_test = dataset.train_test_split() clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test) y_pred = y_pred[:, 1] - y_pred[:, 0] X_test_pos = X_test[y_test == 1] y_pred_pos = y_pred[y_test == 1] y_pred_pos_sorted = y_pred_pos.argsort() X_test_neg = X_test[y_test == 0] y_pred_neg = y_pred[y_test == 0] y_pred_neg_sorted = y_pred_neg.argsort() true_pos = (X_test_pos[y_pred_pos_sorted][::-1], y_pred_pos[y_pred_pos_sorted][::-1]) true_neg = (X_test_neg[y_pred_neg_sorted], y_pred_neg[y_pred_neg_sorted]) false_pos = (X_test_neg[y_pred_neg_sorted][::-1], y_pred_neg[y_pred_neg_sorted][::-1]) false_neg = (X_test_pos[y_pred_pos_sorted], y_pred_pos[y_pred_pos_sorted]) return true_pos, true_neg, false_pos, false_neg