def gainsChart(self, clf, X_train, y_train): predictions = clf.predict_proba(X_train) plot_cumulative_gain(y_train, X_train, figsize=(12, 8), title_fontsize=20, text_fontsize=18)
def test_ax(self): np.random.seed(0) clf = LogisticRegression() clf.fit(self.X, self.y) probas = clf.predict_proba(self.X) fig, ax = plt.subplots(1, 1) out_ax = plot_cumulative_gain(self.y, probas) assert ax is not out_ax out_ax = plot_cumulative_gain(self.y, probas, ax=ax) assert ax is out_ax
def log_cumulative_gain(y_true, y_pred, experiment=None, channel_name='metric_charts', prefix=''): """Creates cumulative gain chart and logs it to Neptune. Args: y_true (array-like, shape (n_samples)): Ground truth (correct) target values. y_pred (array-like, shape (n_samples, 2)): Predictions for classes 0 and 1 with values from 0 to 1. experiment(`neptune.experiments.Experiment`): Neptune experiment. Default is None. channel_name(str): name of the neptune channel. Default is 'metric_charts'. prefix(str): Prefix that will be added before metric name when logged to Neptune. Examples: Train the model and make predictions on test:: from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report X, y = make_classification(n_samples=2000) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = RandomForestClassifier() model.fit(X_train, y_train) y_test_pred = model.predict_proba(X_test) Create and log cumulative gain chart to Neptune:: import neptune from neptunecontrib.monitoring.metrics import log_cumulative_gain neptune.init() with neptune.create_experiment(): log_cumulative_gain(y_test, y_test_pred) Check out this experiment https://ui.neptune.ai/o/neptune-ai/org/binary-classification-metrics/e/BIN-101/logs. """ assert len( y_pred.shape ) == 2, 'y_pred needs to be (n_samples, 2), use expand_prediction helper to format it' _exp = experiment if experiment else neptune expect_not_a_run(_exp) fig, ax = plt.subplots() plt_metrics.plot_cumulative_gain(y_true, y_pred, ax=ax) send_figure(fig, channel_name=prefix + channel_name, experiment=_exp) plt.close()
def lift_gain_curves(self): y_pred_proba_both_classes = np.column_stack( [1 - self.y_pred_proba, self.y_pred_proba]) gain = plot_cumulative_gain(self.y_test, y_pred_proba_both_classes, title='Cumulative Gains Curve') plt.show() lift = plot_lift_curve(self.y_test, y_pred_proba_both_classes, title='Lift curve') plt.show()
def k_NN(self): """Classify the patientsusing k-NN, selecting the best number of neighbours both via a 5-fold and a loo cross-validation procedure. Plot the two estimated error for each possible value of k. Add to the plot the corresponding test errors (i.e., the test error you would have obtained fitting k-NN with the same k) and comment on the results.""" nnbs = KNeighborsClassifier(n_neighbors=10) modl = nnbs.fit(self.Xtrain, self.ytrain) score = modl.score(self.Xtest, self.ytest) ypred = np.array(modl.predict(self.Xtest).tolist()) if self.plot: yprobas = \ np.append((1-ypred).reshape(-1,1), ypred.reshape(-1,1), axis=1) plot_cumulative_gain(self.ytest.values, yprobas) plt.title("Cumulative Gains Curve of kNN prediction with\n"\ + f"binary accuracy of {100*score:.2f}%") plt.show() """Research which k in kNN is the best:""" num = 100 klin = np.linspace(1, 100, num) scorearr = np.zeros(num) for counter, i in enumerate(klin): nnbs = KNeighborsClassifier(n_neighbors=int(i)) modl = nnbs.fit(self.Xtrain, self.ytrain) score = modl.score(self.Xtest, self.ytest) scorearr[counter] = score if self.plot: plt.plot(klin, scorearr) plt.title("The kNN method analysis for multiple k values.") plt.grid() plt.xlabel("k") plt.ylabel("Prediction scores") plt.show() return 1
def plot_analysis(combine, test_name, y_true, y_pred, y_proba, labels, verbose, library, save=True, show=True, sessionid="testing", prefix=""): met_index = 0 plt.rcParams.update({'font.size': 14}) # TODO: Find a way to do this better pltmetrics.plot_confusion_matrix(y_true, y_pred) if not combine: #plt.gcf().set_size_inches(3.65,3.65) save_show(plt, library + "/" + prefix, sessionid, "confusion_matrix", show, save, False, True, True, False) else: plt.subplot(2, 4, met_index + 1) met_index += 1 plt.rcParams.update({'font.size': 12}) pltmetrics.plot_roc_curve(y_true, y_proba) for text in plt.gca().legend_.get_texts(): text.set_text(text.get_text().replace("ROC curve of class", "class")) text.set_text(text.get_text().replace("area =", "AUC: ")) text.set_text(text.get_text().replace("micro-average ROC curve", "micro-avg")) text.set_text(text.get_text().replace("macro-average ROC curve", "macro-avg")) if not combine: #plt.gcf().set_size_inches(3.65,3.65) save_show(plt, library + "/" + prefix, sessionid, "roc_curves", show, save, False, True, True, False) else: plt.subplot(2, 4, met_index + 1) met_index += 1 if len(labels) < 3: pltmetrics.plot_ks_statistic(y_true, y_proba) if not combine: #plt.gcf().set_size_inches(3.65,3.65) save_show(plt, library + "/" + prefix, sessionid, "ks_statistics", show, save, False, True, True, False) else: plt.subplot(2, 4, met_index + 1) met_index += 1 pltmetrics.plot_precision_recall_curve(y_true, y_proba) for text in plt.gca().legend_.get_texts(): text.set_text(text.get_text().replace( "Precision-recall curve of class", "class")) text.set_text(text.get_text().replace("area =", "AUC: ")) text.set_text(text.get_text().replace( "micro-average Precision-recall curve", "micro-avg")) text.set_text(text.get_text().replace("macro-average Precision-recall", "macro-avg")) if not combine: #plt.gcf().set_size_inches(3.65,3.65) save_show(plt, library + "/" + prefix, sessionid, "precision_recall_curve", show, save, False, True, True, False) else: plt.subplot(2, 4, met_index + 1) met_index += 1 if len(labels) < 3: pltmetrics.plot_cumulative_gain(y_true, y_proba) if not combine: #plt.gcf().set_size_inches(3.65,3.65) save_show(plt, library + "/" + prefix, sessionid, "cumulative_gain", show, save, False, True, True, False) else: plt.subplot(2, 4, met_index + 1) met_index += 1 if len(labels) < 3: pltmetrics.plot_lift_curve(y_true, y_proba) if not combine: #plt.gcf().set_size_inches(3.65,3.65) save_show(plt, library + "/" + prefix, sessionid, "lift_curve", show, save, False, True, True, False) else: plt.subplot(2, 4, met_index + 1) met_index += 1 if combine: plt.suptitle(test_name) plt.tight_layout(rect=[0, 0.03, 1, 0.95]) save_show(plt, library, sessionid, figname, show, save, True, analysis=True)
def test_array_like(self): plot_cumulative_gain([0, 1], [[0.8, 0.2], [0.2, 0.8]]) plot_cumulative_gain([0, 'a'], [[0.8, 0.2], [0.2, 0.8]]) plot_cumulative_gain(['b', 'a'], [[0.8, 0.2], [0.2, 0.8]])
def test_string_classes(self): np.random.seed(0) clf = LogisticRegression() clf.fit(self.X, convert_labels_into_string(self.y)) probas = clf.predict_proba(self.X) plot_cumulative_gain(convert_labels_into_string(self.y), probas)
plt.plot(np.arange(n), Train_acc, label="train", color="b") plt.plot([0, n], [train_acc, train_acc], label="avg-train", linestyle=":", color="b") plt.plot(np.arange(n), Test_acc, label="test", color="y") plt.plot([0, n], [test_acc, test_acc], label="avg-test", linestyle=":", color="y") plt.xlabel("n"), plt.ylabel("accuracy %") plt.legend(), plt.grid(), plt.show() #False-negative and false-positive error rates print("False-neg. rate: %.3f" % false_neg) print("False-pos. rate: %.3f" % false_pos) """ output: False-neg. rate: 0.305 False-pos. rate: 0.172 """ #ROC curves of last train_test_split sample p_test = np.ravel(p_test) y_probas = np.zeros((len(p_test), 2)) y_probas[:, 0] = 1 - p_test y_probas[:, 1] = p_test plot_cumulative_gain(y_true=y_test, y_probas=y_probas) plt.show()
def plot_lift(y_true, y_pred): plot_cumulative_gain(y_true, y_pred) plt.show()
def GridSearch(self, X_test, X_train, y_test, y_train, lmbd_vals, eta_vals, iterations): """ Compares the best learning rate and regularization parameter for multiple-cases. Calculate AUC-score and accuracy score. """ train_accuracy = np.zeros((len(eta_vals), len(lmbd_vals))) test_accuracy = np.zeros((len(eta_vals), len(lmbd_vals))) roc_score_test = np.zeros((len(eta_vals), len(lmbd_vals))) roc_score_train = np.zeros((len(eta_vals), len(lmbd_vals))) NN_numpy = np.zeros((len(eta_vals), len(lmbd_vals)), dtype=object) #grid search for i in range(len(eta_vals)): for j in range(len(lmbd_vals)): self.create_biases_and_weights() #self.print_values() self.train(iterations) test_pred = self.predict(X_test) train_pred = self.predict(X_train) accuracy = accuracy_score(y_test, test_pred) #train_accuracy[i][j] = metrics.accuracy_score(y_train,train_pred.round(), normalize=False) #test_accuracy[i][j] = metrics.accuracy_score(y_test,test_pred) train_accuracy[i][j] = accuracy_score(y_train, train_pred) test_accuracy[i, j] = accuracy_score(y_test, test_pred) roc_score_test[i, j] = metrics.roc_auc_score(y_test, test_pred) roc_score_train[i, j] = metrics.roc_auc_score( y_train, train_pred) if accuracy > self.best_accuracy: self.best_accuracy = accuracy self.best_lmbd = lmbd_vals[j] self.best_eta = eta_vals[i] #print('Accuracy score on test data:', accuracy) print('best accuracy:', self.best_accuracy) print('lambda:', self.best_lmbd) print('Learning rate:', self.best_eta) #print('Train Area ratio:', np.mean(roc_score_train)) #print('Test Area ratio:', np.mean(roc_score_test)) sns.set() sns.heatmap(train_accuracy, annot=True, cmap="viridis", fmt='.4g') plt.title("Training Accuracy") plt.ylabel('Learning rate: $\\eta$') plt.xlabel('Regularization Term: $\\lambda$') b, t = plt.ylim() b += 0.5 # Add 0.5 to the bottom t -= 0.5 # Subtract 0.5 from the top plt.ylim(b, t) #plt.savefig('traing_accuracy_cc_nn.png') plt.show() sns.heatmap(test_accuracy, annot=True, cmap="viridis", fmt='.4g') plt.title("Test Accuracy") plt.ylabel('Learning rate: $\\eta$') plt.xlabel('Regularization Term: $\\lambda$') b, t = plt.ylim() b += 0.5 # Add 0.5 to the bottom t -= 0.5 # Subtract 0.5 from the top plt.ylim(b, t) #plt.savefig('test_accuracy_cc_nn.png') plt.show() sns.heatmap(roc_score_train, annot=True, cmap="viridis", fmt='.4g') plt.title("AUC Train") plt.ylabel('Learning rate: $\\eta$') plt.xlabel('Regularization Term: $\\lambda$') b, t = plt.ylim() b += 0.5 # Add 0.5 to the bottom t -= 0.5 # Subtract 0.5 from the top plt.ylim(b, t) #plt.savefig('traing_auc_cc_nn.png') plt.show() sns.heatmap(roc_score_test, annot=True, cmap="viridis", fmt='.4g') plt.title("AUC Test") plt.ylabel('Learning rate: $\\eta$') plt.xlabel('Regularization Term: $\\lambda$') b, t = plt.ylim() b += 0.5 # Add 0.5 to the bottom t -= 0.5 # Subtract 0.5 from the top plt.ylim(b, t) #plt.savefig('test_auc_cc_nn.png') plt.show() Confusion_Matrix(y_test, test_pred) #test_pred = np.argmax(test_pred, axis=1) diff = np.concatenate((1 - test_pred, test_pred), axis=1) plot_cumulative_gain(y_test, diff) plot_roc(y_test, diff, plot_micro=False, plot_macro=False) plt.show()
def LogisticRegression_self_test(X_train, X_test, y_train, y_test, learning_rates, epochs, iteration): """ Logistic regression with stochastic gradient descent and gradient descent. """ # scoping number of training samples n_inputs = X_train.shape[0] n_features = X_train.shape[1] eta_ = 1e-12 beta_opt = np.random.randn(X_train.shape[1], 2) calc_beta_GD, norm = GradientDescent(X_train, beta_opt, y_train, iteration, eta_) prob_GD, predict_GD = Probability_GD( X_test, calc_beta_GD) #defining values to be between 0 and 1 #yPred_GD = (predict_GD >= 0.5).astype(int) # converting to just 0 or 1 #Define Logistic regression clf = LogisticRegression(solver='lbfgs', max_iter=1e5) clf = clf.fit(X_train, np.ravel(y_train)) pred_sklearn = clf.predict(X_test) prob_sklearn = clf.predict_proba(X_test) #print(prob_sklearn) #for eta in np.logspace(np.log10(1e-6), np.log10(1e0), 7): accuracy = np.zeros(len(learning_rates)) auc_score = np.zeros(len(learning_rates)) for i, eta in enumerate(learning_rates): beta_SGD = stochastic_gradient_descent(X_train, beta_opt, y_train, eta, epochs, iteration) prob_SGD, predict_SGD = Probability( X_test, beta_SGD) #defining values to be between 0 and 1 accuracy[i] = metrics.accuracy_score(y_test, predict_SGD) auc_score[i] = metrics.roc_auc_score(y_test, predict_SGD) difference = y_test - predict_SGD if i > 0 and auc_score[i] > auc_score[i - 1]: best_pred_SGD = predict_SGD best_prob_SGD = prob_SGD print('Accuracy {}, learning rate= {}, iterations = {}'.format( accuracy[i], eta, iteration)) print('Auc score: {}'.format(auc_score[i])) """ plt.plot(yPred, label='predict') plt.plot(optimal_beta, label ='optimal beta') plt.plot(y_test, label='test') plt.show() """ sns.set() sns.heatmap(pd.DataFrame(accuracy), annot=True, fmt='.4g') plt.title('Grid-search for logistic regression') plt.ylabel('Learning rate: $\\eta$') plt.xlabel('Regularization Term: $\\lambda$') #plt.xticks(ticks=np.arange(len(learning_rates)) + 0.5, labels=learning_rates) #plt.yticks(ticks=np.arange(len(lambda_values)) + 0.5, labels=lambda_values) b, t = plt.ylim() # discover the values for bottom and top b += 0.5 # Add 0.5 to the bottom t -= 0.5 # Subtract 0.5 from the top plt.ylim(b, t) # update the ylim(bottom, top) values #plt.savefig('accuracy_logreg.png') plt.show() sns.heatmap(pd.DataFrame(auc_score), annot=True, fmt='.4g') plt.title('Grid-search for logistic regression') plt.ylabel('Learning rate: $\\eta$') plt.xlabel('Regularization Term: $\\lambda$') #plt.xticks(ticks=np.arange(len(learning_rates)) + 0.5, labels=learning_rates) #plt.yticks(ticks=np.arange(len(lambda_values)) + 0.5, labels=lambda_values) b, t = plt.ylim() # discover the values for bottom and top b += 0.5 # Add 0.5 to the bottom t -= 0.5 # Subtract 0.5 from the top plt.ylim(b, t) # update the ylim(bottom, top) values #plt.savefig('auc_score_logreg.png') plt.show() #plot confusion matrix Confusion_Matrix(y_test, predict_GD) #Confusion_Matrix(y_test, best_pred_SGD) #Confusion_Matrix(y_test, pred_sklearn) #diff = np.concatenate((1- predict, predict), axis=1) diff_sklearn = np.concatenate((1 - prob_sklearn, prob_sklearn), axis=1) diff_GD = np.concatenate((1 - prob_GD, prob_GD), axis=1) diff_SGD = np.concatenate((1 - best_prob_SGD, best_prob_SGD), axis=1) #plot roc curves plot_roc(y_test, prob_sklearn) plot_roc(y_test, diff_SGD) plot_roc(y_test, prob_GD) plt.show() #plot cumulative gain curves plot_cumulative_gain(y_test, prob_sklearn) ax = plot_cumulative_gain(y_test, diff_SGD) plot_cumulative_gain(y_test, prob_GD) #plt.show() """ #plot roc curves plot_roc(y_test, diff_sklearn, plot_micro=False, plot_macro= False) plot_roc(y_test, diff_GD, plot_micro=False, plot_macro= False) plot_roc(y_test, diff_SGD, plot_micro=False, plot_macro= False) plt.show() #plot cumulative gain curves plot_cumulative_gain(y_test, diff_sklearn) plot_cumulative_gain(y_test, diff_GD) plot_cumulative_gain(y_test, diff_SGD) plt.show() """ model_curve = auc_score area_baseline = 0.5 area_ratio = (model_curve - area_baseline) / (area_baseline) print('Area Ratio:', area_ratio) return accuracy, learning_rates