示例#1
0
 def gainsChart(self, clf, X_train, y_train):
     predictions = clf.predict_proba(X_train)
     plot_cumulative_gain(y_train,
                          X_train,
                          figsize=(12, 8),
                          title_fontsize=20,
                          text_fontsize=18)
示例#2
0
 def test_ax(self):
     np.random.seed(0)
     clf = LogisticRegression()
     clf.fit(self.X, self.y)
     probas = clf.predict_proba(self.X)
     fig, ax = plt.subplots(1, 1)
     out_ax = plot_cumulative_gain(self.y, probas)
     assert ax is not out_ax
     out_ax = plot_cumulative_gain(self.y, probas, ax=ax)
     assert ax is out_ax
示例#3
0
def log_cumulative_gain(y_true,
                        y_pred,
                        experiment=None,
                        channel_name='metric_charts',
                        prefix=''):
    """Creates cumulative gain chart and logs it to Neptune.

    Args:
        y_true (array-like, shape (n_samples)): Ground truth (correct) target values.
        y_pred (array-like, shape (n_samples, 2)): Predictions for classes 0 and 1 with values from 0 to 1.
        experiment(`neptune.experiments.Experiment`): Neptune experiment. Default is None.
        channel_name(str): name of the neptune channel. Default is 'metric_charts'.
        prefix(str): Prefix that will be added before metric name when logged to Neptune.

    Examples:
        Train the model and make predictions on test::

            from sklearn.datasets import make_classification
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.model_selection import train_test_split
            from sklearn.metrics import classification_report

            X, y = make_classification(n_samples=2000)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

            model = RandomForestClassifier()
            model.fit(X_train, y_train)

            y_test_pred = model.predict_proba(X_test)

        Create and log cumulative gain chart to Neptune::

            import neptune
            from neptunecontrib.monitoring.metrics import log_cumulative_gain

            neptune.init()
            with neptune.create_experiment():
                log_cumulative_gain(y_test, y_test_pred)

        Check out this experiment https://ui.neptune.ai/o/neptune-ai/org/binary-classification-metrics/e/BIN-101/logs.

    """
    assert len(
        y_pred.shape
    ) == 2, 'y_pred needs to be (n_samples, 2), use expand_prediction helper to format it'

    _exp = experiment if experiment else neptune

    expect_not_a_run(_exp)

    fig, ax = plt.subplots()
    plt_metrics.plot_cumulative_gain(y_true, y_pred, ax=ax)
    send_figure(fig, channel_name=prefix + channel_name, experiment=_exp)
    plt.close()
    def lift_gain_curves(self):

        y_pred_proba_both_classes = np.column_stack(
            [1 - self.y_pred_proba, self.y_pred_proba])
        gain = plot_cumulative_gain(self.y_test,
                                    y_pred_proba_both_classes,
                                    title='Cumulative Gains Curve')
        plt.show()

        lift = plot_lift_curve(self.y_test,
                               y_pred_proba_both_classes,
                               title='Lift curve')
        plt.show()
示例#5
0
    def k_NN(self):
        """Classify the patientsusing k-NN, selecting the best number of
        neighbours both via a 5-fold and a loo cross-validation procedure.
        Plot the two estimated error for each possible value of k. Add to the
        plot the corresponding test errors (i.e., the test error you would have
        obtained fitting k-NN with the same k) and comment on the results."""
        nnbs = KNeighborsClassifier(n_neighbors=10)
        modl = nnbs.fit(self.Xtrain, self.ytrain)
        score = modl.score(self.Xtest, self.ytest)
        ypred = np.array(modl.predict(self.Xtest).tolist())

        if self.plot:
            yprobas = \
                np.append((1-ypred).reshape(-1,1), ypred.reshape(-1,1), axis=1)
            plot_cumulative_gain(self.ytest.values, yprobas)
            plt.title("Cumulative Gains Curve of kNN prediction with\n"\
                + f"binary accuracy of {100*score:.2f}%")
            plt.show()
        """Research which k in kNN is the best:"""
        num = 100
        klin = np.linspace(1, 100, num)
        scorearr = np.zeros(num)
        for counter, i in enumerate(klin):
            nnbs = KNeighborsClassifier(n_neighbors=int(i))
            modl = nnbs.fit(self.Xtrain, self.ytrain)
            score = modl.score(self.Xtest, self.ytest)
            scorearr[counter] = score

        if self.plot:
            plt.plot(klin, scorearr)
            plt.title("The kNN method analysis for multiple k values.")
            plt.grid()
            plt.xlabel("k")
            plt.ylabel("Prediction scores")
            plt.show()

        return 1
示例#6
0
def plot_analysis(combine,
                  test_name,
                  y_true,
                  y_pred,
                  y_proba,
                  labels,
                  verbose,
                  library,
                  save=True,
                  show=True,
                  sessionid="testing",
                  prefix=""):

    met_index = 0
    plt.rcParams.update({'font.size': 14})
    # TODO: Find a way to do this better
    pltmetrics.plot_confusion_matrix(y_true, y_pred)
    if not combine:
        #plt.gcf().set_size_inches(3.65,3.65)
        save_show(plt, library + "/" + prefix, sessionid, "confusion_matrix",
                  show, save, False, True, True, False)
    else:
        plt.subplot(2, 4, met_index + 1)
    met_index += 1

    plt.rcParams.update({'font.size': 12})
    pltmetrics.plot_roc_curve(y_true, y_proba)
    for text in plt.gca().legend_.get_texts():
        text.set_text(text.get_text().replace("ROC curve of class", "class"))
        text.set_text(text.get_text().replace("area =", "AUC: "))
        text.set_text(text.get_text().replace("micro-average ROC curve",
                                              "micro-avg"))
        text.set_text(text.get_text().replace("macro-average ROC curve",
                                              "macro-avg"))
    if not combine:
        #plt.gcf().set_size_inches(3.65,3.65)
        save_show(plt, library + "/" + prefix, sessionid, "roc_curves", show,
                  save, False, True, True, False)
    else:
        plt.subplot(2, 4, met_index + 1)
    met_index += 1

    if len(labels) < 3:
        pltmetrics.plot_ks_statistic(y_true, y_proba)
        if not combine:
            #plt.gcf().set_size_inches(3.65,3.65)
            save_show(plt, library + "/" + prefix, sessionid, "ks_statistics",
                      show, save, False, True, True, False)
        else:
            plt.subplot(2, 4, met_index + 1)
        met_index += 1

    pltmetrics.plot_precision_recall_curve(y_true, y_proba)
    for text in plt.gca().legend_.get_texts():
        text.set_text(text.get_text().replace(
            "Precision-recall curve of class", "class"))
        text.set_text(text.get_text().replace("area =", "AUC: "))
        text.set_text(text.get_text().replace(
            "micro-average Precision-recall curve", "micro-avg"))
        text.set_text(text.get_text().replace("macro-average Precision-recall",
                                              "macro-avg"))
    if not combine:
        #plt.gcf().set_size_inches(3.65,3.65)
        save_show(plt, library + "/" + prefix, sessionid,
                  "precision_recall_curve", show, save, False, True, True,
                  False)
    else:
        plt.subplot(2, 4, met_index + 1)
    met_index += 1

    if len(labels) < 3:
        pltmetrics.plot_cumulative_gain(y_true, y_proba)
        if not combine:
            #plt.gcf().set_size_inches(3.65,3.65)
            save_show(plt, library + "/" + prefix, sessionid,
                      "cumulative_gain", show, save, False, True, True, False)
        else:
            plt.subplot(2, 4, met_index + 1)
        met_index += 1

    if len(labels) < 3:
        pltmetrics.plot_lift_curve(y_true, y_proba)
        if not combine:
            #plt.gcf().set_size_inches(3.65,3.65)
            save_show(plt, library + "/" + prefix, sessionid, "lift_curve",
                      show, save, False, True, True, False)
        else:
            plt.subplot(2, 4, met_index + 1)
        met_index += 1

    if combine:
        plt.suptitle(test_name)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        save_show(plt,
                  library,
                  sessionid,
                  figname,
                  show,
                  save,
                  True,
                  analysis=True)
示例#7
0
 def test_array_like(self):
     plot_cumulative_gain([0, 1], [[0.8, 0.2], [0.2, 0.8]])
     plot_cumulative_gain([0, 'a'], [[0.8, 0.2], [0.2, 0.8]])
     plot_cumulative_gain(['b', 'a'], [[0.8, 0.2], [0.2, 0.8]])
示例#8
0
 def test_string_classes(self):
     np.random.seed(0)
     clf = LogisticRegression()
     clf.fit(self.X, convert_labels_into_string(self.y))
     probas = clf.predict_proba(self.X)
     plot_cumulative_gain(convert_labels_into_string(self.y), probas)
示例#9
0
    plt.plot(np.arange(n), Train_acc, label="train", color="b")
    plt.plot([0, n], [train_acc, train_acc],
             label="avg-train",
             linestyle=":",
             color="b")
    plt.plot(np.arange(n), Test_acc, label="test", color="y")
    plt.plot([0, n], [test_acc, test_acc],
             label="avg-test",
             linestyle=":",
             color="y")
    plt.xlabel("n"), plt.ylabel("accuracy %")
    plt.legend(), plt.grid(), plt.show()

    #False-negative and false-positive error rates
    print("False-neg. rate: %.3f" % false_neg)
    print("False-pos. rate: %.3f" % false_pos)
    """
    output:
    False-neg. rate: 0.305
    False-pos. rate: 0.172
    """

    #ROC curves of last train_test_split sample
    p_test = np.ravel(p_test)
    y_probas = np.zeros((len(p_test), 2))
    y_probas[:, 0] = 1 - p_test
    y_probas[:, 1] = p_test

    plot_cumulative_gain(y_true=y_test, y_probas=y_probas)
    plt.show()
示例#10
0
def plot_lift(y_true, y_pred):
    plot_cumulative_gain(y_true, y_pred)
    plt.show()
示例#11
0
    def GridSearch(self, X_test, X_train, y_test, y_train, lmbd_vals, eta_vals,
                   iterations):
        """
		Compares the best learning rate and regularization parameter for multiple-cases.
		Calculate AUC-score and accuracy score.
		"""

        train_accuracy = np.zeros((len(eta_vals), len(lmbd_vals)))
        test_accuracy = np.zeros((len(eta_vals), len(lmbd_vals)))
        roc_score_test = np.zeros((len(eta_vals), len(lmbd_vals)))
        roc_score_train = np.zeros((len(eta_vals), len(lmbd_vals)))
        NN_numpy = np.zeros((len(eta_vals), len(lmbd_vals)), dtype=object)

        #grid search
        for i in range(len(eta_vals)):
            for j in range(len(lmbd_vals)):
                self.create_biases_and_weights()
                #self.print_values()
                self.train(iterations)

                test_pred = self.predict(X_test)
                train_pred = self.predict(X_train)

                accuracy = accuracy_score(y_test, test_pred)

                #train_accuracy[i][j] = metrics.accuracy_score(y_train,train_pred.round(), normalize=False)
                #test_accuracy[i][j] = metrics.accuracy_score(y_test,test_pred)

                train_accuracy[i][j] = accuracy_score(y_train, train_pred)
                test_accuracy[i, j] = accuracy_score(y_test, test_pred)

                roc_score_test[i, j] = metrics.roc_auc_score(y_test, test_pred)
                roc_score_train[i, j] = metrics.roc_auc_score(
                    y_train, train_pred)

                if accuracy > self.best_accuracy:
                    self.best_accuracy = accuracy
                    self.best_lmbd = lmbd_vals[j]
                    self.best_eta = eta_vals[i]

                #print('Accuracy score on test data:', accuracy)
                print('best accuracy:', self.best_accuracy)
                print('lambda:', self.best_lmbd)
                print('Learning rate:', self.best_eta)
                #print('Train Area ratio:', np.mean(roc_score_train))
                #print('Test Area ratio:', np.mean(roc_score_test))

        sns.set()
        sns.heatmap(train_accuracy, annot=True, cmap="viridis", fmt='.4g')
        plt.title("Training Accuracy")
        plt.ylabel('Learning rate: $\\eta$')
        plt.xlabel('Regularization Term: $\\lambda$')
        b, t = plt.ylim()
        b += 0.5  # Add 0.5 to the bottom
        t -= 0.5  # Subtract 0.5 from the top
        plt.ylim(b, t)
        #plt.savefig('traing_accuracy_cc_nn.png')
        plt.show()

        sns.heatmap(test_accuracy, annot=True, cmap="viridis", fmt='.4g')
        plt.title("Test Accuracy")
        plt.ylabel('Learning rate: $\\eta$')
        plt.xlabel('Regularization Term: $\\lambda$')
        b, t = plt.ylim()
        b += 0.5  # Add 0.5 to the bottom
        t -= 0.5  # Subtract 0.5 from the top
        plt.ylim(b, t)
        #plt.savefig('test_accuracy_cc_nn.png')
        plt.show()

        sns.heatmap(roc_score_train, annot=True, cmap="viridis", fmt='.4g')
        plt.title("AUC Train")
        plt.ylabel('Learning rate: $\\eta$')
        plt.xlabel('Regularization Term: $\\lambda$')
        b, t = plt.ylim()
        b += 0.5  # Add 0.5 to the bottom
        t -= 0.5  # Subtract 0.5 from the top
        plt.ylim(b, t)
        #plt.savefig('traing_auc_cc_nn.png')
        plt.show()

        sns.heatmap(roc_score_test, annot=True, cmap="viridis", fmt='.4g')
        plt.title("AUC Test")
        plt.ylabel('Learning rate: $\\eta$')
        plt.xlabel('Regularization Term: $\\lambda$')
        b, t = plt.ylim()
        b += 0.5  # Add 0.5 to the bottom
        t -= 0.5  # Subtract 0.5 from the top
        plt.ylim(b, t)
        #plt.savefig('test_auc_cc_nn.png')

        plt.show()

        Confusion_Matrix(y_test, test_pred)

        #test_pred = np.argmax(test_pred, axis=1)

        diff = np.concatenate((1 - test_pred, test_pred), axis=1)

        plot_cumulative_gain(y_test, diff)

        plot_roc(y_test, diff, plot_micro=False, plot_macro=False)
        plt.show()
示例#12
0
def LogisticRegression_self_test(X_train, X_test, y_train, y_test,
                                 learning_rates, epochs, iteration):
    """
	Logistic regression with stochastic gradient descent and gradient descent.
	"""

    # scoping number of training samples

    n_inputs = X_train.shape[0]
    n_features = X_train.shape[1]

    eta_ = 1e-12
    beta_opt = np.random.randn(X_train.shape[1], 2)
    calc_beta_GD, norm = GradientDescent(X_train, beta_opt, y_train, iteration,
                                         eta_)
    prob_GD, predict_GD = Probability_GD(
        X_test, calc_beta_GD)  #defining values to be between 0 and 1
    #yPred_GD = (predict_GD >= 0.5).astype(int) # converting to just 0 or 1

    #Define Logistic regression
    clf = LogisticRegression(solver='lbfgs', max_iter=1e5)
    clf = clf.fit(X_train, np.ravel(y_train))
    pred_sklearn = clf.predict(X_test)
    prob_sklearn = clf.predict_proba(X_test)
    #print(prob_sklearn)

    #for eta in np.logspace(np.log10(1e-6), np.log10(1e0), 7):
    accuracy = np.zeros(len(learning_rates))
    auc_score = np.zeros(len(learning_rates))

    for i, eta in enumerate(learning_rates):
        beta_SGD = stochastic_gradient_descent(X_train, beta_opt, y_train, eta,
                                               epochs, iteration)
        prob_SGD, predict_SGD = Probability(
            X_test, beta_SGD)  #defining values to be between 0 and 1

        accuracy[i] = metrics.accuracy_score(y_test, predict_SGD)
        auc_score[i] = metrics.roc_auc_score(y_test, predict_SGD)
        difference = y_test - predict_SGD

        if i > 0 and auc_score[i] > auc_score[i - 1]:
            best_pred_SGD = predict_SGD
            best_prob_SGD = prob_SGD

        print('Accuracy {}, learning rate= {}, iterations = {}'.format(
            accuracy[i], eta, iteration))

        print('Auc score: {}'.format(auc_score[i]))
        """
		plt.plot(yPred, label='predict')
		plt.plot(optimal_beta, label ='optimal beta')
		plt.plot(y_test, label='test')
		plt.show()
		"""

    sns.set()
    sns.heatmap(pd.DataFrame(accuracy), annot=True, fmt='.4g')
    plt.title('Grid-search for logistic regression')
    plt.ylabel('Learning rate: $\\eta$')
    plt.xlabel('Regularization Term: $\\lambda$')
    #plt.xticks(ticks=np.arange(len(learning_rates)) + 0.5, labels=learning_rates)
    #plt.yticks(ticks=np.arange(len(lambda_values)) + 0.5, labels=lambda_values)
    b, t = plt.ylim()  # discover the values for bottom and top
    b += 0.5  # Add 0.5 to the bottom
    t -= 0.5  # Subtract 0.5 from the top
    plt.ylim(b, t)  # update the ylim(bottom, top) values
    #plt.savefig('accuracy_logreg.png')
    plt.show()

    sns.heatmap(pd.DataFrame(auc_score), annot=True, fmt='.4g')
    plt.title('Grid-search for logistic regression')
    plt.ylabel('Learning rate: $\\eta$')
    plt.xlabel('Regularization Term: $\\lambda$')
    #plt.xticks(ticks=np.arange(len(learning_rates)) + 0.5, labels=learning_rates)
    #plt.yticks(ticks=np.arange(len(lambda_values)) + 0.5, labels=lambda_values)
    b, t = plt.ylim()  # discover the values for bottom and top
    b += 0.5  # Add 0.5 to the bottom
    t -= 0.5  # Subtract 0.5 from the top
    plt.ylim(b, t)  # update the ylim(bottom, top) values
    #plt.savefig('auc_score_logreg.png')
    plt.show()

    #plot confusion matrix
    Confusion_Matrix(y_test, predict_GD)
    #Confusion_Matrix(y_test, best_pred_SGD)
    #Confusion_Matrix(y_test, pred_sklearn)

    #diff = np.concatenate((1- predict, predict), axis=1)

    diff_sklearn = np.concatenate((1 - prob_sklearn, prob_sklearn), axis=1)
    diff_GD = np.concatenate((1 - prob_GD, prob_GD), axis=1)
    diff_SGD = np.concatenate((1 - best_prob_SGD, best_prob_SGD), axis=1)

    #plot roc curves
    plot_roc(y_test, prob_sklearn)
    plot_roc(y_test, diff_SGD)
    plot_roc(y_test, prob_GD)
    plt.show()

    #plot cumulative gain curves
    plot_cumulative_gain(y_test, prob_sklearn)
    ax = plot_cumulative_gain(y_test, diff_SGD)
    plot_cumulative_gain(y_test, prob_GD)
    #plt.show()
    """
	#plot roc curves
	plot_roc(y_test, diff_sklearn, plot_micro=False, plot_macro= False)
	plot_roc(y_test, diff_GD, plot_micro=False, plot_macro= False)
	plot_roc(y_test, diff_SGD, plot_micro=False, plot_macro= False)
	plt.show()

	#plot cumulative gain curves
	plot_cumulative_gain(y_test, diff_sklearn)
	plot_cumulative_gain(y_test, diff_GD)
	plot_cumulative_gain(y_test, diff_SGD)
	plt.show()	

	"""

    model_curve = auc_score
    area_baseline = 0.5
    area_ratio = (model_curve - area_baseline) / (area_baseline)
    print('Area Ratio:', area_ratio)

    return accuracy, learning_rates