def plot_cum_gain_chart(y_true, prob_y, prob_ysk=None): lw = 2 if prob_ysk is not None: x_data, y_data = cumulative_gain_curve(y_true, prob_y[:, 1]) x_data_sk, y_data_sk = cumulative_gain_curve(y_true, prob_ysk[:, 1]) plt.plot(x_data, y_data, linewidth=lw, label='model') plt.plot(x_data_sk, y_data_sk, '--', linewidth=lw, label='model sklearn') else: x_data, y_data = cumulative_gain_curve(y_true, prob_y[:, 1]) plt.plot(x_data, y_data, linewidth=lw, label='model') x_best, y_best = tools.bestCurve(y_true) x_base, y_base = np.array([0, 1]), np.array([0, 1]) plt.plot(x_best, y_best, linewidth=lw, label='best curve') plt.plot(x_base, y_base, ':', linewidth=lw, label='baseline') plt.xlabel(r'Fraction of data', size=14) plt.ylabel(r'Cumulative gain', size=14) plt.legend(prop={'size': 12}) plt.grid('True', linestyle='dashed') plt.tick_params(axis='both', labelsize=12) plt.tight_layout() plt.show()
def plot_cumulative_gain(y_true, y_probas, title='Cumulative Gains Curve', ax=None, figsize=None, title_fontsize="large", text_fontsize="medium"): """Refactored code from scikitplot's plot_cumulative_gain function. Area under curve functionality added and removal of one class option added to the plotting functionality.""" y_true = np.array(y_true) y_probas = np.array(y_probas) classes = np.unique(y_true) if len(classes) != 2: raise ValueError('Cannot calculate Cumulative Gains for data with ' '{} category/ies'.format(len(classes))) # Compute Cumulative Gain Curves percentages, gains1 = cumulative_gain_curve(y_true, y_probas[:, 0], classes[0]) percentages, gains2 = cumulative_gain_curve(y_true, y_probas[:, 1], classes[1]) percentages, gains3 = cumulative_gain_curve(y_true, y_true, classes[0]) percentages, gains4 = cumulative_gain_curve(y_true, y_true, classes[1]) if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) ax.set_title(title, fontsize=title_fontsize) ax.plot(percentages, gains1, lw=3, label='Class {} (pred)'.format(classes[0])) ax.plot(percentages, gains2, lw=3, label='Class {} (pred)'.format(classes[1])) #ax.plot(percentages, gains3, lw=3, label='Class {} (true)'.format(classes[0])) ax.plot(percentages, gains4, lw=3, label='Class {} (true)'.format(classes[1])) ax.set_xlim([0.0, 1.0]) ax.set_ylim([0.0, 1.1]) ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Baseline') ax.set_xlabel('Percentage of sample', fontsize=text_fontsize) ax.set_ylabel('Gain', fontsize=text_fontsize) ax.tick_params(labelsize=text_fontsize) ax.grid('on') ax.legend(loc='lower right', fontsize=text_fontsize) plt.show() return ax
def main(config): data_loader = config.init_obj('data_loader', module_data) X_train, X_test, Y_train, Y_test = train_test_split(data_loader.dataset.datas, data_loader.dataset.targets, test_size=0.2) RF = RandomForestClassifier() RF.fit(X_train, Y_train) preds = RF.predict(X_test) preds_prob = RF.predict_proba(X_test) print(f'Accuracy score: {accuracy_score(Y_test, preds)}') print(f'Precision: {precision_score(Y_test, preds)}') print(f'Recall: {recall_score(Y_test, preds)}') print(f'F1-score: {f1_score(Y_test, preds)}') plot_roc_curve(RF, X_test, Y_test) plt.plot([0, 1], [0, 1], linestyle='--') plt.legend(loc='lower right') plt.savefig(f'RF_ROC.png') plt.close() plot_precision_recall_curve(RF, X_test, Y_test) plt.legend(loc='lower right') plt.savefig(f'RF_PRC.png') plt.close() #plot_cumulative_gain(Y_test, preds_prob) p, g = cumulative_gain_curve(Y_test, preds_prob[:, 1]) plt.plot(p, g) plt.plot([0, preds.sum()/len(preds), 1], [0, 1, 1]) plt.plot([0, 1], [0, 1], linestyle='--') plt.savefig(f'RF_LIFT.png') plt.close()
def plot_cumulative_gain(y_true, y_proba, title_fontsize=15, text_fontsize=10): # Compute Cumulative Gain Curves percentages, gains1 = cumulative_gain_curve(y_true, y_proba, True) # Best classifier #percentages, gains2 = cumulative_gain_curve(y_true, y_true, True) fig, ax = plt.subplots(1, 1) ax.set_title('Cumulative gains chart', fontsize=title_fontsize) ax.plot(percentages, gains1, lw=3, label='Class {}'.format(True)) # Best classifier #ax.plot(percentages, gains2, lw=3, label='Class {}'.format('best')) ax.set_xlim([0.0, 1.0]) ax.set_ylim([0.0, 1.0]) ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Baseline') ax.set_xlabel('Percentage of sample', fontsize=text_fontsize) ax.set_ylabel('Gain', fontsize=text_fontsize) ax.tick_params(labelsize=text_fontsize) ax.grid('on') ax.legend(loc='lower right', fontsize=text_fontsize) return ax
def lift_curve(output, target): try: from sklearn.metrics import roc_curve as rc from scikitplot.helpers import cumulative_gain_curve except ImportError: raise RuntimeError( "This contrib module requires scikitplot to be installed") with torch.no_grad(): pred = torch.argmax(output, dim=1) assert pred.shape[0] == len(target) percentages, gains = cumulative_gain_curve(target.cpu().numpy(), output.cpu().numpy()[:, 1], 1) fig = plt.figure() plt.plot([0, target.cpu().numpy().sum() / len(target.cpu().numpy()), 1], [0, 1, 1]) plt.plot(percentages, gains) plt.plot([0, 1], [0, 1], linestyle='--') fig.canvas.draw() buf = np.asarray(fig.canvas.buffer_rgba(), dtype=np.uint8)[:, :, :3] image = torch.from_numpy(buf).permute(2, 0, 1) plt.close(fig) return image
def auc_CGC(y_true, prob_y): """ calculate area under cumulativ gain curve """ if prob_y.shape[1] > 1: prob_y = prob_y[:, 1] x_data, y_data = cumulative_gain_curve(y_true, prob_y) x_best, y_best = tools.bestCurve(y_true) x_base, y_base = np.array([0, 1]), np.array([0, 1]) #plt.plot(x_data,y_data) #plt.show() auc_data = np.trapz(y_data, x_data) #- np.trapz(y_best, x_best) auc_best = np.trapz(y_best, x_best) #- np.trapz(y_best, x_best) auc_base = np.trapz(y_base, x_base) area_ratio = (auc_data - auc_base) / (auc_best - auc_base) return auc_data, area_ratio
def plot_cumulative_gain(y_true, y_probas, title='Cumulative Gains Curve',ax=None, figsize=None, title_fontsize="large",text_fontsize="medium"): """Generates the Cumulative Gains Plot from labels and scores/probabilities The cumulative gains chart is used to determine the effectiveness of a binary classifier. A detailed explanation can be found at http://mlwiki.org/index.php/Cumulative_Gain_Chart. The implementation here works only for binary classification. Args: y_true (array-like, shape (n_samples)): Ground truth (correct) target values. y_probas (array-like, shape (n_samples, n_classes)): Prediction probabilities for each class returned by a classifier. title (string, optional): Title of the generated plot. Defaults to "Cumulative Gains Curve". ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot the learning curve. If None, the plot is drawn on a new set of axes. figsize (2-tuple, optional): Tuple denoting figure size of the plot e.g. (6, 6). Defaults to ``None``. title_fontsize (string or int, optional): Matplotlib-style fontsizes. Use e.g. "small", "medium", "large" or integer-values. Defaults to "large". text_fontsize (string or int, optional): Matplotlib-style fontsizes. Use e.g. "small", "medium", "large" or integer-values. Defaults to "medium". Returns: ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn. Example: >>> import scikitplot as skplt >>> lr = LogisticRegression() >>> lr = lr.fit(X_train, y_train) >>> y_probas = lr.predict_proba(X_test) >>> skplt.metrics.plot_cumulative_gain(y_test, y_probas) <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490> >>> plt.show() .. image:: _static/examples/plot_cumulative_gain.png :align: center :alt: Cumulative Gains Plot """ y_true = np.array(y_true) y_probas = np.array(y_probas) classes = np.unique(y_true) if len(classes) != 2: raise ValueError('Cannot calculate Cumulative Gains for data with ' '{} category/ies'.format(len(classes))) # Compute Cumulative Gain Curves #percentages, gains1 = cumulative_gain_curve(y_true, y_probas[:, 0], # classes[0]) percentages, gains2 = cumulative_gain_curve(y_true, y_probas[:, 1], classes[1]) if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) ax.set_title(title, fontsize=title_fontsize) #ax.plot(percentages, gains1, lw=3, label='Class {}'.format(classes[0])) ax.plot(percentages, gains2, lw=3, label='Class {}'.format(classes[1])) ax.set_xlim([0.0, 1.0]) ax.set_ylim([0.0, 1.0]) ax.set_xlabel('Percentage of sample', fontsize=text_fontsize) ax.set_ylabel('Gain', fontsize=text_fontsize) ax.tick_params(labelsize=text_fontsize) ax.grid('on') #ax.legend(loc='lower right', fontsize=text_fontsize) return ax
def plot_lift_curve(y_true, y_probas, title='Lift Curve', ax=None, figsize=None, title_fontsize="large", text_fontsize="medium"): """Generates the Lift Curve from labels and scores/probabilities The lift curve is used to determine the effectiveness of a binary classifier. A detailed explanation can be found at http://www2.cs.uregina.ca/~dbd/cs831/notes/lift_chart/lift_chart.html. The implementation here works only for binary classification. Args: y_true (array-like, shape (n_samples)): Ground truth (correct) target values. y_probas (array-like, shape (n_samples, n_classes)): Prediction probabilities for each class returned by a classifier. title (string, optional): Title of the generated plot. Defaults to "Lift Curve". ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot the learning curve. If None, the plot is drawn on a new set of axes. figsize (2-tuple, optional): Tuple denoting figure size of the plot e.g. (6, 6). Defaults to ``None``. title_fontsize (string or int, optional): Matplotlib-style fontsizes. Use e.g. "small", "medium", "large" or integer-values. Defaults to "large". text_fontsize (string or int, optional): Matplotlib-style fontsizes. Use e.g. "small", "medium", "large" or integer-values. Defaults to "medium". Returns: ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn. Example: >>> import scikitplot as skplt >>> lr = LogisticRegression() >>> lr = lr.fit(X_train, y_train) >>> y_probas = lr.predict_proba(X_test) >>> skplt.metrics.plot_lift_curve(y_test, y_probas) <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490> >>> plt.show() .. image:: _static/examples/plot_lift_curve.png :align: center :alt: Lift Curve """ y_true = np.array(y_true) y_probas = np.array(y_probas) classes = np.unique(y_true) if len(classes) != 2: raise ValueError('Cannot calculate Lift Curve for data with ' '{} category/ies'.format(len(classes))) # Compute Cumulative Gain Curves percentages, gains1 = cumulative_gain_curve(y_true, y_probas[:, 0], classes[0]) percentages, gains2 = cumulative_gain_curve(y_true, y_probas[:, 1], classes[1]) percentages = percentages[1:] gains1 = gains1[1:] gains2 = gains2[1:] gains1 = gains1 / percentages gains2 = gains2 / percentages if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) ax.set_title(title, fontsize=title_fontsize) ax.plot(percentages, gains1, lw=3, label='Class {}'.format(classes[0])) ax.plot(percentages, gains2, lw=3, label='Class {}'.format(classes[1])) ax.plot([0, 1], [1, 1], 'k--', lw=2, label='Baseline') ax.set_xlabel('Percentage of sample', fontsize=text_fontsize) ax.set_ylabel('Lift', fontsize=text_fontsize) ax.tick_params(labelsize=text_fontsize) ax.grid('on') ax.legend(loc='lower right', fontsize=text_fontsize) return ax
def cumulative_gain_area_ratio(y_true, y_probas, onehot=False, title='Cumulative Gains Curve', ax=None, figsize=None, title_fontsize="large", text_fontsize="large"): """ Refactored code from scikit-plot's plot_cumulative_gain function. Plots the cumulative gain curve and calculates the area ratio. Inputs: - y_true: vector of targets (must be binary). - y_probas: probability of classification. - onehot: binary, True: y_vectors are of shape (n,2), False: shape (n,) """ y_true = np.array(y_true) y_probas = np.array(y_probas) classes = np.unique(y_true) if len(classes) != 2: raise ValueError('Cannot calculate Cumulative Gains for data with ' '{} category/ies'.format(len(classes))) #Workaround.. if not onehot: y_probas = y_probas.reshape((len(y_probas), 1)) y_probas = np.concatenate((np.zeros((len(y_probas), 1)), y_probas), axis=1) #Compute Cumulative Gain Curves percentages, gains = cumulative_gain_curve(y_true, y_probas[:, 1], classes[1]) #Calculate optimal model curve best_curve_x = [0, np.sum(y_true) / len(y_true), 1] best_curve_y = [0, 1, 1] #Calculate area ratio best_curve_area = auc(best_curve_x, best_curve_y) - 0.5 model_curve_area = auc(percentages, gains) - 0.5 area_ratio = model_curve_area / best_curve_area #plotting if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) ax.set_title(title, fontsize=title_fontsize) ax.plot(percentages, gains, lw=2, label='Model') ax.plot(best_curve_x, best_curve_y, lw=2, label='Best curve') ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Baseline') ax.set_xlim([0.0, 1.0]) ax.set_ylim([0.0, 1.2]) ax.set_xlabel('Percentage of data', fontsize=text_fontsize) ax.set_ylabel('Cumulative percentage of target data', fontsize=text_fontsize) ax.tick_params(labelsize=text_fontsize) ax.grid('on') ax.legend(loc='lower right', fontsize=text_fontsize) plt.show() return area_ratio