def binary_classifier_quality(model, X_test, Y_test): """ Meant for binary classification. If `model` is Grid it uses best model. """ if isinstance(model, GridSearchCV): result = pd.DataFrame( {k: model.cv_results_[k] for k in \ ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']} ) print(result) print() print(f"best params: {model.best_params_}") print("best score: {:f}".format(model.best_score_)) print() Y_hat = model.predict(X_test) print("Confusion matrix (true x pred):") print(confusion_matrix(Y_test, Y_hat)) print("Sensitivity: {:f}".format( sum(Y_hat[Y_test==1]) / sum(Y_test) )) print("Specificity: {:f}".format( sum(1 - Y_hat[Y_test==0]) / sum(Y_test==0))) print("Accuracy score on test data: {:f}".format( accuracy_score(Y_test, Y_hat) )) print("F1 score on test data: {:f}".format( f1_score(Y_test, Y_hat) )) #print(confusion_matrix(grid.predict(X_test), y_test)) ConfusionMatrixDisplay.from_predictions(Y_test, Y_hat) RocCurveDisplay.from_estimator(model, X_test, Y_test)
def sklearn_visualizations(): import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import RocCurveDisplay from sklearn import datasets # data X, y = datasets.load_wine(return_X_y=True) y = y == 2 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) # svm svc = SVC(random_state=rng) svc.fit(X_train, y_train) svc_disp = RocCurveDisplay.from_estimator(svc, X_test, y_test) plt.show() # random forest rfc = RandomForestClassifier(random_state=rng) rfc.fit(X_train, y_train) ax = plt.gca() rfc_disp = RocCurveDisplay.from_estimator(rfc, X_test, y_test, ax=ax, alpha=0.8) svc_disp.plot(ax=ax, alpha=0.8) plt.show()
def plot(self, data_original_test): """"Plot ROC-AOC Curves of both original and synthetic in single figure""" X_test, y_test = self._split_xy(data_original_test) fig, ax = plt.subplots(1, 2, figsize=(14, 6)) sns.despine() # roc curve RocCurveDisplay.from_estimator(self.stats_original_, X_test, y_test, name=self.labels[0], color=COLOR_PALETTE[0], ax=ax[0]) RocCurveDisplay.from_estimator(self.stats_synthetic_, X_test, y_test, name=self.labels[1], color=COLOR_PALETTE[1], ax=ax[0]) ax[0].plot([0, 1], [0, 1], linestyle="--", lw=1, color="black", alpha=0.7) ax[0].set_title('ROC Curve') # pr curve PrecisionRecallDisplay.from_estimator(self.stats_original_, X_test, y_test, name=self.labels[0], color=COLOR_PALETTE[0], ax=ax[1]) PrecisionRecallDisplay.from_estimator(self.stats_synthetic_, X_test, y_test, name=self.labels[1], color=COLOR_PALETTE[1], ax=ax[1]) no_skill = len(y_test[y_test == 1]) / len(y_test) ax[1].plot([0, 1], [no_skill, no_skill], lw=1, linestyle='--', color='black', alpha=0.7) ax[1].set_title('Precision-Recall Curve')
def test_roc_curve_display_complex_pipeline(pyplot, data_binary, clf, constructor_name): """Check the behaviour with complex pipeline.""" X, y = data_binary if constructor_name == "from_estimator": with pytest.raises(NotFittedError): RocCurveDisplay.from_estimator(clf, X, y) clf.fit(X, y) if constructor_name == "from_estimator": display = RocCurveDisplay.from_estimator(clf, X, y) name = clf.__class__.__name__ else: display = RocCurveDisplay.from_predictions(y, y) name = "Classifier" assert name in display.line_.get_label() assert display.estimator_name == name
import matplotlib.pyplot as plt from sklearn.metrics import RocCurveDisplay fig, ax = plt.subplots() models = [ ("RT embedding -> LR", rt_model), ("RF", random_forest), ("RF embedding -> LR", rf_model), ("GBDT", gradient_boosting), ("GBDT embedding -> LR", gbdt_model), ] model_displays = {} for name, pipeline in models: model_displays[name] = RocCurveDisplay.from_estimator(pipeline, X_test, y_test, ax=ax, name=name) _ = ax.set_title("ROC curve") # %% fig, ax = plt.subplots() for name, pipeline in models: model_displays[name].plot(ax=ax) ax.set_xlim(0, 0.2) ax.set_ylim(0.8, 1) _ = ax.set_title("ROC curve (zoomed in at top left)")
X, y = load_wine(return_X_y=True) y = y == 2 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) svc = SVC(random_state=42) svc.fit(X_train, y_train) # %% # Plotting the ROC Curve # ---------------------- # Next, we plot the ROC curve with a single call to # :func:`sklearn.metrics.RocCurveDisplay.from_estimator`. The returned # `svc_disp` object allows us to continue using the already computed ROC curve # for the SVC in future plots. svc_disp = RocCurveDisplay.from_estimator(svc, X_test, y_test) plt.show() # %% # Training a Random Forest and Plotting the ROC Curve # --------------------------------------------------- # We train a random forest classifier and create a plot comparing it to the SVC # ROC curve. Notice how `svc_disp` uses # :func:`~sklearn.metrics.RocCurveDisplay.plot` to plot the SVC ROC curve # without recomputing the values of the roc curve itself. Furthermore, we # pass `alpha=0.8` to the plot functions to adjust the alpha values of the # curves. rfc = RandomForestClassifier(n_estimators=10, random_state=42) rfc.fit(X_train, y_train) ax = plt.gca() rfc_disp = RocCurveDisplay.from_estimator(rfc,
def test_roc_curve_display_plotting( pyplot, response_method, data_binary, with_sample_weight, drop_intermediate, with_strings, constructor_name, default_name, ): """Check the overall plotting behaviour.""" X, y = data_binary pos_label = None if with_strings: y = np.array(["c", "b"])[y] pos_label = "c" if with_sample_weight: rng = np.random.RandomState(42) sample_weight = rng.randint(1, 4, size=(X.shape[0])) else: sample_weight = None lr = LogisticRegression() lr.fit(X, y) y_pred = getattr(lr, response_method)(X) y_pred = y_pred if y_pred.ndim == 1 else y_pred[:, 1] if constructor_name == "from_estimator": display = RocCurveDisplay.from_estimator( lr, X, y, sample_weight=sample_weight, drop_intermediate=drop_intermediate, pos_label=pos_label, alpha=0.8, ) else: display = RocCurveDisplay.from_predictions( y, y_pred, sample_weight=sample_weight, drop_intermediate=drop_intermediate, pos_label=pos_label, alpha=0.8, ) fpr, tpr, _ = roc_curve( y, y_pred, sample_weight=sample_weight, drop_intermediate=drop_intermediate, pos_label=pos_label, ) assert_allclose(display.roc_auc, auc(fpr, tpr)) assert_allclose(display.fpr, fpr) assert_allclose(display.tpr, tpr) assert display.estimator_name == default_name import matplotlib as mpl # noqal assert isinstance(display.line_, mpl.lines.Line2D) assert display.line_.get_alpha() == 0.8 assert isinstance(display.ax_, mpl.axes.Axes) assert isinstance(display.figure_, mpl.figure.Figure) expected_label = f"{default_name} (AUC = {display.roc_auc:.2f})" assert display.line_.get_label() == expected_label expected_pos_label = 1 if pos_label is None else pos_label expected_ylabel = f"True Positive Rate (Positive label: {expected_pos_label})" expected_xlabel = f"False Positive Rate (Positive label: {expected_pos_label})" assert display.ax_.get_ylabel() == expected_ylabel assert display.ax_.get_xlabel() == expected_xlabel
def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name): # check that we can provide the positive label and display the proper # statistics X, y = load_breast_cancer(return_X_y=True) # create an highly imbalanced idx_positive = np.flatnonzero(y == 1) idx_negative = np.flatnonzero(y == 0) idx_selected = np.hstack([idx_negative, idx_positive[:25]]) X, y = X[idx_selected], y[idx_selected] X, y = shuffle(X, y, random_state=42) # only use 2 features to make the problem even harder X = X[:, :2] y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object) X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, random_state=0, ) classifier = LogisticRegression() classifier.fit(X_train, y_train) # sanity check to be sure the positive class is classes_[0] and that we # are betrayed by the class imbalance assert classifier.classes_.tolist() == ["cancer", "not cancer"] y_pred = getattr(classifier, response_method)(X_test) # we select the correcponding probability columns or reverse the decision # function otherwise y_pred_cancer = -1 * y_pred if y_pred.ndim == 1 else y_pred[:, 0] y_pred_not_cancer = y_pred if y_pred.ndim == 1 else y_pred[:, 1] if constructor_name == "from_estimator": display = RocCurveDisplay.from_estimator( classifier, X_test, y_test, pos_label="cancer", response_method=response_method, ) else: display = RocCurveDisplay.from_predictions( y_test, y_pred_cancer, pos_label="cancer", ) roc_auc_limit = 0.95679 assert display.roc_auc == pytest.approx(roc_auc_limit) assert np.trapz(display.tpr, display.fpr) == pytest.approx(roc_auc_limit) if constructor_name == "from_estimator": display = RocCurveDisplay.from_estimator( classifier, X_test, y_test, response_method=response_method, pos_label="not cancer", ) else: display = RocCurveDisplay.from_predictions( y_test, y_pred_not_cancer, pos_label="not cancer", ) assert display.roc_auc == pytest.approx(roc_auc_limit) assert np.trapz(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state) tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) fig, ax = plt.subplots() for i, (train, test) in enumerate(cv.split(X, y)): classifier.fit(X[train], y[train]) viz = RocCurveDisplay.from_estimator( classifier, X[test], y[test], name="ROC fold {}".format(i), alpha=0.3, lw=1, ax=ax, ) interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 tprs.append(interp_tpr) aucs.append(viz.roc_auc) ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)
# The precision and recall metric focuses on the positive class, however, one # might be interested in the compromise between accurately discriminating the # positive class and accurately discriminating the negative classes. The # statistics used for this are sensitivity and specificity. Sensitivity is just # another name for recall. However, specificity measures the proportion of # correctly classified samples in the negative class defined as: TN / (TN + # FP). Similar to the precision-recall curve, sensitivity and specificity are # generally plotted as a curve called the receiver operating characteristic # (ROC) curve. Below is such a curve: # %% from sklearn.metrics import RocCurveDisplay disp = RocCurveDisplay.from_estimator(classifier, data_test, target_test, pos_label='donated', marker="+") disp = RocCurveDisplay.from_estimator(dummy_classifier, data_test, target_test, pos_label='donated', color="tab:orange", linestyle="--", ax=disp.ax_) plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") _ = disp.ax_.set_title("ROC AUC curve") # %% [markdown] # This curve was built using the same principle as the precision-recall curve: # we vary the probability threshold for determining "hard" prediction and
} X, y = make_classification( n_samples=N_SAMPLES, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, ) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) # prepare plots fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(11, 5)) for name, clf in classifiers.items(): clf.fit(X_train, y_train) RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_roc, name=name) DetCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_det, name=name) ax_roc.set_title("Receiver Operating Characteristic (ROC) curves") ax_det.set_title("Detection Error Tradeoff (DET) curves") ax_roc.grid(linestyle="--") ax_det.grid(linestyle="--") plt.legend() plt.show()