def test_multioutput_regressor_error(pyplot): """Check that multioutput regressor raises correct error.""" X = np.asarray([[0, 1], [1, 2]]) y = np.asarray([[0, 1], [4, 1]]) tree = DecisionTreeRegressor().fit(X, y) with pytest.raises(ValueError, match="Multi-output regressors are not supported"): DecisionBoundaryDisplay.from_estimator(tree, X)
def test_multiclass_error(pyplot, response_method): """Check multiclass errors.""" X, y = make_classification(n_classes=3, n_informative=3, random_state=0) X = X[:, [0, 1]] lr = LogisticRegression().fit(X, y) msg = ( "Multiclass classifiers are only supported when response_method is 'predict' or" " 'auto'" ) with pytest.raises(ValueError, match=msg): DecisionBoundaryDisplay.from_estimator(lr, X, response_method=response_method)
def test_error_bad_response(pyplot, response_method, msg): """Check errors for bad response.""" class MyClassifier(BaseEstimator, ClassifierMixin): def fit(self, X, y): self.fitted_ = True self.classes_ = [0, 1] return self clf = MyClassifier().fit(X, y) with pytest.raises(ValueError, match=msg): DecisionBoundaryDisplay.from_estimator(clf, X, response_method=response_method)
def test_multilabel_classifier_error(pyplot, response_method): """Check that multilabel classifier raises correct error.""" X, y = make_multilabel_classification(random_state=0) X = X[:, :2] tree = DecisionTreeClassifier().fit(X, y) msg = "Multi-label and multi-output multi-class classifiers are not supported" with pytest.raises(ValueError, match=msg): DecisionBoundaryDisplay.from_estimator( tree, X, response_method=response_method, )
def test_multi_output_multi_class_classifier_error(pyplot, response_method): """Check that multi-output multi-class classifier raises correct error.""" X = np.asarray([[0, 1], [1, 2]]) y = np.asarray([["tree", "cat"], ["cat", "tree"]]) tree = DecisionTreeClassifier().fit(X, y) msg = "Multi-label and multi-output multi-class classifiers are not supported" with pytest.raises(ValueError, match=msg): DecisionBoundaryDisplay.from_estimator( tree, X, response_method=response_method, )
def test_dataframe_support(pyplot): """Check that passing a dataframe at fit and to the Display does not raise warnings. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/23311 """ pd = pytest.importorskip("pandas") df = pd.DataFrame(X, columns=["col_x", "col_y"]) estimator = LogisticRegression().fit(df, y) with warnings.catch_warnings(): # no warnings linked to feature names validation should be raised warnings.simplefilter("error", UserWarning) DecisionBoundaryDisplay.from_estimator(estimator, df, response_method="predict")
def test_string_target(pyplot): """Check that decision boundary works with classifiers trained on string labels.""" iris = load_iris() X = iris.data[:, [0, 1]] # Use strings as target y = iris.target_names[iris.target] log_reg = LogisticRegression().fit(X, y) # Does not raise DecisionBoundaryDisplay.from_estimator( log_reg, X, grid_resolution=5, response_method="predict", )
def test_multiclass(pyplot, response_method): """Check multiclass gives expected results.""" grid_resolution = 10 eps = 1.0 X, y = make_classification(n_classes=3, n_informative=3, random_state=0) X = X[:, [0, 1]] lr = LogisticRegression(random_state=0).fit(X, y) disp = DecisionBoundaryDisplay.from_estimator( lr, X, response_method=response_method, grid_resolution=grid_resolution, eps=1.0) x0_min, x0_max = X[:, 0].min() - eps, X[:, 0].max() + eps x1_min, x1_max = X[:, 1].min() - eps, X[:, 1].max() + eps xx0, xx1 = np.meshgrid( np.linspace(x0_min, x0_max, grid_resolution), np.linspace(x1_min, x1_max, grid_resolution), ) response = lr.predict(np.c_[xx0.ravel(), xx1.ravel()]) assert_allclose(disp.response, response.reshape(xx0.shape)) assert_allclose(disp.xx0, xx0) assert_allclose(disp.xx1, xx1)
def test_decision_boundary_display(pyplot, fitted_clf, response_method, plot_method): """Check that decision boundary is correct.""" fig, ax = pyplot.subplots() eps = 2.0 disp = DecisionBoundaryDisplay.from_estimator( fitted_clf, X, grid_resolution=5, response_method=response_method, plot_method=plot_method, eps=eps, ax=ax, ) assert isinstance(disp.surface_, pyplot.matplotlib.contour.QuadContourSet) assert disp.ax_ == ax assert disp.figure_ == fig x0, x1 = X[:, 0], X[:, 1] x0_min, x0_max = x0.min() - eps, x0.max() + eps x1_min, x1_max = x1.min() - eps, x1.max() + eps assert disp.xx0.min() == pytest.approx(x0_min) assert disp.xx0.max() == pytest.approx(x0_max) assert disp.xx1.min() == pytest.approx(x1_min) assert disp.xx1.max() == pytest.approx(x1_max) fig2, ax2 = pyplot.subplots() # change plotting method for second plot disp.plot(plot_method="pcolormesh", ax=ax2, shading="auto") assert isinstance(disp.surface_, pyplot.matplotlib.collections.QuadMesh) assert disp.ax_ == ax2 assert disp.figure_ == fig2
def test_display_plot_input_error(pyplot, fitted_clf): """Check input validation for `plot`.""" disp = DecisionBoundaryDisplay.from_estimator(fitted_clf, X, grid_resolution=5) with pytest.raises(ValueError, match="plot_method must be 'contourf'"): disp.plot(plot_method="hello_world")
def test_dataframe_labels_used(pyplot, fitted_clf): """Check that column names are used for pandas.""" pd = pytest.importorskip("pandas") df = pd.DataFrame(X, columns=["col_x", "col_y"]) # pandas column names are used by default _, ax = pyplot.subplots() disp = DecisionBoundaryDisplay.from_estimator(fitted_clf, df, ax=ax) assert ax.get_xlabel() == "col_x" assert ax.get_ylabel() == "col_y" # second call to plot will have the names fig, ax = pyplot.subplots() disp.plot(ax=ax) assert ax.get_xlabel() == "col_x" assert ax.get_ylabel() == "col_y" # axes with a label will not get overridden fig, ax = pyplot.subplots() ax.set(xlabel="hello", ylabel="world") disp.plot(ax=ax) assert ax.get_xlabel() == "hello" assert ax.get_ylabel() == "world" # labels get overriden only if provided to the `plot` method disp.plot(ax=ax, xlabel="overwritten_x", ylabel="overwritten_y") assert ax.get_xlabel() == "overwritten_x" assert ax.get_ylabel() == "overwritten_y" # labels do not get inferred if provided to `from_estimator` _, ax = pyplot.subplots() disp = DecisionBoundaryDisplay.from_estimator(fitted_clf, df, ax=ax, xlabel="overwritten_x", ylabel="overwritten_y") assert ax.get_xlabel() == "overwritten_x" assert ax.get_ylabel() == "overwritten_y"
iris = datasets.load_iris() # we only take the first two features. We could avoid this ugly # slicing by using a two-dim dataset X = iris.data[:, :2] y = iris.target # Create color maps cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"]) cmap_bold = ListedColormap(["darkorange", "c", "darkblue"]) for shrinkage in [None, 0.2]: # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(X, y) y_pred = clf.predict(X) print(shrinkage, np.mean(y == y_pred)) _, ax = plt.subplots() DecisionBoundaryDisplay.from_estimator(clf, X, cmap=cmap_light, ax=ax, response_method="predict") # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor="k", s=20) plt.title("3-Class classification (shrink_threshold=%r)" % shrinkage) plt.axis("tight") plt.show()
clf1 = DecisionTreeClassifier(max_depth=4) clf2 = KNeighborsClassifier(n_neighbors=7) clf3 = SVC(gamma=0.1, kernel="rbf", probability=True) eclf = VotingClassifier( estimators=[("dt", clf1), ("knn", clf2), ("svc", clf3)], voting="soft", weights=[2, 1, 2], ) clf1.fit(X, y) clf2.fit(X, y) clf3.fit(X, y) eclf.fit(X, y) # Plotting decision regions f, axarr = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(10, 8)) for idx, clf, tt in zip( product([0, 1], [0, 1]), [clf1, clf2, clf3, eclf], ["Decision Tree (depth=4)", "KNN (k=7)", "Kernel SVM", "Soft Voting"], ): DecisionBoundaryDisplay.from_estimator(clf, X, alpha=0.4, ax=axarr[idx[0], idx[1]], response_method="predict") axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k") axarr[idx[0], idx[1]].set_title(tt) plt.show()
titles = ( "SVC with linear kernel", "LinearSVC (linear kernel)", "SVC with RBF kernel", "SVC with polynomial (degree 3) kernel", ) # Set-up 2x2 grid for plotting. fig, sub = plt.subplots(2, 2) plt.subplots_adjust(wspace=0.4, hspace=0.4) X0, X1 = X[:, 0], X[:, 1] for clf, title, ax in zip(models, titles, sub.flatten()): disp = DecisionBoundaryDisplay.from_estimator( clf, X, response_method="predict", cmap=plt.cm.coolwarm, alpha=0.8, ax=ax, xlabel=iris.feature_names[0], ylabel=iris.feature_names[1], ) ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k") ax.set_xticks(()) ax.set_yticks(()) ax.set_title(title) plt.show()
# decision_function = np.dot(X, clf.coef_[0]) + clf.intercept_[0] # The support vectors are the samples that lie within the margin # boundaries, whose size is conventionally constrained to 1 support_vector_indices = np.where( np.abs(decision_function) <= 1 + 1e-15)[0] support_vectors = X[support_vector_indices] plt.subplot(1, 2, i + 1) plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.Paired) ax = plt.gca() DecisionBoundaryDisplay.from_estimator( clf, X, ax=ax, grid_resolution=50, plot_method="contour", colors="k", levels=[-1, 0, 1], alpha=0.5, linestyles=["--", "-", "--"], ) plt.scatter( support_vectors[:, 0], support_vectors[:, 1], s=100, linewidth=1, facecolors="none", edgecolors="k", ) plt.title("C=" + str(C)) plt.tight_layout()
# Generate new samples and plot them along with the original dataset X_new, y_new = make_blobs(n_samples=10, centers=[(-7, -1), (-2, 4), (3, 6)], random_state=RANDOM_STATE) plt.subplot(132) plot_scatter(X, cluster_labels) plot_scatter(X_new, "black", 1) plt.title("Unknown instances") # Declare the inductive learning model that it will be used to # predict cluster membership for unknown instances classifier = RandomForestClassifier(random_state=RANDOM_STATE) inductive_learner = InductiveClusterer(clusterer, classifier).fit(X) probable_clusters = inductive_learner.predict(X_new) ax = plt.subplot(133) plot_scatter(X, cluster_labels) plot_scatter(X_new, probable_clusters) # Plotting decision regions DecisionBoundaryDisplay.from_estimator(inductive_learner, X, response_method="predict", alpha=0.4, ax=ax) plt.title("Classify unknown instances") plt.show()
("nca", NeighborhoodComponentsAnalysis()), ("knn", KNeighborsClassifier(n_neighbors=n_neighbors)), ]), ] for name, clf in zip(names, classifiers): clf.fit(X_train, y_train) score = clf.score(X_test, y_test) _, ax = plt.subplots() DecisionBoundaryDisplay.from_estimator( clf, X, cmap=cmap_light, alpha=0.8, ax=ax, response_method="predict", plot_method="pcolormesh", shading="auto", ) # Plot also the training and testing points plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor="k", s=20) plt.title("{} (k = {})".format(name, n_neighbors)) plt.text( 0.9, 0.1, "{:.2f}".format(score), size=15, ha="center", va="center",
# import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. Y = iris.target # Create an instance of Logistic Regression Classifier and fit the data. logreg = LogisticRegression(C=1e5) logreg.fit(X, Y) _, ax = plt.subplots(figsize=(4, 3)) DecisionBoundaryDisplay.from_estimator( logreg, X, cmap=plt.cm.Paired, ax=ax, response_method="predict", plot_method="pcolormesh", shading="auto", xlabel="Sepal length", ylabel="Sepal width", eps=0.5, ) # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors="k", cmap=plt.cm.Paired) plt.xticks(()) plt.yticks(()) plt.show()
# Create color maps cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"]) cmap_bold = ["darkorange", "c", "darkblue"] for weights in ["uniform", "distance"]: # we create an instance of Neighbours Classifier and fit the data. clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) clf.fit(X, y) _, ax = plt.subplots() DecisionBoundaryDisplay.from_estimator( clf, X, cmap=cmap_light, ax=ax, response_method="predict", plot_method="pcolormesh", xlabel=iris.feature_names[0], ylabel=iris.feature_names[1], shading="auto", ) # Plot also the training points sns.scatterplot( x=X[:, 0], y=X[:, 1], hue=iris.target_names[y], palette=cmap_bold, alpha=1.0, edgecolor="black", )
) prevalence = y.mean() populations["prevalence"].append(prevalence) populations["X"].append(X) populations["y"].append(y) # down-sample for plotting rng = np.random.RandomState(1) plot_indices = rng.choice(np.arange(X.shape[0]), size=500, replace=True) X_plot, y_plot = X[plot_indices], y[plot_indices] # plot fixed decision boundary of base model with varying prevalence disp = DecisionBoundaryDisplay.from_estimator( estimator, X_plot, response_method="predict", alpha=0.5, ax=ax, ) scatter = disp.ax_.scatter(X_plot[:, 0], X_plot[:, 1], c=y_plot, edgecolor="k") disp.ax_.set_title(f"prevalence = {y_plot.mean():.2f}") disp.ax_.legend(*scatter.legend_elements()) # %% # We define a function for bootstraping. def scoring_on_bootstrap(estimator, X, y, rng, n_bootstrap=100):
np.random.shuffle(idx) X = X[idx] y = y[idx] # standardize mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std clf = SGDClassifier(alpha=0.001, max_iter=100).fit(X, y) ax = plt.gca() DecisionBoundaryDisplay.from_estimator( clf, X, cmap=plt.cm.Paired, ax=ax, response_method="predict", xlabel=iris.feature_names[0], ylabel=iris.feature_names[1], ) plt.axis("tight") # Plot also the training points for i, color in zip(clf.classes_, colors): idx = np.where(y == i) plt.scatter( X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], cmap=plt.cm.Paired,
3]]): # We only take the two corresponding features X = iris.data[:, pair] y = iris.target # Train clf = DecisionTreeClassifier().fit(X, y) # Plot the decision boundary ax = plt.subplot(2, 3, pairidx + 1) plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5) DecisionBoundaryDisplay.from_estimator( clf, X, cmap=plt.cm.RdYlBu, response_method="predict", ax=ax, xlabel=iris.feature_names[pair[0]], ylabel=iris.feature_names[pair[1]], ) # Plot the training points for i, color in zip(range(n_classes), plot_colors): idx = np.where(y == i) plt.scatter( X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], cmap=plt.cm.RdYlBu, edgecolor="black",
(2 0) k(X, Y) = X ( ) Y.T (0 1) """ M = np.array([[2, 0], [0, 1.0]]) return np.dot(np.dot(X, M), Y.T) h = 0.02 # step size in the mesh # we create an instance of SVM and fit out data. clf = svm.SVC(kernel=my_kernel) clf.fit(X, Y) ax = plt.gca() DecisionBoundaryDisplay.from_estimator( clf, X, cmap=plt.cm.Paired, ax=ax, response_method="predict", plot_method="pcolormesh", shading="auto", ) # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors="k") plt.title("3-Class classification using Support Vector Machine with custom kernel") plt.axis("tight") plt.show()
def test_input_validation_errors(pyplot, kwargs, error_msg, fitted_clf): """Check input validation from_estimator.""" with pytest.raises(ValueError, match=error_msg): DecisionBoundaryDisplay.from_estimator(fitted_clf, X, **kwargs)
transformation = [[0.4, 0.2], [-0.4, 1.2]] X = np.dot(X, transformation) for multi_class in ("multinomial", "ovr"): clf = LogisticRegression(solver="sag", max_iter=100, random_state=42, multi_class=multi_class).fit(X, y) # print the training scores print("training score : %.3f (%s)" % (clf.score(X, y), multi_class)) _, ax = plt.subplots() DecisionBoundaryDisplay.from_estimator(clf, X, response_method="predict", cmap=plt.cm.Paired, ax=ax) plt.title("Decision surface of LogisticRegression (%s)" % multi_class) plt.axis("tight") # Plot also the training points colors = "bry" for i, color in zip(clf.classes_, colors): idx = np.where(y == i) plt.scatter(X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired, edgecolor="black", s=20)
bdt.fit(X, y) plot_colors = "br" plot_step = 0.02 class_names = "AB" plt.figure(figsize=(10, 5)) # Plot the decision boundaries ax = plt.subplot(121) disp = DecisionBoundaryDisplay.from_estimator( bdt, X, cmap=plt.cm.Paired, response_method="predict", ax=ax, xlabel="x", ylabel="y", ) x_min, x_max = disp.xx0.min(), disp.xx0.max() y_min, y_max = disp.xx1.min(), disp.xx1.max() plt.axis("tight") # Plot the training points for i, n, c in zip(range(2), class_names, plot_colors): idx = np.where(y == i) plt.scatter( X[idx, 0], X[idx, 1], c=c,
ax.scatter( X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k" ) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) ax.set_xticks(()) ax.set_yticks(()) i += 1 # iterate over classifiers for name, clf in zip(names, classifiers): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) DecisionBoundaryDisplay.from_estimator( clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5 ) # Plot the training points ax.scatter( X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k" ) # Plot the testing points ax.scatter( X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors="k", alpha=0.6, )