def test_ovr_pipeline(): # Test with pipeline of length one # This test is needed because the multiclass estimators may fail to detect # the presence of predict_proba or decision_function. clf = Pipeline([("tree", DecisionTreeClassifier())]) ovr_pipe = OneVsRestClassifier(clf) ovr_pipe.fit(iris.data, iris.target) ovr = OneVsRestClassifier(DecisionTreeClassifier()) ovr.fit(iris.data, iris.target) assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
def _make_estimators(X_train, y_train, y_ml_train): # Make estimators that make sense to test various scoring methods sensible_regr = DecisionTreeRegressor(random_state=0) # some of the regressions scorers require strictly positive input. sensible_regr.fit(X_train, y_train + 1) sensible_clf = DecisionTreeClassifier(random_state=0) sensible_clf.fit(X_train, y_train) sensible_ml_clf = DecisionTreeClassifier(random_state=0) sensible_ml_clf.fit(X_train, y_ml_train) return dict([(name, sensible_regr) for name in REGRESSION_SCORERS] + [(name, sensible_clf) for name in CLF_SCORERS] + [(name, sensible_clf) for name in CLUSTER_SCORERS] + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS])
def test_thresholded_scorers(): # Test scorers that take thresholds. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = get_scorer('neg_log_loss')(clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # test with a regressor (no decision_function) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) score1 = get_scorer('roc_auc')(reg, X_test, y_test) score2 = roc_auc_score(y_test, reg.predict(X_test)) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) with pytest.raises(ValueError, match="multiclass format is not supported"): get_scorer('roc_auc')(clf, X_test, y_test) # test error is raised with a single class present in model # (predict_proba shape is not suitable for binary auc) X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = DecisionTreeClassifier() clf.fit(X_train, np.zeros_like(y_train)) with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('roc_auc')(clf, X_test, y_test) # for proba scorers with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('neg_log_loss')(clf, X_test, y_test)
def test_set_random_state(): lda = LinearDiscriminantAnalysis() tree = DecisionTreeClassifier() # Linear Discriminant Analysis doesn't have random state: smoke test set_random_state(lda, 3) set_random_state(tree, 3) assert tree.random_state == 3
def test_plot_roc_curve_error_non_binary(pyplot, data): X, y = data clf = DecisionTreeClassifier() clf.fit(X, y) msg = "Estimator should solve a binary classification problem" with pytest.raises(ValueError, match=msg): plot_roc_curve(clf, X, y)
def test_ovr_coef_exceptions(): # Not fitted exception! ovr = OneVsRestClassifier(LinearSVC(random_state=0)) # lambda is needed because we don't want coef_ to be evaluated right away assert_raises(ValueError, lambda x: ovr.coef_, None) # Doesn't have coef_ exception! ovr = OneVsRestClassifier(DecisionTreeClassifier()) ovr.fit(iris.data, iris.target) assert_raises(AttributeError, lambda x: ovr.coef_, None)
def test_pickle_version_warning_is_not_raised_with_matching_version(): iris = datasets.load_iris() tree = DecisionTreeClassifier().fit(iris.data, iris.target) tree_pickle = pickle.dumps(tree) assert b"version" in tree_pickle tree_restored = assert_no_warnings(pickle.loads, tree_pickle) # test that we can predict with the restored decision tree classifier score_of_original = tree.score(iris.data, iris.target) score_of_restored = tree_restored.score(iris.data, iris.target) assert score_of_original == score_of_restored
def test_thresholded_scorers_multilabel_indicator_data(): # Test that the scorer work with multilabel-indicator format # for multilabel and multi-output multi-class classifier X, y = make_multilabel_classification(allow_unlabeled=False, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # Multi-output multi-class predict_proba clf = DecisionTreeClassifier() clf.fit(X_train, y_train) y_proba = clf.predict_proba(X_test) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, np.vstack([p[:, -1] for p in y_proba]).T) assert_almost_equal(score1, score2) # Multi-output multi-class decision_function # TODO Is there any yet? clf = DecisionTreeClassifier() clf.fit(X_train, y_train) clf._predict_proba = clf.predict_proba clf.predict_proba = None clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)] y_proba = clf.decision_function(X_test) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, np.vstack([p for p in y_proba]).T) assert_almost_equal(score1, score2) # Multilabel predict_proba clf = OneVsRestClassifier(DecisionTreeClassifier()) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)) assert_almost_equal(score1, score2) # Multilabel decision function clf = OneVsRestClassifier(LinearSVC(random_state=0)) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) assert_almost_equal(score1, score2)
def test_raises_on_score_list(): # Test that when a list of scores is returned, we raise proper errors. X, y = make_blobs(random_state=0) f1_scorer_no_average = make_scorer(f1_score, average=None) clf = DecisionTreeClassifier() assert_raises(ValueError, cross_val_score, clf, X, y, scoring=f1_scorer_no_average) grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average, param_grid={'max_depth': [1, 2]}) assert_raises(ValueError, grid_search.fit, X, y)
def bench_scikit_tree_classifier(X, Y): """Benchmark with scikit-learn decision tree classifier""" from mrex.tree import DecisionTreeClassifier gc.collect() # start time tstart = datetime.now() clf = DecisionTreeClassifier() clf.fit(X, Y).predict(X) delta = (datetime.now() - tstart) # stop time scikit_classifier_results.append(delta.seconds + delta.microseconds / mu_second)
def test_export_text_errors(): clf = DecisionTreeClassifier(max_depth=2, random_state=0) clf.fit(X, y) err_msg = "max_depth bust be >= 0, given -1" with pytest.raises(ValueError, match=err_msg): export_text(clf, max_depth=-1) err_msg = "feature_names must contain 2 elements, got 1" with pytest.raises(ValueError, match=err_msg): export_text(clf, feature_names=['a']) err_msg = "decimals must be >= 0, given -1" with pytest.raises(ValueError, match=err_msg): export_text(clf, decimals=-1) err_msg = "spacing must be > 0, given 0" with pytest.raises(ValueError, match=err_msg): export_text(clf, spacing=0)
def test_plot_tree_gini(pyplot): # mostly smoke tests # Check correctness of export_graphviz for criterion = gini clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2, criterion="gini", random_state=2) clf.fit(X, y) # Test export code feature_names = ['first feat', 'sepal_width'] nodes = plot_tree(clf, feature_names=feature_names) assert len(nodes) == 3 assert nodes[0].get_text() == ("first feat <= 0.0\ngini = 0.5\n" "samples = 6\nvalue = [3, 3]") assert nodes[1].get_text() == "gini = 0.0\nsamples = 3\nvalue = [3, 0]" assert nodes[2].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]"
def test_gridsearch(): # Check that base trees can be grid-searched. # AdaBoost classification boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier()) parameters = { 'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2), 'algorithm': ('SAMME', 'SAMME.R') } clf = GridSearchCV(boost, parameters) clf.fit(iris.data, iris.target) # AdaBoost regression boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0) parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2)} clf = GridSearchCV(boost, parameters) clf.fit(boston.data, boston.target)
def test_precision(): rng_reg = RandomState(2) rng_clf = RandomState(8) for X, y, clf in zip( (rng_reg.random_sample((5, 2)), rng_clf.random_sample((1000, 4))), (rng_reg.random_sample((5, )), rng_clf.randint(2, size=(1000, ))), (DecisionTreeRegressor(criterion="friedman_mse", random_state=0, max_depth=1), DecisionTreeClassifier(max_depth=1, random_state=0))): clf.fit(X, y) for precision in (4, 3): dot_data = export_graphviz(clf, out_file=None, precision=precision, proportion=True) # With the current random state, the impurity and the threshold # will have the number of precision set in the export_graphviz # function. We will check the number of precision with a strict # equality. The value reported will have only 2 precision and # therefore, only a less equal comparison will be done. # check value for finding in finditer(r"value = \d+\.\d+", dot_data): assert ( len(search(r"\.\d+", finding.group()).group()) <= precision + 1) # check impurity if is_classifier(clf): pattern = r"gini = \d+\.\d+" else: pattern = r"friedman_mse = \d+\.\d+" # check impurity for finding in finditer(pattern, dot_data): assert (len(search(r"\.\d+", finding.group()).group()) == precision + 1) # check threshold for finding in finditer(r"<= \d+\.\d+", dot_data): assert (len(search(r"\.\d+", finding.group()).group()) == precision + 1)
def test_score_sample_weight(): rng = np.random.RandomState(0) # test both ClassifierMixin and RegressorMixin estimators = [ DecisionTreeClassifier(max_depth=2), DecisionTreeRegressor(max_depth=2) ] sets = [datasets.load_iris(), datasets.load_boston()] for est, ds in zip(estimators, sets): est.fit(ds.data, ds.target) # generate random sample weights sample_weight = rng.randint(1, 10, size=len(ds.target)) # check that the score with and without sample weights are different assert (est.score(ds.data, ds.target) != est.score( ds.data, ds.target, sample_weight=sample_weight)), ("Unweighted and weighted scores " "are unexpectedly equal")
def test_graphviz_errors(): # Check for errors of export_graphviz clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2) # Check not-fitted decision tree error out = StringIO() with pytest.raises(NotFittedError): export_graphviz(clf, out) clf.fit(X, y) # Check if it errors when length of feature_names # mismatches with number of features message = ("Length of feature_names, " "1 does not match number of features, 2") with pytest.raises(ValueError, match=message): export_graphviz(clf, None, feature_names=["a"]) message = ("Length of feature_names, " "3 does not match number of features, 2") with pytest.raises(ValueError, match=message): export_graphviz(clf, None, feature_names=["a", "b", "c"]) # Check error when argument is not an estimator message = "is not an estimator instance" with pytest.raises(TypeError, match=message): export_graphviz(clf.fit(X, y).tree_) # Check class_names error out = StringIO() with pytest.raises(IndexError): export_graphviz(clf, out, class_names=[]) # Check precision error out = StringIO() with pytest.raises(ValueError, match="should be greater or equal"): export_graphviz(clf, out, precision=-1) with pytest.raises(ValueError, match="should be an integer"): export_graphviz(clf, out, precision="1")
X = X / 255 # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 60000 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] return X_train, X_test, y_train, y_test ESTIMATORS = { "dummy": DummyClassifier(), 'CART': DecisionTreeClassifier(), 'ExtraTrees': ExtraTreesClassifier(), 'RandomForest': RandomForestClassifier(), 'Nystroem-SVM': make_pipeline( Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)), 'SampledRBF-SVM': make_pipeline( RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)), 'LogisticRegression-SAG': LogisticRegression(solver='sag', tol=1e-1, C=1e4), 'LogisticRegression-SAGA': LogisticRegression(solver='saga', tol=1e-1, C=1e4), 'MultilayerPerceptron': MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver='sgd', learning_rate_init=0.2, momentum=0.9, verbose=1, tol=1e-4, random_state=1), 'MLP-adam': MLPClassifier(
# Standardize first 10 features (the numerical ones) mean = X_train.mean(axis=0) std = X_train.std(axis=0) mean[10:] = 0.0 std[10:] = 1.0 X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, X_test, y_train, y_test ESTIMATORS = { 'GBRT': GradientBoostingClassifier(n_estimators=250), 'ExtraTrees': ExtraTreesClassifier(n_estimators=20), 'RandomForest': RandomForestClassifier(n_estimators=20), 'CART': DecisionTreeClassifier(min_samples_split=5), 'SGD': SGDClassifier(alpha=0.001), 'GaussianNB': GaussianNB(), 'liblinear': LinearSVC(loss="l2", penalty="l2", C=1000, dual=False, tol=1e-3), 'SAG': LogisticRegression(solver='sag', max_iter=2, C=1000) } if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--classifiers', nargs="+", choices=ESTIMATORS,
# Parameters n_classes = 3 plot_colors = "ryb" plot_step = 0.02 # Load data iris = load_iris() for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]): # We only take the two corresponding features X = iris.data[:, pair] y = iris.target # Train clf = DecisionTreeClassifier().fit(X, y) # Plot the decision boundary plt.subplot(2, 3, pairidx + 1) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu) plt.xlabel(iris.feature_names[pair[0]])
############################################################################### # Total impurity of leaves vs effective alphas of pruned tree # --------------------------------------------------------------- # Minimal cost complexity pruning recursively finds the node with the "weakest # link". The weakest link is characterized by an effective alpha, where the # nodes with the smallest effective alpha are pruned first. To get an idea of # what values of ``ccp_alpha`` could be appropriate, scikit-learn provides # :func:`DecisionTreeClassifier.cost_complexity_pruning_path` that returns the # effective alphas and the corresponding total leaf impurities at each step of # the pruning process. As alpha increases, more of the tree is pruned, which # increases the total impurity of its leaves. X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = DecisionTreeClassifier(random_state=0) path = clf.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities ############################################################################### # In the following plot, the maximum effective alpha value is removed, because # it is the trivial tree with only one node. fig, ax = plt.subplots() ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post") ax.set_xlabel("effective alpha") ax.set_ylabel("total impurity of leaves") ax.set_title("Total Impurity vs effective alpha for training set") ############################################################################### # Next, we train a decision tree using the effective alphas. The last value # in ``ccp_alphas`` is the alpha value that prunes the whole tree,
from mrex.discriminant_analysis import QuadraticDiscriminantAnalysis h = .02 # step size in the mesh names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), GaussianProcessClassifier(1.0 * RBF(1.0)), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1, max_iter=1000), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis() ] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y)
import numpy as np import matplotlib.pyplot as plt from mrex import datasets from mrex.tree import DecisionTreeClassifier from mrex.neighbors import KNeighborsClassifier from mrex.svm import SVC from mrex.ensemble import VotingClassifier # Loading some example data iris = datasets.load_iris() X = iris.data[:, [0, 2]] y = iris.target # Training classifiers clf1 = DecisionTreeClassifier(max_depth=4) clf2 = KNeighborsClassifier(n_neighbors=7) clf3 = SVC(gamma=.1, kernel='rbf', probability=True) eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[2, 1, 2]) clf1.fit(X, y) clf2.fit(X, y) clf3.fit(X, y) eclf.fit(X, y) # Plotting decision regions x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
# Running ``GridSearchCV`` using multiple evaluation metrics # ---------------------------------------------------------- # X, y = make_hastie_10_2(n_samples=8000, random_state=42) # The scorers can be either be one of the predefined metric strings or a scorer # callable, like the one returned by make_scorer scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)} # Setting refit='AUC', refits an estimator on the whole dataset with the # parameter setting that has the best cross-validated AUC score. # That estimator is made available at ``gs.best_estimator_`` along with # parameters like ``gs.best_score_``, ``gs.best_params_`` and # ``gs.best_index_`` gs = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid={'min_samples_split': range(2, 403, 10)}, scoring=scoring, refit='AUC', return_train_score=True) gs.fit(X, y) results = gs.cv_results_ ############################################################################### # Plotting the result # ------------------- plt.figure(figsize=(13, 13)) plt.title("GridSearchCV evaluating using multiple scorers simultaneously", fontsize=16)
def test_set_params_updates_valid_params(): # Check that set_params tries to set SVC().C, not # DecisionTreeClassifier().C gscv = GridSearchCV(DecisionTreeClassifier(), {}) gscv.set_params(estimator=SVC(), estimator__C=42.0) assert gscv.estimator.C == 42.0
from mrex.tree import DecisionTreeClassifier # Parameters n_classes = 3 n_estimators = 30 cmap = plt.cm.RdYlBu plot_step = 0.02 # fine step width for decision surface contours plot_step_coarser = 0.5 # step widths for coarse classifier guesses RANDOM_SEED = 13 # fix the seed on each iteration # Load data iris = load_iris() plot_idx = 1 models = [DecisionTreeClassifier(max_depth=None), RandomForestClassifier(n_estimators=n_estimators), ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators)] for pair in ([0, 1], [0, 2], [2, 3]): for model in models: # We only take the two corresponding features X = iris.data[:, pair] y = iris.target # Shuffle idx = np.arange(X.shape[0]) np.random.seed(RANDOM_SEED) np.random.shuffle(idx)
X1, y1 = make_gaussian_quantiles(cov=2., n_samples=200, n_features=2, n_classes=2, random_state=1) X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1) X = np.concatenate((X1, X2)) y = np.concatenate((y1, -y2 + 1)) # Create and fit an AdaBoosted decision tree bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200) bdt.fit(X, y) plot_colors = "br" plot_step = 0.02 class_names = "AB" plt.figure(figsize=(10, 5)) # Plot the decision boundaries plt.subplot(121) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
from mrex import datasets from mrex.tree import DecisionTreeClassifier from mrex.metrics import zero_one_loss from mrex.ensemble import AdaBoostClassifier n_estimators = 400 # A learning rate of 1. may not be optimal for both SAMME and SAMME.R learning_rate = 1. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_test, y_test = X[2000:], y[2000:] X_train, y_train = X[:2000], y[:2000] dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(X_train, y_train) dt_stump_err = 1.0 - dt_stump.score(X_test, y_test) dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1) dt.fit(X_train, y_train) dt_err = 1.0 - dt.score(X_test, y_test) ada_discrete = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME") ada_discrete.fit(X_train, y_train) ada_real = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=learning_rate,
[('lr', LinearRegression()), ('rf', RandomForestRegressor(n_estimators=5))]))] ) @pytest.mark.parametrize("drop", [None, 'drop']) def test_none_estimator_with_weights(X, y, voter, drop): # check that an estimator can be set to None and passing some weight # regression test for # https://github.com/scikit-learn/scikit-learn/issues/13777 voter.fit(X, y, sample_weight=np.ones(y.shape)) voter.set_params(lr=drop) voter.fit(X, y, sample_weight=np.ones(y.shape)) y_pred = voter.predict(X) assert y_pred.shape == y.shape @pytest.mark.parametrize( "estimator", [VotingRegressor( estimators=[('lr', LinearRegression()), ('tree', DecisionTreeRegressor(random_state=0))]), VotingClassifier( estimators=[('lr', LogisticRegression(random_state=0)), ('tree', DecisionTreeClassifier(random_state=0))])], ids=['VotingRegressor', 'VotingClassifier'] ) def test_check_estimators_voting_estimator(estimator): # FIXME: to be removed when meta-estimators can be specified themselves # their testing parameters (for required parameters). check_estimator(estimator) check_no_attributes_set_in_init(estimator.__class__.__name__, estimator)
from mrex.datasets import make_gaussian_quantiles from mrex.ensemble import AdaBoostClassifier from mrex.metrics import accuracy_score from mrex.tree import DecisionTreeClassifier X, y = make_gaussian_quantiles(n_samples=13000, n_features=10, n_classes=3, random_state=1) n_split = 3000 X_train, X_test = X[:n_split], X[n_split:] y_train, y_test = y[:n_split], y[n_split:] bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1) bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME") bdt_real.fit(X_train, y_train) bdt_discrete.fit(X_train, y_train) real_test_errors = [] discrete_test_errors = [] for real_test_predict, discrete_train_predict in zip(
- the rules that were used to predict a sample; - the decision path shared by a group of samples. """ import numpy as np from mrex.model_selection import train_test_split from mrex.datasets import load_iris from mrex.tree import DecisionTreeClassifier iris = load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) estimator = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0) estimator.fit(X_train, y_train) # The decision estimator has an attribute called tree_ which stores the entire # tree structure and allows access to low level attributes. The binary tree # tree_ is represented as a number of parallel arrays. The i-th element of each # array holds information about the node `i`. Node 0 is the tree's root. NOTE: # Some of the arrays only apply to either leaves or split nodes, resp. In this # case the values of nodes of the other type are arbitrary! # # Among those arrays, we have: # - left_child, id of the left child of the node # - right_child, id of the right child of the node # - feature, feature used for splitting the node # - threshold, threshold value at the node #