def test_plot_roc_curve_error_non_binary(pyplot, data): X, y = data clf = DecisionTreeClassifier() clf.fit(X, y) msg = "Estimator should solve a binary classification problem" with pytest.raises(ValueError, match=msg): plot_roc_curve(clf, X, y)
def _make_estimators(X_train, y_train, y_ml_train): # Make estimators that make sense to test various scoring methods sensible_regr = DecisionTreeRegressor(random_state=0) # some of the regressions scorers require strictly positive input. sensible_regr.fit(X_train, y_train + 1) sensible_clf = DecisionTreeClassifier(random_state=0) sensible_clf.fit(X_train, y_train) sensible_ml_clf = DecisionTreeClassifier(random_state=0) sensible_ml_clf.fit(X_train, y_ml_train) return dict([(name, sensible_regr) for name in REGRESSION_SCORERS] + [(name, sensible_clf) for name in CLF_SCORERS] + [(name, sensible_clf) for name in CLUSTER_SCORERS] + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS])
def bench_scikit_tree_classifier(X, Y): """Benchmark with scikit-learn decision tree classifier""" from mrex.tree import DecisionTreeClassifier gc.collect() # start time tstart = datetime.now() clf = DecisionTreeClassifier() clf.fit(X, Y).predict(X) delta = (datetime.now() - tstart) # stop time scikit_classifier_results.append(delta.seconds + delta.microseconds / mu_second)
def test_export_text_errors(): clf = DecisionTreeClassifier(max_depth=2, random_state=0) clf.fit(X, y) err_msg = "max_depth bust be >= 0, given -1" with pytest.raises(ValueError, match=err_msg): export_text(clf, max_depth=-1) err_msg = "feature_names must contain 2 elements, got 1" with pytest.raises(ValueError, match=err_msg): export_text(clf, feature_names=['a']) err_msg = "decimals must be >= 0, given -1" with pytest.raises(ValueError, match=err_msg): export_text(clf, decimals=-1) err_msg = "spacing must be > 0, given 0" with pytest.raises(ValueError, match=err_msg): export_text(clf, spacing=0)
def test_plot_tree_gini(pyplot): # mostly smoke tests # Check correctness of export_graphviz for criterion = gini clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2, criterion="gini", random_state=2) clf.fit(X, y) # Test export code feature_names = ['first feat', 'sepal_width'] nodes = plot_tree(clf, feature_names=feature_names) assert len(nodes) == 3 assert nodes[0].get_text() == ("first feat <= 0.0\ngini = 0.5\n" "samples = 6\nvalue = [3, 3]") assert nodes[1].get_text() == "gini = 0.0\nsamples = 3\nvalue = [3, 0]" assert nodes[2].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]"
def test_graphviz_errors(): # Check for errors of export_graphviz clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2) # Check not-fitted decision tree error out = StringIO() with pytest.raises(NotFittedError): export_graphviz(clf, out) clf.fit(X, y) # Check if it errors when length of feature_names # mismatches with number of features message = ("Length of feature_names, " "1 does not match number of features, 2") with pytest.raises(ValueError, match=message): export_graphviz(clf, None, feature_names=["a"]) message = ("Length of feature_names, " "3 does not match number of features, 2") with pytest.raises(ValueError, match=message): export_graphviz(clf, None, feature_names=["a", "b", "c"]) # Check error when argument is not an estimator message = "is not an estimator instance" with pytest.raises(TypeError, match=message): export_graphviz(clf.fit(X, y).tree_) # Check class_names error out = StringIO() with pytest.raises(IndexError): export_graphviz(clf, out, class_names=[]) # Check precision error out = StringIO() with pytest.raises(ValueError, match="should be greater or equal"): export_graphviz(clf, out, precision=-1) with pytest.raises(ValueError, match="should be an integer"): export_graphviz(clf, out, precision="1")
def test_thresholded_scorers_multilabel_indicator_data(): # Test that the scorer work with multilabel-indicator format # for multilabel and multi-output multi-class classifier X, y = make_multilabel_classification(allow_unlabeled=False, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # Multi-output multi-class predict_proba clf = DecisionTreeClassifier() clf.fit(X_train, y_train) y_proba = clf.predict_proba(X_test) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, np.vstack([p[:, -1] for p in y_proba]).T) assert_almost_equal(score1, score2) # Multi-output multi-class decision_function # TODO Is there any yet? clf = DecisionTreeClassifier() clf.fit(X_train, y_train) clf._predict_proba = clf.predict_proba clf.predict_proba = None clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)] y_proba = clf.decision_function(X_test) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, np.vstack([p for p in y_proba]).T) assert_almost_equal(score1, score2) # Multilabel predict_proba clf = OneVsRestClassifier(DecisionTreeClassifier()) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)) assert_almost_equal(score1, score2) # Multilabel decision function clf = OneVsRestClassifier(LinearSVC(random_state=0)) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) assert_almost_equal(score1, score2)
from mrex import datasets from mrex.tree import DecisionTreeClassifier from mrex.metrics import zero_one_loss from mrex.ensemble import AdaBoostClassifier n_estimators = 400 # A learning rate of 1. may not be optimal for both SAMME and SAMME.R learning_rate = 1. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_test, y_test = X[2000:], y[2000:] X_train, y_train = X[:2000], y[:2000] dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(X_train, y_train) dt_stump_err = 1.0 - dt_stump.score(X_test, y_test) dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1) dt.fit(X_train, y_train) dt_err = 1.0 - dt.score(X_test, y_test) ada_discrete = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME") ada_discrete.fit(X_train, y_train) ada_real = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators,
# In the following plot, the maximum effective alpha value is removed, because # it is the trivial tree with only one node. fig, ax = plt.subplots() ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post") ax.set_xlabel("effective alpha") ax.set_ylabel("total impurity of leaves") ax.set_title("Total Impurity vs effective alpha for training set") ############################################################################### # Next, we train a decision tree using the effective alphas. The last value # in ``ccp_alphas`` is the alpha value that prunes the whole tree, # leaving the tree, ``clfs[-1]``, with one node. clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf.fit(X_train, y_train) clfs.append(clf) print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format( clfs[-1].tree_.node_count, ccp_alphas[-1])) ############################################################################### # For the remainder of this example, we remove the last element in # ``clfs`` and ``ccp_alphas``, because it is the trivial tree with only one # node. Here we show that the number of nodes and tree depth decreases as alpha # increases. clfs = clfs[:-1] ccp_alphas = ccp_alphas[:-1] node_counts = [clf.tree_.node_count for clf in clfs] depth = [clf.tree_.max_depth for clf in clfs] fig, ax = plt.subplots(2, 1)
# Loading some example data iris = datasets.load_iris() X = iris.data[:, [0, 2]] y = iris.target # Training classifiers clf1 = DecisionTreeClassifier(max_depth=4) clf2 = KNeighborsClassifier(n_neighbors=7) clf3 = SVC(gamma=.1, kernel='rbf', probability=True) eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[2, 1, 2]) clf1.fit(X, y) clf2.fit(X, y) clf3.fit(X, y) eclf.fit(X, y) # Plotting decision regions x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8)) for idx, clf, tt in zip( product([0, 1], [0, 1]), [clf1, clf2, clf3, eclf], ['Decision Tree (depth=4)', 'KNN (k=7)', 'Kernel SVM', 'Soft Voting']):
- the decision path shared by a group of samples. """ import numpy as np from mrex.model_selection import train_test_split from mrex.datasets import load_iris from mrex.tree import DecisionTreeClassifier iris = load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) estimator = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0) estimator.fit(X_train, y_train) # The decision estimator has an attribute called tree_ which stores the entire # tree structure and allows access to low level attributes. The binary tree # tree_ is represented as a number of parallel arrays. The i-th element of each # array holds information about the node `i`. Node 0 is the tree's root. NOTE: # Some of the arrays only apply to either leaves or split nodes, resp. In this # case the values of nodes of the other type are arbitrary! # # Among those arrays, we have: # - left_child, id of the left child of the node # - right_child, id of the right child of the node # - feature, feature used for splitting the node # - threshold, threshold value at the node #
def test_export_text(): clf = DecisionTreeClassifier(max_depth=2, random_state=0) clf.fit(X, y) expected_report = dedent(""" |--- feature_1 <= 0.00 | |--- class: -1 |--- feature_1 > 0.00 | |--- class: 1 """).lstrip() assert export_text(clf) == expected_report # testing that leaves at level 1 are not truncated assert export_text(clf, max_depth=0) == expected_report # testing that the rest of the tree is truncated assert export_text(clf, max_depth=10) == expected_report expected_report = dedent(""" |--- b <= 0.00 | |--- class: -1 |--- b > 0.00 | |--- class: 1 """).lstrip() assert export_text(clf, feature_names=['a', 'b']) == expected_report expected_report = dedent(""" |--- feature_1 <= 0.00 | |--- weights: [3.00, 0.00] class: -1 |--- feature_1 > 0.00 | |--- weights: [0.00, 3.00] class: 1 """).lstrip() assert export_text(clf, show_weights=True) == expected_report expected_report = dedent(""" |- feature_1 <= 0.00 | |- class: -1 |- feature_1 > 0.00 | |- class: 1 """).lstrip() assert export_text(clf, spacing=1) == expected_report X_l = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, 1]] y_l = [-1, -1, -1, 1, 1, 1, 2] clf = DecisionTreeClassifier(max_depth=4, random_state=0) clf.fit(X_l, y_l) expected_report = dedent(""" |--- feature_1 <= 0.00 | |--- class: -1 |--- feature_1 > 0.00 | |--- truncated branch of depth 2 """).lstrip() assert export_text(clf, max_depth=0) == expected_report X_mo = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y_mo = [[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1], [1, 1]] reg = DecisionTreeRegressor(max_depth=2, random_state=0) reg.fit(X_mo, y_mo) expected_report = dedent(""" |--- feature_1 <= 0.0 | |--- value: [-1.0, -1.0] |--- feature_1 > 0.0 | |--- value: [1.0, 1.0] """).lstrip() assert export_text(reg, decimals=1) == expected_report assert export_text(reg, decimals=1, show_weights=True) == expected_report X_single = [[-2], [-1], [-1], [1], [1], [2]] reg = DecisionTreeRegressor(max_depth=2, random_state=0) reg.fit(X_single, y_mo) expected_report = dedent(""" |--- first <= 0.0 | |--- value: [-1.0, -1.0] |--- first > 0.0 | |--- value: [1.0, 1.0] """).lstrip() assert export_text(reg, decimals=1, feature_names=['first']) == expected_report assert export_text(reg, decimals=1, show_weights=True, feature_names=['first']) == expected_report
def test_graphviz_toy(): # Check correctness of export_graphviz clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2, criterion="gini", random_state=2) clf.fit(X, y) # Test export code contents1 = export_graphviz(clf, out_file=None) contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]"] ;\n' \ '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert contents1 == contents2 # Test with feature_names contents1 = export_graphviz(clf, feature_names=["feature0", "feature1"], out_file=None) contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]"] ;\n' \ '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert contents1 == contents2 # Test with class_names contents1 = export_graphviz(clf, class_names=["yes", "no"], out_file=None) contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]\\nclass = yes"] ;\n' \ '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n' \ 'class = yes"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n' \ 'class = no"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert contents1 == contents2 # Test plot_options contents1 = export_graphviz(clf, filled=True, impurity=False, proportion=True, special_characters=True, rounded=True, out_file=None) contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled, rounded", color="black", ' \ 'fontname=helvetica] ;\n' \ 'edge [fontname=helvetica] ;\n' \ '0 [label=<X<SUB>0</SUB> ≤ 0.0<br/>samples = 100.0%<br/>' \ 'value = [0.5, 0.5]>, fillcolor="#ffffff"] ;\n' \ '1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, ' \ 'fillcolor="#e58139"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label=<samples = 50.0%<br/>value = [0.0, 1.0]>, ' \ 'fillcolor="#399de5"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert contents1 == contents2 # Test max_depth contents1 = export_graphviz(clf, max_depth=0, class_names=True, out_file=None) contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]\\nclass = y[0]"] ;\n' \ '1 [label="(...)"] ;\n' \ '0 -> 1 ;\n' \ '2 [label="(...)"] ;\n' \ '0 -> 2 ;\n' \ '}' assert contents1 == contents2 # Test max_depth with plot_options contents1 = export_graphviz(clf, max_depth=0, filled=True, out_file=None, node_ids=True) contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled", color="black"] ;\n' \ '0 [label="node #0\\nX[0] <= 0.0\\ngini = 0.5\\n' \ 'samples = 6\\nvalue = [3, 3]", fillcolor="#ffffff"] ;\n' \ '1 [label="(...)", fillcolor="#C0C0C0"] ;\n' \ '0 -> 1 ;\n' \ '2 [label="(...)", fillcolor="#C0C0C0"] ;\n' \ '0 -> 2 ;\n' \ '}' assert contents1 == contents2 # Test multi-output with weighted samples clf = DecisionTreeClassifier(max_depth=2, min_samples_split=2, criterion="gini", random_state=2) clf = clf.fit(X, y2, sample_weight=w) contents1 = export_graphviz(clf, filled=True, impurity=False, out_file=None) contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled", color="black"] ;\n' \ '0 [label="X[0] <= 0.0\\nsamples = 6\\n' \ 'value = [[3.0, 1.5, 0.0]\\n' \ '[3.0, 1.0, 0.5]]", fillcolor="#ffffff"] ;\n' \ '1 [label="samples = 3\\nvalue = [[3, 0, 0]\\n' \ '[3, 0, 0]]", fillcolor="#e58139"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="X[0] <= 1.5\\nsamples = 3\\n' \ 'value = [[0.0, 1.5, 0.0]\\n' \ '[0.0, 1.0, 0.5]]", fillcolor="#f1bd97"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '3 [label="samples = 2\\nvalue = [[0, 1, 0]\\n' \ '[0, 1, 0]]", fillcolor="#e58139"] ;\n' \ '2 -> 3 ;\n' \ '4 [label="samples = 1\\nvalue = [[0.0, 0.5, 0.0]\\n' \ '[0.0, 0.0, 0.5]]", fillcolor="#e58139"] ;\n' \ '2 -> 4 ;\n' \ '}' assert contents1 == contents2 # Test regression output with plot_options clf = DecisionTreeRegressor(max_depth=3, min_samples_split=2, criterion="mse", random_state=2) clf.fit(X, y) contents1 = export_graphviz(clf, filled=True, leaves_parallel=True, out_file=None, rotate=True, rounded=True) contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled, rounded", color="black", ' \ 'fontname=helvetica] ;\n' \ 'graph [ranksep=equally, splines=polyline] ;\n' \ 'edge [fontname=helvetica] ;\n' \ 'rankdir=LR ;\n' \ '0 [label="X[0] <= 0.0\\nmse = 1.0\\nsamples = 6\\n' \ 'value = 0.0", fillcolor="#f2c09c"] ;\n' \ '1 [label="mse = 0.0\\nsamples = 3\\nvalue = -1.0", ' \ 'fillcolor="#ffffff"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="True"] ;\n' \ '2 [label="mse = 0.0\\nsamples = 3\\nvalue = 1.0", ' \ 'fillcolor="#e58139"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="False"] ;\n' \ '{rank=same ; 0} ;\n' \ '{rank=same ; 1; 2} ;\n' \ '}' assert contents1 == contents2 # Test classifier with degraded learning set clf = DecisionTreeClassifier(max_depth=3) clf.fit(X, y_degraded) contents1 = export_graphviz(clf, filled=True, out_file=None) contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled", color="black"] ;\n' \ '0 [label="gini = 0.0\\nsamples = 6\\nvalue = 6.0", ' \ 'fillcolor="#ffffff"] ;\n' \ '}'