def test_importances(): """Check variable importances.""" X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) for name, Tree in CLF_TREES.items(): clf = Tree(random_state=0) clf.fit(X, y) importances = clf.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10, "Failed with {0}".format(name)) assert_equal(n_important, 3, "Failed with {0}".format(name)) X_new = clf.transform(X, threshold="mean") assert_less(0, X_new.shape[1], "Failed with {0}".format(name)) assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name)) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) clf.fit(iris.data, iris.target) clf2 = DecisionTreeClassifier(random_state=0, max_leaf_nodes=len(iris.data)) clf2.fit(iris.data, iris.target) assert_array_equal(clf.feature_importances_, clf2.feature_importances_)
def test_importances_variance_equal_mse(): """Check that gini is equivalent to mse for binary output variable""" from sklearn.tree._tree import TREE_LEAF X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) var = DecisionTreeClassifier(criterion="variance", random_state=0).fit(X_train, y_train) gini = DecisionTreeClassifier(criterion="gini", random_state=0).fit(X_train, y_train) reg = DecisionTreeRegressor(criterion="mse", random_state=0).fit(X_train, y_train) gini_leaves = gini.tree_.children_left == TREE_LEAF var_leaves = var.tree_.children_left == TREE_LEAF assert_array_equal(var.tree_.feature, reg.tree_.feature) assert_almost_equal(var.feature_importances_, reg.feature_importances_) assert_array_equal(var.tree_.children_left, reg.tree_.children_left) assert_array_equal(var.tree_.children_right, reg.tree_.children_right) assert_array_equal(var.tree_.n_node_samples, reg.tree_.n_node_samples) assert_array_equal(var.tree_.feature, gini.tree_.feature) assert_almost_equal(var.feature_importances_, gini.feature_importances_) assert_array_equal(var.tree_.children_left, gini.tree_.children_left) assert_array_equal(var.tree_.children_right, gini.tree_.children_right) assert_array_equal(var.tree_.n_node_samples, gini.tree_.n_node_samples) assert_almost_equal(var.tree_.value[var_leaves], gini.tree_.value[gini_leaves]) clf = DecisionTreeClassifier(criterion="gini", random_state=0, output_transformer=IdentityProjection(), ).fit(X_train, y_train) clf_leaves = clf.tree_.children_left == TREE_LEAF assert_array_equal(clf.tree_.feature, reg.tree_.feature) assert_almost_equal(clf.feature_importances_, reg.feature_importances_) assert_array_equal(clf.tree_.children_left, reg.tree_.children_left) assert_array_equal(clf.tree_.children_right, reg.tree_.children_right) assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples) assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples) assert_array_equal(clf.tree_.feature, gini.tree_.feature) assert_almost_equal(clf.feature_importances_, gini.feature_importances_) assert_array_equal(clf.tree_.children_left, gini.tree_.children_left) assert_array_equal(clf.tree_.children_right, gini.tree_.children_right) assert_array_equal(clf.tree_.n_node_samples, gini.tree_.n_node_samples) assert_almost_equal(clf.tree_.value[clf_leaves], gini.tree_.value[gini_leaves])
def test_sample_weight(): """Check sample weighting.""" # Test that zero-weighted samples are not taken into account X = np.arange(100)[:, np.newaxis] y = np.ones(100) y[:50] = 0.0 sample_weight = np.ones(100) sample_weight[y == 0] = 0.0 clf = DecisionTreeClassifier(random_state=0) clf.fit(X, y, sample_weight=sample_weight) assert_array_equal(clf.predict(X), np.ones(100)) # Test that low weighted samples are not taken into account at low depth X = np.arange(200)[:, np.newaxis] y = np.zeros(200) y[50:100] = 1 y[100:200] = 2 X[100:200, 0] = 200 sample_weight = np.ones(200) sample_weight[y == 2] = .51 # Samples of class '2' are still weightier clf = DecisionTreeClassifier(max_depth=1, random_state=0) clf.fit(X, y, sample_weight=sample_weight) assert_equal(clf.tree_.threshold[0], 149.5) sample_weight[y == 2] = .5 # Samples of class '2' are no longer weightier clf = DecisionTreeClassifier(max_depth=1, random_state=0) clf.fit(X, y, sample_weight=sample_weight) assert_equal(clf.tree_.threshold[0], 49.5) # Threshold should have moved # Test that sample weighting is the same as having duplicates X = iris.data y = iris.target duplicates = rng.randint(0, X.shape[0], 200) clf = DecisionTreeClassifier(random_state=1) clf.fit(X[duplicates], y[duplicates]) sample_weight = np.bincount(duplicates, minlength=X.shape[0]) clf2 = DecisionTreeClassifier(random_state=1) clf2.fit(X, y, sample_weight=sample_weight) internal = clf.tree_.children_left != tree._tree.TREE_LEAF assert_array_almost_equal(clf.tree_.threshold[internal], clf2.tree_.threshold[internal])
def test_big_input(): """Test if the warning for too large inputs is appropriate.""" X = np.repeat(10**40., 4).astype(np.float64).reshape(-1, 1) clf = DecisionTreeClassifier() try: clf.fit(X, [0, 1, 0, 1]) except ValueError as e: assert_in("float32", str(e))
def test_arrays_persist(): """Ensure property arrays' memory stays alive when tree disappears non-regression for #2726 """ for attr in [ 'n_classes', 'value', 'children_left', 'children_right', 'threshold', 'impurity', 'feature', 'n_node_samples' ]: value = getattr(DecisionTreeClassifier().fit([[0]], [0]).tree_, attr) # if pointing to freed memory, contents may be arbitrary assert_true(-2 <= value.flat[0] < 2, 'Array points to arbitrary memory')
def test_sample_weight_invalid(): """Check sample weighting raises errors.""" X = np.arange(100)[:, np.newaxis] y = np.ones(100) y[:50] = 0.0 clf = DecisionTreeClassifier(random_state=0) sample_weight = np.random.rand(100, 1) assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight) sample_weight = np.array(0) assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight) sample_weight = np.ones(101) assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight) sample_weight = np.ones(99) assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)
def test_importances_gini_equal_mse(): """Check that gini is equivalent to mse for binary output variable""" X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # The gini index and the mean square error (variance) might differ due # to numerical instability. Since those instabilities mainly occurs at # high tree depth, we restrict this maximal depth. clf = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=0).fit(X, y) reg = DecisionTreeRegressor(criterion="mse", max_depth=5, random_state=0).fit(X, y) assert_almost_equal(clf.feature_importances_, reg.feature_importances_) assert_array_equal(clf.tree_.feature, reg.tree_.feature) assert_array_equal(clf.tree_.children_left, reg.tree_.children_left) assert_array_equal(clf.tree_.children_right, reg.tree_.children_right) assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
def test_importances_raises(): """Check if variable importance before fit raises ValueError. """ clf = DecisionTreeClassifier() clf.feature_importances_