def test_incremental_variance_ddof(): # Test that degrees of freedom parameter for calculations are correct. rng = np.random.RandomState(1999) X = rng.randn(50, 10) n_samples, n_features = X.shape for batch_size in [11, 20, 37]: steps = np.arange(0, X.shape[0], batch_size) if steps[-1] != X.shape[0]: steps = np.hstack([steps, n_samples]) for i, j in zip(steps[:-1], steps[1:]): batch = X[i:j, :] if i == 0: incremental_means = batch.mean(axis=0) incremental_variances = batch.var(axis=0) # Assign this twice so that the test logic is consistent incremental_count = batch.shape[0] sample_count = batch.shape[0] else: result = _incremental_mean_and_var(batch, incremental_means, incremental_variances, sample_count) (incremental_means, incremental_variances, incremental_count) = result sample_count += batch.shape[0] calculated_means = np.mean(X[:j], axis=0) calculated_variances = np.var(X[:j], axis=0) assert_almost_equal(incremental_means, calculated_means, 6) assert_almost_equal(incremental_variances, calculated_variances, 6) assert_equal(incremental_count, sample_count)
def test_verbose_output(): # Check verbose=1 does not cause error. from sklearn.externals.six.moves import cStringIO as StringIO import sys old_stdout = sys.stdout sys.stdout = StringIO() clf = GradientBoostingClassifier(n_estimators=100, random_state=1, verbose=1, subsample=0.8) clf.fit(X, y) verbose_output = sys.stdout sys.stdout = old_stdout # check output verbose_output.seek(0) header = verbose_output.readline().rstrip() # with OOB true_header = ' '.join(['%10s'] + ['%16s'] * 3) % ( 'Iter', 'Train Loss', 'OOB Improve', 'Remaining Time') assert_equal(true_header, header) n_lines = sum(1 for l in verbose_output.readlines()) # one for 1-10 and then 9 for 20-100 assert_equal(10 + 9, n_lines)
def check_boston(presort, loss, subsample): # Check consistency on dataset boston house prices with least squares # and least absolute deviation. ones = np.ones(len(boston.target)) last_y_pred = None for sample_weight in None, ones, 2 * ones: clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=2, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target, sample_weight=sample_weight) leaves = clf.apply(boston.data) assert_equal(leaves.shape, (506, 100)) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_less(mse, 6.0) if last_y_pred is not None: assert_array_almost_equal(last_y_pred, y_pred) last_y_pred = y_pred
def check_warm_start(name, random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = ForestEstimator(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert_equal(len(clf_ws), n_estimators) clf_no_ws = ForestEstimator(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert_equal(set([tree.random_state for tree in clf_ws]), set([tree.random_state for tree in clf_no_ws])) assert_array_equal(clf_ws.apply(X), clf_no_ws.apply(X), err_msg="Failed with {0}".format(name))
def test_warm_start_wo_nestimators_change(): # Test if warm_start does nothing if n_estimators is not changed. # Regression test for #3513. clf = GradientBoostingClassifier(n_estimators=10, warm_start=True) clf.fit([[0, 1], [2, 3]], [0, 1]) assert_equal(clf.estimators_.shape[0], 10) clf.fit([[0, 1], [2, 3]], [0, 1]) assert_equal(clf.estimators_.shape[0], 10)
def test_shuffle_on_ndim_equals_three(): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2) S = set(to_tuple(A)) shuffle(A) # shouldn't raise a ValueError for dim = 3 assert_equal(set(to_tuple(A)), S)
def test_repr(): # Smoke test the repr of the base estimator. my_estimator = MyEstimator() repr(my_estimator) test = T(K(), K()) assert_equal(repr(test), "T(a=K(c=None, d=None), b=K(c=None, d=None))") some_est = T(a=["long_params"] * 1000) assert_equal(len(repr(some_est)), 415)
def test_symbol_labels(): # Test with non-integer class labels. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) symbol_y = tosequence(map(str, y)) clf.fit(X, symbol_y) assert_array_equal(clf.predict(T), tosequence(map(str, true_result))) assert_equal(100, len(clf.estimators_))
def test_float_class_labels(): # Test with float class labels. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) float_y = np.asarray(y, dtype=np.float32) clf.fit(X, float_y) assert_array_equal(clf.predict(T), np.asarray(true_result, dtype=np.float32)) assert_equal(100, len(clf.estimators_))
def test_oob_multilcass_iris(): # Check OOB improvement on multi-class dataset. clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=0.5) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) assert_greater(score, 0.9) assert_equal(clf.oob_improvement_.shape[0], clf.n_estimators)
def test_oob_improvement(): # Test if oob improvement has correct shape and regression test. clf = GradientBoostingClassifier(n_estimators=100, random_state=1, subsample=0.5) clf.fit(X, y) assert_equal(clf.oob_improvement_.shape[0], 100) # hard-coded regression test - change if modification in OOB computation assert_array_almost_equal(clf.oob_improvement_[:5], np.array([0.19, 0.15, 0.12, -0.12, -0.11]), decimal=2)
def test_safe_mask(): random_state = check_random_state(0) X = random_state.rand(5, 4) X_csr = sp.csr_matrix(X) mask = [False, False, True, True, True] mask = safe_mask(X, mask) assert_equal(X[mask].shape[0], 3) mask = safe_mask(X_csr, mask) assert_equal(X_csr[mask].shape[0], 3)
def test_random_trees_dense_type(): # Test that the `sparse_output` parameter of RandomTreesEmbedding # works by returning a dense array. # Create the RTE with sparse=False hasher = RandomTreesEmbedding(n_estimators=10, sparse_output=False) X, y = datasets.make_circles(factor=0.5) X_transformed = hasher.fit_transform(X) # Assert that type is ndarray, not scipy.sparse.csr.csr_matrix assert_equal(type(X_transformed), np.ndarray)
def test_density(): rng = np.random.RandomState(0) X = rng.randint(10, size=(10, 5)) X[1, 2] = 0 X[5, 3] = 0 X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) X_coo = sparse.coo_matrix(X) X_lil = sparse.lil_matrix(X) for X_ in (X_csr, X_csc, X_coo, X_lil): assert_equal(density(X_), density(X))
def check_pickle(name, X, y): # Check pickability. ForestEstimator = FOREST_ESTIMATORS[name] obj = ForestEstimator(random_state=0) obj.fit(X, y) score = obj.score(X, y) pickle_object = pickle.dumps(obj) obj2 = pickle.loads(pickle_object) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(X, y) assert_equal(score, score2)
def check_parallel(name, X, y): """Check parallel computations in classification""" ForestEstimator = FOREST_ESTIMATORS[name] forest = ForestEstimator(n_estimators=10, n_jobs=3, random_state=0) forest.fit(X, y) assert_equal(len(forest), 10) forest.set_params(n_jobs=1) y1 = forest.predict(X) forest.set_params(n_jobs=2) y2 = forest.predict(X) assert_array_almost_equal(y1, y2, 3)
def test_shape_y(): # Test with float class labels. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) y_ = np.asarray(y, dtype=np.int32) y_ = y_[:, np.newaxis] # This will raise a DataConversionWarning that we want to # "always" raise, elsewhere the warnings gets ignored in the # later tests, and the tests that check for this warning fail assert_warns(DataConversionWarning, clf.fit, X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_))
def test_resample(): # Border case not worth mentioning in doctests assert_true(resample() is None) # Check that invalid arguments yield ValueError assert_raises(ValueError, resample, [0], [0, 1]) assert_raises(ValueError, resample, [0, 1], [0, 1], replace=False, n_samples=3) assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42) # Issue:6581, n_samples can be more when replace is True (default). assert_equal(len(resample([1, 2], n_samples=5)), 5)
def test_warm_start_max_depth(): # Test if possible to fit trees of different depth in ensemble. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: est = Cls(n_estimators=100, max_depth=1, warm_start=True) est.fit(X, y) est.set_params(n_estimators=110, max_depth=2) est.fit(X, y) # last 10 trees have different depth assert_equal(est.estimators_[0, 0].max_depth, 1) for i in range(1, 11): assert_equal(est.estimators_[-i, 0].max_depth, 2)
def check_iris(presort, subsample, sample_weight): # Check consistency on dataset iris. clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=subsample, presort=presort) clf.fit(iris.data, iris.target, sample_weight=sample_weight) score = clf.score(iris.data, iris.target) assert_greater(score, 0.9) leaves = clf.apply(iris.data) assert_equal(leaves.shape, (150, 100, 3))
def test_complete_regression(): # Test greedy trees with max_depth + 1 leafs. from sklearn.tree._tree import TREE_LEAF k = 4 est = GradientBoostingRegressor(n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1) est.fit(boston.data, boston.target) tree = est.estimators_[-1, 0].tree_ assert_equal(tree.children_left[tree.children_left == TREE_LEAF].shape[0], k + 1)
def test_compute_class_weight_auto_negative(): # Test compute_class_weight when labels are negative # Test with balanced class labels. classes = np.array([-2, -1, 0]) y = np.asarray([-1, -1, 0, 0, -2, -2]) cw = assert_warns(DeprecationWarning, compute_class_weight, "auto", classes, y) assert_almost_equal(cw.sum(), classes.shape) assert_equal(len(cw), len(classes)) assert_array_almost_equal(cw, np.array([1., 1., 1.])) cw = compute_class_weight("balanced", classes, y) assert_equal(len(cw), len(classes)) assert_array_almost_equal(cw, np.array([1., 1., 1.])) # Test with unbalanced class labels. y = np.asarray([-1, 0, 0, -2, -2, -2]) cw = assert_warns(DeprecationWarning, compute_class_weight, "auto", classes, y) assert_almost_equal(cw.sum(), classes.shape) assert_equal(len(cw), len(classes)) assert_array_almost_equal(cw, np.array([0.545, 1.636, 0.818]), decimal=3) cw = compute_class_weight("balanced", classes, y) assert_equal(len(cw), len(classes)) class_counts = np.bincount(y + 2) assert_almost_equal(np.dot(cw, class_counts), y.shape[0]) assert_array_almost_equal(cw, [2. / 3, 2., 1.])
def test_max_leaf_nodes_max_depth(): # Test precedence of max_leaf_nodes over max_depth. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier] k = 4 for GBEstimator in all_estimators: est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) tree = est.estimators_[0, 0].tree_ assert_greater(tree.max_depth, 1) est = GBEstimator(max_depth=1).fit(X, y) tree = est.estimators_[0, 0].tree_ assert_equal(tree.max_depth, 1)
def test_mem_layout(): # Test with different memory layouts of X and y X_ = np.asfortranarray(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X_, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) X_ = np.ascontiguousarray(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X_, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) y_ = np.asarray(y, dtype=np.int32) y_ = np.ascontiguousarray(y_) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) y_ = np.asarray(y, dtype=np.int32) y_ = np.asfortranarray(y_) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_))
def test_compute_class_weight_auto_unordered(): # Test compute_class_weight when classes are unordered classes = np.array([1, 0, 3]) y = np.asarray([1, 0, 0, 3, 3, 3]) cw = assert_warns(DeprecationWarning, compute_class_weight, "auto", classes, y) assert_almost_equal(cw.sum(), classes.shape) assert_equal(len(cw), len(classes)) assert_array_almost_equal(cw, np.array([1.636, 0.818, 0.545]), decimal=3) cw = compute_class_weight("balanced", classes, y) class_counts = np.bincount(y)[classes] assert_almost_equal(np.dot(cw, class_counts), y.shape[0]) assert_array_almost_equal(cw, [2., 1., 2. / 3])
def check_max_leaf_nodes_max_depth(name): X, y = hastie_X, hastie_y # Test precedence of max_leaf_nodes over max_depth. ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(max_depth=1, max_leaf_nodes=4, n_estimators=1, random_state=0).fit(X, y) assert_greater(est.estimators_[0].tree_.max_depth, 1) est = ForestEstimator(max_depth=1, n_estimators=1, random_state=0).fit(X, y) assert_equal(est.estimators_[0].tree_.max_depth, 1)
def test_complete_classification(): # Test greedy trees with max_depth + 1 leafs. from sklearn.tree._tree import TREE_LEAF X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) k = 4 est = GradientBoostingClassifier(n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1) est.fit(X, y) tree = est.estimators_[0, 0].tree_ assert_equal(tree.max_depth, k) assert_equal(tree.children_left[tree.children_left == TREE_LEAF].shape[0], k + 1)
def test_clone(): # Tests that clone creates a correct deep copy. # We create an estimator, make a copy of its original state # (which, in this case, is the current state of the estimator), # and check that the obtained copy is a correct deep copy. from uplift.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert_true(selector is not new_selector) assert_equal(selector.get_params(), new_selector.get_params()) selector = SelectFpr(f_classif, alpha=np.zeros((10, 2))) new_selector = clone(selector) assert_true(selector is not new_selector)
def check_classes_shape(name): # Test that n_classes_ and classes_ have proper shape. ForestClassifier = FOREST_CLASSIFIERS[name] # Classification, single output clf = ForestClassifier(random_state=0).fit(X, y) assert_equal(clf.n_classes_, 2) assert_array_equal(clf.classes_, [-1, 1]) # Classification, multi-output _y = np.vstack((y, np.array(y) * 2)).T clf = ForestClassifier(random_state=0).fit(X, _y) assert_array_equal(clf.n_classes_, [2, 2]) assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]])
def check_classification_toy(presort, loss): # Check classification on a toy dataset. clf = GradientBoostingClassifier(loss=loss, n_estimators=10, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf.estimators_)) deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:]) assert_true(np.any(deviance_decrease >= 0.0)) leaves = clf.apply(X) assert_equal(leaves.shape, (6, 10, 1))