def check_iris(presort, subsample, sample_weight): # Check consistency on dataset iris. clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=subsample, presort=presort) clf.fit(iris.data, iris.target, sample_weight=sample_weight) score = clf.score(iris.data, iris.target) assert_greater(score, 0.9) leaves = clf.apply(iris.data) assert_equal(leaves.shape, (150, 100, 3))
def check_max_leaf_nodes_max_depth(name): X, y = hastie_X, hastie_y # Test precedence of max_leaf_nodes over max_depth. ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(max_depth=1, max_leaf_nodes=4, n_estimators=1, random_state=0).fit(X, y) assert_greater(est.estimators_[0].tree_.max_depth, 1) est = ForestEstimator(max_depth=1, n_estimators=1, random_state=0).fit(X, y) assert_equal(est.estimators_[0].tree_.max_depth, 1)
def test_max_leaf_nodes_max_depth(): # Test precedence of max_leaf_nodes over max_depth. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier] k = 4 for GBEstimator in all_estimators: est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) tree = est.estimators_[0, 0].tree_ assert_greater(tree.max_depth, 1) est = GBEstimator(max_depth=1).fit(X, y) tree = est.estimators_[0, 0].tree_ assert_equal(tree.max_depth, 1)
def check_boston_criterion(name, criterion): # Check consistency on dataset boston house prices. ForestRegressor = FOREST_REGRESSORS[name] clf = ForestRegressor(n_estimators=5, criterion=criterion, random_state=1) clf.fit(boston.data, boston.target) score = clf.score(boston.data, boston.target) assert_greater( score, 0.95, "Failed with max_features=None, criterion %s " "and score = %f" % (criterion, score)) clf = ForestRegressor(n_estimators=5, criterion=criterion, max_features=6, random_state=1) clf.fit(boston.data, boston.target) score = clf.score(boston.data, boston.target) assert_greater( score, 0.95, "Failed with max_features=6, criterion %s " "and score = %f" % (criterion, score))
def test_randomized_svd_power_iteration_normalizer(): # randomized_svd with power_iteration_normalized='none' diverges for # large number of power iterations on this dataset rng = np.random.RandomState(42) X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng) X += 3 * rng.randint(0, 2, size=X.shape) n_components = 50 # Check that it diverges with many (non-normalized) power iterations U, s, V = randomized_svd(X, n_components, n_iter=2, power_iteration_normalizer='none') A = X - U.dot(np.diag(s).dot(V)) error_2 = linalg.norm(A, ord='fro') U, s, V = randomized_svd(X, n_components, n_iter=20, power_iteration_normalizer='none') A = X - U.dot(np.diag(s).dot(V)) error_20 = linalg.norm(A, ord='fro') assert_greater(np.abs(error_2 - error_20), 100) for normalizer in ['LU', 'QR', 'auto']: U, s, V = randomized_svd(X, n_components, n_iter=2, power_iteration_normalizer=normalizer, random_state=0) A = X - U.dot(np.diag(s).dot(V)) error_2 = linalg.norm(A, ord='fro') for i in [5, 10, 50]: U, s, V = randomized_svd(X, n_components, n_iter=i, power_iteration_normalizer=normalizer, random_state=0) A = X - U.dot(np.diag(s).dot(V)) error = linalg.norm(A, ord='fro') assert_greater(15, np.abs(error_2 - error))
def test_randomized_svd_low_rank_with_noise(): # Check that extmath.randomized_svd can handle noisy matrices n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.1, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'none', 'LU', 'QR']: # compute the singular values of X using the fast approximate # method without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.01) # compute the singular values of X using the fast approximate # method with iterated power method _, sap, _ = randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def check_iris_criterion(name, criterion): # Check consistency on dataset iris. ForestClassifier = FOREST_CLASSIFIERS[name] clf = ForestClassifier(n_estimators=10, criterion=criterion, random_state=1) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) assert_greater( score, 0.9, "Failed with criterion %s and score = %f" % (criterion, score)) clf = ForestClassifier(n_estimators=10, criterion=criterion, max_features=2, random_state=1) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) assert_greater( score, 0.5, "Failed with criterion %s and score = %f" % (criterion, score))
def test_randomized_svd_infinite_rank(): # Check that extmath.randomized_svd can handle noisy matrices n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=1.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'none', 'LU', 'QR']: # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0, power_iteration_normalizer=normalizer) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.1) # compute the singular values of X using the fast approximate method # with iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5, power_iteration_normalizer=normalizer) # the iterated power method is still managing to get most of the # structure at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def test_zero_estimator_clf(): # Test if ZeroEstimator works for classification. X = iris.data y = np.array(iris.target) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init=ZeroEstimator()) est.fit(X, y) assert_greater(est.score(X, y), 0.96) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(X, y) assert_greater(est.score(X, y), 0.96) # binary clf mask = y != 0 y[mask] = 1 y[~mask] = 0 est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(X, y) assert_greater(est.score(X, y), 0.96) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='foobar') assert_raises(ValueError, est.fit, X, y)
def test_incremental_variance_numerical_stability(): # Test Youngs and Cramer incremental variance formulas. def np_var(A): return A.var(axis=0) # Naive one pass variance computation - not numerically stable # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance def one_pass_var(X): n = X.shape[0] exp_x2 = (X**2).sum(axis=0) / n expx_2 = (X.sum(axis=0) / n)**2 return exp_x2 - expx_2 # Two-pass algorithm, stable. # We use it as a benchmark. It is not an online algorithm # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm def two_pass_var(X): mean = X.mean(axis=0) Y = X.copy() return np.mean((Y - mean)**2, axis=0) # Naive online implementation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm # This works only for chunks for size 1 def naive_mean_variance_update(x, last_mean, last_variance, last_sample_count): updated_sample_count = (last_sample_count + 1) samples_ratio = last_sample_count / float(updated_sample_count) updated_mean = x / updated_sample_count + last_mean * samples_ratio updated_variance = last_variance * samples_ratio + \ (x - last_mean) * (x - updated_mean) / updated_sample_count return updated_mean, updated_variance, updated_sample_count # We want to show a case when one_pass_var has error > 1e-3 while # _batch_mean_variance_update has less. tol = 200 n_features = 2 n_samples = 10000 x1 = np.array(1e8, dtype=np.float64) x2 = np.log(1e-5, dtype=np.float64) A0 = x1 * np.ones((n_samples // 2, n_features), dtype=np.float64) A1 = x2 * np.ones((n_samples // 2, n_features), dtype=np.float64) A = np.vstack((A0, A1)) # Older versions of numpy have different precision # In some old version, np.var is not stable if np.abs(np_var(A) - two_pass_var(A)).max() < 1e-6: stable_var = np_var else: stable_var = two_pass_var # Naive one pass var: >tol (=1063) assert_greater(np.abs(stable_var(A) - one_pass_var(A)).max(), tol) # Starting point for online algorithms: after A0 # Naive implementation: >tol (436) mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2 for i in range(A1.shape[0]): mean, var, n = \ naive_mean_variance_update(A1[i, :], mean, var, n) assert_equal(n, A.shape[0]) # the mean is also slightly unstable assert_greater(np.abs(A.mean(axis=0) - mean).max(), 1e-6) assert_greater(np.abs(stable_var(A) - var).max(), tol) # Robust implementation: <tol (177) mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2 for i in range(A1.shape[0]): mean, var, n = \ _incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])), mean, var, n) assert_equal(n, A.shape[0]) assert_array_almost_equal(A.mean(axis=0), mean) assert_greater(tol, np.abs(stable_var(A) - var).max())