def test_deprecated(): # Test whether the deprecated decorator issues appropriate warnings # Copied almost verbatim from http://docs.python.org/library/warnings.html # First a function... with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @deprecated() def ham(): return "spam" spam = ham() assert_equal(spam, "spam") # function must remain usable assert_equal(len(w), 1) assert_true(issubclass(w[0].category, DeprecationWarning)) assert_true("deprecated" in str(w[0].message).lower()) # ... then a class. with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @deprecated("don't use this") class Ham(object): SPAM = 1 ham = Ham() assert_true(hasattr(ham, "SPAM")) assert_equal(len(w), 1) assert_true(issubclass(w[0].category, DeprecationWarning)) assert_true("deprecated" in str(w[0].message).lower())
def test_randomized_svd_sign_flip_with_transpose(): # Check if the randomized_svd sign flipping is always done based on u # irrespective of transpose. # See https://github.com/scikit-learn/scikit-learn/issues/5608 # for more details. def max_loading_is_positive(u, v): """ returns bool tuple indicating if the values maximising np.abs are positive across all rows for u and across all columns for v. """ u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all() v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all() return u_based, v_based mat = np.arange(10 * 8).reshape(10, -1) # Without transpose u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True) u_based, v_based = max_loading_is_positive(u_flipped, v_flipped) assert_true(u_based) assert_false(v_based) # With transpose u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd( mat, 3, flip_sign=True, transpose=True) u_based, v_based = max_loading_is_positive(u_flipped_with_transpose, v_flipped_with_transpose) assert_true(u_based) assert_false(v_based)
def test_uniform_weights(): # with uniform weights, results should be identical to stats.mode rng = np.random.RandomState(0) x = rng.randint(10, size=(10, 5)) weights = np.ones(x.shape) for axis in (None, 0, 1): mode, score = stats.mode(x, axis) mode2, score2 = weighted_mode(x, weights, axis) assert_true(np.all(mode == mode2)) assert_true(np.all(score == score2))
def test_resample(): # Border case not worth mentioning in doctests assert_true(resample() is None) # Check that invalid arguments yield ValueError assert_raises(ValueError, resample, [0], [0, 1]) assert_raises(ValueError, resample, [0, 1], [0, 1], replace=False, n_samples=3) assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42) # Issue:6581, n_samples can be more when replace is True (default). assert_equal(len(resample([1, 2], n_samples=5)), 5)
def test_compute_class_weight(): # Test (and demo) compute_class_weight. y = np.asarray([2, 2, 2, 3, 3, 4]) classes = np.unique(y) cw = assert_warns(DeprecationWarning, compute_class_weight, "auto", classes, y) assert_almost_equal(cw.sum(), classes.shape) assert_true(cw[0] < cw[1] < cw[2]) cw = compute_class_weight("balanced", classes, y) # total effect of samples is preserved class_counts = np.bincount(y)[2:] assert_almost_equal(np.dot(cw, class_counts), y.shape[0]) assert_true(cw[0] < cw[1] < cw[2])
def test_make_rng(): # Check the check_random_state utility function behavior assert_true(check_random_state(None) is np.random.mtrand._rand) assert_true(check_random_state(np.random) is np.random.mtrand._rand) rng_42 = np.random.RandomState(42) assert_true(check_random_state(42).randint(100) == rng_42.randint(100)) rng_42 = np.random.RandomState(42) assert_true(check_random_state(rng_42) is rng_42) rng_42 = np.random.RandomState(42) assert_true(check_random_state(43).randint(100) != rng_42.randint(100)) assert_raises(ValueError, check_random_state, "some invalid seed")
def test_max_feature_regression(): # Test to make sure random state is set properly. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5, max_depth=2, learning_rate=.1, max_features=2, random_state=1) gbrt.fit(X_train, y_train) deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test)) assert_true(deviance < 0.5, "GB failed with deviance %.4f" % deviance)
def check_sample_int(sample_without_replacement): # This test is heavily inspired from test_random.py of python-core. # # For the entire allowable range of 0 <= k <= N, validate that # the sample is of the correct length and contains only unique items n_population = 100 for n_samples in range(n_population + 1): s = sample_without_replacement(n_population, n_samples) assert_equal(len(s), n_samples) unique = np.unique(s) assert_equal(np.size(unique), n_samples) assert_true(np.all(unique < n_population)) # test edge case n_population == n_samples == 0 assert_equal(np.size(sample_without_replacement(0, 0)), 0)
def test_clone(): # Tests that clone creates a correct deep copy. # We create an estimator, make a copy of its original state # (which, in this case, is the current state of the estimator), # and check that the obtained copy is a correct deep copy. from uplift.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert_true(selector is not new_selector) assert_equal(selector.get_params(), new_selector.get_params()) selector = SelectFpr(f_classif, alpha=np.zeros((10, 2))) new_selector = clone(selector) assert_true(selector is not new_selector)
def test_probability_log(): # Predict probabilities. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert_true(np.all(y_proba >= 0.0)) assert_true(np.all(y_proba <= 1.0)) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def check_classification_toy(presort, loss): # Check classification on a toy dataset. clf = GradientBoostingClassifier(loss=loss, n_estimators=10, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf.estimators_)) deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:]) assert_true(np.any(deviance_decrease >= 0.0)) leaves = clf.apply(X) assert_equal(leaves.shape, (6, 10, 1))
def test_staged_functions_defensive(): # test that staged_functions make defensive copies rng = np.random.RandomState(0) X = rng.uniform(size=(10, 3)) y = (4 * X[:, 0]).astype(np.int) + 1 # don't predict zeros for estimator in [ GradientBoostingRegressor(), GradientBoostingClassifier() ]: estimator.fit(X, y) for func in ['predict', 'decision_function', 'predict_proba']: staged_func = getattr(estimator, "staged_" + func, None) if staged_func is None: # regressor has no staged_predict_proba continue with warnings.catch_warnings(record=True): staged_result = list(staged_func(X)) staged_result[1][:] = 0 assert_true(np.all(staged_result[0] != 0))
def check_warm_start_oob(name): # Test that the warm start computes oob score when asked. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning. clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=True) clf.fit(X, y) clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=False) clf_2.fit(X, y) clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15) clf_2.fit(X, y) assert_true(hasattr(clf_2, 'oob_score_')) assert_equal(clf.oob_score_, clf_2.oob_score_) # Test that oob_score is computed even if we don't need to train # additional trees. clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True, random_state=1, bootstrap=True, oob_score=False) clf_3.fit(X, y) assert_true(not (hasattr(clf_3, 'oob_score_'))) clf_3.set_params(oob_score=True) ignore_warnings(clf_3.fit)(X, y) assert_equal(clf.oob_score_, clf_3.oob_score_)
def test_random_choice_csc(n_samples=10000, random_state=24): # Explicit class probabilities classes = [np.array([0, 1]), np.array([0, 1, 2])] class_probabilites = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])] got = random_choice_csc(n_samples, classes, class_probabilites, random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples) assert_array_almost_equal(class_probabilites[k], p, decimal=1) # Implicit class probabilities classes = [[0, 1], [1, 2]] # test for array-like support class_probabilites = [np.array([0.5, 0.5]), np.array([0, 1/2, 1/2])] got = random_choice_csc(n_samples=n_samples, classes=classes, random_state=random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples) assert_array_almost_equal(class_probabilites[k], p, decimal=1) # Edge case probabilities 1.0 and 0.0 classes = [np.array([0, 1]), np.array([0, 1, 2])] class_probabilites = [np.array([1.0, 0.0]), np.array([0.0, 1.0, 0.0])] got = random_choice_csc(n_samples, classes, class_probabilites, random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel(), minlength=len(class_probabilites[k])) / n_samples assert_array_almost_equal(class_probabilites[k], p, decimal=1) # One class target data classes = [[1], [0]] # test for array-like support class_probabilites = [np.array([0.0, 1.0]), np.array([1.0])] got = random_choice_csc(n_samples=n_samples, classes=classes, random_state=random_state) assert_true(sp.issparse(got)) for k in range(len(classes)): p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples assert_array_almost_equal(class_probabilites[k], p, decimal=1)
def test_probability_exponential(): # Predict probabilities. clf = GradientBoostingClassifier(loss='exponential', n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert_true(np.all(y_proba >= 0.0)) assert_true(np.all(y_proba <= 1.0)) score = clf.decision_function(T).ravel() assert_array_almost_equal(y_proba[:, 1], 1.0 / (1.0 + np.exp(-2 * score))) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def check_importances(name, criterion, X, y): ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(n_estimators=20, criterion=criterion, random_state=0) est.fit(X, y) importances = est.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10) assert_equal(n_important, 3) # XXX: Remove this test in 0.19 after transform support to estimators # is removed. X_new = assert_warns(DeprecationWarning, est.transform, X, threshold="mean") assert_less(0 < X_new.shape[1], X.shape[1]) # Check with parallel importances = est.feature_importances_ est.set_params(n_jobs=2) importances_parrallel = est.feature_importances_ assert_array_almost_equal(importances, importances_parrallel) # Check with sample weights sample_weight = check_random_state(0).randint(1, 10, len(X)) est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=sample_weight) importances = est.feature_importances_ assert_true(np.all(importances >= 0.0)) for scale in [0.5, 10, 100]: est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=scale * sample_weight) importances_bis = est.feature_importances_ assert_less(np.abs(importances - importances_bis).mean(), 0.001)
def test_feature_importances(): X = np.array(boston.data, dtype=np.float32) y = np.array(boston.target, dtype=np.float32) for presort in True, False: clf = GradientBoostingRegressor(n_estimators=100, max_depth=5, min_samples_split=2, random_state=1, presort=presort) clf.fit(X, y) assert_true(hasattr(clf, 'feature_importances_')) # XXX: Remove this test in 0.19 after transform support to estimators # is removed. X_new = assert_warns(DeprecationWarning, clf.transform, X, threshold="mean") assert_less(X_new.shape[1], X.shape[1]) feature_mask = (clf.feature_importances_ > clf.feature_importances_.mean()) assert_array_almost_equal(X_new, X[:, feature_mask])
def test_get_params(): test = T(K(), K()) assert_true('a__d' in test.get_params(deep=True)) assert_true('a__d' not in test.get_params(deep=False)) test.set_params(a__d=2) assert_true(test.a.d == 2) assert_raises(ValueError, test.set_params, a__a=2)
def test_clone_nan(): # Regression test for cloning estimators with default parameter as np.nan clf = MyEstimator(empty=np.nan) clf2 = clone(clf) assert_true(clf.empty is clf2.empty)
def test_get_params_deprecated(): # deprecated attribute should not show up as params est = DeprecatedAttributeEstimator(a=1) assert_true('a' in est.get_params()) assert_true('a' in est.get_params(deep=True)) assert_true('a' in est.get_params(deep=False)) assert_true('b' not in est.get_params()) assert_true('b' not in est.get_params(deep=True)) assert_true('b' not in est.get_params(deep=False))