def check_warm_start(name, random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = ForestEstimator(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert_equal(len(clf_ws), n_estimators) clf_no_ws = ForestEstimator(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert_equal(set([tree.random_state for tree in clf_ws]), set([tree.random_state for tree in clf_no_ws])) assert_array_equal(clf_ws.apply(X), clf_no_ws.apply(X), err_msg="Failed with {0}".format(name))
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises( NotFittedError, lambda X: np.fromiter(clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_equal(clf.predict_proba(X_test), staged_proba)
def test_safe_indexing(): X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] inds = np.array([1, 2]) X_inds = safe_indexing(X, inds) X_arrays = safe_indexing(np.array(X), inds) assert_array_equal(np.array(X_inds), X_arrays) assert_array_equal(np.array(X_inds), np.array(X)[inds])
def test_safe_indexing_mock_pandas(): X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) X_df = MockDataFrame(X) inds = np.array([1, 2]) X_df_indexed = safe_indexing(X_df, inds) X_indexed = safe_indexing(X_df, inds) assert_array_equal(np.array(X_df_indexed), X_indexed)
def test_symbol_labels(): # Test with non-integer class labels. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) symbol_y = tosequence(map(str, y)) clf.fit(X, symbol_y) assert_array_equal(clf.predict(T), tosequence(map(str, true_result))) assert_equal(100, len(clf.estimators_))
def test_vector_sign_flip(): # Testing that sign flip is working & largest value has positive sign data = np.random.RandomState(36).randn(5, 5) max_abs_rows = np.argmax(np.abs(data), axis=1) data_flipped = _deterministic_vector_sign_flip(data) max_rows = np.argmax(data_flipped, axis=1) assert_array_equal(max_abs_rows, max_rows) signs = np.sign(data[range(data.shape[0]), max_abs_rows]) assert_array_equal(data, data_flipped * signs[:, np.newaxis])
def test_dtype_convert(n_classes=15): classifier = RandomForestClassifier(random_state=0, bootstrap=False) X = np.eye(n_classes) y = [ch for ch in 'ABCDEFGHIJKLMNOPQRSTU'[:n_classes]] result = classifier.fit(X, y).predict(X) assert_array_equal(classifier.classes_, y) assert_array_equal(result, y)
def test_non_uniform_weights_toy_edge_case_clf(): X = [[1, 0], [1, 0], [1, 0], [0, 1]] y = [0, 0, 1, 0] # ignore the first 2 training samples by setting their weight to 0 sample_weight = [0, 0, 1, 1] for loss in ('deviance', 'exponential'): gb = GradientBoostingClassifier(n_estimators=5) gb.fit(X, y, sample_weight=sample_weight) assert_array_equal(gb.predict([[1, 0]]), [1])
def test_clone_empty_array(): # Regression test for cloning estimators with empty arrays clf = MyEstimator(empty=np.array([])) clf2 = clone(clf) assert_array_equal(clf.empty, clf2.empty) clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]]))) clf2 = clone(clf) assert_array_equal(clf.empty.data, clf2.empty.data)
def test_float_class_labels(): # Test with float class labels. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) float_y = np.asarray(y, dtype=np.float32) clf.fit(X, float_y) assert_array_equal(clf.predict(T), np.asarray(true_result, dtype=np.float32)) assert_equal(100, len(clf.estimators_))
def test_gen_even_slices(): # check that gen_even_slices contains all samples some_range = range(10) joined_range = list( chain(*[some_range[slice] for slice in gen_even_slices(10, 3)])) assert_array_equal(some_range, joined_range) # check that passing negative n_chunks raises an error slices = gen_even_slices(10, -1) assert_raises_regex(ValueError, "gen_even_slices got n_packs=-1, must be" " >=1", next, slices)
def test_degenerate_targets(): # Check if we can fit even though all targets are equal. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) # classifier should raise exception assert_raises(ValueError, clf.fit, X, np.ones(len(X))) clf = GradientBoostingRegressor(n_estimators=100, random_state=1) clf.fit(X, np.ones(len(X))) clf.predict([rng.rand(2)]) assert_array_equal(np.ones((1, ), dtype=np.float64), clf.predict([rng.rand(2)]))
def test_warm_start_oob_switch(): # Test if oob can be turned on during warm start. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: est = Cls(n_estimators=100, max_depth=1, warm_start=True) est.fit(X, y) est.set_params(n_estimators=110, subsample=0.5) est.fit(X, y) assert_array_equal(est.oob_improvement_[:100], np.zeros(100)) # the last 10 are not zeros assert_array_equal(est.oob_improvement_[-10:] == 0.0, np.zeros(10, dtype=np.bool))
def test_shape_y(): # Test with float class labels. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) y_ = np.asarray(y, dtype=np.int32) y_ = y_[:, np.newaxis] # This will raise a DataConversionWarning that we want to # "always" raise, elsewhere the warnings gets ignored in the # later tests, and the tests that check for this warning fail assert_warns(DataConversionWarning, clf.fit, X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_))
def test_mem_layout(): # Test with different memory layouts of X and y X_ = np.asfortranarray(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X_, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) X_ = np.ascontiguousarray(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X_, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) y_ = np.asarray(y, dtype=np.int32) y_ = np.ascontiguousarray(y_) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) y_ = np.asarray(y, dtype=np.int32) y_ = np.asfortranarray(y_) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_))
def test_cartesian(): # Check if cartesian product delivers the right results axes = (np.array([1, 2, 3]), np.array([4, 5]), np.array([6, 7])) true_out = np.array([[1, 4, 6], [1, 4, 7], [1, 5, 6], [1, 5, 7], [2, 4, 6], [2, 4, 7], [2, 5, 6], [2, 5, 7], [3, 4, 6], [3, 4, 7], [3, 5, 6], [3, 5, 7]]) out = cartesian(axes) assert_array_equal(true_out, out) # check single axis x = np.arange(3) assert_array_equal(x[:, np.newaxis], cartesian((x, )))
def test_random_weights(): # set this up so that each row should have a weighted mode of 6, # with a score that is easily reproduced mode_result = 6 rng = np.random.RandomState(0) x = rng.randint(mode_result, size=(100, 10)) w = rng.random_sample(x.shape) x[:, :5] = mode_result w[:, :5] += 1 mode, score = weighted_mode(x, w, axis=1) assert_array_equal(mode, mode_result) assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
def test_random_trees_dense_equal(): # Test that the `sparse_output` parameter of RandomTreesEmbedding # works by returning the same array for both argument values. # Create the RTEs hasher_dense = RandomTreesEmbedding(n_estimators=10, sparse_output=False, random_state=0) hasher_sparse = RandomTreesEmbedding(n_estimators=10, sparse_output=True, random_state=0) X, y = datasets.make_circles(factor=0.5) X_transformed_dense = hasher_dense.fit_transform(X) X_transformed_sparse = hasher_sparse.fit_transform(X) # Assert that dense and sparse hashers have same array. assert_array_equal(X_transformed_sparse.toarray(), X_transformed_dense)
def check_classification_toy(name): """Check classification on a toy dataset.""" ForestClassifier = FOREST_CLASSIFIERS[name] clf = ForestClassifier(n_estimators=10, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf)) clf = ForestClassifier(n_estimators=10, max_features=1, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf)) # also test apply leaf_indices = clf.apply(X) assert_equal(leaf_indices.shape, (len(X), clf.n_estimators))
def test_probability_log(): # Predict probabilities. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert_true(np.all(y_proba >= 0.0)) assert_true(np.all(y_proba <= 1.0)) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def test_serialization(): # Check model serialization. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) try: import cPickle as pickle except ImportError: import pickle serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL) clf = None clf = pickle.loads(serialized_clf) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_))
def check_classification_toy(presort, loss): # Check classification on a toy dataset. clf = GradientBoostingClassifier(loss=loss, n_estimators=10, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf.estimators_)) deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:]) assert_true(np.any(deviance_decrease >= 0.0)) leaves = clf.apply(X) assert_equal(leaves.shape, (6, 10, 1))
def test_safe_indexing_pandas(): try: import pandas as pd except ImportError: raise SkipTest("Pandas not found") X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) X_df = pd.DataFrame(X) inds = np.array([1, 2]) X_df_indexed = safe_indexing(X_df, inds) X_indexed = safe_indexing(X_df, inds) assert_array_equal(np.array(X_df_indexed), X_indexed) # fun with read-only data in dataframes # this happens in joblib memmapping X.setflags(write=False) X_df_readonly = pd.DataFrame(X) with warnings.catch_warnings(record=True): X_df_ro_indexed = safe_indexing(X_df_readonly, inds) assert_array_equal(np.array(X_df_ro_indexed), X_indexed)
def test_staged_predict(): # Test whether staged decision function eventually gives # the same prediction. X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test = X[200:] clf = GradientBoostingRegressor() # test raise ValueError if not fitted assert_raises( ValueError, lambda X: np.fromiter(clf.staged_predict(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # test if prediction for last stage equals ``predict`` for y in clf.staged_predict(X_test): assert_equal(y.shape, y_pred.shape) assert_array_equal(y_pred, y)
def test_probability_exponential(): # Predict probabilities. clf = GradientBoostingClassifier(loss='exponential', n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert_true(np.all(y_proba >= 0.0)) assert_true(np.all(y_proba <= 1.0)) score = clf.decision_function(T).ravel() assert_array_almost_equal(y_proba[:, 1], 1.0 / (1.0 + np.exp(-2 * score))) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def test_column_or_1d(): EXAMPLES = [ ("binary", ["spam", "egg", "spam"]), ("binary", [0, 1, 0, 1]), ("continuous", np.arange(10) / 20.), ("multiclass", [1, 2, 3]), ("multiclass", [0, 1, 2, 2, 0]), ("multiclass", [[1], [2], [3]]), ("multilabel-indicator", [[0, 1, 0], [0, 0, 1]]), ("multiclass-multioutput", [[1, 2, 3]]), ("multiclass-multioutput", [[1, 1], [2, 2], [3, 1]]), ("multiclass-multioutput", [[5, 1], [4, 2], [3, 1]]), ("multiclass-multioutput", [[1, 2, 3]]), ("continuous-multioutput", np.arange(30).reshape((-1, 3))), ] for y_type, y in EXAMPLES: if y_type in ["binary", 'multiclass', "continuous"]: assert_array_equal(column_or_1d(y), np.ravel(y)) else: assert_raises(ValueError, column_or_1d, y)
def check_warm_start_equal_n_estimators(name): # Test if warm start with equal n_estimators does nothing and returns the # same forest and raises a warning. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] clf = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, random_state=1) clf.fit(X, y) clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, random_state=1) clf_2.fit(X, y) # Now clf_2 equals clf. clf_2.set_params(random_state=2) assert_warns(UserWarning, clf_2.fit, X, y) # If we had fit the trees again we would have got a different forest as we # changed the random state. assert_array_equal(clf.apply(X), clf_2.apply(X))
def check_decision_path(name): X, y = hastie_X, hastie_y n_samples = X.shape[0] ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, random_state=1) est.fit(X, y) indicator, n_nodes_ptr = est.decision_path(X) assert_equal(indicator.shape[1], n_nodes_ptr[-1]) assert_equal(indicator.shape[0], n_samples) assert_array_equal(np.diff(n_nodes_ptr), [e.tree_.node_count for e in est.estimators_]) # Assert that leaves index are correct leaves = est.apply(X) for est_id in range(leaves.shape[1]): leave_indicator = [ indicator[i, n_nodes_ptr[est_id] + j] for i, j in enumerate(leaves[:, est_id]) ] assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
def check_classes_shape(name): # Test that n_classes_ and classes_ have proper shape. ForestClassifier = FOREST_CLASSIFIERS[name] # Classification, single output clf = ForestClassifier(random_state=0).fit(X, y) assert_equal(clf.n_classes_, 2) assert_array_equal(clf.classes_, [-1, 1]) # Classification, multi-output _y = np.vstack((y, np.array(y) * 2)).T clf = ForestClassifier(random_state=0).fit(X, _y) assert_array_equal(clf.n_classes_, [2, 2]) assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]])
def test_shuffle_dont_convert_to_array(): # Check that shuffle does not try to convert to numpy arrays with float # dtypes can let any indexable datastructure pass-through. a = ['a', 'b', 'c'] b = np.array(['a', 'b', 'c'], dtype=object) c = [1, 2, 3] d = MockDataFrame(np.array([['a', 0], ['b', 1], ['c', 2]], dtype=object)) e = sp.csc_matrix(np.arange(6).reshape(3, 2)) a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0) assert_equal(a_s, ['c', 'b', 'a']) assert_equal(type(a_s), list) assert_array_equal(b_s, ['c', 'b', 'a']) assert_equal(b_s.dtype, object) assert_equal(c_s, [3, 2, 1]) assert_equal(type(c_s), list) assert_array_equal(d_s, np.array([['c', 2], ['b', 1], ['a', 0]], dtype=object)) assert_equal(type(d_s), MockDataFrame) assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))