def test_permutation_importance_correlated_feature_regression(n_jobs): # Make sure that feature highly correlated to the target have a higher # importance rng = np.random.RandomState(42) n_repeats = 5 X, y = load_boston(return_X_y=True) y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape( -1, 1) X = np.hstack([X, y_with_little_noise]) clf = RandomForestRegressor(n_estimators=10, random_state=42) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng, n_jobs=n_jobs) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y was added as the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_forest_degenerate_feature_importances(): # build a forest of single node trees. See #13636 X = np.zeros((10, 10)) y = np.ones((10,)) gbr = RandomForestRegressor(n_estimators=10).fit(X, y) assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64))
def test_base_estimator(): # Test different base estimators. from mrex.ensemble import RandomForestClassifier # XXX doesn't work with y_class because RF doesn't support classes_ # Shouldn't AdaBoost run a LabelBinarizer? clf = AdaBoostClassifier(RandomForestClassifier()) clf.fit(X, y_regr) clf = AdaBoostClassifier(SVC(), algorithm="SAMME") clf.fit(X, y_class) from mrex.ensemble import RandomForestRegressor clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0) clf.fit(X, y_regr) clf = AdaBoostRegressor(SVR(), random_state=0) clf.fit(X, y_regr) # Check that an empty discrete ensemble fails in fit, not predict. X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]] y_fail = ["foo", "bar", 1, 2] clf = AdaBoostClassifier(SVC(), algorithm="SAMME") assert_raises_regexp(ValueError, "worse than random", clf.fit, X_fail, y_fail)
def test_check_consistent_length(): check_consistent_length([1], [2], [3], [4], [5]) check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b']) check_consistent_length([1], (2, ), np.array([3]), sp.csr_matrix((1, 2))) assert_raises_regex(ValueError, 'inconsistent numbers of samples', check_consistent_length, [1, 2], [1]) assert_raises_regex(TypeError, r"got <\w+ 'int'>", check_consistent_length, [1, 2], 1) assert_raises_regex(TypeError, r"got <\w+ 'object'>", check_consistent_length, [1, 2], object()) assert_raises(TypeError, check_consistent_length, [1, 2], np.array(1)) # Despite ensembles having __len__ they must raise TypeError assert_raises_regex(TypeError, 'Expected sequence or array-like', check_consistent_length, [1, 2], RandomForestRegressor())
assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X)) assert_array_almost_equal( eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X) ) @pytest.mark.parametrize( "X, y, voter", [(X, y, VotingClassifier( [('lr', LogisticRegression()), ('rf', RandomForestClassifier(n_estimators=5))])), (X_r, y_r, VotingRegressor( [('lr', LinearRegression()), ('rf', RandomForestRegressor(n_estimators=5))]))] ) @pytest.mark.parametrize("drop", [None, 'drop']) def test_none_estimator_with_weights(X, y, voter, drop): # check that an estimator can be set to None and passing some weight # regression test for # https://github.com/scikit-learn/scikit-learn/issues/13777 voter.fit(X, y, sample_weight=np.ones(y.shape)) voter.set_params(lr=drop) voter.fit(X, y, sample_weight=np.ones(y.shape)) y_pred = voter.predict(X) assert y_pred.shape == y.shape @pytest.mark.parametrize( "estimator",
def test_calibration(): """Test calibration objects with isotonic and sigmoid""" n_samples = 100 X, y = make_classification(n_samples=2 * n_samples, n_features=6, random_state=42) sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) X -= X.min() # MultinomialNB only allows positive X # split train and test X_train, y_train, sw_train = \ X[:n_samples], y[:n_samples], sample_weight[:n_samples] X_test, y_test = X[n_samples:], y[n_samples:] # Naive-Bayes clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train) prob_pos_clf = clf.predict_proba(X_test)[:, 1] pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1) assert_raises(ValueError, pc_clf.fit, X, y) # Naive Bayes with calibration for this_X_train, this_X_test in [(X_train, X_test), (sparse.csr_matrix(X_train), sparse.csr_matrix(X_test))]: for method in ['isotonic', 'sigmoid']: pc_clf = CalibratedClassifierCV(clf, method=method, cv=2) # Note that this fit overwrites the fit on the entire training # set pc_clf.fit(this_X_train, y_train, sample_weight=sw_train) prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1] # Check that brier score has improved after calibration assert (brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(y_test, prob_pos_pc_clf)) # Check invariance against relabeling [0, 1] -> [1, 2] pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [-1, 1] pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [1, 0] pc_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train) prob_pos_pc_clf_relabeled = \ pc_clf.predict_proba(this_X_test)[:, 1] if method == "sigmoid": assert_array_almost_equal(prob_pos_pc_clf, 1 - prob_pos_pc_clf_relabeled) else: # Isotonic calibration is not invariant against relabeling # but should improve in both cases assert (brier_score_loss(y_test, prob_pos_clf) > brier_score_loss((y_test + 1) % 2, prob_pos_pc_clf_relabeled)) # Check failure cases: # only "isotonic" and "sigmoid" should be accepted as methods clf_invalid_method = CalibratedClassifierCV(clf, method="foo") assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train) # base-estimators should provide either decision_function or # predict_proba (most regressors, for instance, should fail) clf_base_regressor = \ CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid") assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
print(__doc__) import matplotlib.pyplot as plt from mrex import datasets from mrex.ensemble import GradientBoostingRegressor from mrex.ensemble import RandomForestRegressor from mrex.linear_model import LinearRegression from mrex.ensemble import VotingRegressor # Loading some example data X, y = datasets.load_boston(return_X_y=True) # Training classifiers reg1 = GradientBoostingRegressor(random_state=1, n_estimators=10) reg2 = RandomForestRegressor(random_state=1, n_estimators=10) reg3 = LinearRegression() ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)]) reg1.fit(X, y) reg2.fit(X, y) reg3.fit(X, y) ereg.fit(X, y) xt = X[:20] plt.figure() plt.plot(reg1.predict(xt), 'gd', label='GradientBoostingRegressor') plt.plot(reg2.predict(xt), 'b^', label='RandomForestRegressor') plt.plot(reg3.predict(xt), 'ys', label='LinearRegression') plt.plot(ereg.predict(xt), 'r*', label='VotingRegressor') plt.tick_params(axis='x',
import numpy as np import matplotlib.pyplot as plt # To use the experimental IterativeImputer, we need to explicitly ask for it: from mrex.experimental import enable_iterative_imputer # noqa from mrex.datasets import load_diabetes from mrex.datasets import load_boston from mrex.ensemble import RandomForestRegressor from mrex.pipeline import make_pipeline, make_union from mrex.impute import SimpleImputer, IterativeImputer, MissingIndicator from mrex.model_selection import cross_val_score rng = np.random.RandomState(0) N_SPLITS = 5 REGRESSOR = RandomForestRegressor(random_state=0) def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( make_union(imputer, MissingIndicator(missing_values=0)), REGRESSOR) impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) return impute_scores def get_results(dataset): X_full, y_full = dataset.data, dataset.target n_samples = X_full.shape[0]
# Create a random dataset rng = np.random.RandomState(1) X = np.sort(200 * rng.rand(600, 1) - 100, axis=0) y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T y += (0.5 - rng.rand(*y.shape)) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=400, test_size=200, random_state=4) max_depth = 30 regr_multirf = MultiOutputRegressor( RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=0)) regr_multirf.fit(X_train, y_train) regr_rf = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=2) regr_rf.fit(X_train, y_train) # Predict on new data y_multirf = regr_multirf.predict(X_test) y_rf = regr_rf.predict(X_test) # Plot the results plt.figure() s = 50