def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble): # raise an error when the name contains dunder if issubclass(Ensemble, ClassifierMixin): estimators = [('lr__', LogisticRegression())] else: estimators = [('lr__', LinearRegression())] ensemble = Ensemble(estimators=estimators) err_msg = r"Estimator names must not contain __: got \['lr__'\]" with pytest.raises(ValueError, match=err_msg): ensemble.fit(X, y) # raise an error when the name is not unique if issubclass(Ensemble, ClassifierMixin): estimators = [('lr', LogisticRegression()), ('lr', LogisticRegression())] else: estimators = [('lr', LinearRegression()), ('lr', LinearRegression())] ensemble = Ensemble(estimators=estimators) err_msg = r"Names provided are not unique: \['lr', 'lr'\]" with pytest.raises(ValueError, match=err_msg): ensemble.fit(X, y) # raise an error when the name conflicts with the parameters if issubclass(Ensemble, ClassifierMixin): estimators = [('estimators', LogisticRegression())] else: estimators = [('estimators', LinearRegression())] ensemble = Ensemble(estimators=estimators) err_msg = "Estimator names conflict with constructor arguments" with pytest.raises(ValueError, match=err_msg): ensemble.fit(X, y)
def test_partial_dependence_easy_target(est, power): # If the target y only depends on one feature in an obvious way (linear or # quadratic) then the partial dependence for that feature should reflect # it. # We here fit a linear regression_data model (with polynomial features if # needed) and compute r_squared to check that the partial dependence # correctly reflects the target. rng = np.random.RandomState(0) n_samples = 200 target_variable = 2 X = rng.normal(size=(n_samples, 5)) y = X[:, target_variable]**power est.fit(X, y) averaged_predictions, values = partial_dependence( est, features=[target_variable], X=X, grid_resolution=1000) new_X = values[0].reshape(-1, 1) new_y = averaged_predictions[0] # add polynomial features if needed new_X = PolynomialFeatures(degree=power).fit_transform(new_X) lr = LinearRegression().fit(new_X, new_y) r2 = r2_score(new_y, lr.predict(new_X)) assert r2 > .99
def test_plot_partial_dependence_passing_numpy_axes(pyplot, clf_boston, boston): grid_resolution = 25 feature_names = boston.feature_names.tolist() disp1 = plot_partial_dependence(clf_boston, boston.data, ['CRIM', 'ZN'], grid_resolution=grid_resolution, feature_names=feature_names) assert disp1.axes_.shape == (1, 2) assert disp1.axes_[0, 0].get_ylabel() == "Partial dependence" assert disp1.axes_[0, 1].get_ylabel() == "" assert len(disp1.axes_[0, 0].get_lines()) == 1 assert len(disp1.axes_[0, 1].get_lines()) == 1 lr = LinearRegression() lr.fit(boston.data, boston.target) disp2 = plot_partial_dependence(lr, boston.data, ['CRIM', 'ZN'], grid_resolution=grid_resolution, feature_names=feature_names, ax=disp1.axes_) assert np.all(disp1.axes_ == disp2.axes_) assert len(disp2.axes_[0, 0].get_lines()) == 2 assert len(disp2.axes_[0, 1].get_lines()) == 2
def test_omp_reaches_least_squares(): # Use small simple data; it's a sanity check but OMP can stop early rng = check_random_state(0) n_samples, n_features = (10, 8) n_targets = 3 X = rng.randn(n_samples, n_features) Y = rng.randn(n_samples, n_targets) omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_features) lstsq = LinearRegression() omp.fit(X, Y) lstsq.fit(X, Y) assert_array_almost_equal(omp.coef_, lstsq.coef_)
def test_linear_regression_sparse(random_state=0): # Test that linear regression also works with sparse data random_state = check_random_state(random_state) for i in range(10): n = 100 X = sparse.eye(n, n) beta = random_state.rand(n) y = X * beta[:, np.newaxis] ols = LinearRegression() ols.fit(X, y.ravel()) assert_array_almost_equal(beta, ols.coef_ + ols.intercept_) assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
def test_transform_target_regressor_invertible(): X, y = friedman regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log, check_inverse=True) assert_warns_message( UserWarning, "The provided functions or transformer" " are not strictly inverse of each other.", regr.fit, X, y) regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log) regr.set_params(check_inverse=False) assert_no_warnings(regr.fit, X, y)
def test_transform_target_regressor_2d_transformer(X, y): # Check consistency with transformer accepting only 2D array and a 1D/2D y # array. transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform if y.ndim == 1: # create a 2D array and squeeze results y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze() else: y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) if y.ndim == 1: # create a 2D array and squeeze results lr.fit(X, transformer2.fit_transform(y.reshape(-1, 1)).squeeze()) else: lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_plot_partial_dependence_fig_deprecated(pyplot): # Make sure fig object is correctly used if not None X, y = make_regression(n_samples=50, random_state=0) clf = LinearRegression() clf.fit(X, y) fig = pyplot.figure() grid_resolution = 25 msg = ("The fig parameter is deprecated in version 0.22 and will be " "removed in version 0.24") with pytest.warns(FutureWarning, match=msg): plot_partial_dependence( clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig) assert pyplot.gcf() is fig
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', LinearRegression()), ('svr', LinearSVR())] reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough) reg.fit(X_train, y_train) result = reg.predict(X_test, **predict_params) expected_result_length = 2 if predict_params else 1 if predict_params: assert len(result) == expected_result_length X_trans = reg.transform(X_test) expected_column_count = 12 if passthrough else 2 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -10:]) reg.set_params(lr='drop') reg.fit(X_train, y_train) reg.predict(X_test) X_trans = reg.transform(X_test) expected_column_count_drop = 11 if passthrough else 1 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -10:])
def test_transform_target_regressor_functions_multioutput(): X = friedman[0] y = np.vstack((friedman[1], friedman[1]**2 + 1)).T regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp) y_pred = regr.fit(X, y).predict(X) # check the transformer output y_tran = regr.transformer_.transform(y) assert_allclose(np.log(y), y_tran) assert_allclose(y, regr.transformer_.inverse_transform(y_tran)) assert y.shape == y_pred.shape assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X))) # check the regressor output lr = LinearRegression().fit(X, regr.func(y)) assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
def test_transform_target_regressor_error(): X, y = friedman # provide a transformer and functions at the same time regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=StandardScaler(), func=np.exp, inverse_func=np.log) with pytest.raises(ValueError, match="'transformer' and functions" " 'func'/'inverse_func' cannot both be set."): regr.fit(X, y) # fit with sample_weight with a regressor which does not support it sample_weight = np.ones((y.shape[0], )) regr = TransformedTargetRegressor(regressor=Lasso(), transformer=StandardScaler()) with pytest.raises(TypeError, match=r"fit\(\) got an unexpected " "keyword argument 'sample_weight'"): regr.fit(X, y, sample_weight=sample_weight) # func is given but inverse_func is not regr = TransformedTargetRegressor(func=np.exp) with pytest.raises(ValueError, match="When 'func' is provided, " "'inverse_func' must also be provided"): regr.fit(X, y)
def test_subsamples(): X, y, w, c = gen_toy_problem_4d() theil_sen = TheilSenRegressor(n_subsamples=X.shape[0], random_state=0).fit(X, y) lstq = LinearRegression().fit(X, y) # Check for exact the same results as Least Squares assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 9)
def test_transform_target_regressor_functions(): X, y = friedman regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp) y_pred = regr.fit(X, y).predict(X) # check the transformer output y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze() assert_allclose(np.log(y), y_tran) assert_allclose( y, regr.transformer_.inverse_transform(y_tran.reshape(-1, 1)).squeeze()) assert y.shape == y_pred.shape assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X))) # check the regressor output lr = LinearRegression().fit(X, regr.func(y)) assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
def test_theil_sen_1d(): X, y, w, c = gen_toy_problem_1d() # Check that Least Squares fails lstq = LinearRegression().fit(X, y) assert np.abs(lstq.coef_ - w) > 0.9 # Check that Theil-Sen works theil_sen = TheilSenRegressor(random_state=0).fit(X, y) assert_array_almost_equal(theil_sen.coef_, w, 1) assert_array_almost_equal(theil_sen.intercept_, c, 1)
def test_theil_sen_1d_no_intercept(): X, y, w, c = gen_toy_problem_1d(intercept=False) # Check that Least Squares fails lstq = LinearRegression(fit_intercept=False).fit(X, y) assert np.abs(lstq.coef_ - w - c) > 0.5 # Check that Theil-Sen works theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y) assert_array_almost_equal(theil_sen.coef_, w + c, 1) assert_almost_equal(theil_sen.intercept_, 0.)
def test_fit_intercept(): # Test assertions on betas shape. X2 = np.array([[0.38349978, 0.61650022], [0.58853682, 0.41146318]]) X3 = np.array([[0.27677969, 0.70693172, 0.01628859], [0.08385139, 0.20692515, 0.70922346]]) y = np.array([1, 1]) lr2_without_intercept = LinearRegression(fit_intercept=False).fit(X2, y) lr2_with_intercept = LinearRegression().fit(X2, y) lr3_without_intercept = LinearRegression(fit_intercept=False).fit(X3, y) lr3_with_intercept = LinearRegression().fit(X3, y) assert ( lr2_with_intercept.coef_.shape == lr2_without_intercept.coef_.shape) assert ( lr3_with_intercept.coef_.shape == lr3_without_intercept.coef_.shape) assert ( lr2_without_intercept.coef_.ndim == lr3_without_intercept.coef_.ndim)
def test_ransac_stop_score(): base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, stop_score=0, random_state=0) ransac_estimator.fit(X, y) assert ransac_estimator.n_trials_ == 1
def test_theil_sen_2d(): X, y, w, c = gen_toy_problem_2d() # Check that Least Squares fails lstq = LinearRegression().fit(X, y) assert norm(lstq.coef_ - w) > 1.0 # Check that Theil-Sen works theil_sen = TheilSenRegressor(max_subpopulation=1e3, random_state=0).fit(X, y) assert_array_almost_equal(theil_sen.coef_, w, 1) assert_array_almost_equal(theil_sen.intercept_, c, 1)
def test_regressormixin_score_multioutput(): from sklearn_lib.linear_model import LinearRegression # no warnings when y_type is continuous X = [[1], [2], [3]] y = [1, 2, 3] reg = LinearRegression().fit(X, y) assert_no_warnings(reg.score, X, y) # warn when y_type is continuous-multioutput y = [[1, 2], [2, 3], [3, 4]] reg = LinearRegression().fit(X, y) msg = ("The default value of multioutput (not exposed in " "score method) will change from 'variance_weighted' " "to 'uniform_average' in 0.23 to keep consistent " "with 'metrics.r2_score'. To specify the default " "value manually and avoid the warning, please " "either call 'metrics.r2_score' directly or make a " "custom scorer with 'metrics.make_scorer' (the " "built-in scorer 'r2' uses " "multioutput='uniform_average').") assert_warns_message(FutureWarning, msg, reg.score, X, y)
def test_linear_regression_sample_weights(): # TODO: loop over sparse data as well rng = np.random.RandomState(0) # It would not work with under-determined systems for n_samples, n_features in ((6, 5), ): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) for intercept in (True, False): # LinearRegression with explicit sample_weight reg = LinearRegression(fit_intercept=intercept) reg.fit(X, y, sample_weight=sample_weight) coefs1 = reg.coef_ inter1 = reg.intercept_ assert reg.coef_.shape == (X.shape[1], ) # sanity checks assert reg.score(X, y) > 0.5 # Closed form of the weighted least square # theta = (X^T W X)^(-1) * X^T W y W = np.diag(sample_weight) if intercept is False: X_aug = X else: dummy_column = np.ones(shape=(n_samples, 1)) X_aug = np.concatenate((dummy_column, X), axis=1) coefs2 = linalg.solve( X_aug.T.dot(W).dot(X_aug), X_aug.T.dot(W).dot(y)) if intercept is False: assert_array_almost_equal(coefs1, coefs2) else: assert_array_almost_equal(coefs1, coefs2[1:]) assert_almost_equal(inter1, coefs2[0])
def test_classes_property(): X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) assert_raises(AttributeError, getattr, reg, "classes_") clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) assert_raises(AttributeError, getattr, clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y))
def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] reg = LinearRegression() reg.fit((X), Y) assert reg.coef_.shape == (2, n_features) Y_pred = reg.predict(X) reg.fit(X, y) y_pred = reg.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression(): # Test LinearRegression on a simple dataset. # a simple dataset X = [[1], [2]] Y = [1, 2] reg = LinearRegression() reg.fit(X, Y) assert_array_almost_equal(reg.coef_, [1]) assert_array_almost_equal(reg.intercept_, [0]) assert_array_almost_equal(reg.predict(X), [1, 2]) # test it also for degenerate input X = [[1]] Y = [0] reg = LinearRegression() reg.fit(X, Y) assert_array_almost_equal(reg.coef_, [0]) assert_array_almost_equal(reg.intercept_, [0]) assert_array_almost_equal(reg.predict(X), [0])
def test_linear_regression_sparse_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions with sparse data random_state = check_random_state(random_state) X, y = make_sparse_uncorrelated(random_state=random_state) X = sparse.coo_matrix(X) Y = np.vstack((y, y)).T n_features = X.shape[1] ols = LinearRegression() ols.fit(X, Y) assert ols.coef_.shape == (2, n_features) Y_pred = ols.predict(X) ols.fit(X, y.ravel()) y_pred = ols.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_sparse_equal_dense(normalize, fit_intercept): # Test that linear regression agrees between sparse and dense rng = check_random_state(0) n_samples = 200 n_features = 2 X = rng.randn(n_samples, n_features) X[X < 0.1] = 0. Xcsr = sparse.csr_matrix(X) y = rng.rand(n_samples) params = dict(normalize=normalize, fit_intercept=fit_intercept) clf_dense = LinearRegression(**params) clf_sparse = LinearRegression(**params) clf_dense.fit(X, y) clf_sparse.fit(Xcsr, y) assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_) assert_allclose(clf_dense.coef_, clf_sparse.coef_)
def test_ransac_predict(): X = np.arange(100)[:, None] y = np.zeros((100, )) y[0] = 1 y[1] = 100 base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=0.5, random_state=0) ransac_estimator.fit(X, y) assert_array_equal(ransac_estimator.predict(X), np.zeros(100))
def test_ransac_no_valid_model(): def is_model_valid(estimator, X, y): return False base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, is_model_valid=is_model_valid, max_trials=5) msg = ("RANSAC could not find a valid consensus set") assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y) assert ransac_estimator.n_skips_no_inliers_ == 0 assert ransac_estimator.n_skips_invalid_data_ == 0 assert ransac_estimator.n_skips_invalid_model_ == 5
def test_ransac_is_model_valid(): def is_model_valid(estimator, X, y): assert X.shape[0] == 2 assert y.shape[0] == 2 return False base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, is_model_valid=is_model_valid, random_state=0) assert_raises(ValueError, ransac_estimator.fit, X, y)
def test_less_samples_than_features(): random_state = np.random.RandomState(0) n_samples, n_features = 10, 20 X = random_state.normal(size=(n_samples, n_features)) y = random_state.normal(size=n_samples) # Check that Theil-Sen falls back to Least Squares if fit_intercept=False theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y) lstq = LinearRegression(fit_intercept=False).fit(X, y) assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 12) # Check fit_intercept=True case. This will not be equal to the Least # Squares solution since the intercept is calculated differently. theil_sen = TheilSenRegressor(fit_intercept=True, random_state=0).fit(X, y) y_pred = theil_sen.predict(X) assert_array_almost_equal(y_pred, y, 12)
def test_ransac_default_residual_threshold(): base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, random_state=0) # Estimate parameters of corrupted data ransac_estimator.fit(X, y) # Ground truth / reference inlier mask ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype( np.bool_) ref_inlier_mask[outliers] = False assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)