def test_warm_start(fit_intercept): n_samples, n_features = 110, 10 X, y = make_regression( n_samples=n_samples, n_features=n_features, n_informative=n_features - 2, noise=0.5, random_state=42, ) glm1 = _GeneralizedLinearRegressor(warm_start=False, fit_intercept=fit_intercept, max_iter=1000) glm1.fit(X, y) glm2 = _GeneralizedLinearRegressor(warm_start=True, fit_intercept=fit_intercept, max_iter=1) # As we intentionally set max_iter=1, L-BFGS-B will issue a # ConvergenceWarning which we here simply ignore. with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ConvergenceWarning) glm2.fit(X, y) assert glm1.score(X, y) > glm2.score(X, y) glm2.set_params(max_iter=1000) glm2.fit(X, y) # The two model are not exactly identical since the lbfgs solver # computes the approximate hessian from previous iterations, which # will not be strictly identical in the case of a warm start. assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
def test_glm_solver_argument(solver): """Test GLM for invalid solver argument.""" y = np.array([1, 2]) X = np.array([[1], [2]]) glm = _GeneralizedLinearRegressor(solver=solver) with pytest.raises(ValueError): glm.fit(X, y)
def test_glm_fit_intercept_argument(fit_intercept): """Test GLM for invalid fit_intercept argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) glm = _GeneralizedLinearRegressor(fit_intercept=fit_intercept) with pytest.raises(ValueError, match="fit_intercept must be bool"): glm.fit(X, y)
def test_glm_warm_start_argument(warm_start): """Test GLM for invalid warm_start argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) glm = _GeneralizedLinearRegressor(warm_start=warm_start) with pytest.raises(ValueError, match="warm_start must be bool"): glm.fit(X, y)
def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, sample_weight, request): """Compare with Ridge regression for Normal distributions.""" test_size = 10 X, y = make_regression( n_samples=n_samples + test_size, n_features=n_features, n_informative=n_features - 2, noise=0.5, random_state=42, ) if n_samples > n_features: ridge_params = {"solver": "svd"} else: ridge_params = {"solver": "saga", "max_iter": 1000000, "tol": 1e-7} ( X_train, X_test, y_train, y_test, ) = train_test_split(X, y, test_size=test_size, random_state=0) alpha = 1.0 if sample_weight is None: sw_train = None alpha_ridge = alpha * n_samples else: sw_train = np.random.RandomState(0).rand(len(y_train)) alpha_ridge = alpha * sw_train.sum() # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge( alpha=alpha_ridge, normalize=False, random_state=42, fit_intercept=fit_intercept, **ridge_params, ) ridge.fit(X_train, y_train, sample_weight=sw_train) glm = _GeneralizedLinearRegressor( alpha=alpha, fit_intercept=fit_intercept, max_iter=300, tol=1e-5, ) glm.fit(X_train, y_train, sample_weight=sw_train) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=2e-4) assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4)
def test_glm_identity_regression(fit_intercept): """Test GLM regression with identity link on a simple dataset.""" coef = [1.0, 2.0] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) glm = _GeneralizedLinearRegressor( alpha=0, fit_intercept=fit_intercept, tol=1e-12, ) if fit_intercept: glm.fit(X[:, 1:], y) assert_allclose(glm.coef_, coef[1:], rtol=1e-10) assert_allclose(glm.intercept_, coef[0], rtol=1e-10) else: glm.fit(X, y) assert_allclose(glm.coef_, coef, rtol=1e-12)
def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" # scalar value but not positive X = [[1]] y = [1] weights = 0 glm = _GeneralizedLinearRegressor() # Positive weights are accepted glm.fit(X, y, sample_weight=1) # 2d array weights = [[0]] with pytest.raises(ValueError, match="must be 1D array or scalar"): glm.fit(X, y, weights) # 1d but wrong length weights = [1, 0] msg = r"sample_weight.shape == \(2,\), expected \(1,\)!" with pytest.raises(ValueError, match=msg): glm.fit(X, y, weights)
def test_convergence_warning(regression_data): X, y = regression_data est = _GeneralizedLinearRegressor(max_iter=1, tol=1e-20) with pytest.warns(ConvergenceWarning): est.fit(X, y)