def test_prediction_between_low_up(return_pred: str) -> None: """Test that prediction lies between low and up prediction intervals.""" mapie = MapieRegressor(LinearRegression(), return_pred=return_pred) mapie.fit(X_boston, y_boston) y_preds = mapie.predict(X_boston) y_pred, y_low, y_up = y_preds[:, 0], y_preds[:, 1], y_preds[:, 2] assert (y_pred >= y_low).all() & (y_pred <= y_up).all()
def test_results(method: str) -> None: """ Test that MapieRegressor applied on a linear regression model fitted on a linear curve results in null uncertainty. """ mapie = MapieRegressor(LinearRegression(), method=method, n_splits=3) mapie.fit(X_toy, y_toy) y_preds = mapie.predict(X_toy) y_low, y_up = y_preds[:, 1], y_preds[:, 2] assert_almost_equal(y_up, y_low, 10)
def test_linreg_results(method: str) -> None: """Test expected PIs for a multivariate linear regression problem with fixed random seed.""" mapie = MapieRegressor(LinearRegression(), method=method, alpha=0.05, random_state=SEED) mapie.fit(X_reg, y_reg) y_preds = mapie.predict(X_reg) preds_low, preds_up = y_preds[:, 1], y_preds[:, 2] assert_almost_equal((preds_up - preds_low).mean(), expected_widths[method], 2) assert_almost_equal(coverage_score(y_reg, preds_low, preds_up), expected_coverages[method], 2)
def test_optional_input_values() -> None: """Test default values of input parameters.""" mapie = MapieRegressor(DummyRegressor()) assert mapie.method == "cv_plus" assert mapie.alpha == 0.1 assert mapie.n_splits == 5 assert mapie.shuffle assert mapie.return_pred == "single" assert mapie.random_state is None
def test_invalid_method_in_predict(monkeypatch: Any, method: str) -> None: """Test message in predict when invalid method is selected.""" monkeypatch.setattr(MapieRegressor, "_check_parameters", lambda _: None) monkeypatch.setattr(MapieRegressor, "_select_cv", lambda _: LeaveOneOut()) mapie = MapieRegressor(DummyRegressor(), method=method) mapie.fit(X_boston, y_boston) with pytest.raises(ValueError, match=r".*Invalid method.*"): mapie.predict(X_boston)
def test_predinterv_outputshape() -> None: """Test that number of observations given by predict method is equal to input data.""" mapie = MapieRegressor(DummyRegressor()) mapie.fit(X_reg, y_reg) assert mapie.predict(X_reg).shape[0] == X_reg.shape[0] assert mapie.predict(X_reg).shape[1] == 3
# Non-nested approach with the CV+ method using the Random Forest model. cv_obj = RandomizedSearchCV(rf_model, param_distributions=rf_params, n_iter=n_iter, cv=n_cv, scoring="neg_root_mean_squared_error", return_train_score=True, verbose=0, n_jobs=-1, random_state=random_state) cv_obj.fit(X_train, y_train) best_est = cv_obj.best_estimator_ mapie_non_nested = MapieRegressor(best_est, alpha=alpha, method='cv_plus', n_splits=n_cv, return_pred='median', random_state=random_state) mapie_non_nested.fit(X_train, y_train) y_preds_non_nested = mapie_non_nested.predict(X_test) widths_non_nested = y_preds_non_nested[:, 2] - y_preds_non_nested[:, 1] coverage_non_nested = coverage_score(y_test, y_preds_non_nested[:, 1], y_preds_non_nested[:, 2]) score_non_nested = mean_squared_error(y_test, y_preds_non_nested[:, 0], squared=False) # Nested approach with the CV+ method using the Random Forest model. cv_obj = RandomizedSearchCV(rf_model, param_distributions=rf_params, n_iter=n_iter,
====================================================== An example plot of :class:`mapie.estimators.MapieRegressor` used in the Quickstart. """ import numpy as np from matplotlib import pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.datasets import make_regression from mapie.estimators import MapieRegressor from mapie.metrics import coverage_score regressor = LinearRegression() X, y = make_regression(n_samples=500, n_features=1, noise=20, random_state=59) mapie = MapieRegressor(regressor, method="jackknife_plus") mapie.fit(X, y) y_preds = mapie.predict(X) plt.xlabel('x') plt.ylabel('y') plt.scatter(X, y, alpha=0.3) plt.plot(X, y_preds[:, 0], color='C1') order = np.argsort(X[:, 0]) plt.fill_between(X[order].ravel(), y_preds[:, 1][order], y_preds[:, 2][order], alpha=0.3) plt.title( f"Target coverage = 0.9; Effective coverage = {coverage_score(y, y_preds[:, 1], y_preds[:, 2])}" )
def test_none_estimator() -> None: """Test error raised when estimator is None.""" mapie = MapieRegressor(None) with pytest.raises(ValueError, match=r".*Invalid none estimator.*"): mapie.fit(X_boston, y_boston)
def test_fitted() -> None: """Test that fit does not crash.""" mapie = MapieRegressor(DummyRegressor()) mapie.fit(X_reg, y_reg)
def test_not_fitted() -> None: """Test error message when predict is called before fit.""" mapie = MapieRegressor(DummyRegressor()) with pytest.raises(NotFittedError, match=r".*not fitted.*"): mapie.predict(X_reg)
ax.plot(X_test, y_pred, label='Prediction intervals') ax.fill_between(X_test, y_pred_low, y_pred_up, alpha=0.3) ax.set_title(title) ax.legend() X_train, y_train, X_test, y_test, y_test_sigma = get_homoscedastic_data( n_samples=200, n_test=200, sigma=0.1) polyn_model = Pipeline([('poly', PolynomialFeatures(degree=4)), ('linear', LinearRegression(fit_intercept=False))]) methods = [ 'jackknife', 'jackknife_plus', 'jackknife_minmax', 'cv', 'cv_plus', 'cv_minmax' ] fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3, figsize=(3 * 6, 12)) axs = [ax1, ax2, ax3, ax4, ax5, ax6] for i, method in enumerate(methods): mapie = MapieRegressor(polyn_model, method=method, alpha=0.05, n_splits=10, return_pred='ensemble') mapie.fit(X_train.reshape(-1, 1), y_train) y_preds = mapie.predict(X_test.reshape(-1, 1)) plot_1d_data(X_train, y_train, X_test, y_test, y_test_sigma, y_preds[:, 0], y_preds[:, 1], y_preds[:, 2], axs[i], method)
def test_invalid_alpha(alpha: int) -> None: """Test that invalid alphas raise errors.""" mapie = MapieRegressor(DummyRegressor(), alpha=alpha) with pytest.raises(ValueError, match=r".*Invalid alpha.*"): mapie.fit(X_boston, y_boston)
def test_invalid_method_in_fit(monkeypatch: Any, method: str) -> None: """Test error in select_cv when invalid method is selected.""" monkeypatch.setattr(MapieRegressor, "_check_parameters", lambda _: None) mapie = MapieRegressor(DummyRegressor(), method=method) with pytest.raises(ValueError, match=r".*Invalid method.*"): mapie.fit(X_boston, y_boston)
def test_invalid_method_in_check_parameters(method: str) -> None: """Test error in check_parameters when invalid method is selected.""" mapie = MapieRegressor(DummyRegressor(), method=method) with pytest.raises(ValueError, match=r".*Invalid method.*"): mapie.fit(X_boston, y_boston)
def test_none_estimator() -> None: """Test error raised when estimator is None.""" mapie = MapieRegressor(None) mapie.fit(X_boston, y_boston) assert isinstance(mapie.estimator, LinearRegression)
def test_invalid_ensemble_in_check_parameters(ensemble: Any) -> None: """Test error in check_parameters when invalid ensemble is selected.""" mapie = MapieRegressor(DummyRegressor(), ensemble=ensemble) with pytest.raises(ValueError, match=r".*Invalid ensemble.*"): mapie.fit(X_boston, y_boston)
def test_prediction_between_low_up(ensemble: bool) -> None: """Test that prediction lies between low and up prediction intervals.""" mapie = MapieRegressor(LinearRegression(), ensemble=ensemble) mapie.fit(X_boston, y_boston) y_preds = mapie.predict(X_boston) y_pred, y_low, y_up = y_preds[:, 0], y_preds[:, 1], y_preds[:, 2] assert (y_pred >= y_low).all() & (y_pred <= y_up).all() @pytest.mark.parametrize("method", all_methods) def test_linreg_results(method: str) -> None: """Test expected PIs for a multivariate linear regression problem with fixed random seed.""" mapie = MapieRegressor(LinearRegression(), method=method, alpha=0.05, random_state=SEED) mapie.fit(X_reg, y_reg) y_preds = mapie.predict(X_reg) preds_low, preds_up = y_preds[:, 1], y_preds[:, 2] assert_almost_equal((preds_up - preds_low).mean(), expected_widths[method], 2) assert_almost_equal(coverage_score(y_reg, preds_low, preds_up), expected_coverages[method], 2) @parametrize_with_checks([MapieRegressor(LinearRegression())]) # type: ignore def test_sklearn_compatible_estimator(estimator: Any, check: Any) -> None: """Check compatibility with sklearn, using sklearn estimator checks API.""" if check.func.__name__ not in SKLEARN_EXCLUDED_CHECKS: check(estimator)
def test_single_estimator_attribute(method: str) -> None: """Test class attributes shared by all PI methods.""" mapie = MapieRegressor(DummyRegressor(), method=method) mapie.fit(X_reg, y_reg) assert hasattr(mapie, 'single_estimator_')
def test_quantile_attribute(method: str) -> None: """Test quantile attribute.""" mapie = MapieRegressor(DummyRegressor(), method=method) mapie.fit(X_reg, y_reg) assert hasattr(mapie, 'quantile_') assert (mapie.quantile_ >= 0)
def test_initialized() -> None: """Test that initialization does not crash.""" MapieRegressor(DummyRegressor())
def test_jkcv_attribute(method: str) -> None: """Test class attributes shared by jackknife and CV methods.""" mapie = MapieRegressor(DummyRegressor(), method=method) mapie.fit(X_reg, y_reg) assert hasattr(mapie, 'estimators_') assert hasattr(mapie, 'residuals_')
def test_predicted() -> None: """Test that predict does not crash.""" mapie = MapieRegressor(DummyRegressor()) mapie.fit(X_reg, y_reg) mapie.predict(X_reg)
def test_cv_attributes(method: str) -> None: """Test class attributes shared by CV methods.""" mapie = MapieRegressor(DummyRegressor(), method=method, shuffle=False) mapie.fit(X_reg, y_reg) assert hasattr(mapie, 'k_') assert mapie.random_state is None
def PIs_vs_dimensions( methods: List[str], alpha: float, n_trial: int, dimensions: List[int] ) -> Dict[str, Dict[int, Dict[str, np.ndarray]]]: """ Compute the prediction intervals for a linear regression problem. Function adapted from Foygel-Barber et al. (2020). It generates several times linear data with random noise whose signal-to-noise is equal to 10 and for several given dimensions, given by the dimensions list. Here we use MAPIE, with a LinearRegression base model, to estimate the width means and the coverage levels of the prediction intervals estimated by all the available methods as function of the dataset dimension. This simulation is carried out to emphasize the instability of the prediction intervals estimated by the Jackknife method when the dataset dimension is equal to the number of training samples (here 100). Parameters ---------- methods : List[str] List of methods for estimating prediction intervals. alpha : float 1 - (target coverage level). n_trial : int Number of trials for each dimension for estimating prediction intervals. For each trial, a new random noise is generated. dimensions : List[int] List of dimension values of input data. Returns ------- Dict[str, Dict[int, Dict[str, np.ndarray]]] Prediction interval widths and coverages for each method, trial, and dimension value. """ n_train = 100 n_test = 100 SNR = 10 results: Dict[str, Dict[int, Dict[str, np.ndarray]]] = { method: { dimension: { "coverage": np.empty(n_trial), "width_mean": np.empty(n_trial) } for dimension in dimensions } for method in methods } for dimension in dimensions: for trial in range(n_trial): beta = np.random.normal(size=dimension) beta_norm = np.sqrt((beta**2).sum()) beta = beta/beta_norm*np.sqrt(SNR) X_train = np.random.normal(size=(n_train, dimension)) noise_train = np.random.normal(size=n_train) noise_test = np.random.normal(size=n_test) y_train = X_train.dot(beta) + noise_train X_test = np.random.normal(size=(n_test, dimension)) y_test = X_test.dot(beta) + noise_test for method in methods: mapie = MapieRegressor( LinearRegression(), alpha=alpha, method=method, n_splits=5, shuffle=False, return_pred="ensemble" ) mapie.fit(X_train, y_train) y_preds = mapie.predict(X_test) results[method][dimension]["coverage"][trial] = coverage_score( y_test, y_preds[:, 1], y_preds[:, 2] ) results[method][dimension]["width_mean"][trial] = ( y_preds[:, 2] - y_preds[:, 1] ).mean() return results
# Non-nested approach with the CV+ method using the Random Forest model. cv_obj = RandomizedSearchCV(rf_model, param_distributions=rf_params, n_iter=n_iter, cv=n_cv, scoring="neg_root_mean_squared_error", return_train_score=True, verbose=0, n_jobs=-1, random_state=random_state) cv_obj.fit(X_train, y_train) best_est = cv_obj.best_estimator_ mapie_non_nested = MapieRegressor(best_est, alpha=alpha, method="cv_plus", n_splits=n_cv, ensemble=True, random_state=random_state) mapie_non_nested.fit(X_train, y_train) y_preds_non_nested = mapie_non_nested.predict(X_test) widths_non_nested = y_preds_non_nested[:, 2] - y_preds_non_nested[:, 1] coverage_non_nested = coverage_score(y_test, y_preds_non_nested[:, 1], y_preds_non_nested[:, 2]) score_non_nested = mean_squared_error(y_test, y_preds_non_nested[:, 0], squared=False) # Nested approach with the CV+ method using the Random Forest model. cv_obj = RandomizedSearchCV(rf_model, param_distributions=rf_params, n_iter=n_iter,