예제 #1
0
def test_prediction_between_low_up(return_pred: str) -> None:
    """Test that prediction lies between low and up prediction intervals."""
    mapie = MapieRegressor(LinearRegression(), return_pred=return_pred)
    mapie.fit(X_boston, y_boston)
    y_preds = mapie.predict(X_boston)
    y_pred, y_low, y_up = y_preds[:, 0], y_preds[:, 1], y_preds[:, 2]
    assert (y_pred >= y_low).all() & (y_pred <= y_up).all()
예제 #2
0
def test_results(method: str) -> None:
    """
    Test that MapieRegressor applied on a linear regression model
    fitted on a linear curve results in null uncertainty.
    """
    mapie = MapieRegressor(LinearRegression(), method=method, n_splits=3)
    mapie.fit(X_toy, y_toy)
    y_preds = mapie.predict(X_toy)
    y_low, y_up = y_preds[:, 1], y_preds[:, 2]
    assert_almost_equal(y_up, y_low, 10)
예제 #3
0
def test_linreg_results(method: str) -> None:
    """Test expected PIs for a multivariate linear regression problem with fixed random seed."""
    mapie = MapieRegressor(LinearRegression(),
                           method=method,
                           alpha=0.05,
                           random_state=SEED)
    mapie.fit(X_reg, y_reg)
    y_preds = mapie.predict(X_reg)
    preds_low, preds_up = y_preds[:, 1], y_preds[:, 2]
    assert_almost_equal((preds_up - preds_low).mean(), expected_widths[method],
                        2)
    assert_almost_equal(coverage_score(y_reg, preds_low, preds_up),
                        expected_coverages[method], 2)
예제 #4
0
def test_optional_input_values() -> None:
    """Test default values of input parameters."""
    mapie = MapieRegressor(DummyRegressor())
    assert mapie.method == "cv_plus"
    assert mapie.alpha == 0.1
    assert mapie.n_splits == 5
    assert mapie.shuffle
    assert mapie.return_pred == "single"
    assert mapie.random_state is None
예제 #5
0
def test_invalid_method_in_predict(monkeypatch: Any, method: str) -> None:
    """Test message in predict when invalid method is selected."""
    monkeypatch.setattr(MapieRegressor, "_check_parameters", lambda _: None)
    monkeypatch.setattr(MapieRegressor, "_select_cv", lambda _: LeaveOneOut())
    mapie = MapieRegressor(DummyRegressor(), method=method)
    mapie.fit(X_boston, y_boston)
    with pytest.raises(ValueError, match=r".*Invalid method.*"):
        mapie.predict(X_boston)
예제 #6
0
def test_predinterv_outputshape() -> None:
    """Test that number of observations given by predict method is equal to input data."""
    mapie = MapieRegressor(DummyRegressor())
    mapie.fit(X_reg, y_reg)
    assert mapie.predict(X_reg).shape[0] == X_reg.shape[0]
    assert mapie.predict(X_reg).shape[1] == 3
예제 #7
0
# Non-nested approach with the CV+ method using the Random Forest model.
cv_obj = RandomizedSearchCV(rf_model,
                            param_distributions=rf_params,
                            n_iter=n_iter,
                            cv=n_cv,
                            scoring="neg_root_mean_squared_error",
                            return_train_score=True,
                            verbose=0,
                            n_jobs=-1,
                            random_state=random_state)
cv_obj.fit(X_train, y_train)
best_est = cv_obj.best_estimator_
mapie_non_nested = MapieRegressor(best_est,
                                  alpha=alpha,
                                  method='cv_plus',
                                  n_splits=n_cv,
                                  return_pred='median',
                                  random_state=random_state)
mapie_non_nested.fit(X_train, y_train)
y_preds_non_nested = mapie_non_nested.predict(X_test)
widths_non_nested = y_preds_non_nested[:, 2] - y_preds_non_nested[:, 1]
coverage_non_nested = coverage_score(y_test, y_preds_non_nested[:, 1],
                                     y_preds_non_nested[:, 2])
score_non_nested = mean_squared_error(y_test,
                                      y_preds_non_nested[:, 0],
                                      squared=False)

# Nested approach with the CV+ method using the Random Forest model.
cv_obj = RandomizedSearchCV(rf_model,
                            param_distributions=rf_params,
                            n_iter=n_iter,
예제 #8
0
======================================================

An example plot of :class:`mapie.estimators.MapieRegressor` used
in the Quickstart.
"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from mapie.estimators import MapieRegressor
from mapie.metrics import coverage_score

regressor = LinearRegression()
X, y = make_regression(n_samples=500, n_features=1, noise=20, random_state=59)

mapie = MapieRegressor(regressor, method="jackknife_plus")
mapie.fit(X, y)
y_preds = mapie.predict(X)

plt.xlabel('x')
plt.ylabel('y')
plt.scatter(X, y, alpha=0.3)
plt.plot(X, y_preds[:, 0], color='C1')
order = np.argsort(X[:, 0])
plt.fill_between(X[order].ravel(),
                 y_preds[:, 1][order],
                 y_preds[:, 2][order],
                 alpha=0.3)
plt.title(
    f"Target coverage = 0.9; Effective coverage = {coverage_score(y, y_preds[:, 1], y_preds[:, 2])}"
)
예제 #9
0
def test_none_estimator() -> None:
    """Test error raised when estimator is None."""
    mapie = MapieRegressor(None)
    with pytest.raises(ValueError, match=r".*Invalid none estimator.*"):
        mapie.fit(X_boston, y_boston)
예제 #10
0
def test_fitted() -> None:
    """Test that fit does not crash."""
    mapie = MapieRegressor(DummyRegressor())
    mapie.fit(X_reg, y_reg)
예제 #11
0
def test_not_fitted() -> None:
    """Test error message when predict is called before fit."""
    mapie = MapieRegressor(DummyRegressor())
    with pytest.raises(NotFittedError, match=r".*not fitted.*"):
        mapie.predict(X_reg)
예제 #12
0
    ax.plot(X_test, y_pred, label='Prediction intervals')
    ax.fill_between(X_test, y_pred_low, y_pred_up, alpha=0.3)
    ax.set_title(title)
    ax.legend()


X_train, y_train, X_test, y_test, y_test_sigma = get_homoscedastic_data(
    n_samples=200, n_test=200, sigma=0.1)

polyn_model = Pipeline([('poly', PolynomialFeatures(degree=4)),
                        ('linear', LinearRegression(fit_intercept=False))])

methods = [
    'jackknife', 'jackknife_plus', 'jackknife_minmax', 'cv', 'cv_plus',
    'cv_minmax'
]
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2,
                                                       3,
                                                       figsize=(3 * 6, 12))
axs = [ax1, ax2, ax3, ax4, ax5, ax6]
for i, method in enumerate(methods):
    mapie = MapieRegressor(polyn_model,
                           method=method,
                           alpha=0.05,
                           n_splits=10,
                           return_pred='ensemble')
    mapie.fit(X_train.reshape(-1, 1), y_train)
    y_preds = mapie.predict(X_test.reshape(-1, 1))
    plot_1d_data(X_train, y_train, X_test, y_test, y_test_sigma, y_preds[:, 0],
                 y_preds[:, 1], y_preds[:, 2], axs[i], method)
예제 #13
0
def test_invalid_alpha(alpha: int) -> None:
    """Test that invalid alphas raise errors."""
    mapie = MapieRegressor(DummyRegressor(), alpha=alpha)
    with pytest.raises(ValueError, match=r".*Invalid alpha.*"):
        mapie.fit(X_boston, y_boston)
예제 #14
0
def test_invalid_method_in_fit(monkeypatch: Any, method: str) -> None:
    """Test error in select_cv when invalid method is selected."""
    monkeypatch.setattr(MapieRegressor, "_check_parameters", lambda _: None)
    mapie = MapieRegressor(DummyRegressor(), method=method)
    with pytest.raises(ValueError, match=r".*Invalid method.*"):
        mapie.fit(X_boston, y_boston)
예제 #15
0
def test_invalid_method_in_check_parameters(method: str) -> None:
    """Test error in check_parameters when invalid method is selected."""
    mapie = MapieRegressor(DummyRegressor(), method=method)
    with pytest.raises(ValueError, match=r".*Invalid method.*"):
        mapie.fit(X_boston, y_boston)
예제 #16
0
def test_none_estimator() -> None:
    """Test error raised when estimator is None."""
    mapie = MapieRegressor(None)
    mapie.fit(X_boston, y_boston)
    assert isinstance(mapie.estimator, LinearRegression)
예제 #17
0
def test_invalid_ensemble_in_check_parameters(ensemble: Any) -> None:
    """Test error in check_parameters when invalid ensemble is selected."""
    mapie = MapieRegressor(DummyRegressor(), ensemble=ensemble)
    with pytest.raises(ValueError, match=r".*Invalid ensemble.*"):
        mapie.fit(X_boston, y_boston)
예제 #18
0
def test_prediction_between_low_up(ensemble: bool) -> None:
    """Test that prediction lies between low and up prediction intervals."""
    mapie = MapieRegressor(LinearRegression(), ensemble=ensemble)
    mapie.fit(X_boston, y_boston)
    y_preds = mapie.predict(X_boston)
    y_pred, y_low, y_up = y_preds[:, 0], y_preds[:, 1], y_preds[:, 2]
    assert (y_pred >= y_low).all() & (y_pred <= y_up).all()


@pytest.mark.parametrize("method", all_methods)
def test_linreg_results(method: str) -> None:
    """Test expected PIs for a multivariate linear regression problem with fixed random seed."""
    mapie = MapieRegressor(LinearRegression(),
                           method=method,
                           alpha=0.05,
                           random_state=SEED)
    mapie.fit(X_reg, y_reg)
    y_preds = mapie.predict(X_reg)
    preds_low, preds_up = y_preds[:, 1], y_preds[:, 2]
    assert_almost_equal((preds_up - preds_low).mean(), expected_widths[method],
                        2)
    assert_almost_equal(coverage_score(y_reg, preds_low, preds_up),
                        expected_coverages[method], 2)


@parametrize_with_checks([MapieRegressor(LinearRegression())])  # type: ignore
def test_sklearn_compatible_estimator(estimator: Any, check: Any) -> None:
    """Check compatibility with sklearn, using sklearn estimator checks API."""
    if check.func.__name__ not in SKLEARN_EXCLUDED_CHECKS:
        check(estimator)
예제 #19
0
def test_single_estimator_attribute(method: str) -> None:
    """Test class attributes shared by all PI methods."""
    mapie = MapieRegressor(DummyRegressor(), method=method)
    mapie.fit(X_reg, y_reg)
    assert hasattr(mapie, 'single_estimator_')
예제 #20
0
def test_quantile_attribute(method: str) -> None:
    """Test quantile attribute."""
    mapie = MapieRegressor(DummyRegressor(), method=method)
    mapie.fit(X_reg, y_reg)
    assert hasattr(mapie, 'quantile_')
    assert (mapie.quantile_ >= 0)
예제 #21
0
def test_initialized() -> None:
    """Test that initialization does not crash."""
    MapieRegressor(DummyRegressor())
예제 #22
0
def test_jkcv_attribute(method: str) -> None:
    """Test class attributes shared by jackknife and CV methods."""
    mapie = MapieRegressor(DummyRegressor(), method=method)
    mapie.fit(X_reg, y_reg)
    assert hasattr(mapie, 'estimators_')
    assert hasattr(mapie, 'residuals_')
예제 #23
0
def test_predicted() -> None:
    """Test that predict does not crash."""
    mapie = MapieRegressor(DummyRegressor())
    mapie.fit(X_reg, y_reg)
    mapie.predict(X_reg)
예제 #24
0
def test_cv_attributes(method: str) -> None:
    """Test class attributes shared by CV methods."""
    mapie = MapieRegressor(DummyRegressor(), method=method, shuffle=False)
    mapie.fit(X_reg, y_reg)
    assert hasattr(mapie, 'k_')
    assert mapie.random_state is None
예제 #25
0
def PIs_vs_dimensions(
    methods: List[str],
    alpha: float,
    n_trial: int,
    dimensions: List[int]
) -> Dict[str, Dict[int, Dict[str, np.ndarray]]]:
    """
    Compute the prediction intervals for a linear regression problem.
    Function adapted from Foygel-Barber et al. (2020).

    It generates several times linear data with random noise whose signal-to-noise
    is equal to 10 and for several given dimensions, given by the dimensions list.

    Here we use MAPIE, with a LinearRegression base model, to estimate the width
    means and the coverage levels of the prediction intervals estimated by all the
    available methods as function of the dataset dimension.

    This simulation is carried out to emphasize the instability of the prediction
    intervals estimated by the Jackknife method when the dataset dimension is
    equal to the number of training samples (here 100).

    Parameters
    ----------
    methods : List[str]
        List of methods for estimating prediction intervals.
    alpha : float
        1 - (target coverage level).
    n_trial : int
        Number of trials for each dimension for estimating prediction intervals.
        For each trial, a new random noise is generated.
    dimensions : List[int]
        List of dimension values of input data.

    Returns
    -------
    Dict[str, Dict[int, Dict[str, np.ndarray]]]
        Prediction interval widths and coverages for each method, trial,
        and dimension value.
    """
    n_train = 100
    n_test = 100
    SNR = 10
    results: Dict[str, Dict[int, Dict[str, np.ndarray]]] = {
        method: {
            dimension: {
                "coverage": np.empty(n_trial),
                "width_mean": np.empty(n_trial)
            } for dimension in dimensions
        } for method in methods
    }
    for dimension in dimensions:
        for trial in range(n_trial):
            beta = np.random.normal(size=dimension)
            beta_norm = np.sqrt((beta**2).sum())
            beta = beta/beta_norm*np.sqrt(SNR)
            X_train = np.random.normal(size=(n_train, dimension))
            noise_train = np.random.normal(size=n_train)
            noise_test = np.random.normal(size=n_test)
            y_train = X_train.dot(beta) + noise_train
            X_test = np.random.normal(size=(n_test, dimension))
            y_test = X_test.dot(beta) + noise_test

            for method in methods:
                mapie = MapieRegressor(
                    LinearRegression(),
                    alpha=alpha,
                    method=method,
                    n_splits=5,
                    shuffle=False,
                    return_pred="ensemble"
                )
                mapie.fit(X_train, y_train)
                y_preds = mapie.predict(X_test)
                results[method][dimension]["coverage"][trial] = coverage_score(
                    y_test, y_preds[:, 1], y_preds[:, 2]
                )
                results[method][dimension]["width_mean"][trial] = (
                    y_preds[:, 2] - y_preds[:, 1]
                ).mean()
    return results
예제 #26
0
# Non-nested approach with the CV+ method using the Random Forest model.
cv_obj = RandomizedSearchCV(rf_model,
                            param_distributions=rf_params,
                            n_iter=n_iter,
                            cv=n_cv,
                            scoring="neg_root_mean_squared_error",
                            return_train_score=True,
                            verbose=0,
                            n_jobs=-1,
                            random_state=random_state)
cv_obj.fit(X_train, y_train)
best_est = cv_obj.best_estimator_
mapie_non_nested = MapieRegressor(best_est,
                                  alpha=alpha,
                                  method="cv_plus",
                                  n_splits=n_cv,
                                  ensemble=True,
                                  random_state=random_state)
mapie_non_nested.fit(X_train, y_train)
y_preds_non_nested = mapie_non_nested.predict(X_test)
widths_non_nested = y_preds_non_nested[:, 2] - y_preds_non_nested[:, 1]
coverage_non_nested = coverage_score(y_test, y_preds_non_nested[:, 1],
                                     y_preds_non_nested[:, 2])
score_non_nested = mean_squared_error(y_test,
                                      y_preds_non_nested[:, 0],
                                      squared=False)

# Nested approach with the CV+ method using the Random Forest model.
cv_obj = RandomizedSearchCV(rf_model,
                            param_distributions=rf_params,
                            n_iter=n_iter,