Пример #1
0
def test_invalid_method_in_predict(monkeypatch: Any, method: str) -> None:
    """Test message in predict when invalid method is selected."""
    monkeypatch.setattr(MapieRegressor, "_check_parameters", lambda _: None)
    monkeypatch.setattr(MapieRegressor, "_select_cv", lambda _: LeaveOneOut())
    mapie = MapieRegressor(DummyRegressor(), method=method)
    mapie.fit(X_boston, y_boston)
    with pytest.raises(ValueError, match=r".*Invalid method.*"):
        mapie.predict(X_boston)
Пример #2
0
def test_prediction_between_low_up(return_pred: str) -> None:
    """Test that prediction lies between low and up prediction intervals."""
    mapie = MapieRegressor(LinearRegression(), return_pred=return_pred)
    mapie.fit(X_boston, y_boston)
    y_preds = mapie.predict(X_boston)
    y_pred, y_low, y_up = y_preds[:, 0], y_preds[:, 1], y_preds[:, 2]
    assert (y_pred >= y_low).all() & (y_pred <= y_up).all()
Пример #3
0
def test_results(method: str) -> None:
    """
    Test that MapieRegressor applied on a linear regression model
    fitted on a linear curve results in null uncertainty.
    """
    mapie = MapieRegressor(LinearRegression(), method=method, n_splits=3)
    mapie.fit(X_toy, y_toy)
    y_preds = mapie.predict(X_toy)
    y_low, y_up = y_preds[:, 1], y_preds[:, 2]
    assert_almost_equal(y_up, y_low, 10)
Пример #4
0
def test_linreg_results(method: str) -> None:
    """Test expected PIs for a multivariate linear regression problem with fixed random seed."""
    mapie = MapieRegressor(LinearRegression(),
                           method=method,
                           alpha=0.05,
                           random_state=SEED)
    mapie.fit(X_reg, y_reg)
    y_preds = mapie.predict(X_reg)
    preds_low, preds_up = y_preds[:, 1], y_preds[:, 2]
    assert_almost_equal((preds_up - preds_low).mean(), expected_widths[method],
                        2)
    assert_almost_equal(coverage_score(y_reg, preds_low, preds_up),
                        expected_coverages[method], 2)
Пример #5
0
def PIs_vs_dimensions(
    methods: List[str],
    alpha: float,
    n_trial: int,
    dimensions: List[int]
) -> Dict[str, Dict[int, Dict[str, np.ndarray]]]:
    """
    Compute the prediction intervals for a linear regression problem.
    Function adapted from Foygel-Barber et al. (2020).

    It generates several times linear data with random noise whose signal-to-noise
    is equal to 10 and for several given dimensions, given by the dimensions list.

    Here we use MAPIE, with a LinearRegression base model, to estimate the width
    means and the coverage levels of the prediction intervals estimated by all the
    available methods as function of the dataset dimension.

    This simulation is carried out to emphasize the instability of the prediction
    intervals estimated by the Jackknife method when the dataset dimension is
    equal to the number of training samples (here 100).

    Parameters
    ----------
    methods : List[str]
        List of methods for estimating prediction intervals.
    alpha : float
        1 - (target coverage level).
    n_trial : int
        Number of trials for each dimension for estimating prediction intervals.
        For each trial, a new random noise is generated.
    dimensions : List[int]
        List of dimension values of input data.

    Returns
    -------
    Dict[str, Dict[int, Dict[str, np.ndarray]]]
        Prediction interval widths and coverages for each method, trial,
        and dimension value.
    """
    n_train = 100
    n_test = 100
    SNR = 10
    results: Dict[str, Dict[int, Dict[str, np.ndarray]]] = {
        method: {
            dimension: {
                "coverage": np.empty(n_trial),
                "width_mean": np.empty(n_trial)
            } for dimension in dimensions
        } for method in methods
    }
    for dimension in dimensions:
        for trial in range(n_trial):
            beta = np.random.normal(size=dimension)
            beta_norm = np.sqrt((beta**2).sum())
            beta = beta/beta_norm*np.sqrt(SNR)
            X_train = np.random.normal(size=(n_train, dimension))
            noise_train = np.random.normal(size=n_train)
            noise_test = np.random.normal(size=n_test)
            y_train = X_train.dot(beta) + noise_train
            X_test = np.random.normal(size=(n_test, dimension))
            y_test = X_test.dot(beta) + noise_test

            for method in methods:
                mapie = MapieRegressor(
                    LinearRegression(),
                    alpha=alpha,
                    method=method,
                    n_splits=5,
                    shuffle=False,
                    return_pred="ensemble"
                )
                mapie.fit(X_train, y_train)
                y_preds = mapie.predict(X_test)
                results[method][dimension]["coverage"][trial] = coverage_score(
                    y_test, y_preds[:, 1], y_preds[:, 2]
                )
                results[method][dimension]["width_mean"][trial] = (
                    y_preds[:, 2] - y_preds[:, 1]
                ).mean()
    return results
Пример #6
0
def test_not_fitted() -> None:
    """Test error message when predict is called before fit."""
    mapie = MapieRegressor(DummyRegressor())
    with pytest.raises(NotFittedError, match=r".*not fitted.*"):
        mapie.predict(X_reg)
Пример #7
0
def test_predicted() -> None:
    """Test that predict does not crash."""
    mapie = MapieRegressor(DummyRegressor())
    mapie.fit(X_reg, y_reg)
    mapie.predict(X_reg)
Пример #8
0
def test_predinterv_outputshape() -> None:
    """Test that number of observations given by predict method is equal to input data."""
    mapie = MapieRegressor(DummyRegressor())
    mapie.fit(X_reg, y_reg)
    assert mapie.predict(X_reg).shape[0] == X_reg.shape[0]
    assert mapie.predict(X_reg).shape[1] == 3
Пример #9
0
    ax.plot(X_test, y_pred, label='Prediction intervals')
    ax.fill_between(X_test, y_pred_low, y_pred_up, alpha=0.3)
    ax.set_title(title)
    ax.legend()


X_train, y_train, X_test, y_test, y_test_sigma = get_homoscedastic_data(
    n_samples=200, n_test=200, sigma=0.1)

polyn_model = Pipeline([('poly', PolynomialFeatures(degree=4)),
                        ('linear', LinearRegression(fit_intercept=False))])

methods = [
    'jackknife', 'jackknife_plus', 'jackknife_minmax', 'cv', 'cv_plus',
    'cv_minmax'
]
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2,
                                                       3,
                                                       figsize=(3 * 6, 12))
axs = [ax1, ax2, ax3, ax4, ax5, ax6]
for i, method in enumerate(methods):
    mapie = MapieRegressor(polyn_model,
                           method=method,
                           alpha=0.05,
                           n_splits=10,
                           return_pred='ensemble')
    mapie.fit(X_train.reshape(-1, 1), y_train)
    y_preds = mapie.predict(X_test.reshape(-1, 1))
    plot_1d_data(X_train, y_train, X_test, y_test, y_test_sigma, y_preds[:, 0],
                 y_preds[:, 1], y_preds[:, 2], axs[i], method)
Пример #10
0
                            cv=n_cv,
                            scoring="neg_root_mean_squared_error",
                            return_train_score=True,
                            verbose=0,
                            n_jobs=-1,
                            random_state=random_state)
cv_obj.fit(X_train, y_train)
best_est = cv_obj.best_estimator_
mapie_non_nested = MapieRegressor(best_est,
                                  alpha=alpha,
                                  method='cv_plus',
                                  n_splits=n_cv,
                                  return_pred='median',
                                  random_state=random_state)
mapie_non_nested.fit(X_train, y_train)
y_preds_non_nested = mapie_non_nested.predict(X_test)
widths_non_nested = y_preds_non_nested[:, 2] - y_preds_non_nested[:, 1]
coverage_non_nested = coverage_score(y_test, y_preds_non_nested[:, 1],
                                     y_preds_non_nested[:, 2])
score_non_nested = mean_squared_error(y_test,
                                      y_preds_non_nested[:, 0],
                                      squared=False)

# Nested approach with the CV+ method using the Random Forest model.
cv_obj = RandomizedSearchCV(rf_model,
                            param_distributions=rf_params,
                            n_iter=n_iter,
                            cv=n_cv,
                            scoring="neg_root_mean_squared_error",
                            return_train_score=True,
                            verbose=0,
Пример #11
0
An example plot of :class:`mapie.estimators.MapieRegressor` used
in the Quickstart.
"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from mapie.estimators import MapieRegressor
from mapie.metrics import coverage_score

regressor = LinearRegression()
X, y = make_regression(n_samples=500, n_features=1, noise=20, random_state=59)

mapie = MapieRegressor(regressor, method="jackknife_plus")
mapie.fit(X, y)
y_preds = mapie.predict(X)

plt.xlabel('x')
plt.ylabel('y')
plt.scatter(X, y, alpha=0.3)
plt.plot(X, y_preds[:, 0], color='C1')
order = np.argsort(X[:, 0])
plt.fill_between(X[order].ravel(),
                 y_preds[:, 1][order],
                 y_preds[:, 2][order],
                 alpha=0.3)
plt.title(
    f"Target coverage = 0.9; Effective coverage = {coverage_score(y, y_preds[:, 1], y_preds[:, 2])}"
)
plt.show()