Exemplo n.º 1
0
def test_linreg_results(method: str) -> None:
    """Test expected PIs for a multivariate linear regression problem with fixed random seed."""
    mapie = MapieRegressor(LinearRegression(),
                           method=method,
                           alpha=0.05,
                           random_state=SEED)
    mapie.fit(X_reg, y_reg)
    y_preds = mapie.predict(X_reg)
    preds_low, preds_up = y_preds[:, 1], y_preds[:, 2]
    assert_almost_equal((preds_up - preds_low).mean(), expected_widths[method],
                        2)
    assert_almost_equal(coverage_score(y_reg, preds_low, preds_up),
                        expected_coverages[method], 2)
Exemplo n.º 2
0
def PIs_vs_dimensions(
    methods: List[str],
    alpha: float,
    n_trial: int,
    dimensions: List[int]
) -> Dict[str, Dict[int, Dict[str, np.ndarray]]]:
    """
    Compute the prediction intervals for a linear regression problem.
    Function adapted from Foygel-Barber et al. (2020).

    It generates several times linear data with random noise whose signal-to-noise
    is equal to 10 and for several given dimensions, given by the dimensions list.

    Here we use MAPIE, with a LinearRegression base model, to estimate the width
    means and the coverage levels of the prediction intervals estimated by all the
    available methods as function of the dataset dimension.

    This simulation is carried out to emphasize the instability of the prediction
    intervals estimated by the Jackknife method when the dataset dimension is
    equal to the number of training samples (here 100).

    Parameters
    ----------
    methods : List[str]
        List of methods for estimating prediction intervals.
    alpha : float
        1 - (target coverage level).
    n_trial : int
        Number of trials for each dimension for estimating prediction intervals.
        For each trial, a new random noise is generated.
    dimensions : List[int]
        List of dimension values of input data.

    Returns
    -------
    Dict[str, Dict[int, Dict[str, np.ndarray]]]
        Prediction interval widths and coverages for each method, trial,
        and dimension value.
    """
    n_train = 100
    n_test = 100
    SNR = 10
    results: Dict[str, Dict[int, Dict[str, np.ndarray]]] = {
        method: {
            dimension: {
                "coverage": np.empty(n_trial),
                "width_mean": np.empty(n_trial)
            } for dimension in dimensions
        } for method in methods
    }
    for dimension in dimensions:
        for trial in range(n_trial):
            beta = np.random.normal(size=dimension)
            beta_norm = np.sqrt((beta**2).sum())
            beta = beta/beta_norm*np.sqrt(SNR)
            X_train = np.random.normal(size=(n_train, dimension))
            noise_train = np.random.normal(size=n_train)
            noise_test = np.random.normal(size=n_test)
            y_train = X_train.dot(beta) + noise_train
            X_test = np.random.normal(size=(n_test, dimension))
            y_test = X_test.dot(beta) + noise_test

            for method in methods:
                mapie = MapieRegressor(
                    LinearRegression(),
                    alpha=alpha,
                    method=method,
                    n_splits=5,
                    shuffle=False,
                    return_pred="ensemble"
                )
                mapie.fit(X_train, y_train)
                y_preds = mapie.predict(X_test)
                results[method][dimension]["coverage"][trial] = coverage_score(
                    y_test, y_preds[:, 1], y_preds[:, 2]
                )
                results[method][dimension]["width_mean"][trial] = (
                    y_preds[:, 2] - y_preds[:, 1]
                ).mean()
    return results
Exemplo n.º 3
0
def test_ypredlow_type() -> None:
    "Test that list(y_pred_low) gives right coverage."
    assert coverage_score(y_toy, list(y_toy_preds[:, 1]),
                          y_toy_preds[:, 2]) == 0.8
Exemplo n.º 4
0
def test_ypredup_type() -> None:
    "Test that list(y_pred_up) gives right coverage."
    assert coverage_score(y_toy, y_toy_preds[:, 1],
                          list(y_toy_preds[:, 2])) == 0.8
Exemplo n.º 5
0
def test_ytrue_type() -> None:
    "Test that list(y_true) gives right coverage."
    assert coverage_score(list(y_toy), y_toy_preds[:, 1],
                          y_toy_preds[:, 2]) == 0.8
Exemplo n.º 6
0
def test_toydata() -> None:
    "Test coverage_score for toy data"
    assert coverage_score(y_toy, y_toy_preds[:, 1], y_toy_preds[:, 2]) == 0.8
Exemplo n.º 7
0
def test_same_length() -> None:
    "Test when y_true and y_preds have different lengths."
    with pytest.raises(ValueError, match=r".*could not be broadcast*"):
        coverage_score(y_toy, y_toy_preds[:-1, 1], y_toy_preds[:-1, 2])
Exemplo n.º 8
0
def test_ypredup_shape() -> None:
    "Test shape of y_pred_low."
    with pytest.raises(ValueError, match=r".*y should be a 1d array*"):
        coverage_score(y_toy, y_toy_preds[:, 1], y_toy_preds[:, 1:])
Exemplo n.º 9
0
                            return_train_score=True,
                            verbose=0,
                            n_jobs=-1,
                            random_state=random_state)
cv_obj.fit(X_train, y_train)
best_est = cv_obj.best_estimator_
mapie_non_nested = MapieRegressor(best_est,
                                  alpha=alpha,
                                  method='cv_plus',
                                  n_splits=n_cv,
                                  return_pred='median',
                                  random_state=random_state)
mapie_non_nested.fit(X_train, y_train)
y_preds_non_nested = mapie_non_nested.predict(X_test)
widths_non_nested = y_preds_non_nested[:, 2] - y_preds_non_nested[:, 1]
coverage_non_nested = coverage_score(y_test, y_preds_non_nested[:, 1],
                                     y_preds_non_nested[:, 2])
score_non_nested = mean_squared_error(y_test,
                                      y_preds_non_nested[:, 0],
                                      squared=False)

# Nested approach with the CV+ method using the Random Forest model.
cv_obj = RandomizedSearchCV(rf_model,
                            param_distributions=rf_params,
                            n_iter=n_iter,
                            cv=n_cv,
                            scoring="neg_root_mean_squared_error",
                            return_train_score=True,
                            verbose=0,
                            n_jobs=-1,
                            random_state=random_state)
mapie_nested = MapieRegressor(cv_obj,