def test_quantile_estimates_calibration(q): # Test that model estimates percentage of points below the prediction X, y = make_regression(n_samples=1000, n_features=20, random_state=0, noise=1.0) quant = QuantileRegressor( quantile=q, alpha=0, solver_options={"lstsq": False}, ).fit(X, y) assert np.mean(y < quant.predict(X)) == approx(q, abs=1e-2)
def test_linprog_failure(): """Test that linprog fails.""" X = np.linspace(0, 10, num=10).reshape(-1, 1) y = np.linspace(0, 10, num=10) reg = QuantileRegressor( alpha=0, solver="interior-point", solver_options={"maxiter": 1} ) msg = "Linear programming for QuantileRegressor did not succeed." with pytest.warns(ConvergenceWarning, match=msg): reg.fit(X, y)
def test_quantile_equals_huber_for_low_epsilon(fit_intercept): X, y = make_regression(n_samples=100, n_features=20, random_state=0, noise=1.0) alpha = 1e-4 huber = HuberRegressor( epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept ).fit(X, y) quant = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(X, y) assert_allclose(huber.coef_, quant.coef_, atol=1e-1) if fit_intercept: assert huber.intercept_ == approx(quant.intercept_, abs=1e-1) # check that we still predict fraction assert np.mean(y < quant.predict(X)) == approx(0.5, abs=1e-1)
def test_sparse_input(sparse_format, solver, fit_intercept): """Test that sparse and dense X give same results.""" X, y = make_regression(n_samples=100, n_features=20, random_state=1, noise=1.0) X_sparse = sparse_format(X) alpha = 1e-4 quant_dense = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(X, y) quant_sparse = QuantileRegressor( alpha=alpha, fit_intercept=fit_intercept, solver=solver ).fit(X_sparse, y) assert_allclose(quant_sparse.coef_, quant_dense.coef_, rtol=1e-2) if fit_intercept: assert quant_sparse.intercept_ == approx(quant_dense.intercept_) # check that we still predict fraction assert 0.45 <= np.mean(y < quant_sparse.predict(X_sparse)) <= 0.55
def test_incompatible_solver_for_sparse_input(X_y_data, solver): X, y = X_y_data X_sparse = sparse.csc_matrix(X) err_msg = ( f"Solver {solver} does not support sparse X. Use solver 'highs' for example." ) with pytest.raises(ValueError, match=err_msg): QuantileRegressor(solver=solver).fit(X_sparse, y)
def get_calibration(train_approximation_distances, train_target_distances, val_approximation_distances, val_target_distances, quantile): qr = QuantileRegressor(quantile=quantile, alpha=0, solver='highs') qr.fit(train_approximation_distances.reshape(-1, 1), train_target_distances) predicted_target_distances = qr.predict( val_approximation_distances.reshape(-1, 1)).squeeze() num_leq = 0 for predicted_target, val_target in zip(predicted_target_distances, val_target_distances): if val_target <= predicted_target: num_leq += 1 return num_leq / len(val_target_distances), predicted_target_distances
def test_quantile_toy_example(quantile, alpha, intercept, coef): # test how different parameters affect a small intuitive example X = [[0], [1], [1]] y = [1, 2, 11] model = QuantileRegressor(quantile=quantile, alpha=alpha).fit(X, y) assert_allclose(model.intercept_, intercept, atol=1e-2) if coef is not None: assert_allclose(model.coef_[0], coef, atol=1e-2) if alpha < 100: assert model.coef_[0] >= 1 assert model.coef_[0] <= 10
def test_error_interior_point_future(X_y_data, monkeypatch): """Check that we will raise a proper error when requesting `solver='interior-point'` in SciPy >= 1.11. """ X, y = X_y_data import sklearn.linear_model._quantile with monkeypatch.context() as m: m.setattr(sklearn.linear_model._quantile, "sp_version", parse_version("1.11.0")) err_msg = "Solver interior-point is not anymore available in SciPy >= 1.11.0." with pytest.raises(ValueError, match=err_msg): QuantileRegressor(solver="interior-point").fit(X, y)
def test_asymmetric_error(quantile): """Test quantile regression for asymmetric distributed targets.""" n_samples = 1000 rng = np.random.RandomState(42) X = np.concatenate( ( np.abs(rng.randn(n_samples)[:, None]), -rng.randint(2, size=(n_samples, 1)), ), axis=1, ) intercept = 1.23 coef = np.array([0.5, -2]) # Take care that X @ coef + intercept > 0 assert np.min(X @ coef + intercept) > 0 # For an exponential distribution with rate lambda, e.g. exp(-lambda * x), # the quantile at level q is: # quantile(q) = - log(1 - q) / lambda # scale = 1/lambda = -quantile(q) / log(1 - q) y = rng.exponential( scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples ) model = QuantileRegressor( quantile=quantile, alpha=0, solver="highs", ).fit(X, y) # This test can be made to pass with any solver but in the interest # of sparing continuous integration resources, the test is performed # with the fastest solver only. assert model.intercept_ == approx(intercept, rel=0.2) assert_allclose(model.coef_, coef, rtol=0.6) assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2) # Now compare to Nelder-Mead optimization with L1 penalty alpha = 0.01 model.set_params(alpha=alpha).fit(X, y) model_coef = np.r_[model.intercept_, model.coef_] def func(coef): loss = mean_pinball_loss(y, X @ coef[1:] + coef[0], alpha=quantile) L1 = np.sum(np.abs(coef[1:])) return loss + alpha * L1 res = minimize( fun=func, x0=[1, 0, -1], method="Nelder-Mead", tol=1e-12, options={"maxiter": 2000}, ) assert func(model_coef) == approx(func(res.x)) assert_allclose(model.intercept_, res.x[0]) assert_allclose(model.coef_, res.x[1:]) assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2)
def test_quantile_sample_weight(): # test that with unequal sample weights we still estimate weighted fraction n = 1000 X, y = make_regression(n_samples=n, n_features=5, random_state=0, noise=10.0) weight = np.ones(n) # when we increase weight of upper observations, # estimate of quantile should go up weight[y > y.mean()] = 100 quant = QuantileRegressor(quantile=0.5, alpha=1e-8, solver_options={"lstsq": False}) quant.fit(X, y, sample_weight=weight) fraction_below = np.mean(y < quant.predict(X)) assert fraction_below > 0.5 weighted_fraction_below = np.average(y < quant.predict(X), weights=weight) assert weighted_fraction_below == approx(0.5, abs=3e-2)
def test_too_new_solver_methods_raise_error(X_y_data, solver): """Test that highs solver raises for scipy<1.6.0.""" X, y = X_y_data with pytest.raises(ValueError, match="scipy>=1.6.0"): QuantileRegressor(solver=solver).fit(X, y)
def test_warning_new_default(X_y_data): """Check that we warn about the new default solver.""" X, y = X_y_data model = QuantileRegressor() with pytest.warns(FutureWarning, match="The default solver will change"): model.fit(X, y)
def fit( self, series: Union[TimeSeries, Sequence[TimeSeries]], past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, max_samples_per_ts: Optional[int] = None, n_jobs_multioutput_wrapper: Optional[int] = None, **kwargs, ): """ Fit/train the model on one or multiple series. Parameters ---------- series TimeSeries or Sequence[TimeSeries] object containing the target values. past_covariates Optionally, a series or sequence of series specifying past-observed covariates future_covariates Optionally, a series or sequence of series specifying future-known covariates max_samples_per_ts This is an integer upper bound on the number of tuples that can be produced per time series. It can be used in order to have an upper bound on the total size of the dataset and ensure proper sampling. If `None`, it will read all of the individual time series in advance (at dataset creation) to know their sizes, which might be expensive on big datasets. If some series turn out to have a length that would allow more than `max_samples_per_ts`, only the most recent `max_samples_per_ts` samples will be considered. n_jobs_multioutput_wrapper Number of jobs of the MultiOutputRegressor wrapper to run in parallel. Only used if the model doesn't support multi-output regression natively. **kwargs Additional keyword arguments passed to the `fit` method of the model. """ if self.likelihood == "quantile": # set solver for linear program if "solver" not in self.kwargs: # set default fast solver self.kwargs["solver"] = "highs" # test solver availability with dummy problem c = [1] try: linprog(c=c, method=self.kwargs["solver"]) except ValueError as ve: logger.warning( f"{ve}. Upgrading scipy enables significantly faster solvers" ) # set solver to slow legacy self.kwargs["solver"] = "interior-point" # empty model container in case of multiple calls to fit, e.g. when backtesting self._model_container.clear() for quantile in self.quantiles: self.kwargs["quantile"] = quantile self.model = QuantileRegressor(**self.kwargs) super().fit( series=series, past_covariates=past_covariates, future_covariates=future_covariates, max_samples_per_ts=max_samples_per_ts, **kwargs, ) self._model_container[quantile] = self.model return self else: super().fit( series=series, past_covariates=past_covariates, future_covariates=future_covariates, max_samples_per_ts=max_samples_per_ts, **kwargs, ) return self
def test_init_parameters_validation(X_y_data, params, err_msg): """Test that invalid init parameters raise errors.""" X, y = X_y_data with pytest.raises(ValueError, match=err_msg): QuantileRegressor(**params).fit(X, y)
def __init__( self, lags: Union[int, list] = None, lags_past_covariates: Union[int, List[int]] = None, lags_future_covariates: Union[Tuple[int, int], List[int]] = None, output_chunk_length: int = 1, likelihood: str = None, quantiles: List[float] = None, random_state: Optional[int] = None, **kwargs, ): """Linear regression model. Parameters ---------- lags Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags are used (from -1 backward). Otherwise a list of integers with lags is required (each lag must be < 0). lags_past_covariates Number of lagged past_covariates values used to predict the next time step. If an integer is given the last `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers with lags < 0 is required. lags_future_covariates Number of lagged future_covariates values used to predict the next time step. If an tuple (past, future) is given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list of integers with lags is required. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may be useful if the covariates don't extend far enough into the future. likelihood Can be set to `quantile` or `poisson`. If set, the model will be probabilistic, allowing sampling at prediction time. If set to `quantile`, the `sklearn.linear_model.QuantileRegressor` is used. Similarly, if set to `poisson`, the `sklearn.linear_model.PoissonRegressor` is used. quantiles Fit the model to these quantiles if the `likelihood` is set to `quantile`. random_state Control the randomness of the sampling. Used as seed for `numpy.random.Generator <https://numpy.org/doc/stable/reference/random/generator.html#numpy.random.Generator>`_. Ignored when no `likelihood` is set. Default: ``None``. **kwargs Additional keyword arguments passed to `sklearn.linear_model.LinearRegression` (by default), to `sklearn.linear_model.PoissonRegressor` (if `likelihood="poisson"`), or to `sklearn.linear_model.QuantileRegressor` (if `likelihood="quantile"`). """ self.kwargs = kwargs self._median_idx = None self._model_container = None self.quantiles = None self.likelihood = likelihood self._rng = None # parse likelihood available_likelihoods = ["quantile", "poisson"] # to be extended if likelihood is not None: self._check_likelihood(likelihood, available_likelihoods) self._rng = np.random.default_rng(seed=random_state) if likelihood == "poisson": model = PoissonRegressor(**kwargs) if likelihood == "quantile": model = QuantileRegressor(**kwargs) self.quantiles, self._median_idx = self._prepare_quantiles( quantiles) self._model_container = self._get_model_container() else: model = LinearRegression(**kwargs) super().__init__( lags=lags, lags_past_covariates=lags_past_covariates, lags_future_covariates=lags_future_covariates, output_chunk_length=output_chunk_length, model=model, )
# Fitting a `QuantileRegressor` # ----------------------------- # # In this section, we want to estimate the conditional median as well as # a low and high quantile fixed at 5% and 95%, respectively. Thus, we will get # three linear models, one for each quantile. # # We will use the quantiles at 5% and 95% to find the outliers in the training # sample beyond the central 90% interval. from sklearn.linear_model import QuantileRegressor quantiles = [0.05, 0.5, 0.95] predictions = {} out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_) for quantile in quantiles: qr = QuantileRegressor(quantile=quantile, alpha=0) y_pred = qr.fit(X, y_normal).predict(X) predictions[quantile] = y_pred if quantile == min(quantiles): out_bounds_predictions = np.logical_or( out_bounds_predictions, y_pred >= y_normal ) elif quantile == max(quantiles): out_bounds_predictions = np.logical_or( out_bounds_predictions, y_pred <= y_normal ) # %% # Now, we can plot the three linear models and the distinguished samples that # are within the central 90% interval from samples that are outside this
def test_equivariance(quantile): """Test equivariace of quantile regression. See Koenker (2005) Quantile Regression, Chapter 2.2.3. """ rng = np.random.RandomState(42) n_samples, n_features = 100, 5 X, y = make_regression( n_samples=n_samples, n_features=n_features, n_informative=n_features, noise=0, random_state=rng, shuffle=False, ) # make y asymmetric y += rng.exponential(scale=100, size=y.shape) params = dict(alpha=0, solver_options={"lstsq": True, "tol": 1e-10}) model1 = QuantileRegressor(quantile=quantile, **params).fit(X, y) # coef(q; a*y, X) = a * coef(q; y, X) a = 2.5 model2 = QuantileRegressor(quantile=quantile, **params).fit(X, a * y) assert model2.intercept_ == approx(a * model1.intercept_, rel=1e-5) assert_allclose(model2.coef_, a * model1.coef_, rtol=1e-5) # coef(1-q; -a*y, X) = -a * coef(q; y, X) model2 = QuantileRegressor(quantile=1 - quantile, **params).fit(X, -a * y) assert model2.intercept_ == approx(-a * model1.intercept_, rel=1e-5) assert_allclose(model2.coef_, -a * model1.coef_, rtol=1e-5) # coef(q; y + X @ g, X) = coef(q; y, X) + g g_intercept, g_coef = rng.randn(), rng.randn(n_features) model2 = QuantileRegressor(quantile=quantile, **params) model2.fit(X, y + X @ g_coef + g_intercept) assert model2.intercept_ == approx(model1.intercept_ + g_intercept) assert_allclose(model2.coef_, model1.coef_ + g_coef, rtol=1e-6) # coef(q; y, X @ A) = A^-1 @ coef(q; y, X) A = rng.randn(n_features, n_features) model2 = QuantileRegressor(quantile=quantile, **params) model2.fit(X @ A, y) assert model2.intercept_ == approx(model1.intercept_, rel=1e-5) assert_allclose(model2.coef_, np.linalg.solve(A, model1.coef_), rtol=1e-5)