def forecast(us_counties: pd.DataFrame, log_metrics: bool, hp: dict, metric_threshold: int = 5): metrics = {} growth_rates = {} horizon = hp['horizon'] metric_skip = 0 for location in tqdm(us_counties['location'].unique(), unit=' counties'): if log_metrics: if metric_skip == metric_threshold: metric_skip = 0 else: metric_skip += 1 continue y = us_counties[us_counties.location == location].reset_index()['cases'] if len(y) < horizon: continue model = AutoARIMA(**hp) with warnings.catch_warnings(): # When there is no cases, it will throw a warning warnings.filterwarnings("ignore") try: if log_metrics: y, yv = train_test_split(y, test_size=horizon) model.fit(y) predictions = model.predict(n_periods=horizon) # Value error very rarely with weird/broken time series data except (ValueError, IndexError): continue if log_metrics: metrics[location] = np.mean( np.abs(yv - predictions) / (np.abs(yv) + np.abs(predictions))) last_forecast = predictions[len(predictions) - 1] todays_cases = y[len(y) - 1] # Places with very small amount of cases are hard to predict case_handicap = min(1.0, 0.5 + (todays_cases / 120)) growth = (last_forecast / todays_cases) * case_handicap growth_rates[location] = growth final_list = [ i[0] for i in sorted(growth_rates.items(), key=lambda i: i[1], reverse=True) ] def rank_risk(row) -> int: case_growth = growth_rates.get(row.location) if not case_growth: return 1 return round(max(0, (case_growth - 1) * 100)) if not log_metrics: us_counties['outbreak_risk'] = us_counties.apply(rank_risk, axis=1) return us_counties, final_list, metrics
def test_AutoARIMA_class(): train, test = wineind[:125], wineind[125:] mod = AutoARIMA(maxiter=5) mod.fit(train) endog = mod.model_.arima_res_.data.endog assert_array_almost_equal(train, endog) # update mod.update(test, maxiter=2) new_endog = mod.model_.arima_res_.data.endog assert_array_almost_equal(wineind, new_endog)
def test_issue_30(): # From the issue: vec = np.array([33., 44., 58., 49., 46., 98., 97.]) arm = AutoARIMA(out_of_sample_size=1, seasonal=False, suppress_warnings=True) arm.fit(vec) # This is a way to force it: ARIMA(order=(0, 1, 0), out_of_sample_size=1).fit(vec) # Want to make sure it works with exog arrays as well exog = np.random.RandomState(1).rand(vec.shape[0], 2) auto_arima(vec, exogenous=exog, out_of_sample_size=1, seasonal=False, suppress_warnings=True) # This is a way to force it: ARIMA(order=(0, 1, 0), out_of_sample_size=1).fit(vec, exogenous=exog)
class AutoArimaEstimator(BaseForecastEstimator): """Wrapper for ``pmdarima.arima.AutoARIMA``. It currently does not handle the regressor issue when there is gap between train and predict periods. Parameters ---------- score_func : callable see ``BaseForecastEstimator``. coverage : float between [0.0, 1.0] see ``BaseForecastEstimator``. null_model_params : dict with arguments to define DummyRegressor null model, optional, default=None see ``BaseForecastEstimator``. regressor_cols: `list` [`str`], optional, default None A list of regressor columns used during training and prediction. If None, no regressor columns are used. See ``AutoArima`` documentation for rest of the parameter descriptions: * https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.AutoARIMA.html#pmdarima.arima.AutoARIMA Attributes ---------- model : ``AutoArima`` object Auto arima model object fit_df : `pandas.DataFrame` or None The training data used to fit the model. forecast : `pandas.DataFrame` Output of the predict method of ``AutoArima``. """ def __init__( self, # Null model parameters score_func: callable = mean_squared_error, coverage: float = 0.90, null_model_params: Optional[Dict] = None, # Additional parameters regressor_cols: Optional[List[str]] = None, freq: Optional[float] = None, # pmdarima fit parameters start_p: Optional[int] = 2, d: Optional[int] = None, start_q: Optional[int] = 2, max_p: Optional[int] = 5, max_d: Optional[int] = 2, max_q: Optional[int] = 5, start_P: Optional[int] = 1, D: Optional[int] = None, start_Q: Optional[int] = 1, max_P: Optional[int] = 2, max_D: Optional[int] = 1, max_Q: Optional[int] = 2, max_order: Optional[int] = 5, m: Optional[int] = 1, seasonal: Optional[bool] = True, stationary: Optional[bool] = False, information_criterion: Optional[str] = 'aic', alpha: Optional[int] = 0.05, test: Optional[str] = 'kpss', seasonal_test: Optional[str] = 'ocsb', stepwise: Optional[bool] = True, n_jobs: Optional[int] = 1, start_params: Optional[Dict] = None, trend: Optional[str] = None, method: Optional[str] = 'lbfgs', maxiter: Optional[int] = 50, offset_test_args: Optional[Dict] = None, seasonal_test_args: Optional[Dict] = None, suppress_warnings: Optional[bool] = True, error_action: Optional[str] = 'trace', trace: Optional[Union[int, bool]] = False, random: Optional[bool] = False, random_state: Optional[Union[int, callable]] = None, n_fits: Optional[int] = 10, out_of_sample_size: Optional[int] = 0, scoring: Optional[str] = 'mse', scoring_args: Optional[Dict] = None, with_intercept: Optional[Union[bool, str]] = "auto", # pmdarima predict parameters return_conf_int: Optional[bool] = True, dynamic: Optional[bool] = False): # Every subclass of BaseForecastEstimator must call super().__init__ super().__init__( score_func=score_func, coverage=coverage, null_model_params=null_model_params) self.regressor_cols = regressor_cols self.freq = freq self.start_p = start_p self.d = d self.start_q = start_q self.max_p = max_p self.max_d = max_d self.max_q = max_q self.start_P = start_P self.D = D self.start_Q = start_Q self.max_P = max_P self.max_D = max_D self.max_Q = max_Q self.max_order = max_order self.m = m self.seasonal = seasonal self.stationary = stationary self.information_criterion = information_criterion self.alpha = alpha self.test = test self.seasonal_test = seasonal_test self.stepwise = stepwise self.n_jobs = n_jobs self.start_params = start_params self.trend = trend self.method = method self.maxiter = maxiter self.offset_test_args = offset_test_args self.seasonal_test_args = seasonal_test_args self.suppress_warnings = suppress_warnings self.error_action = error_action self.trace = trace self.random = random self.random_state = random_state self.n_fits = n_fits self.out_of_sample_size = out_of_sample_size self.scoring = scoring self.scoring_args = scoring_args self.with_intercept = with_intercept self.return_conf_int = return_conf_int self.coverage = coverage self.dynamic = dynamic # set by the fit method self.model = None self.fit_df = None # set by the predict method self.forecast = None def fit(self, X, y=None, time_col=TIME_COL, value_col=VALUE_COL, **fit_params): """Fits ``ARIMA`` forecast model. Parameters ---------- X : `pandas.DataFrame` Input timeseries, with timestamp column, value column, and any additional regressors. The value column is the response, included in X to allow transformation by `sklearn.pipeline.Pipeline` y : ignored The original timeseries values, ignored. (The y for fitting is included in ``X``.) time_col : `str` Time column name in ``X`` value_col : `str` Value column name in ``X`` fit_params : `dict` additional parameters for null model Returns ------- self : self Fitted model is stored in ``self.model``. """ X = X.sort_values(by=time_col) # fits null model super().fit(X, y=y, time_col=time_col, value_col=value_col, **fit_params) self.fit_df = X # fits AutoArima model self.model = AutoARIMA( start_p=self.start_p, d=self.d, start_q=self.start_q, max_p=self.max_p, max_d=self.max_d, max_q=self.max_q, start_P=self.start_P, D=self.D, start_Q=self.start_Q, max_P=self.max_P, max_D=self.max_D, max_Q=self.max_Q, max_order=self.max_order, m=self.m, seasonal=self.seasonal, stationary=self.stationary, information_criterion=self.information_criterion, alpha=self.alpha, test=self.test, seasonal_test=self.seasonal_test, stepwise=self.stepwise, n_jobs=self.n_jobs, start_params=self.start_params, trend=self.trend, method=self.method, maxiter=self.maxiter, offset_test_args=self.offset_test_args, seasonal_test_args=self.seasonal_test_args, suppress_warnings=self.suppress_warnings, error_action=self.error_action, trace=self.trace, random=self.random, random_state=self.random_state, n_fits=self.n_fits, out_of_sample_size=self.out_of_sample_size, scoring=self.scoring, scoring_args=self.scoring_args, with_intercept=self.with_intercept, return_conf_int=self.return_conf_int, dynamic=self.dynamic, regressor_cols=self.regressor_cols ) # fits auto-arima if self.regressor_cols is None: reg_df = None else: reg_df = X[self.regressor_cols] self.model.fit(y=X[[value_col]], X=reg_df) return self def predict(self, X, y=None): """Creates forecast for the dates specified in ``X``. Currently does not support the regressor case where there is gap between train and predict periods. Parameters ---------- X: `pandas.DataFrame` Input timeseries with timestamp column and any additional regressors. Timestamps are the dates for prediction. Value column, if provided in ``X``, is ignored. y: ignored. Returns ------- predictions: `pandas.DataFrame` Forecasted values for the dates in ``X``. Columns: - ``TIME_COL``: dates - ``PREDICTED_COL``: predictions - ``PREDICTED_LOWER_COL``: lower bound of predictions - ``PREDICTED_UPPER_COL``: upper bound of predictions """ X = X.sort_values(by=self.time_col_) # Returns the cached result if applicable cached_predictions = super().predict(X=X) if cached_predictions is not None: return cached_predictions # Currently does not support the regressor case where # there is gap between train and predict periods if self.regressor_cols is None: fut_reg_df = None else: fut_df = X[X[self.time_col_] > self.fit_df[self.time_col_].iloc[-1]] fut_reg_df = fut_df[self.regressor_cols] # Auto-arima only accepts regressor values beyond `fit_df` if self.freq is None: self.freq = pd.infer_freq(self.fit_df[self.time_col_]) if self.freq == "H": self.freq = self.freq.lower() # np.timedelta recognizes lower case letters chosen_d = self.model.model_.order[1] # This is the value of the d chosen by auto-arima forecast_start = int((X[self.time_col_].iloc[0] - self.fit_df[self.time_col_].iloc[0])/np.timedelta64(1, self.freq)) if forecast_start < chosen_d: append_length = chosen_d - forecast_start # Number of NaNs to append to `pred_df` forecast_start = chosen_d # Auto-arima can not predict below the chosen d else: append_length = 0 forecast_end = int((X[self.time_col_].iloc[-1] - self.fit_df[self.time_col_].iloc[0])/np.timedelta64(1, self.freq)) predictions = self.model.predict_in_sample( X=fut_reg_df, start=forecast_start, end=forecast_end, dynamic=self.dynamic, return_conf_int=self.return_conf_int, alpha=(1-self.coverage) ) if append_length > 0: pred_df = pd.DataFrame({ TIME_COL: X[self.time_col_], PREDICTED_COL: np.append(np.repeat(np.nan, append_length), predictions[0]), PREDICTED_LOWER_COL: np.append(np.repeat(np.nan, append_length), predictions[1][:, 0]), PREDICTED_UPPER_COL: np.append(np.repeat(np.nan, append_length), predictions[1][:, 1]) }) else: pred_df = pd.DataFrame({ TIME_COL: X[self.time_col_], PREDICTED_COL: predictions[0], PREDICTED_LOWER_COL: predictions[1][:, 0], PREDICTED_UPPER_COL: predictions[1][:, 1] }) self.forecast = pred_df # Caches the predictions self.cached_predictions_ = pred_df return pred_df def summary(self): BaseForecastEstimator.summary(self) # AutoArima summary return self.model.summary()