def predict(self, df, predict_col: str = "yhat"): """ Main prediction method for generating forecast values based on the group keys and dates for each that are passed in to this method. The structure of the DataFrame submitted to this method is the same normalized format that ``fit`` takes as a DataFrame argument. i.e.: ========== ==== ============= region zone ds ========== ==== ============= northeast 1 '2021-10-01' northeast 2 '2021-10-01' northeast 1 '2021-10-02' ========== ==== ============= :param df: Normalized DataFrame consisting of grouping key entries and the dates to forecast for each group. :param predict_col: The name of the column in the output ``DataFrame`` that contains the forecasted series data. :return: A consolidated (unioned) single DataFrame of all groups forecasts """ self._fit_check() _validate_keys_in_df(df, self._group_key_columns) grouped_data = PandasGroupGenerator( self._group_key_columns, self._datetime_col, self._y_col).generate_prediction_groups(df) predictions = self._run_predictions(grouped_data) if predict_col != "yhat": predictions.rename(columns={"yhat": predict_col}, inplace=True) return predictions
def test_validate_keys_in_df(): """ Test for ensuring that grouping keys are properly validated and that invalid keys submitted for validation will raise a DivinerException with the appropriate message. :return: None """ invalid_group_keys = ("a", "b", "q") valid_subset_group_keys = ( "a", "b", "c", ) # missing the 'z' column but we don't raise on that. valid_group_keys = ("a", "b", "c", "z") df = generate_sample() with pytest.raises( DivinerException, match= ("Not all key grouping columns supplied: \\('a', 'b', 'q'\\) are present " "in the submitted df: \\['ds', 'y', 'a', 'b', 'c', 'z'\\]"), ): _validate_keys_in_df(df, invalid_group_keys) assert _validate_keys_in_df(df, valid_subset_group_keys) is None assert _validate_keys_in_df(df, valid_group_keys) is None
def _get_df_with_master_key_column(self, df: pd.DataFrame) -> pd.DataFrame: """ Method for creating the 'master_group_key' column that defines a unique group. The master_group_key column is generated from the concatenation (within a tuple) of the values in each of the individual `_group_key_columns`, serving as an aggregation grouping key to define a unique collection of datetime series values. For example: =========== ==== ============ ====== region zone ds y =========== ==== ============ ====== 'northeast' 1 "2021-10-01" 1234.5 'northeast' 2 "2021-10-01" 3255.6 'northeast' 1 "2021-10-02" 1255.9 =========== ==== ============ ====== With the above dataset, the ``group_key_columns`` passed in would be: ``('region', 'zone')`` This method will modify the input ``DataFrame`` by adding the ``master_group_key`` as follows: =========== ==== ============ ====== ================ region zone ds y grouping_key =========== ==== ============ ====== ================ 'northeast' 1 "2021-10-01" 1234.5 ('northeast', 1) 'northeast' 2 "2021-10-01" 3255.6 ('northeast', 2) 'northeast' 1 "2021-10-02" 1255.9 ('northeast', 1) =========== ==== ============ ====== ================ :param df: The normalized ``DataFrame`` :return: A copy of the passed-in ``DataFrame`` with a master grouping key column added that contains the group definitions per row of the input ``DataFrame``. """ _validate_keys_in_df(df, self._group_key_columns) master_group_df = df.copy() master_group_df[self._master_group_key] = master_group_df[[ *self._group_key_columns ]].apply(lambda column: tuple(column), axis=1) # pylint: disable=unnecessary-lambda return master_group_df
def fit(self, df, group_key_columns, y_col="y", datetime_col="ds", **kwargs): """ Main ``fit`` method for executing a Prophet ``fit`` on the submitted DataFrame, grouped by the ``group_key_columns`` submitted. When initiated, the input DataFrame ``df`` will be split into an iterable collection that represents a core series to be fit against. This ``fit`` method is a per-group wrapper around Prophet's ``fit`` implementation. See: https://facebook.github.io/prophet/docs/quick_start.html for information on the basic API, as well as links to the source code that will demonstrate all of the options available for overriding default functionality. For a full description of all parameters that are available to the optimizer, run the following in a shell: .. code-block:: python :caption: Retrieving pystan parameters import pystan help(pystan.StanModel.optimizing) :param df: Normalized pandas DataFrame containing ``group_key_columns``, a ``'ds'`` column, and a target ``'y'`` column. An example normalized data set to be used in this method: ========== ==== ============= ====== region zone ds y ========== ==== ============= ====== northeast 1 '2021-10-01' 1234.5 northeast 2 '2021-10-01' 3255.6 northeast 1 '2021-10-02' 1255.9 ========== ==== ============= ====== :param group_key_columns: The columns in the ``df`` argument that define, in aggregate, a unique time series entry. For example, with the DataFrame referenced in the ``df`` param, group_key_columns could be: (``'region'``, ``'zone'``) Specifying an incomplete grouping collection, while valid through this API (i.e., ('region')), can cause serious problems with any forecast that is built with this API. Ensure that all relevant keys are defined in the input `df` and declared in this param to ensure that the appropriate per-univariate series data is used to train each model. :param y_col: The name of the column within the DataFrame input to any method within this class that contains the endogenous regressor term (the raw data that will be used to train and use as a basis for forecasting). :param datetime_col: The name of the column within the DataFrame input that defines the datetime or date values associated with each row of the endogenous regressor (``y_col``) data. :param kwargs: overrides for underlying ``Prophet`` ``.fit()`` ``**kwargs`` (i.e., optimizer backend library configuration overrides) for further information, see: (https://facebook.github.io/prophet/docs/diagnostics.html\ #hyperparameter-tuning). :return: object instance (self) of GroupedProphet """ self._model_init_check() self._group_key_columns = group_key_columns _validate_keys_in_df(df, self._group_key_columns) if y_col != "y": df.rename(columns={y_col: "y"}, inplace=True) if datetime_col != "ds": df.rename(columns={datetime_col: "ds"}, inplace=True) grouped_data = PandasGroupGenerator( self._group_key_columns, self._datetime_col, self._y_col).generate_processing_groups(df) fit_model = [] for group_key, df in grouped_data: group_model = self._fit_prophet(group_key, df, **kwargs) if group_model: fit_model.append(group_model) self.model = _restructure_fit_payload(fit_model) return self
def fit( self, df, group_key_columns, y_col: str, datetime_col: str, exog_cols: List[str] = None, ndiffs: Dict = None, nsdiffs: Dict = None, silence_warnings: bool = False, **fit_kwargs, ): """ Fit method for training a ``pmdarima`` model on the submitted normalized DataFrame. When initialized, the input DataFrame will be split into an iterable collection of grouped data sets based on the ``group_key_columns`` arguments, which is then used to fit individual ``pmdarima`` models (or a supplied ``Pipeline``) upon the templated object supplied as a class instance argument `model_template`. For API information for ``pmdarima``'s ``ARIMA``, ``AutoARIMA``, and ``Pipeline`` APIs, see: https://alkaline-ml.com/pmdarima/modules/classes.html#api-ref :param df: A normalized group data set consisting of a datetime column that defines ordering of the series, an endogenous regressor column that specifies the series data for training (e.g. ``y_col``), and column(s) that define the grouping of the series data. An example normalized data set: =========== ===== ======== ============ ====== region zone country ds y =========== ===== ======== ============ ====== 'northeast' 1 "US" "2021-10-01" 1234.5 'northeast' 2 "US" "2021-10-01" 3255.6 'northeast' 1 "US" "2021-10-02" 1255.9 =========== ===== ======== ============ ====== Wherein the grouping_key_columns could be one, some, or all of ``['region', 'zone', 'country']``, the datetime_col would be the `'ds'` column, and the series ``y_col`` (endogenous regressor) would be `'y'`. :param group_key_columns: The columns in the ``df`` argument that define, in aggregate, a unique time series entry. For example, with the DataFrame referenced in the ``df`` param, group_key_columns could be: ``('region', 'zone')`` or ``('region')`` or ``('country', 'region', 'zone')`` :param y_col: The name of the column within the DataFrame input to any method within this class that contains the endogenous regressor term (the raw data that will be used to train and use as a basis for forecasting). :param datetime_col: The name of the column within the DataFrame input that defines the datetime or date values associated with each row of the endogenous regressor (``y_col``) data. :param exog_cols: An optional collection of column names within the submitted data to class methods that contain exogenous regressor elements to use as part of model fitting and predicting. Default: ``None`` :param ndiffs: optional overrides to the ``d`` ``ARIMA`` differencing term for stationarity enforcement. The structure of this argument is a dictionary in the form of: ``{<group_key>: <d_term>}``. To calculate, use ``diviner.PmdarimaAnalyzer.calculate_ndiffs()`` Default: ``None`` :param nsdiffs: optional overrides to the ``D`` SARIMAX seasonal differencing term for seasonal stationarity enforcement. The structure of this argument is a dictionary in the form of: ``{<group_key>: <D_term>}``. To calculate, use :py:meth:``diviner.PmdarimaAnalyzer.calculate_nsdiffs`` Default: ``None`` :param silence_warnings: If ``True``, removes ``SARIMAX`` and underlying optimizer warning message from stdout printing. With a sufficiently large nubmer of groups to process, the volume of these messages to stdout may become very large. Default: ``False`` :param fit_kwargs: ``fit_kwargs`` for ``pmdarima``'s ``ARIMA``, ``AutoARIMA``, or ``Pipeline`` stage overrides. For more information, see the ``pmdarima`` docs: https://alkaline-ml.com/pmdarima/index.html :return: object instance of ``GroupedPmdarima`` with the persisted fit model attached. """ self._model_init_check() self._y_col = y_col self._datetime_col = datetime_col self._exog_cols = exog_cols self._group_key_columns = group_key_columns if ndiffs and isinstance(ndiffs, dict): self._ndiffs = ndiffs if nsdiffs and isinstance(nsdiffs, dict): self._nsdiffs = nsdiffs _validate_keys_in_df(df, self._group_key_columns) grouped_data = PandasGroupGenerator( self._group_key_columns, self._datetime_col, self._y_col).generate_processing_groups(df) dt_indexed_group_data = apply_datetime_index_to_groups( grouped_data, self._datetime_col) self._max_datetime_per_group = _get_last_datetime_per_group( dt_indexed_group_data) self._datetime_freq_per_group = _get_datetime_freq_per_group( dt_indexed_group_data) fit_model = [ self._fit_individual_model(group_key, group_df, silence_warnings, **fit_kwargs) for group_key, group_df in dt_indexed_group_data ] self.model = _restructure_fit_payload(fit_model) return self