예제 #1
0
    def predict(self, df, predict_col: str = "yhat"):
        """
        Main prediction method for generating forecast values based on the group keys and dates
        for each that are passed in to this method. The structure of the DataFrame submitted to
        this method is the same normalized format that ``fit`` takes as a DataFrame argument.
        i.e.:

        ========== ==== =============
        region     zone ds
        ========== ==== =============
        northeast  1    '2021-10-01'
        northeast  2    '2021-10-01'
        northeast  1    '2021-10-02'
        ========== ==== =============

        :param df: Normalized DataFrame consisting of grouping key entries and the dates to
                   forecast for each group.
        :param predict_col: The name of the column in the output ``DataFrame`` that contains the
                            forecasted series data.
        :return: A consolidated (unioned) single DataFrame of all groups forecasts
        """
        self._fit_check()
        _validate_keys_in_df(df, self._group_key_columns)

        grouped_data = PandasGroupGenerator(
            self._group_key_columns, self._datetime_col,
            self._y_col).generate_prediction_groups(df)

        predictions = self._run_predictions(grouped_data)

        if predict_col != "yhat":
            predictions.rename(columns={"yhat": predict_col}, inplace=True)

        return predictions
예제 #2
0
def test_validate_keys_in_df():
    """
    Test for ensuring that grouping keys are properly validated and that invalid keys submitted
    for validation will raise a DivinerException with the appropriate message.
    :return: None
    """
    invalid_group_keys = ("a", "b", "q")
    valid_subset_group_keys = (
        "a",
        "b",
        "c",
    )  # missing the 'z' column but we don't raise on that.
    valid_group_keys = ("a", "b", "c", "z")

    df = generate_sample()

    with pytest.raises(
            DivinerException,
            match=
        ("Not all key grouping columns supplied: \\('a', 'b', 'q'\\) are present "
         "in the submitted df: \\['ds', 'y', 'a', 'b', 'c', 'z'\\]"),
    ):
        _validate_keys_in_df(df, invalid_group_keys)

    assert _validate_keys_in_df(df, valid_subset_group_keys) is None
    assert _validate_keys_in_df(df, valid_group_keys) is None
    def _get_df_with_master_key_column(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method for creating the 'master_group_key' column that defines a unique group.
        The master_group_key column is generated from the concatenation (within a tuple) of the
        values in each of the individual `_group_key_columns`, serving as an aggregation grouping
        key to define a unique collection of datetime series values.
        For example:

        =========== ==== ============ ======
        region      zone ds           y
        =========== ==== ============ ======
        'northeast' 1    "2021-10-01" 1234.5
        'northeast' 2    "2021-10-01" 3255.6
        'northeast' 1    "2021-10-02" 1255.9
        =========== ==== ============ ======

        With the above dataset, the ``group_key_columns`` passed in would be: ``('region', 'zone')``
        This method will modify the input ``DataFrame`` by adding the ``master_group_key`` as
        follows:

        =========== ==== ============ ====== ================
        region      zone ds           y      grouping_key
        =========== ==== ============ ====== ================
        'northeast' 1    "2021-10-01" 1234.5 ('northeast', 1)
        'northeast' 2    "2021-10-01" 3255.6 ('northeast', 2)
        'northeast' 1    "2021-10-02" 1255.9 ('northeast', 1)
        =========== ==== ============ ====== ================

        :param df: The normalized ``DataFrame``
        :return: A copy of the passed-in ``DataFrame`` with a master grouping key column added
                 that contains the group definitions per row of the input ``DataFrame``.
        """

        _validate_keys_in_df(df, self._group_key_columns)

        master_group_df = df.copy()
        master_group_df[self._master_group_key] = master_group_df[[
            *self._group_key_columns
        ]].apply(lambda column: tuple(column), axis=1)  # pylint: disable=unnecessary-lambda
        return master_group_df
예제 #4
0
    def fit(self,
            df,
            group_key_columns,
            y_col="y",
            datetime_col="ds",
            **kwargs):
        """
        Main ``fit`` method for executing a Prophet ``fit`` on the submitted DataFrame, grouped by
        the ``group_key_columns`` submitted.
        When initiated, the input DataFrame ``df`` will be split into an iterable collection
        that represents a core series to be fit against.
        This ``fit`` method is a per-group wrapper around Prophet's ``fit`` implementation. See:
        https://facebook.github.io/prophet/docs/quick_start.html for information on the basic
        API, as well as links to the source code that will demonstrate all of the options
        available for overriding default functionality.
        For a full description of all parameters that are available to the optimizer, run the
        following in a shell:

        .. code-block:: python
            :caption: Retrieving pystan parameters

            import pystan

            help(pystan.StanModel.optimizing)


        :param df: Normalized pandas DataFrame containing ``group_key_columns``, a ``'ds'`` column,
                   and a target ``'y'`` column.
                   An example normalized data set to be used in this method:

                   ========== ==== ============= ======
                   region     zone ds            y
                   ========== ==== ============= ======
                   northeast  1    '2021-10-01'  1234.5
                   northeast  2    '2021-10-01'  3255.6
                   northeast  1    '2021-10-02'  1255.9
                   ========== ==== ============= ======

        :param group_key_columns: The columns in the ``df`` argument that define, in aggregate, a
                                  unique time series entry. For example, with the DataFrame
                                  referenced in the ``df`` param, group_key_columns could be:
                                  (``'region'``, ``'zone'``)
                                  Specifying an incomplete grouping collection, while valid
                                  through this API (i.e., ('region')), can cause serious problems
                                  with any forecast that is built with this API. Ensure that all
                                  relevant keys are defined in the input `df` and declared in this
                                  param to ensure that the appropriate per-univariate series data
                                  is used to train each model.
        :param y_col: The name of the column within the DataFrame input to any method within this
                      class that contains the endogenous regressor term (the raw data that will
                      be used to train and use as a basis for forecasting).
        :param datetime_col: The name of the column within the DataFrame input that defines the
                             datetime or date values associated with each row of the endogenous
                             regressor (``y_col``) data.
        :param kwargs: overrides for underlying ``Prophet`` ``.fit()`` ``**kwargs`` (i.e., optimizer
                       backend library configuration overrides) for further information, see:
                       (https://facebook.github.io/prophet/docs/diagnostics.html\
                       #hyperparameter-tuning).
        :return: object instance (self) of GroupedProphet
        """

        self._model_init_check()
        self._group_key_columns = group_key_columns

        _validate_keys_in_df(df, self._group_key_columns)

        if y_col != "y":
            df.rename(columns={y_col: "y"}, inplace=True)
        if datetime_col != "ds":
            df.rename(columns={datetime_col: "ds"}, inplace=True)

        grouped_data = PandasGroupGenerator(
            self._group_key_columns, self._datetime_col,
            self._y_col).generate_processing_groups(df)

        fit_model = []
        for group_key, df in grouped_data:
            group_model = self._fit_prophet(group_key, df, **kwargs)
            if group_model:
                fit_model.append(group_model)

        self.model = _restructure_fit_payload(fit_model)

        return self
예제 #5
0
    def fit(
        self,
        df,
        group_key_columns,
        y_col: str,
        datetime_col: str,
        exog_cols: List[str] = None,
        ndiffs: Dict = None,
        nsdiffs: Dict = None,
        silence_warnings: bool = False,
        **fit_kwargs,
    ):
        """
        Fit method for training a ``pmdarima`` model on the submitted normalized DataFrame.
        When initialized, the input DataFrame will be split into an iterable collection of
        grouped data sets based on the ``group_key_columns`` arguments, which is then used to fit
        individual ``pmdarima`` models (or a supplied ``Pipeline``) upon the templated object
        supplied as a class instance argument `model_template`.
        For API information for ``pmdarima``'s ``ARIMA``, ``AutoARIMA``, and ``Pipeline`` APIs, see:
        https://alkaline-ml.com/pmdarima/modules/classes.html#api-ref

        :param df: A normalized group data set consisting of a datetime column that defines
                   ordering of the series, an endogenous regressor column that specifies the
                   series data for training (e.g. ``y_col``), and column(s) that define the
                   grouping of the series data.

                   An example normalized data set:

                   =========== ===== ======== ============ ======
                   region      zone  country  ds           y
                   =========== ===== ======== ============ ======
                   'northeast' 1     "US"     "2021-10-01" 1234.5
                   'northeast' 2     "US"     "2021-10-01" 3255.6
                   'northeast' 1     "US"     "2021-10-02" 1255.9
                   =========== ===== ======== ============ ======

                   Wherein the grouping_key_columns could be one, some, or all of
                   ``['region', 'zone', 'country']``, the datetime_col would be the `'ds'` column,
                   and the series ``y_col`` (endogenous regressor) would be `'y'`.
        :param group_key_columns: The columns in the ``df`` argument that define, in aggregate, a
                                  unique time series entry. For example, with the DataFrame
                                  referenced in the ``df`` param, group_key_columns could be:
                                  ``('region', 'zone')`` or ``('region')`` or
                                  ``('country', 'region', 'zone')``
        :param y_col: The name of the column within the DataFrame input to any method within this
                      class that contains the endogenous regressor term (the raw data that will
                      be used to train and use as a basis for forecasting).
        :param datetime_col: The name of the column within the DataFrame input that defines the
                             datetime or date values associated with each row of the endogenous
                             regressor (``y_col``) data.
        :param exog_cols: An optional collection of column names within the submitted data to class
                          methods that contain exogenous regressor elements to use as part of model
                          fitting and predicting.

                          Default: ``None``
        :param ndiffs: optional overrides to the ``d`` ``ARIMA`` differencing term for stationarity
                       enforcement.
                       The structure of this argument is a dictionary in the form of:
                       ``{<group_key>: <d_term>}``. To calculate, use
                       ``diviner.PmdarimaAnalyzer.calculate_ndiffs()``

                       Default: ``None``
        :param nsdiffs: optional overrides to the ``D`` SARIMAX seasonal differencing term for
                        seasonal stationarity enforcement.
                        The structure of this argument is a dictionary in the form of:
                        ``{<group_key>: <D_term>}``. To calculate, use
                        :py:meth:``diviner.PmdarimaAnalyzer.calculate_nsdiffs``

                        Default: ``None``
        :param silence_warnings: If ``True``, removes ``SARIMAX`` and underlying optimizer warning
                                 message from stdout printing. With a sufficiently large nubmer of
                                 groups to process, the volume of these messages to stdout may
                                 become very large.

                                 Default: ``False``
        :param fit_kwargs: ``fit_kwargs`` for ``pmdarima``'s ``ARIMA``, ``AutoARIMA``, or
                           ``Pipeline`` stage overrides.
                           For more information, see the ``pmdarima`` docs:
                           https://alkaline-ml.com/pmdarima/index.html
        :return: object instance of ``GroupedPmdarima`` with the persisted fit model attached.
        """

        self._model_init_check()

        self._y_col = y_col
        self._datetime_col = datetime_col
        self._exog_cols = exog_cols
        self._group_key_columns = group_key_columns
        if ndiffs and isinstance(ndiffs, dict):
            self._ndiffs = ndiffs
        if nsdiffs and isinstance(nsdiffs, dict):
            self._nsdiffs = nsdiffs

        _validate_keys_in_df(df, self._group_key_columns)

        grouped_data = PandasGroupGenerator(
            self._group_key_columns, self._datetime_col,
            self._y_col).generate_processing_groups(df)

        dt_indexed_group_data = apply_datetime_index_to_groups(
            grouped_data, self._datetime_col)

        self._max_datetime_per_group = _get_last_datetime_per_group(
            dt_indexed_group_data)
        self._datetime_freq_per_group = _get_datetime_freq_per_group(
            dt_indexed_group_data)

        fit_model = [
            self._fit_individual_model(group_key, group_df, silence_warnings,
                                       **fit_kwargs)
            for group_key, group_df in dt_indexed_group_data
        ]

        self.model = _restructure_fit_payload(fit_model)

        return self