Exemplo n.º 1
0
 def add_time_features(df):
     time_df = build_time_features_df(
         dt=df[time_col], conti_year_origin=origin_for_time_vars)
     for col in match_cols:
         if col not in df.columns:
             df[col] = time_df[col].values
     return df
Exemplo n.º 2
0
def test_build_time_features_df_leap_years():
    date_list_non_leap_year = pd.date_range(start=dt(2019, 2, 28),
                                            periods=3 * 24,
                                            freq="H").tolist()

    df0 = pd.DataFrame({"ts": date_list_non_leap_year})
    time_df = build_time_features_df(dt=df0["ts"], conti_year_origin=2019)
    expected = (np.repeat([58.0, 59.0, 60.0], 24) + np.tile(range(24), 3) / 24)
    observed = 365.0 * time_df["toy"]
    assert np.allclose(observed, expected)

    date_list_leap_year = pd.date_range(start=dt(2020, 2, 28),
                                        periods=3 * 24,
                                        freq="H").tolist()

    df0 = pd.DataFrame({"ts": date_list_leap_year})
    time_df = build_time_features_df(dt=df0["ts"], conti_year_origin=2019)
    expected = (np.repeat([58.0, 59.0, 59.0], 24) + np.concatenate(
        [range(24), np.repeat(0, 24), range(24)]) / 24)
    observed = 365.0 * time_df["toy"]
    assert np.allclose(observed, expected)
def test_BuildTimeseriesFeaturesTransformer_1():
    """Checks if the transformer class returns same output as build_time_features_df"""
    date_list = pd.date_range(start=datetime(2019, 1, 1),
                              periods=100,
                              freq="H").tolist()
    df = pd.DataFrame({"ts": date_list})

    timeseries_transform = BuildTimeseriesFeaturesTransformer(time_col="ts")
    result = timeseries_transform.fit_transform(df)
    features_ts = build_time_features_df(dt=df["ts"], conti_year_origin=2019)
    expected = pd.concat([df, features_ts], axis=1)
    assert result.equals(expected)
    def transform(self, X):
        """ Calculates time series features of the input time series

        Parameters
        ----------
        X : pd.DataFrame

        Returns
        -------
        A copy of the data frame with original time points and calculated features
        """
        if self.origin_for_time_vars is None:
            raise NotFittedError(
                "This instance is not fitted yet. Call 'fit' with appropriate arguments "
                "before calling 'transform'.")
        assert isinstance(X, pd.DataFrame)
        dt = X[self.time_col]
        features_ts = build_time_features_df(
            dt, conti_year_origin=self.origin_for_time_vars)
        output = pd.concat([dt, features_ts], axis=1)
        return output
Exemplo n.º 5
0
def test_build_time_features_df():
    date_list = pd.date_range(start=dt(2019, 1, 1), periods=24 * 365,
                              freq="H").tolist()

    df0 = pd.DataFrame({"ts": date_list})
    time_df = build_time_features_df(dt=df0["ts"], conti_year_origin=2019)
    assert time_df["datetime"][0] == datetime.datetime(2019, 1, 1, 0, 0, 0)
    assert time_df["date"][0] == datetime.date(2019, 1, 1)
    assert time_df["year"][0] == 2019
    assert time_df["year_length"][0] == 365
    assert time_df["quarter"][0] == 1
    assert time_df["quarter_start"][0] == pd.to_datetime("2019-01-01")
    assert time_df["quarter_start"][24 * 89] == pd.to_datetime("2019-01-01")
    assert time_df["quarter_start"][24 * 91] == pd.to_datetime("2019-04-01")
    assert time_df["toq"][0] == 0
    assert time_df["toq"][24 * 16] == 16.0 / 90.0
    assert time_df["toq"][24 * 10] == 10.0 / 90.0
    assert time_df["toq"][24 * 89] == 89.0 / 90.0
    assert time_df["toq"][24 * 91] == 1.0 / 91.0
    assert time_df["month"][0] == 1
    assert time_df["month_length"][0] == 31
    assert time_df["woy"][0] == 1
    assert time_df["doy"][0] == 1
    assert time_df["dom"][0] == 1
    assert time_df["dow"][0] == 2
    assert time_df["str_dow"][0] == "2-Tue"
    assert time_df["hour"][0] == 0
    assert time_df["minute"][0] == 0
    assert time_df["second"][0] == 0
    assert time_df["year_month"][0] == "2019-01"
    assert time_df["year_woy"][0] == "2019_01"
    assert time_df["month_dom"][0] == "01/01"
    assert time_df["year_woy_dow"][0] == "2019_01_2"
    assert time_df["dow_hr"][0] == "2_00"
    assert time_df["dow_hr_min"][0] == "2_00_00"
    assert time_df["tod"][0] == 0.0
    assert time_df["tow"][0] == 1.0
    assert time_df["tom"][0] == 0.0 / 31
    assert time_df["toy"][0] == 0.0
    assert time_df["conti_year"][0] == 2019.0
    assert not time_df["is_weekend"][0]
    assert time_df["dow_grouped"][0] == "1234-MTuWTh"
    assert time_df["dow_grouped"][24 * 3] == "5-Fri"
    assert time_df["dow_grouped"][24 * 4] == "6-Sat"
    assert time_df["dow_grouped"][24 * 5] == "7-Sun"
    # detailed check on dow_hr
    assert list(time_df["dow_hr"])[::7][:25] == [
        '2_00', '2_07', '2_14', '2_21', '3_04', '3_11', '3_18', '4_01', '4_08',
        '4_15', '4_22', '5_05', '5_12', '5_19', '6_02', '6_09', '6_16', '6_23',
        '7_06', '7_13', '7_20', '1_03', '1_10', '1_17', '2_00'
    ]  # noqa: E501

    assert time_df["ct1"][0] == 0.0
    assert time_df["ct2"][0] == 0.0
    assert time_df["ct3"][0] == 0.0
    assert time_df["ct_sqrt"][0] == 0.0
    assert time_df["ct_root3"][0] == 0.0

    ct1 = 50.0 / 365 / 24
    assert time_df["ct1"][50] == pytest.approx(ct1, rel=1e-3)
    assert time_df["ct2"][50] == pytest.approx(ct1**2, rel=1e-3)
    assert time_df["ct3"][50] == pytest.approx(ct1**3, rel=1e-3)
    assert time_df["ct_sqrt"][50] == pytest.approx(ct1**0.5, rel=1e-3)
    assert time_df["ct_root3"][50] == pytest.approx(ct1**(1 / 3), rel=1e-3)

    quarter_dates = [
        "2020-01-01",
        "2020-03-31",  # Q1 2020 (leap year)
        "2020-04-01",
        "2020-06-30",  # Q2 2020
        "2020-07-01",
        "2020-09-30",  # Q3 2020
        "2020-10-01",
        "2020-12-31",  # Q4 2020
        "2021-01-01",
        "2021-03-31",  # Q1 2021
        "2021-05-13-12",
        "2021-08-03-18",  # Q2/3 2021
    ]
    time_df = build_time_features_df(quarter_dates, conti_year_origin=2020.0)
    assert_equal(time_df["quarter_start"],
                 pd.Series(
                     pd.to_datetime([
                         "2020-01-01",
                         "2020-01-01",
                         "2020-04-01",
                         "2020-04-01",
                         "2020-07-01",
                         "2020-07-01",
                         "2020-10-01",
                         "2020-10-01",
                         "2021-01-01",
                         "2021-01-01",
                         "2021-04-01",
                         "2021-07-01",
                     ])),
                 check_names=False)
    assert_equal(time_df["quarter_length"],
                 pd.Series([
                     91,
                     91,
                     91,
                     91,
                     92,
                     92,
                     92,
                     92,
                     90,
                     90,
                     91,
                     92,
                 ]),
                 check_names=False)
    assert_equal(time_df["doq"],
                 pd.Series([
                     1,
                     91,
                     1,
                     91,
                     1,
                     92,
                     1,
                     92,
                     1,
                     90,
                     43,
                     34,
                 ]),
                 check_names=False)
    assert_equal(time_df["toq"],
                 pd.Series([
                     0.0,
                     90.0 / 91.0,
                     0.0,
                     90.0 / 91.0,
                     0.0,
                     91.0 / 92.0,
                     0.0,
                     91.0 / 92.0,
                     0.0,
                     89.0 / 90.0,
                     42.5 / 91.0,
                     33.75 / 92.0,
                 ]),
                 check_names=False)

    # Checks for exception
    with pytest.raises(ValueError, match="Length of dt cannot be zero."):
        build_time_features_df(dt=df0.iloc[0:0]["ts"], conti_year_origin=2019)
Exemplo n.º 6
0
def test_get_changepoint_values_from_config(hourly_data):
    """Tests get_changepoint_values_from_config"""
    train_df = hourly_data["train_df"]
    conti_year_origin = get_default_origin_for_time_vars(train_df, TIME_COL)
    time_features_df = build_time_features_df(
        dt=train_df[TIME_COL], conti_year_origin=conti_year_origin)
    with pytest.raises(Exception,
                       match="changepoint method must be specified"):
        get_changepoint_values_from_config(
            changepoints_dict={"n_changepoints": 2},
            time_features_df=time_features_df,
            time_col="datetime")

    with pytest.raises(NotImplementedError,
                       match="changepoint method.*not recognized"):
        get_changepoint_values_from_config(
            changepoints_dict={"method": "not implemented"},
            time_features_df=time_features_df,
            time_col="datetime")

    # tests uniform method
    changepoint_values = get_changepoint_values_from_config(
        changepoints_dict={
            "method": "uniform",
            "n_changepoints": 20
        },
        time_features_df=time_features_df,
        time_col="datetime")
    expected_changepoint_values = get_evenly_spaced_changepoints_values(
        df=time_features_df, n_changepoints=20)
    assert np.array_equal(changepoint_values, expected_changepoint_values)

    changepoint_values = get_changepoint_values_from_config(
        changepoints_dict={
            "method": "uniform",
            "n_changepoints": 20,
            "continuous_time_col": "ct2"
        },
        time_features_df=time_features_df,
        time_col="datetime")
    expected_changepoint_values = get_evenly_spaced_changepoints_values(
        df=time_features_df, n_changepoints=20, continuous_time_col="ct2")
    assert np.array_equal(changepoint_values, expected_changepoint_values)

    # tests custom method
    dates = ["2018-01-01", "2019-01-02-16", "2019-01-03", "2019-02-01"]
    changepoint_values = get_changepoint_values_from_config(
        changepoints_dict={
            "method": "custom",
            "dates": dates
        },
        time_features_df=time_features_df,
        time_col="datetime")
    expected_changepoint_values = get_custom_changepoints_values(
        df=time_features_df, changepoint_dates=dates, time_col="datetime")
    assert np.array_equal(changepoint_values, expected_changepoint_values)

    changepoint_values = get_changepoint_values_from_config(
        changepoints_dict={
            "method": "custom",
            "dates": dates,
            "continuous_time_col": "ct2"
        },
        time_features_df=time_features_df,
        time_col="datetime")
    expected_changepoint_values = get_custom_changepoints_values(
        df=time_features_df,
        changepoint_dates=dates,
        time_col="datetime",
        continuous_time_col="ct2")
    assert np.array_equal(changepoint_values, expected_changepoint_values)
Exemplo n.º 7
0
def generate_df_for_tests(freq,
                          periods,
                          train_start_date=datetime.datetime(2018, 7, 1),
                          train_end_date=None,
                          train_frac=0.8,
                          conti_year_origin=None,
                          noise_std=2.0,
                          remove_extra_cols=True,
                          autoreg_coefs=None,
                          fs_coefs=[-1, 3, 4],
                          growth_coef=3.0,
                          growth_pow=1.1,
                          intercept=0.0):
    """Generates dataset for unit tests.

    :param freq: str
        pd.date_range freq parameter, e.g. H or D
    :param periods: int
        number of periods to generate
    :param train_start_date: datetime.datetime
        train start date
    :param train_end_date: Optional[datetime.datetime]
        train end date
    :param train_frac: Optional[float]
        fraction of data to use for training
        only used if train_end_date isn't provided
    :param noise_std: float
        standard deviation of gaussian noise
    :param conti_year_origin: float
        the time origin for continuous time variables
    :param remove_extra_cols: bool
        whether to remove extra columns besides TIME_COL, VALUE_COL
    :param autoreg_coefs: Optional[List[int]]
        The coefficients for the autoregressive terms.
        If provided the generated series denoted mathematically by Y(t) will be
        converted as follows:
        Y(t) -> Y(t) + c1 Y(t-1) + c2 Y(t-2) + c3 Y(t-3) + ...
        where autoreg_coefs = [c1, c2, c3, ...]
        In this fashion, the obtained series will have autoregressive
        properties not explained by seasonality and growth.
    :param fs_coefs: List[float]
        The fourier series coefficients used.
    :param growth_coef: float
        Multiplier for growth
    :param growth_pow: float
        Power for growth, as function of continuous time
    :param intercept: float
        Constant term added to Y(t)

    :return: Dict[str, any]
        contains full dataframe, train dataframe, test dataframe,
        and nrows in test dataframe
    """
    np.random.seed(123)

    date_list = pd.date_range(start=train_start_date,
                              periods=periods,
                              freq=freq).tolist()

    df0 = pd.DataFrame({TIME_COL: date_list})
    if conti_year_origin is None:
        conti_year_origin = get_default_origin_for_time_vars(df0, TIME_COL)
    time_df = build_time_features_df(dt=df0[TIME_COL],
                                     conti_year_origin=conti_year_origin)
    df = pd.concat([df0, time_df], axis=1)
    df["growth"] = growth_coef * (df["ct1"]**growth_pow)

    func = fourier_series_multi_fcn(col_names=["toy", "tow", "tod"],
                                    periods=[1.0, 7.0, 24.0],
                                    orders=[1, 1, 1],
                                    seas_names=None)

    res = func(df)
    df_seas = res["df"]
    df = pd.concat([df, df_seas], axis=1)

    df[VALUE_COL] = (
        intercept + df["growth"] +
        fs_coefs[0] * df[get_fourier_col_name(1, "tod", function_name="sin")] +
        fs_coefs[1] * df[get_fourier_col_name(1, "tow", function_name="sin")] +
        fs_coefs[2] * df[get_fourier_col_name(1, "toy", function_name="sin")] +
        noise_std * np.random.normal(size=df.shape[0]))

    if autoreg_coefs is not None:
        df["temporary_new_value"] = df[VALUE_COL]
        k = len(autoreg_coefs)
        for i in range(k):
            df["temporary_new_value"] = (
                df["temporary_new_value"] +
                autoreg_coefs[i] * df[VALUE_COL].shift(-i)).bfill()
        df[VALUE_COL] = df["temporary_new_value"]
        del df["temporary_new_value"]

    if train_end_date is None:
        train_rows = np.floor(train_frac * df.shape[0]).astype(int)
        train_end_date = df[TIME_COL][train_rows]

    if remove_extra_cols:
        df = df[[TIME_COL, VALUE_COL]]
    train_df = df.loc[df[TIME_COL] <= train_end_date]
    test_df = df.loc[df[TIME_COL] > train_end_date]
    fut_time_num = test_df.shape[0]

    return {
        "df": df,
        "train_df": train_df.reset_index(drop=True),
        "test_df": test_df.reset_index(drop=True),
        "fut_time_num": fut_time_num,
    }
Exemplo n.º 8
0
# For a full list of such features, see `~greykite.common.features.timeseries_features.build_time_features_df`.
#
# If a feature is not automatically created by ``SILVERKITE``, we need to create it
# beforehand and append it to the data df.
# Here we create the "is_football_season" feature.
# Note that we also need to provide the customized column for the forecast horizon period as well.
# The way we do it is to first create the df with timestamps covering the forecast horizon.
# This can be done with the `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.make_future_dataframe`
# function within the `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries` class.
# Then we create a new column of our customized regressor for this augmented df.

# Makes augmented df with forecast horizon 365 days
df_full = ts.make_future_dataframe(periods=365)
# Builds "df_features" that contains datetime information of the "df"
df_features = build_time_features_df(
    dt=df_full["ts"],
    conti_year_origin=convert_date_to_continuous_time(df_full["ts"][0]))

# Roughly approximates the football season.
# "woy" is short for "week of year", created above.
# Football season is roughly the first 6 weeks and last 17 weeks in a year.
is_football_season = (df_features["woy"] <= 6) | (df_features["woy"] >= 36)
# Adds the new feature to the dataframe.
df_full["is_football_season"] = is_football_season.astype(int).tolist()
df_full.reset_index(drop=True, inplace=True)

# Configures regressor column.
regressors = {"regressor_cols": ["is_football_season"]}

# %%
# Interactions
def test_group_silverkite_seas_components():
    """Tests group_silverkite_seas_components"""
    silverkite_diagnostics: SilverkiteDiagnostics = SilverkiteDiagnostics()
    time_col = "ts"
    # Daily
    date_list = pd.date_range(start="2018-01-01", end="2018-01-07",
                              freq="H").tolist()
    time_df = build_time_features_df(date_list, conti_year_origin=2018)
    df = pd.DataFrame({
        time_col: time_df["datetime"],
        "DAILY_SEASONALITY": time_df["hour"]
    })
    res = silverkite_diagnostics.group_silverkite_seas_components(df)
    expected_df = pd.DataFrame({
        "Hour of day": np.arange(24.0),
        "daily": np.arange(24.0),
    })
    assert_frame_equal(res, expected_df)

    # Weekly
    date_list = pd.date_range(start="2018-01-01", end="2018-01-20",
                              freq="D").tolist()
    time_df = build_time_features_df(date_list, conti_year_origin=2018)
    df = pd.DataFrame({
        time_col: time_df["datetime"],
        "WEEKLY_SEASONALITY": time_df["tow"]
    })
    res = silverkite_diagnostics.group_silverkite_seas_components(df)
    expected_df = pd.DataFrame({
        "Day of week": np.arange(7.0),
        "weekly": np.arange(7.0),
    })
    assert_frame_equal(res, expected_df)

    # Monthly
    date_list = pd.date_range(start="2018-01-01", end="2018-01-31",
                              freq="D").tolist()
    time_df = build_time_features_df(date_list, conti_year_origin=2018)
    df = pd.DataFrame({
        time_col: time_df["datetime"],
        "MONTHLY_SEASONALITY": time_df["dom"]
    })
    res = silverkite_diagnostics.group_silverkite_seas_components(df)
    expected_df = pd.DataFrame({
        "Time of month": np.arange(31.0) / 31,
        "monthly": np.arange(1.0, 32.0),
    })
    assert_frame_equal(res, expected_df)

    # Quarterly (92 day quarters)
    date_list = pd.date_range(start="2018-07-01", end="2018-12-31",
                              freq="D").tolist()
    time_df = build_time_features_df(date_list, conti_year_origin=2018)
    df = pd.DataFrame({
        time_col: time_df["datetime"],
        "QUARTERLY_SEASONALITY": time_df["toq"]
    })
    res = silverkite_diagnostics.group_silverkite_seas_components(df)
    expected_df = pd.DataFrame({
        "Time of quarter": np.arange(92.0) / 92,
        "quarterly": np.arange(92.0) / 92,
    })
    assert_frame_equal(res, expected_df)

    # Quarterly (90 day quarter)
    date_list = pd.date_range(start="2018-01-01", end="2018-03-31",
                              freq="D").tolist()
    time_df = build_time_features_df(date_list, conti_year_origin=2018)
    df = pd.DataFrame({
        time_col: time_df["datetime"],
        "QUARTERLY_SEASONALITY": time_df["toq"]
    })
    res = silverkite_diagnostics.group_silverkite_seas_components(df)
    expected_df = pd.DataFrame({
        "Time of quarter": np.arange(90.0) / 90,
        "quarterly": np.arange(90.0) / 90,
    })
    assert_frame_equal(res, expected_df)

    # Yearly (non-leap years)
    date_list = pd.date_range(start="2018-01-01", end="2019-12-31",
                              freq="D").tolist()
    time_df = build_time_features_df(date_list, conti_year_origin=2018)
    df = pd.DataFrame({
        time_col: time_df["datetime"],
        "YEARLY_SEASONALITY": time_df["toy"]
    })
    res = silverkite_diagnostics.group_silverkite_seas_components(df)
    expected_df = pd.DataFrame({
        "Time of year": np.arange(365.0) / 365,
        "yearly": np.arange(365.0) / 365,
    })
    assert_frame_equal(res, expected_df)
Exemplo n.º 10
0
def add_groupby_column(df,
                       time_col,
                       groupby_time_feature=None,
                       groupby_sliding_window_size=None,
                       groupby_custom_column=None):
    """Extracts a column to group by from ``df``.

    Exactly one of ``groupby_time_feature``, ``groupby_sliding_window_size``,
    `groupby_custom_column` must be provided.

    Parameters
    ----------
    df : 'pandas.DataFrame`
        Contains the univariate time series / forecast
    time_col : `str`
        The name of the time column of the univariate time series / forecast
    groupby_time_feature : `str` or None, optional
        If provided, groups by a column generated by
        `~greykite.common.features.timeseries_features.build_time_features_df`.
        See that function for valid values.
    groupby_sliding_window_size : `int` or None, optional
        If provided, sequentially partitions data into groups of size
        ``groupby_sliding_window_size``.
    groupby_custom_column : `pandas.Series` or None, optional
        If provided, groups by this column value.
        Should be same length as the ``df``.

    Returns
    -------
    result : `dict`
        Dictionary with two items:

        * ``"df"`` : `pandas.DataFrame`
            ``df`` with a grouping column added.
            The column can be used to group rows together.

        * ``"groupby_col"`` : `str`
            The name of the groupby column added to ``df``.
            The column name depends on the grouping method:

                - ``groupby_time_feature`` for ``groupby_time_feature``
                - ``{cst.TIME_COL}_downsample`` for ``groupby_sliding_window_size``
                - ``groupby_custom_column.name`` for ``groupby_custom_column``.
    """
    # Resets index to support indexing in groupby_sliding_window_size
    df = df.copy()
    dt = pd.Series(df[time_col].values)
    # Determines the groups
    is_groupby_time_feature = 1 if groupby_time_feature is not None else 0
    is_groupby_sliding_window_size = 1 if groupby_sliding_window_size is not None else 0
    is_groupby_custom_column = 1 if groupby_custom_column is not None else 0
    if is_groupby_time_feature + is_groupby_sliding_window_size + is_groupby_custom_column != 1:
        raise ValueError(
            "Exactly one of (groupby_time_feature, groupby_rolling_window_size, groupby_custom_column)"
            "must be specified")
    groups = None
    if is_groupby_time_feature == 1:
        # Group by a value derived from the time column
        time_features = build_time_features_df(dt,
                                               conti_year_origin=min(dt).year)
        groups = time_features[groupby_time_feature]
        groups.name = groupby_time_feature
    elif is_groupby_sliding_window_size == 1:
        # Group by sliding window for evaluation over time
        index_dates = split_range_into_groups(
            n=df.shape[0],
            group_size=groupby_sliding_window_size,
            which_group_complete="last"
        )  # ensures the last group is complete (first group may be partial)
        groups = dt[
            index_dates *
            groupby_sliding_window_size]  # uses first date in each group as grouping value
        groups.name = f"{time_col}_downsample"
    elif is_groupby_custom_column == 1:
        # Group by custom column
        groups = groupby_custom_column

    groups_col_name = groups.name if groups.name is not None else "groups"
    df[groups_col_name] = groups.values
    if df.index.name in df.columns:
        # Removes ambiguity in case the index name is the same as the newly added column,
        # (or an existing column).
        df.index.name = None
    return {"df": df, "groupby_col": groups_col_name}