def test_load_peyton_manning_ts(): dl = DataLoaderTS() ts = dl.load_peyton_manning_ts() assert ts.original_time_col == TIME_COL assert ts.original_value_col == VALUE_COL assert ts.freq == "1D" assert_equal(ts.df[VALUE_COL], ts.y)
def test_load_hourly_bikesharing_ts(): dl = DataLoaderTS() ts = dl.load_bikesharing_ts() assert ts.original_time_col == "ts" assert ts.original_value_col == "count" assert ts.freq == "H" assert ts.regressor_cols == ["tmin", "tmax", "pn"] assert_equal(ts.df[VALUE_COL], ts.y)
def test_plot(): """Checks plot function""" # Plots with `color` ts = UnivariateTimeSeries() df = pd.DataFrame({ TIME_COL: [ dt(2018, 1, 1, 3, 0, 0), dt(2018, 1, 1, 4, 0, 0), dt(2018, 1, 1, 5, 0, 0) ], VALUE_COL: [1, 2, 3] }) ts.load_data(df, TIME_COL, VALUE_COL) fig = ts.plot(color="green") assert len(fig.data) == 1 assert fig.data[0].line.color == "green" with pytest.raises( ValueError, match= "There is no `anomaly_info` to show. `show_anomaly_adjustment` must be False." ): ts.plot(show_anomaly_adjustment=True) # Plots with `show_anomaly_adjustment` dl = DataLoaderTS() df = dl.load_beijing_pm() value_col = "pm" # Masks up to 2011-02-04-03, and adds 100.0 to the rest anomaly_df = pd.DataFrame({ START_DATE_COL: ["2010-01-01-00", "2011-02-04-03"], END_DATE_COL: ["2011-02-04-03", "2014-12-31-23"], ADJUSTMENT_DELTA_COL: [np.nan, 100.0], METRIC_COL: [value_col, value_col] }) anomaly_info = { "value_col": value_col, "anomaly_df": anomaly_df, "start_date_col": START_DATE_COL, "end_date_col": END_DATE_COL, "adjustment_delta_col": ADJUSTMENT_DELTA_COL, "filter_by_dict": { METRIC_COL: value_col }, "adjustment_method": "add" } ts = UnivariateTimeSeries() ts.load_data(df=df, value_col="pm", anomaly_info=anomaly_info) fig = ts.plot(show_anomaly_adjustment=True) assert len(fig.data) == 2 assert fig.data[0].name == value_col assert fig.data[1].name == f"{value_col}_unadjusted" assert fig.layout.xaxis.title.text == ts.original_time_col assert fig.layout.yaxis.title.text == ts.original_value_col assert fig.data[0].y.shape[0] == df.shape[0] assert fig.data[1].y.shape[0] == df.shape[0] # adjusted data has more NaNs, since anomalies are replaced with NaN assert sum(np.isnan(fig.data[0].y)) == 10906 assert sum(np.isnan(fig.data[1].y)) == 2067
def test_load_hourly_beijing_pm_ts(): dl = DataLoaderTS() ts = dl.load_beijing_pm_ts() assert ts.original_time_col == TIME_COL assert ts.original_value_col == "pm" assert ts.freq == "H" assert ts.regressor_cols == [ "dewp", "temp", "pres", "cbwd", "iws", "is", "ir" ] assert_equal(ts.df[VALUE_COL], ts.y)
def test_load_hourly_parking_ts(): dl = DataLoaderTS() ts = dl.load_parking_ts(system_code_number=None) assert ts.original_time_col == "LastUpdated" assert ts.original_value_col == "OccupancyRatio" assert ts.freq == "30min" assert ts.df.shape == (3666, 4) ts = dl.load_parking_ts(system_code_number="NIA South") assert ts.original_time_col == "LastUpdated" assert ts.original_value_col == "OccupancyRatio" assert ts.freq == "30min" assert ts.df.shape == (3522, 5)
def test_load_data_anomaly(): """Checks anomaly_info parameter""" dl = DataLoaderTS() df = dl.load_beijing_pm() value_col = "pm" # no anomaly adjustment ts = UnivariateTimeSeries() ts.load_data(df=df, value_col=value_col) assert ts.df_before_adjustment is None # adjusts two columns dim_one = "one" dim_two = "two" anomaly_df = pd.DataFrame({ START_DATE_COL: ["2011-04-04-10", "2011-10-10-00", "2012-12-20-10"], END_DATE_COL: ["2011-04-05-20", "2011-10-11-23", "2012-12-20-13"], ADJUSTMENT_DELTA_COL: [np.nan, 100.0, -100.0], METRIC_COL: [dim_one, dim_one, dim_two] }) anomaly_info = [{ "value_col": value_col, "anomaly_df": anomaly_df, "start_date_col": START_DATE_COL, "end_date_col": END_DATE_COL, "adjustment_delta_col": ADJUSTMENT_DELTA_COL, "filter_by_dict": { METRIC_COL: dim_one }, "adjustment_method": "add" }, { "value_col": "pres", "anomaly_df": anomaly_df, "start_date_col": START_DATE_COL, "end_date_col": END_DATE_COL, "adjustment_delta_col": ADJUSTMENT_DELTA_COL, "filter_by_dict": { METRIC_COL: dim_two }, "adjustment_method": "subtract" }] ts = UnivariateTimeSeries() ts.load_data(df=df, value_col=value_col, anomaly_info=anomaly_info) canonical_data_dict = get_canonical_data(df=df, value_col=value_col, anomaly_info=anomaly_info) assert_equal(ts.df, canonical_data_dict["df"]) assert_equal(ts.df_before_adjustment, canonical_data_dict["df_before_adjustment"])
def test_load_data_ts(): dl = DataLoaderTS() ts = dl.load_data_ts(data_name="daily_peyton_manning") expected_ts = dl.load_peyton_manning_ts() assert_equal(ts.df, expected_ts.df) ts = dl.load_data_ts(data_name="hourly_parking", system_code_number="Shopping") expected_ts = dl.load_parking_ts(system_code_number="Shopping") assert_equal(ts.df, expected_ts.df) # Error due to unavailable data name data_name = "dummy" data_inventory = dl.get_data_inventory() with pytest.raises( ValueError, match=fr"Input data name '{data_name}' is not recognized. " fr"Must be one of \{data_inventory}\."): dl.load_data_ts(data_name=data_name)
from greykite.algo.forecast.silverkite.constants.silverkite_seasonality import SilverkiteSeasonalityEnum from greykite.algo.forecast.silverkite.forecast_simple_silverkite_helper import cols_interact from greykite.common import constants as cst from greykite.common.features.timeseries_features import build_time_features_df from greykite.common.features.timeseries_features import convert_date_to_continuous_time from greykite.framework.benchmark.data_loader_ts import DataLoaderTS from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam from greykite.framework.templates.autogen.forecast_config import ForecastConfig from greykite.framework.templates.autogen.forecast_config import MetadataParam from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam from greykite.framework.templates.forecaster import Forecaster from greykite.framework.templates.model_templates import ModelTemplateEnum from greykite.framework.utils.result_summary import summarize_grid_search_results # Loads dataset into UnivariateTimeSeries dl = DataLoaderTS() ts = dl.load_peyton_manning_ts() df = ts.df # cleaned pandas.DataFrame # %% # Exploratory data analysis (EDA) # -------------------------------- # After reading in a time series, we could first do some exploratory data analysis. # The `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries` class is # used to store a timeseries and perform EDA. # describe print(ts.describe_time_col()) print(ts.describe_value_col()) # %%
def test_init(): dl = DataLoaderTS() assert dl.available_datasets == dl.get_data_inventory()
def test_get_quantiles_and_overlays(): """Tests get_quantiles_and_overlays""" dl = DataLoaderTS() peyton_manning_ts = dl.load_peyton_manning_ts() # no columns are requested with pytest.raises( ValueError, match= "Must enable at least one of: show_mean, show_quantiles, show_overlays." ): peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="doy") # show_mean only grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="dow", show_mean=True, mean_col_name="custom_name") assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays([[MEAN_COL_GROUP], ["custom_name"]], names=["category", "name"])) assert grouped_df.index.name == "dow" assert grouped_df.shape == (7, 1) assert grouped_df.index[0] == 1 # show_quantiles only (bool) grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=180, show_quantiles=True) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[QUANTILE_COL_GROUP, QUANTILE_COL_GROUP], ["Q0.1", "Q0.9"]], names=["category", "name"])) assert grouped_df.index.name == "ts_downsample" assert grouped_df.shape == (17, 2) assert grouped_df.index[0] == pd.Timestamp(2007, 12, 10) # show_quantiles only (list) custom_col = pd.Series( np.random.choice(list("abcd"), size=peyton_manning_ts.df.shape[0])) grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_custom_column=custom_col, show_quantiles=[0, 0.25, 0.5, 0.75, 1], quantile_col_prefix="prefix") assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[QUANTILE_COL_GROUP] * 5, ["prefix0", "prefix0.25", "prefix0.5", "prefix0.75", "prefix1"]], names=["category", "name"])) assert grouped_df.index.name == "groups" assert grouped_df.shape == (4, 5) assert grouped_df.index[0] == "a" # checks quantile computation df = peyton_manning_ts.df.copy() df["custom_col"] = custom_col.values quantile_df = df.groupby("custom_col")[VALUE_COL].agg( [np.nanmin, np.nanmedian, np.nanmax]) assert_equal(grouped_df["quantile"]["prefix0"], quantile_df["nanmin"], check_names=False) assert_equal(grouped_df["quantile"]["prefix0.5"], quantile_df["nanmedian"], check_names=False) assert_equal(grouped_df["quantile"]["prefix1"], quantile_df["nanmax"], check_names=False) # show_overlays only (bool), no overlay label grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="doy", show_overlays=True) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 9, [f"overlay{i}" for i in range(9)]], names=["category", "name"])) assert grouped_df.index.name == "doy" assert grouped_df.shape == (366, 9) assert grouped_df.index[0] == 1 # show_overlays only (int below the available number), time feature overlay label np.random.seed(123) grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="doy", show_overlays=4, overlay_label_time_feature="year") assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 4, ["2007", "2011", "2012", "2014"]], names=["category", "name"])) assert grouped_df.index.name == "doy" assert grouped_df.shape == (366, 4) assert grouped_df.index[0] == 1 # show_overlays only (int above the available number), custom overlay label grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="dom", show_overlays=200, overlay_label_custom_column=custom_col) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 4, ["a", "b", "c", "d"]], names=["category", "name"])) assert grouped_df.index.name == "dom" assert grouped_df.shape == (31, 4) assert grouped_df.index[0] == 1 # show_overlays only (list of indices), sliding window overlay label grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="dom", show_overlays=[0, 4], overlay_label_sliding_window_size=365 * 2) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 2, ["2007-12-10 00:00:00", "2015-12-08 00:00:00"]], names=["category", "name"])) assert grouped_df.index.name == "dom" assert grouped_df.shape == (31, 2) assert grouped_df.index[0] == 1 # show_overlays only (np.ndarray), sliding window overlay label grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="dom", show_overlays=np.arange(0, 6, 2), overlay_label_sliding_window_size=365 * 2) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 3, [ "2007-12-10 00:00:00", "2011-12-09 00:00:00", "2015-12-08 00:00:00" ]], names=["category", "name"])) assert grouped_df.index.name == "dom" assert grouped_df.shape == (31, 3) assert grouped_df.index[0] == 1 # show_overlays only (list of column names), sliding window overlay label grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="dom", show_overlays=["2007-12-10 00:00:00", "2015-12-08 00:00:00"], overlay_label_sliding_window_size=365 * 2) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 2, ["2007-12-10 00:00:00", "2015-12-08 00:00:00"]], names=["category", "name"])) assert grouped_df.index.name == "dom" assert grouped_df.shape == (31, 2) assert grouped_df.index[0] == 1 # Show all 3 (no overlay label) grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=50, # 50 per group (50 overlays) show_mean=True, show_quantiles=[0.05, 0.5, 0.95], # 3 quantiles show_overlays=True) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[MEAN_COL_GROUP] + [QUANTILE_COL_GROUP] * 3 + [OVERLAY_COL_GROUP] * 50, ["mean", "Q0.05", "Q0.5", "Q0.95"] + [f"overlay{i}" for i in range(50)]], names=["category", "name"])) assert grouped_df.index.name == "ts_downsample" assert grouped_df.shape == (60, 54) assert grouped_df.index[-1] == pd.Timestamp(2016, 1, 7) # Show all 3 (with overlay label). # Pass overlay_pivot_table_kwargs. grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=180, show_mean=True, show_quantiles=[0.05, 0.5, 0.95], # 3 quantiles show_overlays=True, overlay_label_time_feature="dow", # 7 possible values aggfunc="median") assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[MEAN_COL_GROUP] + [QUANTILE_COL_GROUP] * 3 + [OVERLAY_COL_GROUP] * 7, [ "mean", "Q0.05", "Q0.5", "Q0.95", "1", "2", "3", "4", "5", "6", "7" ]], names=["category", "name"])) assert grouped_df.index.name == "ts_downsample" assert grouped_df.shape == (17, 11) assert grouped_df.index[-1] == pd.Timestamp(2015, 10, 29) assert np.linalg.norm( grouped_df[OVERLAY_COL_GROUP].mean()) > 1.0 # not centered with pytest.raises( TypeError, match="pivot_table\\(\\) got an unexpected keyword argument 'aggfc'" ): peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=180, show_mean=True, show_quantiles=[0.05, 0.5, 0.95], show_overlays=True, overlay_label_time_feature="dow", aggfc=np.nanmedian) # unrecognized parameter # center_values with show_mean=True centered_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=180, show_mean=True, show_quantiles=[0.05, 0.5, 0.95], show_overlays=True, overlay_label_time_feature="dow", aggfunc="median", center_values=True) assert np.linalg.norm(centered_df[[MEAN_COL_GROUP, OVERLAY_COL_GROUP ]].mean()) < 1e-8 # centered at 0 assert_equal( centered_df[QUANTILE_COL_GROUP], grouped_df[QUANTILE_COL_GROUP] - grouped_df[MEAN_COL_GROUP].mean()[0]) # center_values with show_mean=False centered_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=180, show_mean=False, show_quantiles=[0.05, 0.5, 0.95], show_overlays=True, overlay_label_time_feature="dow", aggfunc="median", center_values=True) assert np.linalg.norm(centered_df[[OVERLAY_COL_GROUP ]].mean()) < 1e-8 # centered at 0 overall_mean = peyton_manning_ts.df[VALUE_COL].mean() assert_equal(centered_df[QUANTILE_COL_GROUP], grouped_df[QUANTILE_COL_GROUP] - overall_mean) # new value_col df = generate_df_with_reg_for_tests(freq="D", periods=700)["df"] ts = UnivariateTimeSeries() ts.load_data(df=df) grouped_df = ts.get_quantiles_and_overlays( groupby_time_feature="dow", show_mean=True, show_quantiles=True, show_overlays=True, overlay_label_time_feature="woy", value_col="regressor1") df_dow = add_groupby_column(df=ts.df, time_col=TIME_COL, groupby_time_feature="dow") dow_mean = df_dow["df"].groupby("dow").agg( mean=pd.NamedAgg(column="regressor1", aggfunc=np.nanmean)) assert_equal(grouped_df["mean"], dow_mean, check_names=False)
def test_plot_quantiles_and_overlays(): """Tests plot_quantiles_and_overlays""" dl = DataLoaderTS() peyton_manning_ts = dl.load_peyton_manning_ts() # plots one at a time, with different axis options fig = peyton_manning_ts.plot_quantiles_and_overlays( groupby_time_feature="doy", show_mean=True) assert fig.layout.showlegend assert fig.layout.title.text == "y vs doy" assert fig.layout.xaxis.title.text == "doy" assert fig.layout.yaxis.title.text == "y" assert len(fig.data) == 1 assert fig.data[0].mode == "lines" assert fig.data[0].legendgroup == MEAN_COL_GROUP assert fig.data[0].line.color == "#595959" assert fig.data[0].line.width == 2 assert fig.data[0].name == "mean" fig = peyton_manning_ts.plot_quantiles_and_overlays( groupby_time_feature="doy", show_quantiles=True, ylabel="log(pageviews)", xlabel="day of year") assert fig.layout.showlegend assert fig.layout.title.text == "log(pageviews) vs day of year" assert fig.layout.xaxis.title.text == "day of year" assert fig.layout.yaxis.title.text == "log(pageviews)" assert len(fig.data) == 2 assert fig.data[0].mode == "lines" assert fig.data[0].legendgroup == QUANTILE_COL_GROUP assert fig.data[0].line.color == "#1F9AFF" assert fig.data[0].line.width == 2 assert fig.data[0].fill is None # no fill from first line assert fig.data[1].fill == "tonexty" assert fig.data[0].name == "Q0.1" assert fig.data[1].name == "Q0.9" fig = peyton_manning_ts.plot_quantiles_and_overlays( groupby_time_feature="doy", show_overlays=True, overlay_label_time_feature="year", ylabel="log(pageviews)", xlabel="day of year", title="Yearly seasonality patterns", showlegend=False) assert not fig.layout.showlegend assert fig.layout.title.text == "Yearly seasonality patterns" assert fig.layout.xaxis.title.text == "day of year" assert fig.layout.yaxis.title.text == "log(pageviews)" assert len(fig.data) == 10 assert fig.data[0].mode == "lines" assert fig.data[0].legendgroup == OVERLAY_COL_GROUP assert fig.data[0].line.color == "#B3B3B3" assert fig.data[0].line.width == 1 assert fig.data[0].line.dash == "solid" assert fig.data[0].name == "2007" assert fig.data[-1].name == "2016" # plots all at once fig = peyton_manning_ts.plot_quantiles_and_overlays( groupby_time_feature="doy", show_mean=True, show_quantiles=True, show_overlays=True, overlay_label_time_feature="year", ylabel="log(pageviews)", xlabel="day of year", title="Yearly seasonality patterns", showlegend=True) assert fig.layout.showlegend assert fig.layout.title.text == "Yearly seasonality patterns" assert fig.layout.xaxis.title.text == "day of year" assert fig.layout.yaxis.title.text == "log(pageviews)" assert len(fig.data) == 13 # 1 (mean) + 2 (quantiles) + 10 (one per year) assert fig.data[0].mode == "lines" assert fig.data[0].legendgroup == OVERLAY_COL_GROUP assert fig.data[0].line.color == "#B3B3B3" assert fig.data[0].line.width == 1 assert fig.data[0].line.dash == "solid" assert fig.data[0].opacity == 0.5 assert fig.data[0].name == "2007" assert fig.data[9].name == "2016" assert fig.data[10].name == "Q0.1" assert fig.data[11].name == "Q0.9" assert fig.data[12].name == "mean" # plots all with custom style beijing_pm_ts = dl.load_beijing_pm_ts() mean_style = { "line": dict(width=2, color="#757575"), # gray "legendgroup": MEAN_COL_GROUP } quantile_style = { "line": dict(width=2, color="#A3A3A3"), # light gray "legendgroup": QUANTILE_COL_GROUP, "fill": "tonexty" } overlay_style = { # Different color for each line (unspecified) "line": dict( # No legendgroup to allow individual toggling of lines. width=1, dash="dot") } fig = beijing_pm_ts.plot_quantiles_and_overlays( groupby_time_feature="hour", show_mean=True, show_quantiles=[0.2, 0.8], show_overlays=True, overlay_label_time_feature="month", center_values=True, mean_col_name="avg", mean_style=mean_style, quantile_style=quantile_style, overlay_style=overlay_style, xlabel="hour of day", value_col="pres", title="Daily seasonality pattern: pres") assert fig.layout.showlegend assert fig.layout.title.text == "Daily seasonality pattern: pres" assert fig.layout.xaxis.title.text == "hour of day" assert fig.layout.yaxis.title.text == "pres" assert len(fig.data) == 15 # The first timeseries is the 12 overlays (one per month), then Q0.1, Q0.9, then the mean assert fig.data[0].name == "1" assert fig.data[11].name == "12" assert fig.data[12].name == "Q0.2" assert fig.data[13].name == "Q0.8" assert fig.data[14].name == "avg" assert fig.data[0].mode == "lines" assert fig.data[0].legendgroup is None assert fig.data[0].line.color is None assert fig.data[12].line.color == "#A3A3A3" assert fig.data[14].line.color == "#757575"