def test_metadata_1(): assert _util.metadata(freq="1H", prediction_length=20, cardinality=[10, 3]) == { "freq": "1H", "prediction_length": 20, "feat_static_cat": [ { "name": "feat_static_cat_0", "cardinality": "10" }, { "name": "feat_static_cat_1", "cardinality": "3" }, ], }
def generate_lstnet_dataset(dataset_path: Path, dataset_name: str): ds_info = datasets_info[dataset_name] os.makedirs(dataset_path, exist_ok=True) with open(dataset_path / "metadata.json", "w") as f: f.write( json.dumps( metadata( cardinality=ds_info.num_series, freq=ds_info.freq, prediction_length=ds_info.prediction_length, ))) train_file = dataset_path / "train" / "data.json" test_file = dataset_path / "test" / "data.json" time_index = pd.date_range( start=ds_info.start_date, freq=ds_info.freq, periods=ds_info.num_time_steps, ) df = pd.read_csv(ds_info.url, header=None) assert df.shape == ( ds_info.num_time_steps, ds_info.num_series, ), f"expected num_time_steps/num_series {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}" timeseries = load_from_pandas(df=df, time_index=time_index, agg_freq=ds_info.agg_freq) # the last date seen during training ts_index = timeseries[0].index training_end = ts_index[int(len(ts_index) * (8 / 10))] train_ts = [] for cat, ts in enumerate(timeseries): sliced_ts = ts[:training_end] if len(sliced_ts) > 0: train_ts.append( to_dict( target_values=sliced_ts.values, start=sliced_ts.index[0], cat=[cat], )) assert len(train_ts) == ds_info.num_series save_to_file(train_file, train_ts) # time of the first prediction prediction_dates = [ frequency_add(training_end, i * ds_info.prediction_length) for i in range(ds_info.rolling_evaluations) ] test_ts = [] for prediction_start_date in prediction_dates: for cat, ts in enumerate(timeseries): # print(prediction_start_date) prediction_end_date = frequency_add(prediction_start_date, ds_info.prediction_length) sliced_ts = ts[:prediction_end_date] test_ts.append( to_dict( target_values=sliced_ts.values, start=sliced_ts.index[0], cat=[cat], )) assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations save_to_file(test_file, test_ts)
def generate_lstnet_dataset( dataset_path: Path, dataset_name: str, prediction_length: Optional[int] = None, ): ds_info = datasets_info[dataset_name] ds_metadata = metadata( cardinality=ds_info.num_series, freq=ds_info.freq if ds_info.agg_freq is None else ds_info.agg_freq, prediction_length=prediction_length or ds_info.prediction_length, ) os.makedirs(dataset_path, exist_ok=True) with open(dataset_path / "metadata.json", "w") as f: json.dump(ds_metadata, f) time_index = pd.period_range( start=ds_info.start_date, freq=ds_info.freq, periods=ds_info.num_time_steps, ) df = cast( pd.DataFrame, pd.read_csv(ds_info.url, header=None), # type: ignore ) assert df.shape == ( ds_info.num_time_steps, ds_info.num_series, ), ("expected num_time_steps/num_series" f" {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}") timeseries = load_from_pandas(df=df, time_index=time_index, agg_freq=ds_info.agg_freq) # the last date seen during training ts_index = cast(pd.PeriodIndex, timeseries[0].index) training_end = ts_index[int(len(ts_index) * (8 / 10))] train_ts = [] for cat, ts in enumerate(timeseries): sliced_ts = ts[:training_end] if len(sliced_ts) > 0: train_ts.append( to_dict( target_values=sliced_ts.values, start=sliced_ts.index[0], cat=[cat], item_id=cat, )) assert len(train_ts) == ds_info.num_series save_to_file(dataset_path / "train" / "data.json", train_ts) # time of the first prediction prediction_dates = [ training_end + i * ds_info.prediction_length for i in range(ds_info.rolling_evaluations) ] test_ts = [] for prediction_start_date in prediction_dates: for cat, ts in enumerate(timeseries): # print(prediction_start_date) prediction_end_date = (prediction_start_date + ds_info.prediction_length) sliced_ts = ts[:prediction_end_date] test_ts.append( to_dict( target_values=sliced_ts.values, start=sliced_ts.index[0], cat=[cat], item_id=cat, )) assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations save_to_file(dataset_path / "test" / "data.json", test_ts)
def generate_m5_dataset( dataset_path: Path, pandas_freq: str, prediction_length: int, m5_file_path: Path, ): cal_path = f"{m5_file_path}/calendar.csv" sales_path = f"{m5_file_path}/sales_train_validation.csv" if not os.path.exists(cal_path) or not os.path.exists(sales_path): raise RuntimeError( "M5 data is available on Kaggle" " (https://www.kaggle.com/c/m5-forecasting-accuracy/data). You" " first need to agree to the terms of the competition before" " being able to download the data. After you have done that," f" please supply the files at {m5_file_path}.") # Prepare directory dataset_path.mkdir(exist_ok=True) # Read M5 data from dataset_path calendar = pd.read_csv(cal_path) sales_train_validation = pd.read_csv(sales_path) submission_prediction_length = prediction_length * 2 # Build dynamic features cal_features = calendar.drop( [ "date", "wm_yr_wk", "weekday", "wday", "month", "year", "event_name_1", "event_name_2", "d", ], axis=1, ) cal_features["event_type_1"] = cal_features["event_type_1"].apply( lambda x: 0 if str(x) == "nan" else 1) cal_features["event_type_2"] = cal_features["event_type_2"].apply( lambda x: 0 if str(x) == "nan" else 1) test_cal_features = cal_features.values.T train_cal_features = test_cal_features[:, :-submission_prediction_length - prediction_length] test_cal_features = test_cal_features[:, :-submission_prediction_length] test_cal_features_list = [test_cal_features] * len(sales_train_validation) train_cal_features_list = [train_cal_features ] * len(sales_train_validation) # Build static features state_ids = ( sales_train_validation["state_id"].astype("category").cat.codes.values) state_ids_un = np.unique(state_ids) store_ids = ( sales_train_validation["store_id"].astype("category").cat.codes.values) store_ids_un = np.unique(store_ids) cat_ids = ( sales_train_validation["cat_id"].astype("category").cat.codes.values) cat_ids_un = np.unique(cat_ids) dept_ids = ( sales_train_validation["dept_id"].astype("category").cat.codes.values) dept_ids_un = np.unique(dept_ids) item_ids = ( sales_train_validation["item_id"].astype("category").cat.codes.values) item_ids_un = np.unique(item_ids) stat_cat_list = [item_ids, dept_ids, cat_ids, store_ids, state_ids] stat_cat = np.concatenate(stat_cat_list) stat_cat = stat_cat.reshape(len(stat_cat_list), len(item_ids)).T cardinalities = [ len(item_ids_un), len(dept_ids_un), len(cat_ids_un), len(store_ids_un), len(state_ids_un), ] # Build target series train_ids = sales_train_validation["id"] train_df = sales_train_validation.drop( ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"], axis=1) test_target_values = train_df.values.copy() train_target_values = [ts[:-prediction_length] for ts in train_df.values] dates = ["2011-01-29 00:00:00" for _ in range(len(sales_train_validation))] # Create metadata file meta_file = dataset_path / "metadata.json" with open(meta_file, "w") as f: f.write( json.dumps( metadata( cardinality=cardinalities, freq=pandas_freq, prediction_length=prediction_length, ))) # Build training set train_file = dataset_path / "train" / "data.json" train_ds = [{ FieldName.TARGET: target.tolist(), FieldName.START: start, FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(), FieldName.FEAT_STATIC_CAT: fsc.tolist(), FieldName.ITEM_ID: id, } for (target, start, fdr, fsc, id) in zip( train_target_values, dates, train_cal_features_list, stat_cat, train_ids, )] save_to_file(train_file, train_ds) # Build testing set test_file = dataset_path / "test" / "data.json" test_ds = [{ FieldName.TARGET: target.tolist(), FieldName.START: start, FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(), FieldName.FEAT_STATIC_CAT: fsc.tolist(), FieldName.ITEM_ID: id, } for (target, start, fdr, fsc, id) in zip( test_target_values, dates, test_cal_features_list, stat_cat, train_ids, )] save_to_file(test_file, test_ds)
def generate_retail_dataset(dataset_path: Path, split: str = "2011-11-24"): retail_dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx" df = pd.read_excel(retail_dataset_url) combination = ["StockCode", "Country"] df = _preprocess_retail_data(df, combination) df.to_pickle("tmp/temp.pkl") # df = pd.read_pickle("temp.pkl") idx = pd.IndexSlice[:, :, :split] train_df = df.loc[idx, :].reset_index() idx = pd.IndexSlice[:, :, split:] test_df = df.loc[idx, :].reset_index() full_df = df.reset_index() single_prediction_length = len(test_df["InvoiceDate"].unique()) feat_static_cat = combination feat_dynamic_real = ['UnitPrice'] target = 'Quantity' date_col = 'InvoiceDate' os.makedirs(dataset_path, exist_ok=True) uniq_combs = train_df[combination].drop_duplicates().apply(tuple, axis=1) dynamic_real_train_l = [] dynamic_real_test_l = [] stat_cat_l = [] start_l = [] train_target_l = [] test_target_l = [] for stock_code, country in tqdm(uniq_combs): df = train_df[ (train_df.StockCode == stock_code) & (train_df.Country == country) ] _df = full_df[(full_df.StockCode == stock_code) & (full_df.Country == country)] train_ts = _df[target].values.ravel() if (train_ts>0).sum() > (single_prediction_length+13): test_feat_dyn_array = _df.loc[:, feat_dynamic_real].values.T train_feat_dyn_array = test_feat_dyn_array[:, :-single_prediction_length] test_ts = train_ts.copy() train_ts = train_ts[:-single_prediction_length] dynamic_real_train_l.append(train_feat_dyn_array) dynamic_real_test_l.append(test_feat_dyn_array) start_l.append(df[date_col].min()) train_target_l.append(train_ts) test_target_l.append(test_ts) stat_cat_l.append( np.squeeze(df.loc[:, feat_static_cat].drop_duplicates().values) ) stat_cat_cardinalities = [ len(full_df[col].unique()) for col in feat_static_cat ] with open(dataset_path / "metadata.json", "w") as f: f.write( json.dumps( metadata( cardinality=stat_cat_cardinalities, freq="1D", prediction_length=single_prediction_length, ) ) ) train_file = dataset_path / "train" / "data.json" test_file = dataset_path / "test" / "data.json" train_ds = [ { FieldName.ITEM_ID: "|".join(map(str,uniq_comb)), FieldName.TARGET: target.tolist(), FieldName.START: str(start), FieldName.FEAT_STATIC_CAT: fsc.tolist(), FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(), } for uniq_comb, target, start, fdr, fsc in zip( uniq_combs, train_target_l, start_l, dynamic_real_train_l, stat_cat_l, ) ] save_to_file(train_file, train_ds) test_ds = [ { FieldName.ITEM_ID: "|".join(map(str,uniq_comb)), FieldName.TARGET: target.tolist(), FieldName.START: str(start), FieldName.FEAT_STATIC_CAT: fsc.tolist(), FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(), } for uniq_comb, target, start, fdr, fsc in zip( uniq_combs, test_target_l, start_l, dynamic_real_test_l, stat_cat_l, ) ] save_to_file(test_file, test_ds)
def generate_m3_dataset(dataset_path: Path, m3_freq: str): from gluonts.dataset.repository.datasets import default_dataset_path m3_xls_path = default_dataset_path / "M3C.xls" if not os.path.exists(m3_xls_path): raise RuntimeError( f"The m3 data is available at https://forecasters.org/resources/time-series-data/m3-competition/ " f"Please download the file and copy the files to this location: {m3_xls_path}" ) class M3Setting(NamedTuple): sheet_name: str prediction_length: int freq: str subsets = { "yearly": M3Setting("M3Year", 6, "12M"), "quarterly": M3Setting("M3Quart", 8, "3M"), "monthly": M3Setting("M3Month", 18, "1M"), "other": M3Setting("M3Other", 8, "3M"), } assert (m3_freq.lower() in subsets ), f"invalid m3_freq='{m3_freq}'. Allowed values: {subsets.keys()}" if m3_freq.lower() == "other": warnings.warn( "Be aware: The M3-other dataset does not have a known frequency. Since gluonts needs a known frequency, " "we will generate the dataset with an artificial `quarterly` frequency." ) subset = subsets[m3_freq.lower()] df = pd.read_excel(m3_xls_path, sheet_name=subset.sheet_name) def truncate_trailing_nan(v: np.ndarray): last_finite_index = np.where(np.isfinite(v))[0][-1] return v[:last_finite_index + 1] train_data = [] test_data = [] def normalize_category(c: str): return c.strip() df["Category"] = df["Category"].apply(normalize_category) categories = list(df["Category"].unique()) cat_map = {c: i for i, c in enumerate(categories)} i = 0 for _, row in df.iterrows(): vals = row.values series, n, nf, category, starting_year, starting_offset = vals[:6] target = np.asarray(vals[6:], dtype=np.float64) target = truncate_trailing_nan(target) assert len(target) == n assert nf == subset.prediction_length mock_start = "1750-01-01 00:00:00" if starting_year == 0: assert starting_offset == 0 starting_year = mock_start s = pd.Timestamp(str(starting_year), freq=subset.freq) offset = max(starting_offset - 1, 0) if offset: s += offset * s.freq start = str(s).split(" ")[0] cat = [i, cat_map[category]] d_train = to_dict( target_values=target[:-subset.prediction_length], start=start, cat=cat, item_id=series, ) train_data.append(d_train) d_test = to_dict(target_values=target, start=start, cat=cat, item_id=series) test_data.append(d_test) i += 1 os.makedirs(dataset_path, exist_ok=True) with open(dataset_path / "metadata.json", "w") as f: f.write( json.dumps( metadata( cardinality=[len(train_data), len(categories)], freq=subset.freq, prediction_length=subset.prediction_length, ))) train_file = dataset_path / "train" / "data.json" test_file = dataset_path / "test" / "data.json" save_to_file(train_file, train_data) save_to_file(test_file, test_data) check_dataset(dataset_path, len(df))
def generate_m4_dataset(dataset_path: Path, m4_freq: str, pandas_freq: str, prediction_length: int): m4_dataset_url = ( "https://github.com/M4Competition/M4-methods/raw/master/Dataset") train_df = pd.read_csv(f"{m4_dataset_url}/Train/{m4_freq}-train.csv", index_col=0) test_df = pd.read_csv(f"{m4_dataset_url}/Test/{m4_freq}-test.csv", index_col=0) os.makedirs(dataset_path, exist_ok=True) with open(dataset_path / "metadata.json", "w") as f: f.write( json.dumps( metadata( cardinality=len(train_df), freq=pandas_freq, prediction_length=prediction_length, ))) train_file = dataset_path / "train" / "data.json" test_file = dataset_path / "test" / "data.json" train_target_values = [ts[~np.isnan(ts)] for ts in train_df.values] test_target_values = [ np.hstack([train_ts, test_ts]) for train_ts, test_ts in zip(train_target_values, test_df.values) ] if m4_freq == "Yearly": # some time series have more than 300 years which can not be # represented in pandas, this is probably due to a misclassification # of those time series as Yearly. We use only those time series with # fewer than 300 items for this reason. train_target_values = [ ts for ts in train_target_values if len(ts) <= 300 ] test_target_values = [ ts for ts in test_target_values if len(ts) <= 300 ] # the original dataset did not include time stamps, so we use the earliest # point available in pandas as the start date for each time series. mock_start_dataset = "1750-01-01 00:00:00" save_to_file( train_file, [ to_dict( target_values=target, start=mock_start_dataset, cat=[cat], item_id=cat, ) for cat, target in enumerate(train_target_values) ], ) save_to_file( test_file, [ to_dict( target_values=target, start=mock_start_dataset, cat=[cat], item_id=cat, ) for cat, target in enumerate(test_target_values) ], )
def generate_uber_dataset(dataset_path: Path, uber_freq: str, prediction_length: int): subsets = {"daily": "1D", "hourly": "1H"} assert ( uber_freq.lower() in subsets ), f"invalid uber_freq='{uber_freq}'. Allowed values: {subsets.keys()}" freq_setting = subsets[uber_freq.lower()] # download the dataset and read the data with tempfile.TemporaryDirectory() as dir_path: temp_dir_path = Path(dir_path) temp_zip_path = temp_dir_path / "uber-dataset.zip" uber_url_path = ("http://raw.githubusercontent.com/fivethirtyeight/" "uber-tlc-foil-response/master/uber-trip-data/" "uber-raw-data-janjune-15.csv.zip") request.urlretrieve(uber_url_path, temp_zip_path) with zipfile.ZipFile(temp_zip_path) as zf: zf.extractall(path=temp_dir_path) uber_file_path = temp_dir_path / "uber-raw-data-janjune-15.csv" uber_df = pd.read_csv( uber_file_path, header=0, usecols=["Pickup_date", "locationID"], index_col=0, ) # We divide the raw data according to locationID. Each json line represents # a time series of a loacationID. The targets are numbers of pickup-events # during a day or an hour. time_series_of_locations = list(uber_df.groupby(by="locationID")) dataset_path.mkdir(exist_ok=True) train_path = dataset_path / "train" test_path = dataset_path / "test" train_path.mkdir(exist_ok=True) test_path.mkdir(exist_ok=True) train_file = train_path / "data.json" test_file = test_path / "data.json" with open(train_file, "w") as o_train, open(test_file, "w") as o_test: for locationID, df in time_series_of_locations: df.sort_index() df.index = pd.to_datetime(df.index) count_series = df.resample(rule=freq_setting).size() start_time = pd.Timestamp(df.index[0]).strftime("%Y-%m-%d %X") target = count_series.values.tolist() feat_static_cat = [locationID] format_dict = { "start": start_time, "target": target, "feat_static_cat": feat_static_cat, } test_json_line = json.dumps(format_dict) o_test.write(test_json_line) o_test.write("\n") format_dict["target"] = format_dict["target"][:-prediction_length] train_json_line = json.dumps(format_dict) o_train.write(train_json_line) o_train.write("\n") with open(dataset_path / "metadata.json", "w") as f: f.write( json.dumps( metadata( cardinality=len(time_series_of_locations), freq=freq_setting[1], prediction_length=prediction_length, )))