def generate_m4_dataset(dataset_path: Path, m4_freq: str, pandas_freq: str, prediction_length: int): m4_dataset_url = ( "https://github.com/M4Competition/M4-methods/raw/master/Dataset") train_df = pd.read_csv(f"{m4_dataset_url}/Train/{m4_freq}-train.csv", index_col=0) test_df = pd.read_csv(f"{m4_dataset_url}/Test/{m4_freq}-test.csv", index_col=0) os.makedirs(dataset_path, exist_ok=True) with open(dataset_path / "metadata.json", "w") as f: f.write( json.dumps( metadata( cardinality=len(train_df), freq=pandas_freq, prediction_length=prediction_length, ))) train_file = dataset_path / "train" / "data.json" test_file = dataset_path / "test" / "data.json" train_target_values = [ts[~np.isnan(ts)] for ts in train_df.values] test_target_values = [ np.hstack([train_ts, test_ts]) for train_ts, test_ts in zip(train_target_values, test_df.values) ] if m4_freq == "Yearly": # some time series have more than 300 years which can not be represented in pandas, # this is probably due to a misclassification of those time series as Yearly # we simply use only the last 300 years for training # note this does not affect test time as prediction length is less than 300 years train_target_values = [ts[-300:] for ts in train_target_values] test_target_values = [ts[-300:] for ts in test_target_values] # the original dataset did not include time stamps, so we use a mock start date for each time series # we use the earliest point available in pandas mock_start_dataset = "1750-01-01 00:00:00" save_to_file( train_file, [ to_dict(target_values=target, start=mock_start_dataset, cat=[cat]) for cat, target in enumerate(train_target_values) ], ) save_to_file( test_file, [ to_dict(target_values=target, start=mock_start_dataset, cat=[cat]) for cat, target in enumerate(test_target_values) ], )
def save_dataset(dataset_path: Path, ds_info: GPCopulaDataset): dataset = list(FileDataset(dataset_path, freq=ds_info.freq)) shutil.rmtree(dataset_path) train_file = dataset_path / "data.json" save_to_file( train_file, [ to_dict( target_values=data_entry[FieldName.TARGET], start=data_entry[FieldName.START], # Handles adding categorical features of rolling # evaluation dates cat=[cat - ds_info.num_series * (cat // ds_info.num_series)], ) for cat, data_entry in enumerate(dataset) ], )
def generate_lstnet_dataset(dataset_path: Path, dataset_name: str): ds_info = datasets_info[dataset_name] os.makedirs(dataset_path, exist_ok=True) with open(dataset_path / "metadata.json", "w") as f: f.write( json.dumps( metadata( cardinality=ds_info.num_series, freq=ds_info.freq, prediction_length=ds_info.prediction_length, ))) train_file = dataset_path / "train" / "data.json" test_file = dataset_path / "test" / "data.json" time_index = pd.date_range( start=ds_info.start_date, freq=ds_info.freq, periods=ds_info.num_time_steps, ) df = pd.read_csv(ds_info.url, header=None) assert df.shape == ( ds_info.num_time_steps, ds_info.num_series, ), f"expected num_time_steps/num_series {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}" timeseries = load_from_pandas(df=df, time_index=time_index, agg_freq=ds_info.agg_freq) # the last date seen during training ts_index = timeseries[0].index training_end = ts_index[int(len(ts_index) * (8 / 10))] train_ts = [] for cat, ts in enumerate(timeseries): sliced_ts = ts[:training_end] if len(sliced_ts) > 0: train_ts.append( to_dict( target_values=sliced_ts.values, start=sliced_ts.index[0], cat=[cat], )) assert len(train_ts) == ds_info.num_series save_to_file(train_file, train_ts) # time of the first prediction prediction_dates = [ frequency_add(training_end, i * ds_info.prediction_length) for i in range(ds_info.rolling_evaluations) ] test_ts = [] for prediction_start_date in prediction_dates: for cat, ts in enumerate(timeseries): # print(prediction_start_date) prediction_end_date = frequency_add(prediction_start_date, ds_info.prediction_length) sliced_ts = ts[:prediction_end_date] test_ts.append( to_dict( target_values=sliced_ts.values, start=sliced_ts.index[0], cat=[cat], )) assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations save_to_file(test_file, test_ts)
def generate_lstnet_dataset( dataset_path: Path, dataset_name: str, prediction_length: Optional[int] = None, ): ds_info = datasets_info[dataset_name] ds_metadata = metadata( cardinality=ds_info.num_series, freq=ds_info.freq if ds_info.agg_freq is None else ds_info.agg_freq, prediction_length=prediction_length or ds_info.prediction_length, ) os.makedirs(dataset_path, exist_ok=True) with open(dataset_path / "metadata.json", "w") as f: json.dump(ds_metadata, f) time_index = pd.period_range( start=ds_info.start_date, freq=ds_info.freq, periods=ds_info.num_time_steps, ) df = cast( pd.DataFrame, pd.read_csv(ds_info.url, header=None), # type: ignore ) assert df.shape == ( ds_info.num_time_steps, ds_info.num_series, ), ("expected num_time_steps/num_series" f" {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}") timeseries = load_from_pandas(df=df, time_index=time_index, agg_freq=ds_info.agg_freq) # the last date seen during training ts_index = cast(pd.PeriodIndex, timeseries[0].index) training_end = ts_index[int(len(ts_index) * (8 / 10))] train_ts = [] for cat, ts in enumerate(timeseries): sliced_ts = ts[:training_end] if len(sliced_ts) > 0: train_ts.append( to_dict( target_values=sliced_ts.values, start=sliced_ts.index[0], cat=[cat], item_id=cat, )) assert len(train_ts) == ds_info.num_series save_to_file(dataset_path / "train" / "data.json", train_ts) # time of the first prediction prediction_dates = [ training_end + i * ds_info.prediction_length for i in range(ds_info.rolling_evaluations) ] test_ts = [] for prediction_start_date in prediction_dates: for cat, ts in enumerate(timeseries): # print(prediction_start_date) prediction_end_date = (prediction_start_date + ds_info.prediction_length) sliced_ts = ts[:prediction_end_date] test_ts.append( to_dict( target_values=sliced_ts.values, start=sliced_ts.index[0], cat=[cat], item_id=cat, )) assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations save_to_file(dataset_path / "test" / "data.json", test_ts)
def generate_m3_dataset(dataset_path: Path, m3_freq: str): from gluonts.dataset.repository.datasets import default_dataset_path m3_xls_path = default_dataset_path / "M3C.xls" if not os.path.exists(m3_xls_path): raise RuntimeError( f"The m3 data is available at https://forecasters.org/resources/time-series-data/m3-competition/ " f"Please download the file and copy the files to this location: {m3_xls_path}" ) class M3Setting(NamedTuple): sheet_name: str prediction_length: int freq: str subsets = { "yearly": M3Setting("M3Year", 6, "12M"), "quarterly": M3Setting("M3Quart", 8, "3M"), "monthly": M3Setting("M3Month", 18, "1M"), "other": M3Setting("M3Other", 8, "3M"), } assert (m3_freq.lower() in subsets ), f"invalid m3_freq='{m3_freq}'. Allowed values: {subsets.keys()}" if m3_freq.lower() == "other": warnings.warn( "Be aware: The M3-other dataset does not have a known frequency. Since gluonts needs a known frequency, " "we will generate the dataset with an artificial `quarterly` frequency." ) subset = subsets[m3_freq.lower()] df = pd.read_excel(m3_xls_path, sheet_name=subset.sheet_name) def truncate_trailing_nan(v: np.ndarray): last_finite_index = np.where(np.isfinite(v))[0][-1] return v[:last_finite_index + 1] train_data = [] test_data = [] def normalize_category(c: str): return c.strip() df["Category"] = df["Category"].apply(normalize_category) categories = list(df["Category"].unique()) cat_map = {c: i for i, c in enumerate(categories)} i = 0 for _, row in df.iterrows(): vals = row.values series, n, nf, category, starting_year, starting_offset = vals[:6] target = np.asarray(vals[6:], dtype=np.float64) target = truncate_trailing_nan(target) assert len(target) == n assert nf == subset.prediction_length mock_start = "1750-01-01 00:00:00" if starting_year == 0: assert starting_offset == 0 starting_year = mock_start s = pd.Timestamp(str(starting_year), freq=subset.freq) offset = max(starting_offset - 1, 0) if offset: s += offset * s.freq start = str(s).split(" ")[0] cat = [i, cat_map[category]] d_train = to_dict( target_values=target[:-subset.prediction_length], start=start, cat=cat, item_id=series, ) train_data.append(d_train) d_test = to_dict(target_values=target, start=start, cat=cat, item_id=series) test_data.append(d_test) i += 1 os.makedirs(dataset_path, exist_ok=True) with open(dataset_path / "metadata.json", "w") as f: f.write( json.dumps( metadata( cardinality=[len(train_data), len(categories)], freq=subset.freq, prediction_length=subset.prediction_length, ))) train_file = dataset_path / "train" / "data.json" test_file = dataset_path / "test" / "data.json" save_to_file(train_file, train_data) save_to_file(test_file, test_data) check_dataset(dataset_path, len(df))