def get_dataset( dataset_name: str, path: Optional[Path] = None, regenerate: bool = False, ) -> TrainDatasets: """ Get the repository dataset. Currently only [Retail Dataset](https://archive.ics.uci.edu/ml/datasets/online+retail) is available Parameters: dataset_name: name of the dataset, for instance "retail" regenerate: whether to regenerate the dataset even if a local file is present. If this flag is False and the file is present, the dataset will not be downloaded again. path: where the dataset should be saved Returns: dataset obtained by either downloading or reloading from local file. """ if path is None: path = default_dataset_path dataset_path = materialize_dataset(dataset_name, path, regenerate) return load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test", )
def gluonts_model_eval_all(dataset_name, estimator): dataset = load_datasets(dataset_name) estimator = estimator( prediction_length=dataset.metadata.prediction_length, freq=dataset.metadata.freq, use_feat_static_cat=True, cardinality=[ feat_static_cat.cardinality for feat_static_cat in dataset.metadata.feat_static_cat ], ) log(f"evaluating {estimator} on {dataset}") predictor = estimator.train(dataset.train) forecast_it, ts_it = make_evaluation_predictions(dataset.test, predictor=predictor, num_samples=100) agg_metrics, item_metrics = Evaluator()(ts_it, forecast_it, num_series=len(dataset.test)) log(agg_metrics) eval_dict = agg_metrics eval_dict["dataset"] = dataset_name eval_dict["estimator"] = type(estimator).__name__ return eval_dict
def get_dataset( dataset_name: str, path: Path = default_dataset_path, regenerate: bool = False, ) -> TrainDatasets: """ Get a repository dataset. The datasets that can be obtained through this function have been used with different processing over time by several papers (e.g., [SFG17]_, [LCY+18]_, and [YRD15]_). Parameters ---------- dataset_name name of the dataset, for instance "m4_hourly" regenerate whether to regenerate the dataset even if a local file is present. If this flag is False and the file is present, the dataset will not be downloaded again. path where the dataset should be saved Returns ------- dataset obtained by either downloading or reloading from local file. """ dataset_path = materialize_dataset(dataset_name, path, regenerate) return load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test", )
def generate_dataframes(dataset_path=None,index=None): all_targets =[] all_dynamic =[] all_static =[] start=[] TD =load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test") all_targets=[] all_dynamic = [] all_static = [] for items in TD.train: target=np.transpose(items["target"]).tolist() dynamic=np.transpose(items["feat_dynamic_real"]).tolist() static= np.transpose(items["feat_static_cat"]).tolist() all_dynamic.append(dynamic) #all_static.append(static) #all_targets.append(target) df_timeseries =pd.DataFrame(all_targets) del all_targets #df_dynamic =pd.DataFrame(all_dynamic) #del all_dynamic #df_static =pd.DataFrame(all_static) #del all_static print(df_timeseries.head(10)) #print(df_dynamic.head(10)) #print(df_static.head(10)) return df_timeseries,df_dynamic,df_static
def get_dataset(dataset_name: str, regenerate: bool = False) -> TrainDatasets: """ Parameters ---------- dataset_name name of the dataset, for instance "m4_hourly" regenerate whether to regenerate the dataset even if a local file is present. If this flag is False and the file is present, the dataset will not be downloaded again. Returns ------- dataset obtained by either downloading or reloading from local file. """ assert ( dataset_name in dataset_recipes.keys() ), f"{dataset_name} is not present, please choose one from {dataset_recipes.keys()}." dataset_path = Path(dataset_name) dataset_recipe = dataset_recipes[dataset_name] if not os.path.exists(dataset_path) or regenerate: logging.info(f"downloading and processing {dataset_name}") dataset_recipe(dataset_path=dataset_path) else: logging.info( f"using dataset already processed in path {dataset_path}.") return load_datasets( metadata=dataset_path, train=dataset_path / 'train', test=dataset_path / 'test', )
def get_dataset(dataset_name): if dataset_name == "wiki2000_nips": datasets_root_path = os.path.join( os.environ["HOME"], ".mxnet/gluon-ts/datasets" ) dataset_path = os.path.join(datasets_root_path, dataset_name) if not os.path.exists(datasets_root_path): os.makedirs(datasets_root_path, exist_ok=True) if not os.path.exists(dataset_path): raise Exception( f"you must manually upload the wiki dataset " f"and place it to the following folder: {dataset_path}" ) else: dataset = load_datasets( metadata=pathlib.PurePath( os.path.join(dataset_path, "metadata") ), train=pathlib.PurePath(os.path.join(dataset_path, "train")), test=pathlib.PurePath(os.path.join(dataset_path, "test")), ) if ( dataset.metadata.freq == "1D" ): # WHY IS WIKI "D" AND THIS IS "1D" ?! dataset.metadata.freq = "D" return dataset else: return get_dataset_gts(dataset_name)
def test_pandas_to_gluonts(root): ##### Generate Data using functions ############################################################################ #from zlocal import root dir0 = root + "/zsample/m5_dataset/" gluonts_path = dir0 + "/json/" df_timeseries, df_dynamic, df_static, pars_data = load_datasset_m5() ####### Generate the Gluonts format dataset pandas_to_gluonts(df_timeseries, df_dynamic, df_static, pars=pars_data, path_save=gluonts_path, return_df=False) ####### For Model definition cardinalities = gluonts_static_cardinalities(df_static, submission=1, single_pred_length=28, submission_pred_length=10, n_timeseries=1, transpose=1) test_ds = None train_ds = None # test gluonts data from gluonts.dataset.common import load_datasets dataset_path = gluonts_path TD = load_datasets( metadata=dataset_path, train=dataset_path + "/train", test=dataset_path + "/test", )
def get_dataset( dataset_name: str, path: Path = default_dataset_path, regenerate: bool = False, ) -> TrainDatasets: """ Parameters ---------- dataset_name name of the dataset, for instance "m4_hourly" regenerate whether to regenerate the dataset even if a local file is present. If this flag is False and the file is present, the dataset will not be downloaded again. path where the dataset should be saved Returns ------- dataset obtained by either downloading or reloading from local file. """ dataset_path = materialize_dataset(dataset_name, path, regenerate) return load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test", )
def load_multivariate_datasets(path: Path) -> TrainDatasets: ds = load_datasets(path / "metadata", path / "train", path / "test") target_dim = ds.metadata.feat_static_cat[0].cardinality grouper_train = MultivariateGrouper(max_target_dim=target_dim) grouper_test = MultivariateGrouper(max_target_dim=target_dim) return TrainDatasets( metadata=ds.metadata, train=grouper_train(ds.train), test=grouper_test(ds.test), )
def train(arguments): """ Generic train method that trains a specified estimator on a specified dataset. """ logger.info("Downloading estimator config.") estimator_config = Path(arguments.estimator) / "estimator.json" with estimator_config.open() as config_file: estimator = serde.load_json(config_file.read()) logger.info("Downloading dataset.") if arguments.s3_dataset is None: # load built in dataset dataset = datasets.get_dataset(arguments.dataset) else: # load custom dataset s3_dataset_dir = Path(arguments.s3_dataset) dataset = common.load_datasets( metadata=s3_dataset_dir, train=s3_dataset_dir / "train", test=s3_dataset_dir / "test", ) logger.info("Starting model training.") predictor = estimator.train(dataset.train) forecast_it, ts_it = backtest.make_evaluation_predictions( dataset=dataset.test, predictor=predictor, num_samples=int(arguments.num_samples), ) logger.info("Starting model evaluation.") evaluator = Evaluator(quantiles=eval(arguments.quantiles)) agg_metrics, item_metrics = evaluator(ts_it, forecast_it, num_series=len(list(dataset.test))) # required for metric tracking. for name, value in agg_metrics.items(): logger.info(f"gluonts[metric-{name}]: {value}") # save the evaluation results metrics_output_dir = Path(arguments.output_data_dir) with open(metrics_output_dir / "agg_metrics.json", "w") as f: json.dump(agg_metrics, f) with open(metrics_output_dir / "item_metrics.csv", "w") as f: item_metrics.to_csv(f, index=False) # save the model model_output_dir = Path(arguments.model_dir) predictor.serialize(model_output_dir)
def get_dataset( dataset_name: str, path: Path = default_dataset_path, regenerate: bool = False, prediction_length: Optional[int] = None, ) -> TrainDatasets: """ Get a repository dataset. The datasets that can be obtained through this function have been used with different processing over time by several papers (e.g., [SFG17]_, [LCY+18]_, and [YRD15]_) or are obtained through the `Monash Time Series Forecasting Repository <https://forecastingdata.org/>`_. Parameters ---------- dataset_name Name of the dataset, for instance "m4_hourly". regenerate Whether to regenerate the dataset even if a local file is present. If this flag is False and the file is present, the dataset will not be downloaded again. path Where the dataset should be saved. prediction_length The prediction length to be used for the dataset. If None, the default prediction length will be used. If the dataset is already materialized, setting this option to a different value does not have an effect. Make sure to set `regenerate=True` in this case. Note that some datasets from the Monash Time Series Forecasting Repository do not actually have a default prediction length -- the default then depends on the frequency of the data: - Minutely data --> prediction length of 60 (one hour) - Hourly data --> prediction length of 48 (two days) - Daily data --> prediction length of 30 (one month) - Weekly data --> prediction length of 8 (two months) - Monthly data --> prediction length of 12 (one year) - Yearly data --> prediction length of 4 (four years) Returns ------- Dataset obtained by either downloading or reloading from local file. """ dataset_path = materialize_dataset( dataset_name, path, regenerate, prediction_length ) return load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test", )
def load_dataset(args: Namespace) -> TrainDatasets: """Load data from channel or fallback to named public dataset.""" if args.s3_dataset is None: # load built in dataset logger.info("Downloading dataset %s", args.dataset) dataset = datasets.get_dataset(args.dataset) else: # load custom dataset logger.info("Loading dataset from %s", args.s3_dataset) s3_dataset_dir = Path(args.s3_dataset) dataset = load_datasets( metadata=s3_dataset_dir / "metadata", train=s3_dataset_dir / "train", test=s3_dataset_dir / "test", ) return dataset
def check_dataset(dataset_path: Path, length: int, sheet_name): # check that things are correct from gluonts.dataset.common import load_datasets ds = load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test", ) assert ds.test is not None assert len(list(ds.train)) == length assert len(list(ds.test)) == length assert ds.metadata.prediction_length is not None for ts_train, ts_test in tqdm( zip(ds.train, ds.test), total=length, desc="checking consistency" ): train_target = ts_train["target"] test_target = ts_test["target"] assert ( len(train_target) == len(test_target) - ds.metadata.prediction_length ) assert np.all(train_target == test_target[: len(train_target)]) assert ts_train["start"] == ts_test["start"] start = ts_train["start"] regex = r"^(\d{4})-(\d{2})-(\d{2})( 00:00(:00)?)?$" m = re.match(regex, str(start)) assert m month, day = m.group(2), m.group(3) if sheet_name in ["M3Quart", "Other"]: assert f"{month}-{day}" in [ "03-31", "06-30", "09-30", "12-31", ], f"Invalid time stamp `{month}-{day}`" elif sheet_name == "M3Year": assert ( f"{month}-{day}" == "12-31" ), f"Invalid time stamp {month}-{day}"
def check_dataset(dataset_path: Path, length: int): # check that things are correct from gluonts.dataset.common import load_datasets ds = load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test", ) assert ds.test is not None assert len(list(ds.train)) == length assert len(list(ds.test)) == length assert ds.metadata.prediction_length is not None for ts_train, ts_test in tqdm(zip(ds.train, ds.test), total=length, desc="checking consistency"): train_target = ts_train["target"] test_target = ts_test["target"] assert (len(train_target) == len(test_target) - ds.metadata.prediction_length) assert np.all(train_target == test_target[:len(train_target)])
from mxnet import gluon import numpy as np import pandas as pd import matplotlib.pyplot as plt # %matplotlib inline import os from tqdm.autonotebook import tqdm from pathlib import Path from gluonts.dataset.common import ListDataset,load_datasets #dataset_path=Path("time/data/gluonts_01") data_folder = "kaggle_data/m5_dataset" gluonts_datafolder='gluonts_data/m5_dataset' dataset_path=Path(gluonts_datafolder) TD =load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test",) import pandas as pd import json def generate_dataframes(dataset_path=None,index=None): all_targets =[] all_dynamic =[] all_static =[] start=[] TD =load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test") all_targets=[] all_dynamic = [] all_static = [] for items in TD.train:
def gluonts_to_pandas(dataset_path=None): from gluonts.dataset.common import load_datasets all_targets = [] all_dynamic = [] all_static = [] start = [] TD = load_datasets(metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test") instance = next(iter(TD.test)) #### load decode pars ######## decode_pars = json.load( open(dataset_path / "decode.json", mode='r'), object_hook=lambda d: {int(k) if k.lstrip('-').isdigit() else k: v for k, v in d.items()}) df_dynamic_labels = decode_pars["df_dynamic_labels"] df_dynamic_cols = decode_pars["df_dynamic_cols"] df_static_labels = decode_pars["df_static_labels"] df_static_cols = decode_pars["df_static_cols"] df_timeseries_cols = decode_pars["df_timeseries_cols"] df_timeseries_dtype = decode_pars["df_timeseries_dtype"] df_dynamic_dtype = decode_pars["df_dynamic_dtype"] df_static_dtype = decode_pars["df_static_dtype"] ################################################# dynamic_features = np.transpose(instance["feat_dynamic_real"]) for items in TD.test: #print(items) target = np.transpose(items["target"]).tolist() static = np.transpose(items["feat_static_cat"]).tolist() all_static.append(static) all_targets.append(target) del TD df_timeseries = pd.DataFrame(all_targets) del all_targets df_dynamic = pd.DataFrame(dynamic_features) df_static = pd.DataFrame(all_static) ################ decode df_dynamic ############# if df_dynamic_labels is not None: for key in df_dynamic_labels: col = key labels = df_dynamic_labels[key] for l in labels: v = labels[l] df_dynamic[col] = df_dynamic[col].apply(lambda x: v if x == l else x) for col in df_dynamic.columns: df_dynamic[col] = df_dynamic[col].apply(lambda x: np.NAN if x == -l else x) if df_dynamic_cols is not None: df_dynamic.rename(columns=df_dynamic_cols, inplace=True) ##### decode df_timeseries######################################## if df_timeseries_cols is not None: df_timeseries.rename(columns=df_timeseries_cols, inplace=True) del all_dynamic ####### decode df staic########################################### if df_static_labels is not None: for key in df_static_labels: d = df_static_labels[key] df_static[key] = df_static[key].map(d) if df_static_cols is not None: df_static.rename(columns=df_static_cols, inplace=True) ##################### del all_static df_timeseries = df_timeseries.astype(df_timeseries_dtype) df_dynamic = df_dynamic.astype(df_dynamic_dtype) df_static = df_static.astype(df_static_dtype) return df_timeseries, df_dynamic, df_static
def gluonts_to_pandas(dataset_path, data_type): from gluonts.dataset.common import ListDataset,load_datasets from copy import deepcopy all_targets = [] all_dynamic = [] all_static = [] all_static_Real = [] start = [] TD = load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test") ### json iterator Why Test only ############### if data_type == "test" : TD_current = deepcopy( TD.test ) if data_type == "train" : TD_current = deepcopy( TD.train ) instance_iter =next(iter(TD_current)) df_static_real = None df_dynamic = None df_static = None df_static_real = None #### load decode pars ############################ decode_pars = json.load(open(dataset_path / "decode.json", mode='r'),object_hook=lambda d: {int(k) if k.lstrip('-').isdigit() else k: v for k, v in d.items()}) print(decode_pars) df_dynamic_labels = decode_pars["df_dynamic_labels"] df_dynamic_cols = decode_pars["df_dynamic_cols"] df_static_labels = decode_pars["df_static_labels"] df_static_cols = decode_pars["df_static_cols"] df_static_real_cols = decode_pars["df_static_real_cols"] df_timeseries_cols = decode_pars["df_timeseries_cols"] df_timeseries_dtype = decode_pars["df_timeseries_dtype"] df_dynamic_dtype = decode_pars["df_dynamic_dtype"] df_static_dtype = decode_pars["df_static_dtype"] df_static_real_dtype =decode_pars["df_static_real_dtype"] ################################################# if "feat_dynamic_real" in instance_iter: dynamic_features = np.transpose(instance_iter["feat_dynamic_real"]) else: dynamic_features = None for items in TD_current : #print(items) target=np.transpose(items["target"]).tolist() if "feat_static_cat" in items: static= np.transpose(items["feat_static_cat"]).tolist() if "feat_static_real" in items: static_real = items["feat_static_real"] all_static_Real.append(static_real) all_static.append(static) all_targets.append(target) df_timeseries =pd.DataFrame(all_targets) del all_targets ############# decode df_static real ############################ if len(all_static_Real)>0: df_static_real=pd.DataFrame(all_static_Real) df_static_real = df_static_real.astype(df_static_real_dtype) ################ decode df_dynamic ##### if dynamic_features: df_dynamic =pd.DataFrame(dynamic_features) if df_dynamic_labels is not None: for key in df_dynamic_labels: col = key labels= df_dynamic_labels[key] for l in labels: v = labels[l] df_dynamic[col]=df_dynamic[col].apply(lambda x: v if x == l else x) for col in df_dynamic.columns: df_dynamic[col]=df_dynamic[col].apply(lambda x: np.NAN if x == -l else x) if df_dynamic_cols is not None: df_dynamic.rename(columns = df_dynamic_cols, inplace = True) df_dynamic = df_dynamic.astype(df_dynamic_dtype) if len(all_static)>0: df_static =pd.DataFrame(all_static) ##### decode df_timeseries############# if df_timeseries_cols is not None: df_timeseries.rename(columns = df_timeseries_cols , inplace = True) df_timeseries = df_timeseries.astype(df_timeseries_dtype) ####### decode df staic################ if not df_static.empty: if df_static_labels is not None: for key in df_static_labels: d = df_static_labels[key] df_static[key] = df_static[key].map(d) if df_static_cols is not None: df_static.rename(columns = df_static_cols, inplace = True) df_static = df_static.astype(df_static_dtype) ########## df static real ########### ##################################### return df_timeseries,df_dynamic,df_static,df_static_real
cardinalities = gluonts_static_cardinalities(df_static, submission=1, single_pred_length=28, submission_pred_length=10, n_timeseries=1, transpose=1) test_ds = None train_ds = None # test gluonts data from gluonts.dataset.common import ListDataset, load_datasets dataset_path = gluonts_path TD = load_datasets( metadata=dataset_path, train=dataset_path + "/train", test=dataset_path + "/test", ) """ a,b = gluonts_create_dynamic(df_dynamic, submission=True, single_pred_length=28, submission_pred_length=10, n_timeseries=1, transpose=1) col = "event_type_1" df_dynamic[col].apply(lambda x: to_flag(x) ) to_flag('ok')