Пример #1
0
def get_dataset(
    dataset_name: str,
    path: Optional[Path] = None,
    regenerate: bool = False,
) -> TrainDatasets:
    """
    Get the repository dataset.
    Currently only [Retail Dataset](https://archive.ics.uci.edu/ml/datasets/online+retail) is available
    Parameters:
        dataset_name:
            name of the dataset, for instance "retail"
        regenerate:
            whether to regenerate the dataset even if a local file is present.
            If this flag is False and the file is present, the dataset will not
            be downloaded again.
        path:
            where the dataset should be saved
    Returns:
        dataset obtained by either downloading or reloading from local file.
    """
    if path is None:
        path = default_dataset_path
    dataset_path = materialize_dataset(dataset_name, path, regenerate)

    return load_datasets(
        metadata=dataset_path,
        train=dataset_path / "train",
        test=dataset_path / "test",
    )
Пример #2
0
def gluonts_model_eval_all(dataset_name, estimator):
    dataset = load_datasets(dataset_name)
    estimator = estimator(
        prediction_length=dataset.metadata.prediction_length,
        freq=dataset.metadata.freq,
        use_feat_static_cat=True,
        cardinality=[
            feat_static_cat.cardinality
            for feat_static_cat in dataset.metadata.feat_static_cat
        ],
    )

    log(f"evaluating {estimator} on {dataset}")
    predictor = estimator.train(dataset.train)
    forecast_it, ts_it = make_evaluation_predictions(dataset.test,
                                                     predictor=predictor,
                                                     num_samples=100)
    agg_metrics, item_metrics = Evaluator()(ts_it,
                                            forecast_it,
                                            num_series=len(dataset.test))

    log(agg_metrics)
    eval_dict = agg_metrics
    eval_dict["dataset"] = dataset_name
    eval_dict["estimator"] = type(estimator).__name__
    return eval_dict
def get_dataset(
    dataset_name: str,
    path: Path = default_dataset_path,
    regenerate: bool = False,
) -> TrainDatasets:
    """
    Get a repository dataset.

    The datasets that can be obtained through this function have been used
    with different processing over time by several papers (e.g., [SFG17]_,
    [LCY+18]_, and [YRD15]_).

    Parameters
    ----------
    dataset_name
        name of the dataset, for instance "m4_hourly"
    regenerate
        whether to regenerate the dataset even if a local file is present.
        If this flag is False and the file is present, the dataset will not
        be downloaded again.
    path
        where the dataset should be saved
    Returns
    -------
        dataset obtained by either downloading or reloading from local file.
    """
    dataset_path = materialize_dataset(dataset_name, path, regenerate)

    return load_datasets(
        metadata=dataset_path,
        train=dataset_path / "train",
        test=dataset_path / "test",
    )
Пример #4
0
def generate_dataframes(dataset_path=None,index=None):
  all_targets   =[]
  all_dynamic   =[]
  all_static    =[]
  start=[]
  TD             =load_datasets(   metadata=dataset_path,
                                    train=dataset_path / "train", test=dataset_path / "test")
  all_targets=[]
  all_dynamic  = []
  all_static  = []
  for items in TD.train:
    target=np.transpose(items["target"]).tolist()

    dynamic=np.transpose(items["feat_dynamic_real"]).tolist()

    static= np.transpose(items["feat_static_cat"]).tolist()
    all_dynamic.append(dynamic)
    #all_static.append(static)
    #all_targets.append(target)
  df_timeseries =pd.DataFrame(all_targets)
  del all_targets
  #df_dynamic =pd.DataFrame(all_dynamic)
  #del all_dynamic
  #df_static =pd.DataFrame(all_static)
  #del all_static
  print(df_timeseries.head(10))
  #print(df_dynamic.head(10))
  #print(df_static.head(10))
  return df_timeseries,df_dynamic,df_static
Пример #5
0
def get_dataset(dataset_name: str, regenerate: bool = False) -> TrainDatasets:
    """
    Parameters
    ----------
    dataset_name
        name of the dataset, for instance "m4_hourly"
    regenerate
        whether to regenerate the dataset even if a local file is present. If this flag is False and the
        file is present, the dataset will not be downloaded again.

    Returns
    -------
        dataset obtained by either downloading or reloading from local file.
    """
    assert (
        dataset_name in dataset_recipes.keys()
    ), f"{dataset_name} is not present, please choose one from {dataset_recipes.keys()}."
    dataset_path = Path(dataset_name)

    dataset_recipe = dataset_recipes[dataset_name]

    if not os.path.exists(dataset_path) or regenerate:
        logging.info(f"downloading and processing {dataset_name}")
        dataset_recipe(dataset_path=dataset_path)
    else:
        logging.info(
            f"using dataset already processed in path {dataset_path}.")

    return load_datasets(
        metadata=dataset_path,
        train=dataset_path / 'train',
        test=dataset_path / 'test',
    )
Пример #6
0
def get_dataset(dataset_name):
    if dataset_name == "wiki2000_nips":
        datasets_root_path = os.path.join(
            os.environ["HOME"], ".mxnet/gluon-ts/datasets"
        )
        dataset_path = os.path.join(datasets_root_path, dataset_name)

        if not os.path.exists(datasets_root_path):
            os.makedirs(datasets_root_path, exist_ok=True)
        if not os.path.exists(dataset_path):
            raise Exception(
                f"you must manually upload the wiki dataset "
                f"and place it to the following folder: {dataset_path}"
            )
        else:
            dataset = load_datasets(
                metadata=pathlib.PurePath(
                    os.path.join(dataset_path, "metadata")
                ),
                train=pathlib.PurePath(os.path.join(dataset_path, "train")),
                test=pathlib.PurePath(os.path.join(dataset_path, "test")),
            )
            if (
                dataset.metadata.freq == "1D"
            ):  # WHY IS WIKI "D" AND THIS IS "1D" ?!
                dataset.metadata.freq = "D"
            return dataset
    else:
        return get_dataset_gts(dataset_name)
Пример #7
0
def test_pandas_to_gluonts(root):
    ##### Generate Data using functions  ############################################################################
    #from zlocal import root
    dir0 = root + "/zsample/m5_dataset/"
    gluonts_path = dir0 + "/json/"

    df_timeseries, df_dynamic, df_static, pars_data = load_datasset_m5()

    ####### Generate the Gluonts format dataset
    pandas_to_gluonts(df_timeseries,
                      df_dynamic,
                      df_static,
                      pars=pars_data,
                      path_save=gluonts_path,
                      return_df=False)

    ####### For Model definition
    cardinalities = gluonts_static_cardinalities(df_static,
                                                 submission=1,
                                                 single_pred_length=28,
                                                 submission_pred_length=10,
                                                 n_timeseries=1,
                                                 transpose=1)

    test_ds = None
    train_ds = None
    # test gluonts data
    from gluonts.dataset.common import load_datasets
    dataset_path = gluonts_path

    TD = load_datasets(
        metadata=dataset_path,
        train=dataset_path + "/train",
        test=dataset_path + "/test",
    )
Пример #8
0
def get_dataset(
    dataset_name: str,
    path: Path = default_dataset_path,
    regenerate: bool = False,
) -> TrainDatasets:
    """
    Parameters
    ----------
    dataset_name
        name of the dataset, for instance "m4_hourly"
    regenerate
        whether to regenerate the dataset even if a local file is present.
        If this flag is False and the file is present, the dataset will not
        be downloaded again.
    path
        where the dataset should be saved
    Returns
    -------
        dataset obtained by either downloading or reloading from local file.
    """
    dataset_path = materialize_dataset(dataset_name, path, regenerate)

    return load_datasets(
        metadata=dataset_path,
        train=dataset_path / "train",
        test=dataset_path / "test",
    )
Пример #9
0
def load_multivariate_datasets(path: Path) -> TrainDatasets:
    ds = load_datasets(path / "metadata", path / "train", path / "test")
    target_dim = ds.metadata.feat_static_cat[0].cardinality
    grouper_train = MultivariateGrouper(max_target_dim=target_dim)
    grouper_test = MultivariateGrouper(max_target_dim=target_dim)
    return TrainDatasets(
        metadata=ds.metadata,
        train=grouper_train(ds.train),
        test=grouper_test(ds.test),
    )
Пример #10
0
def train(arguments):
    """
    Generic train method that trains a specified estimator on a specified
    dataset.
    """

    logger.info("Downloading estimator config.")
    estimator_config = Path(arguments.estimator) / "estimator.json"
    with estimator_config.open() as config_file:
        estimator = serde.load_json(config_file.read())

    logger.info("Downloading dataset.")
    if arguments.s3_dataset is None:
        # load built in dataset
        dataset = datasets.get_dataset(arguments.dataset)
    else:
        # load custom dataset
        s3_dataset_dir = Path(arguments.s3_dataset)
        dataset = common.load_datasets(
            metadata=s3_dataset_dir,
            train=s3_dataset_dir / "train",
            test=s3_dataset_dir / "test",
        )

    logger.info("Starting model training.")
    predictor = estimator.train(dataset.train)
    forecast_it, ts_it = backtest.make_evaluation_predictions(
        dataset=dataset.test,
        predictor=predictor,
        num_samples=int(arguments.num_samples),
    )

    logger.info("Starting model evaluation.")
    evaluator = Evaluator(quantiles=eval(arguments.quantiles))

    agg_metrics, item_metrics = evaluator(ts_it,
                                          forecast_it,
                                          num_series=len(list(dataset.test)))

    # required for metric tracking.
    for name, value in agg_metrics.items():
        logger.info(f"gluonts[metric-{name}]: {value}")

    # save the evaluation results
    metrics_output_dir = Path(arguments.output_data_dir)
    with open(metrics_output_dir / "agg_metrics.json", "w") as f:
        json.dump(agg_metrics, f)
    with open(metrics_output_dir / "item_metrics.csv", "w") as f:
        item_metrics.to_csv(f, index=False)

    # save the model
    model_output_dir = Path(arguments.model_dir)
    predictor.serialize(model_output_dir)
Пример #11
0
def get_dataset(
    dataset_name: str,
    path: Path = default_dataset_path,
    regenerate: bool = False,
    prediction_length: Optional[int] = None,
) -> TrainDatasets:
    """
    Get a repository dataset.

    The datasets that can be obtained through this function have been used
    with different processing over time by several papers (e.g., [SFG17]_,
    [LCY+18]_, and [YRD15]_) or are obtained through the `Monash Time Series
    Forecasting Repository <https://forecastingdata.org/>`_.

    Parameters
    ----------
    dataset_name
        Name of the dataset, for instance "m4_hourly".
    regenerate
        Whether to regenerate the dataset even if a local file is present.
        If this flag is False and the file is present, the dataset will not
        be downloaded again.
    path
        Where the dataset should be saved.
    prediction_length
        The prediction length to be used for the dataset. If None, the default
        prediction length will be used. If the dataset is already materialized,
        setting this option to a different value does not have an effect.
        Make sure to set `regenerate=True` in this case. Note that some
        datasets from the Monash Time Series Forecasting Repository do not
        actually have a default prediction length -- the default then depends
        on the frequency of the data:
        - Minutely data --> prediction length of 60 (one hour)
        - Hourly data --> prediction length of 48 (two days)
        - Daily data --> prediction length of 30 (one month)
        - Weekly data --> prediction length of 8 (two months)
        - Monthly data --> prediction length of 12 (one year)
        - Yearly data --> prediction length of 4 (four years)

    Returns
    -------
        Dataset obtained by either downloading or reloading from local file.
    """
    dataset_path = materialize_dataset(
        dataset_name, path, regenerate, prediction_length
    )

    return load_datasets(
        metadata=dataset_path,
        train=dataset_path / "train",
        test=dataset_path / "test",
    )
def load_dataset(args: Namespace) -> TrainDatasets:
    """Load data from channel or fallback to named public dataset."""
    if args.s3_dataset is None:
        # load built in dataset
        logger.info("Downloading dataset %s", args.dataset)
        dataset = datasets.get_dataset(args.dataset)
    else:
        # load custom dataset
        logger.info("Loading dataset from %s", args.s3_dataset)
        s3_dataset_dir = Path(args.s3_dataset)
        dataset = load_datasets(
            metadata=s3_dataset_dir / "metadata", train=s3_dataset_dir / "train", test=s3_dataset_dir / "test",
        )
    return dataset
Пример #13
0
def check_dataset(dataset_path: Path, length: int, sheet_name):
    # check that things are correct
    from gluonts.dataset.common import load_datasets

    ds = load_datasets(
        metadata=dataset_path,
        train=dataset_path / "train",
        test=dataset_path / "test",
    )

    assert ds.test is not None
    assert len(list(ds.train)) == length
    assert len(list(ds.test)) == length

    assert ds.metadata.prediction_length is not None

    for ts_train, ts_test in tqdm(
        zip(ds.train, ds.test), total=length, desc="checking consistency"
    ):
        train_target = ts_train["target"]
        test_target = ts_test["target"]
        assert (
            len(train_target)
            == len(test_target) - ds.metadata.prediction_length
        )
        assert np.all(train_target == test_target[: len(train_target)])

        assert ts_train["start"] == ts_test["start"]
        start = ts_train["start"]
        regex = r"^(\d{4})-(\d{2})-(\d{2})( 00:00(:00)?)?$"
        m = re.match(regex, str(start))
        assert m
        month, day = m.group(2), m.group(3)
        if sheet_name in ["M3Quart", "Other"]:
            assert f"{month}-{day}" in [
                "03-31",
                "06-30",
                "09-30",
                "12-31",
            ], f"Invalid time stamp `{month}-{day}`"
        elif sheet_name == "M3Year":
            assert (
                f"{month}-{day}" == "12-31"
            ), f"Invalid time stamp {month}-{day}"
Пример #14
0
def check_dataset(dataset_path: Path, length: int):
    # check that things are correct
    from gluonts.dataset.common import load_datasets

    ds = load_datasets(
        metadata=dataset_path,
        train=dataset_path / "train",
        test=dataset_path / "test",
    )

    assert ds.test is not None
    assert len(list(ds.train)) == length
    assert len(list(ds.test)) == length

    assert ds.metadata.prediction_length is not None

    for ts_train, ts_test in tqdm(zip(ds.train, ds.test),
                                  total=length,
                                  desc="checking consistency"):
        train_target = ts_train["target"]
        test_target = ts_test["target"]
        assert (len(train_target) == len(test_target) -
                ds.metadata.prediction_length)
        assert np.all(train_target == test_target[:len(train_target)])
Пример #15
0
from mxnet import gluon
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# %matplotlib inline
import os
from tqdm.autonotebook import tqdm
from pathlib import Path

from gluonts.dataset.common import ListDataset,load_datasets
#dataset_path=Path("time/data/gluonts_01")

data_folder = "kaggle_data/m5_dataset"
gluonts_datafolder='gluonts_data/m5_dataset'
dataset_path=Path(gluonts_datafolder)
TD             =load_datasets(   metadata=dataset_path,
                                    train=dataset_path / "train", test=dataset_path / "test",)

import pandas as pd
import json

def generate_dataframes(dataset_path=None,index=None):
  all_targets   =[]
  all_dynamic   =[]
  all_static    =[]
  start=[]
  TD             =load_datasets(   metadata=dataset_path,
                                    train=dataset_path / "train", test=dataset_path / "test")
  all_targets=[]
  all_dynamic  = []
  all_static  = []
  for items in TD.train:
Пример #16
0
def gluonts_to_pandas(dataset_path=None):
    from gluonts.dataset.common import load_datasets
    all_targets = []
    all_dynamic = []
    all_static = []
    start = []
    TD = load_datasets(metadata=dataset_path,
                       train=dataset_path / "train",
                       test=dataset_path / "test")
    instance = next(iter(TD.test))
    #### load decode pars ########
    decode_pars = json.load(
        open(dataset_path / "decode.json", mode='r'),
        object_hook=lambda d:
        {int(k) if k.lstrip('-').isdigit() else k: v
         for k, v in d.items()})
    df_dynamic_labels = decode_pars["df_dynamic_labels"]
    df_dynamic_cols = decode_pars["df_dynamic_cols"]
    df_static_labels = decode_pars["df_static_labels"]
    df_static_cols = decode_pars["df_static_cols"]
    df_timeseries_cols = decode_pars["df_timeseries_cols"]
    df_timeseries_dtype = decode_pars["df_timeseries_dtype"]
    df_dynamic_dtype = decode_pars["df_dynamic_dtype"]
    df_static_dtype = decode_pars["df_static_dtype"]

    #################################################
    dynamic_features = np.transpose(instance["feat_dynamic_real"])
    for items in TD.test:
        #print(items)
        target = np.transpose(items["target"]).tolist()
        static = np.transpose(items["feat_static_cat"]).tolist()

        all_static.append(static)
        all_targets.append(target)
    del TD
    df_timeseries = pd.DataFrame(all_targets)
    del all_targets
    df_dynamic = pd.DataFrame(dynamic_features)
    df_static = pd.DataFrame(all_static)

    ################ decode  df_dynamic #############
    if df_dynamic_labels is not None:
        for key in df_dynamic_labels:
            col = key
            labels = df_dynamic_labels[key]
            for l in labels:
                v = labels[l]
                df_dynamic[col] = df_dynamic[col].apply(lambda x: v
                                                        if x == l else x)
    for col in df_dynamic.columns:
        df_dynamic[col] = df_dynamic[col].apply(lambda x: np.NAN
                                                if x == -l else x)
    if df_dynamic_cols is not None:
        df_dynamic.rename(columns=df_dynamic_cols, inplace=True)

    ##### decode df_timeseries########################################
    if df_timeseries_cols is not None:
        df_timeseries.rename(columns=df_timeseries_cols, inplace=True)
    del all_dynamic

    ####### decode df staic###########################################
    if df_static_labels is not None:
        for key in df_static_labels:
            d = df_static_labels[key]
            df_static[key] = df_static[key].map(d)
    if df_static_cols is not None:
        df_static.rename(columns=df_static_cols, inplace=True)

    #####################
    del all_static
    df_timeseries = df_timeseries.astype(df_timeseries_dtype)
    df_dynamic = df_dynamic.astype(df_dynamic_dtype)
    df_static = df_static.astype(df_static_dtype)

    return df_timeseries, df_dynamic, df_static
Пример #17
0
def gluonts_to_pandas(dataset_path, data_type):
  from gluonts.dataset.common import ListDataset,load_datasets  
  from copy import deepcopy
  all_targets = []
  all_dynamic = []
  all_static  = []
  all_static_Real = []
  start       = []
  TD          = load_datasets(  metadata=dataset_path,
                                  train=dataset_path / "train", test=dataset_path / "test")
  
  ### json iterator  Why Test only   ###############
  if data_type == "test"  : TD_current = deepcopy( TD.test )
  if data_type == "train" : TD_current = deepcopy( TD.train )


  instance_iter =next(iter(TD_current))
  df_static_real = None
  df_dynamic = None
  df_static = None
  df_static_real = None

  #### load decode pars ############################
  decode_pars = json.load(open(dataset_path / "decode.json", mode='r'),object_hook=lambda d: {int(k) 
                          if k.lstrip('-').isdigit() else k: v for k, v in d.items()})
  print(decode_pars)
  df_dynamic_labels   = decode_pars["df_dynamic_labels"]
  df_dynamic_cols     = decode_pars["df_dynamic_cols"]
  df_static_labels    = decode_pars["df_static_labels"]
  df_static_cols      = decode_pars["df_static_cols"]
  df_static_real_cols = decode_pars["df_static_real_cols"]

  df_timeseries_cols  = decode_pars["df_timeseries_cols"]
  df_timeseries_dtype = decode_pars["df_timeseries_dtype"]
  df_dynamic_dtype    = decode_pars["df_dynamic_dtype"]
  df_static_dtype     = decode_pars["df_static_dtype"]
  df_static_real_dtype =decode_pars["df_static_real_dtype"]

  #################################################
  if "feat_dynamic_real" in instance_iter:
    dynamic_features = np.transpose(instance_iter["feat_dynamic_real"])
  else:
    dynamic_features = None


  for items in TD_current :
    #print(items)
    target=np.transpose(items["target"]).tolist() 
    if "feat_static_cat" in items:
      static= np.transpose(items["feat_static_cat"]).tolist()

    if "feat_static_real" in items:
      static_real = items["feat_static_real"]
      all_static_Real.append(static_real)
    
    all_static.append(static)
    all_targets.append(target)

  

  df_timeseries =pd.DataFrame(all_targets)
  del all_targets

  ############# decode df_static real ############################
  if len(all_static_Real)>0:
    df_static_real=pd.DataFrame(all_static_Real)
    df_static_real     = df_static_real.astype(df_static_real_dtype) 

  ################ decode  df_dynamic #####   
  if dynamic_features:
    df_dynamic =pd.DataFrame(dynamic_features)
    if  df_dynamic_labels is not None:
      for key in  df_dynamic_labels:
        col = key
        labels= df_dynamic_labels[key]
        for l in labels:
          v = labels[l]
          df_dynamic[col]=df_dynamic[col].apply(lambda x: v if x == l else x)

    for col in df_dynamic.columns:       
      df_dynamic[col]=df_dynamic[col].apply(lambda x: np.NAN if x == -l else x)  
    
    if  df_dynamic_cols is not None:
      df_dynamic.rename(columns =  df_dynamic_cols, inplace = True) 
    df_dynamic    = df_dynamic.astype(df_dynamic_dtype)
  
  if len(all_static)>0:  
    df_static =pd.DataFrame(all_static)


  ##### decode df_timeseries#############
  if  df_timeseries_cols is not None:
    df_timeseries.rename(columns = df_timeseries_cols , inplace = True) 
  df_timeseries = df_timeseries.astype(df_timeseries_dtype)
  

  ####### decode df staic################
  if not df_static.empty:
    if df_static_labels  is not None:
        for key in df_static_labels: 
          d =  df_static_labels[key]
          df_static[key] = df_static[key].map(d)
    if  df_static_cols is not None:
        df_static.rename(columns = df_static_cols, inplace = True) 
    df_static     = df_static.astype(df_static_dtype)    
 

  ########## df static real ###########

  
  
  #####################################
  return df_timeseries,df_dynamic,df_static,df_static_real
Пример #18
0
    cardinalities = gluonts_static_cardinalities(df_static,
                                                 submission=1,
                                                 single_pred_length=28,
                                                 submission_pred_length=10,
                                                 n_timeseries=1,
                                                 transpose=1)

    test_ds = None
    train_ds = None
    # test gluonts data
    from gluonts.dataset.common import ListDataset, load_datasets
    dataset_path = gluonts_path

    TD = load_datasets(
        metadata=dataset_path,
        train=dataset_path + "/train",
        test=dataset_path + "/test",
    )
"""


a,b = gluonts_create_dynamic(df_dynamic, submission=True, single_pred_length=28, submission_pred_length=10, 
                           n_timeseries=1, transpose=1) 

col = "event_type_1"
df_dynamic[col].apply(lambda x: to_flag(x) )


 to_flag('ok')