예제 #1
0
def test_metadata_1():
    assert _util.metadata(freq="1H", prediction_length=20,
                          cardinality=[10, 3]) == {
                              "freq":
                              "1H",
                              "prediction_length":
                              20,
                              "feat_static_cat": [
                                  {
                                      "name": "feat_static_cat_0",
                                      "cardinality": "10"
                                  },
                                  {
                                      "name": "feat_static_cat_1",
                                      "cardinality": "3"
                                  },
                              ],
                          }
예제 #2
0
def generate_lstnet_dataset(dataset_path: Path, dataset_name: str):
    ds_info = datasets_info[dataset_name]

    os.makedirs(dataset_path, exist_ok=True)

    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=ds_info.num_series,
                    freq=ds_info.freq,
                    prediction_length=ds_info.prediction_length,
                )))

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"

    time_index = pd.date_range(
        start=ds_info.start_date,
        freq=ds_info.freq,
        periods=ds_info.num_time_steps,
    )

    df = pd.read_csv(ds_info.url, header=None)

    assert df.shape == (
        ds_info.num_time_steps,
        ds_info.num_series,
    ), f"expected num_time_steps/num_series {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}"

    timeseries = load_from_pandas(df=df,
                                  time_index=time_index,
                                  agg_freq=ds_info.agg_freq)

    # the last date seen during training
    ts_index = timeseries[0].index
    training_end = ts_index[int(len(ts_index) * (8 / 10))]

    train_ts = []
    for cat, ts in enumerate(timeseries):
        sliced_ts = ts[:training_end]
        if len(sliced_ts) > 0:
            train_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                ))

    assert len(train_ts) == ds_info.num_series

    save_to_file(train_file, train_ts)

    # time of the first prediction
    prediction_dates = [
        frequency_add(training_end, i * ds_info.prediction_length)
        for i in range(ds_info.rolling_evaluations)
    ]

    test_ts = []
    for prediction_start_date in prediction_dates:
        for cat, ts in enumerate(timeseries):
            # print(prediction_start_date)
            prediction_end_date = frequency_add(prediction_start_date,
                                                ds_info.prediction_length)
            sliced_ts = ts[:prediction_end_date]
            test_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                ))

    assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations

    save_to_file(test_file, test_ts)
예제 #3
0
def generate_lstnet_dataset(
    dataset_path: Path,
    dataset_name: str,
    prediction_length: Optional[int] = None,
):
    ds_info = datasets_info[dataset_name]

    ds_metadata = metadata(
        cardinality=ds_info.num_series,
        freq=ds_info.freq if ds_info.agg_freq is None else ds_info.agg_freq,
        prediction_length=prediction_length or ds_info.prediction_length,
    )

    os.makedirs(dataset_path, exist_ok=True)

    with open(dataset_path / "metadata.json", "w") as f:
        json.dump(ds_metadata, f)

    time_index = pd.period_range(
        start=ds_info.start_date,
        freq=ds_info.freq,
        periods=ds_info.num_time_steps,
    )

    df = cast(
        pd.DataFrame,
        pd.read_csv(ds_info.url, header=None),  # type: ignore
    )

    assert df.shape == (
        ds_info.num_time_steps,
        ds_info.num_series,
    ), ("expected num_time_steps/num_series"
        f" {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}")

    timeseries = load_from_pandas(df=df,
                                  time_index=time_index,
                                  agg_freq=ds_info.agg_freq)

    # the last date seen during training
    ts_index = cast(pd.PeriodIndex, timeseries[0].index)
    training_end = ts_index[int(len(ts_index) * (8 / 10))]

    train_ts = []
    for cat, ts in enumerate(timeseries):
        sliced_ts = ts[:training_end]
        if len(sliced_ts) > 0:
            train_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                    item_id=cat,
                ))

    assert len(train_ts) == ds_info.num_series

    save_to_file(dataset_path / "train" / "data.json", train_ts)

    # time of the first prediction
    prediction_dates = [
        training_end + i * ds_info.prediction_length
        for i in range(ds_info.rolling_evaluations)
    ]

    test_ts = []
    for prediction_start_date in prediction_dates:
        for cat, ts in enumerate(timeseries):
            # print(prediction_start_date)
            prediction_end_date = (prediction_start_date +
                                   ds_info.prediction_length)
            sliced_ts = ts[:prediction_end_date]
            test_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                    item_id=cat,
                ))

    assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations

    save_to_file(dataset_path / "test" / "data.json", test_ts)
예제 #4
0
def generate_m5_dataset(
    dataset_path: Path,
    pandas_freq: str,
    prediction_length: int,
    m5_file_path: Path,
):
    cal_path = f"{m5_file_path}/calendar.csv"
    sales_path = f"{m5_file_path}/sales_train_validation.csv"

    if not os.path.exists(cal_path) or not os.path.exists(sales_path):
        raise RuntimeError(
            "M5 data is available on Kaggle"
            " (https://www.kaggle.com/c/m5-forecasting-accuracy/data). You"
            " first need to agree to the terms of the competition before"
            " being able to download the data. After you have done that,"
            f" please supply the files at {m5_file_path}.")

    # Prepare directory
    dataset_path.mkdir(exist_ok=True)

    # Read M5 data from dataset_path
    calendar = pd.read_csv(cal_path)
    sales_train_validation = pd.read_csv(sales_path)
    submission_prediction_length = prediction_length * 2

    # Build dynamic features
    cal_features = calendar.drop(
        [
            "date",
            "wm_yr_wk",
            "weekday",
            "wday",
            "month",
            "year",
            "event_name_1",
            "event_name_2",
            "d",
        ],
        axis=1,
    )
    cal_features["event_type_1"] = cal_features["event_type_1"].apply(
        lambda x: 0 if str(x) == "nan" else 1)
    cal_features["event_type_2"] = cal_features["event_type_2"].apply(
        lambda x: 0 if str(x) == "nan" else 1)
    test_cal_features = cal_features.values.T
    train_cal_features = test_cal_features[:, :-submission_prediction_length -
                                           prediction_length]
    test_cal_features = test_cal_features[:, :-submission_prediction_length]

    test_cal_features_list = [test_cal_features] * len(sales_train_validation)
    train_cal_features_list = [train_cal_features
                               ] * len(sales_train_validation)

    # Build static features
    state_ids = (
        sales_train_validation["state_id"].astype("category").cat.codes.values)
    state_ids_un = np.unique(state_ids)
    store_ids = (
        sales_train_validation["store_id"].astype("category").cat.codes.values)
    store_ids_un = np.unique(store_ids)
    cat_ids = (
        sales_train_validation["cat_id"].astype("category").cat.codes.values)
    cat_ids_un = np.unique(cat_ids)
    dept_ids = (
        sales_train_validation["dept_id"].astype("category").cat.codes.values)
    dept_ids_un = np.unique(dept_ids)
    item_ids = (
        sales_train_validation["item_id"].astype("category").cat.codes.values)
    item_ids_un = np.unique(item_ids)
    stat_cat_list = [item_ids, dept_ids, cat_ids, store_ids, state_ids]
    stat_cat = np.concatenate(stat_cat_list)
    stat_cat = stat_cat.reshape(len(stat_cat_list), len(item_ids)).T
    cardinalities = [
        len(item_ids_un),
        len(dept_ids_un),
        len(cat_ids_un),
        len(store_ids_un),
        len(state_ids_un),
    ]

    # Build target series
    train_ids = sales_train_validation["id"]
    train_df = sales_train_validation.drop(
        ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"], axis=1)
    test_target_values = train_df.values.copy()
    train_target_values = [ts[:-prediction_length] for ts in train_df.values]
    dates = ["2011-01-29 00:00:00" for _ in range(len(sales_train_validation))]

    # Create metadata file
    meta_file = dataset_path / "metadata.json"
    with open(meta_file, "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=cardinalities,
                    freq=pandas_freq,
                    prediction_length=prediction_length,
                )))

    # Build training set
    train_file = dataset_path / "train" / "data.json"
    train_ds = [{
        FieldName.TARGET: target.tolist(),
        FieldName.START: start,
        FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(),
        FieldName.FEAT_STATIC_CAT: fsc.tolist(),
        FieldName.ITEM_ID: id,
    } for (target, start, fdr, fsc, id) in zip(
        train_target_values,
        dates,
        train_cal_features_list,
        stat_cat,
        train_ids,
    )]
    save_to_file(train_file, train_ds)

    # Build testing set
    test_file = dataset_path / "test" / "data.json"
    test_ds = [{
        FieldName.TARGET: target.tolist(),
        FieldName.START: start,
        FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(),
        FieldName.FEAT_STATIC_CAT: fsc.tolist(),
        FieldName.ITEM_ID: id,
    } for (target, start, fdr, fsc, id) in zip(
        test_target_values,
        dates,
        test_cal_features_list,
        stat_cat,
        train_ids,
    )]
    save_to_file(test_file, test_ds)
예제 #5
0
def generate_retail_dataset(dataset_path: Path, split: str = "2011-11-24"):
    retail_dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
    df = pd.read_excel(retail_dataset_url)
    combination = ["StockCode", "Country"]
    df = _preprocess_retail_data(df, combination)
    df.to_pickle("tmp/temp.pkl")
    # df = pd.read_pickle("temp.pkl")
    idx = pd.IndexSlice[:, :, :split]
    train_df = df.loc[idx, :].reset_index()
    idx = pd.IndexSlice[:, :, split:]
    test_df = df.loc[idx, :].reset_index()
    full_df = df.reset_index()
    single_prediction_length = len(test_df["InvoiceDate"].unique())
    feat_static_cat = combination
    feat_dynamic_real = ['UnitPrice']
    target = 'Quantity'
    date_col = 'InvoiceDate'

    os.makedirs(dataset_path, exist_ok=True)

    uniq_combs = train_df[combination].drop_duplicates().apply(tuple, axis=1)
    dynamic_real_train_l = []
    dynamic_real_test_l = []
    stat_cat_l = []
    start_l = []
    train_target_l = []
    test_target_l = []
    for stock_code, country in tqdm(uniq_combs):
        df = train_df[
            (train_df.StockCode == stock_code) & (train_df.Country == country)
        ]
        _df = full_df[(full_df.StockCode == stock_code) & (full_df.Country == country)]
        train_ts = _df[target].values.ravel()
        if (train_ts>0).sum() > (single_prediction_length+13):
            test_feat_dyn_array = _df.loc[:, feat_dynamic_real].values.T
            train_feat_dyn_array = test_feat_dyn_array[:, :-single_prediction_length]

            test_ts = train_ts.copy()
            train_ts = train_ts[:-single_prediction_length]

            dynamic_real_train_l.append(train_feat_dyn_array)
            dynamic_real_test_l.append(test_feat_dyn_array)
            start_l.append(df[date_col].min())
            train_target_l.append(train_ts)
            test_target_l.append(test_ts)
            stat_cat_l.append(
                np.squeeze(df.loc[:, feat_static_cat].drop_duplicates().values)
            )
    stat_cat_cardinalities = [
            len(full_df[col].unique()) for col in feat_static_cat
        ]

    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=stat_cat_cardinalities,
                    freq="1D",
                    prediction_length=single_prediction_length,
                )
            )
        )

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"
    train_ds = [
        {
            FieldName.ITEM_ID: "|".join(map(str,uniq_comb)),
            FieldName.TARGET: target.tolist(),
            FieldName.START: str(start),
            FieldName.FEAT_STATIC_CAT: fsc.tolist(),
            FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(),
        }
        for uniq_comb, target, start, fdr, fsc in zip(
            uniq_combs, train_target_l, start_l, dynamic_real_train_l, stat_cat_l,
        )
    ]
    save_to_file(train_file, train_ds)
    test_ds = [
        {
            FieldName.ITEM_ID: "|".join(map(str,uniq_comb)),
            FieldName.TARGET: target.tolist(),
            FieldName.START: str(start),
            FieldName.FEAT_STATIC_CAT: fsc.tolist(),
            FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(),
        }
        for uniq_comb, target, start, fdr, fsc in zip(
            uniq_combs, test_target_l, start_l, dynamic_real_test_l, stat_cat_l,
        )
    ]
    save_to_file(test_file, test_ds)
예제 #6
0
def generate_m3_dataset(dataset_path: Path, m3_freq: str):
    from gluonts.dataset.repository.datasets import default_dataset_path

    m3_xls_path = default_dataset_path / "M3C.xls"
    if not os.path.exists(m3_xls_path):
        raise RuntimeError(
            f"The m3 data is available at https://forecasters.org/resources/time-series-data/m3-competition/ "
            f"Please download the file and copy the files to this location: {m3_xls_path}"
        )

    class M3Setting(NamedTuple):
        sheet_name: str
        prediction_length: int
        freq: str

    subsets = {
        "yearly": M3Setting("M3Year", 6, "12M"),
        "quarterly": M3Setting("M3Quart", 8, "3M"),
        "monthly": M3Setting("M3Month", 18, "1M"),
        "other": M3Setting("M3Other", 8, "3M"),
    }
    assert (m3_freq.lower() in subsets
            ), f"invalid m3_freq='{m3_freq}'. Allowed values: {subsets.keys()}"

    if m3_freq.lower() == "other":
        warnings.warn(
            "Be aware: The M3-other dataset does not have a known frequency. Since gluonts needs a known frequency, "
            "we will generate the dataset with an artificial `quarterly` frequency."
        )

    subset = subsets[m3_freq.lower()]
    df = pd.read_excel(m3_xls_path, sheet_name=subset.sheet_name)

    def truncate_trailing_nan(v: np.ndarray):
        last_finite_index = np.where(np.isfinite(v))[0][-1]
        return v[:last_finite_index + 1]

    train_data = []
    test_data = []

    def normalize_category(c: str):
        return c.strip()

    df["Category"] = df["Category"].apply(normalize_category)
    categories = list(df["Category"].unique())

    cat_map = {c: i for i, c in enumerate(categories)}

    i = 0
    for _, row in df.iterrows():
        vals = row.values
        series, n, nf, category, starting_year, starting_offset = vals[:6]
        target = np.asarray(vals[6:], dtype=np.float64)
        target = truncate_trailing_nan(target)
        assert len(target) == n
        assert nf == subset.prediction_length
        mock_start = "1750-01-01 00:00:00"
        if starting_year == 0:
            assert starting_offset == 0
            starting_year = mock_start
        s = pd.Timestamp(str(starting_year), freq=subset.freq)
        offset = max(starting_offset - 1, 0)
        if offset:
            s += offset * s.freq
        start = str(s).split(" ")[0]

        cat = [i, cat_map[category]]

        d_train = to_dict(
            target_values=target[:-subset.prediction_length],
            start=start,
            cat=cat,
            item_id=series,
        )
        train_data.append(d_train)

        d_test = to_dict(target_values=target,
                         start=start,
                         cat=cat,
                         item_id=series)
        test_data.append(d_test)
        i += 1

    os.makedirs(dataset_path, exist_ok=True)
    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=[len(train_data),
                                 len(categories)],
                    freq=subset.freq,
                    prediction_length=subset.prediction_length,
                )))

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"

    save_to_file(train_file, train_data)
    save_to_file(test_file, test_data)

    check_dataset(dataset_path, len(df))
예제 #7
0
def generate_m4_dataset(dataset_path: Path, m4_freq: str, pandas_freq: str,
                        prediction_length: int):
    m4_dataset_url = (
        "https://github.com/M4Competition/M4-methods/raw/master/Dataset")
    train_df = pd.read_csv(f"{m4_dataset_url}/Train/{m4_freq}-train.csv",
                           index_col=0)
    test_df = pd.read_csv(f"{m4_dataset_url}/Test/{m4_freq}-test.csv",
                          index_col=0)

    os.makedirs(dataset_path, exist_ok=True)

    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=len(train_df),
                    freq=pandas_freq,
                    prediction_length=prediction_length,
                )))

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"

    train_target_values = [ts[~np.isnan(ts)] for ts in train_df.values]

    test_target_values = [
        np.hstack([train_ts, test_ts])
        for train_ts, test_ts in zip(train_target_values, test_df.values)
    ]

    if m4_freq == "Yearly":
        # some time series have more than 300 years which can not be
        # represented in pandas, this is probably due to a misclassification
        # of those time series as Yearly. We use only those time series with
        # fewer than 300 items for this reason.
        train_target_values = [
            ts for ts in train_target_values if len(ts) <= 300
        ]
        test_target_values = [
            ts for ts in test_target_values if len(ts) <= 300
        ]

    # the original dataset did not include time stamps, so we use the earliest
    # point available in pandas as the start date for each time series.
    mock_start_dataset = "1750-01-01 00:00:00"

    save_to_file(
        train_file,
        [
            to_dict(
                target_values=target,
                start=mock_start_dataset,
                cat=[cat],
                item_id=cat,
            ) for cat, target in enumerate(train_target_values)
        ],
    )

    save_to_file(
        test_file,
        [
            to_dict(
                target_values=target,
                start=mock_start_dataset,
                cat=[cat],
                item_id=cat,
            ) for cat, target in enumerate(test_target_values)
        ],
    )
예제 #8
0
def generate_uber_dataset(dataset_path: Path, uber_freq: str,
                          prediction_length: int):
    subsets = {"daily": "1D", "hourly": "1H"}
    assert (
        uber_freq.lower() in subsets
    ), f"invalid uber_freq='{uber_freq}'. Allowed values: {subsets.keys()}"
    freq_setting = subsets[uber_freq.lower()]

    # download the dataset and read the data
    with tempfile.TemporaryDirectory() as dir_path:
        temp_dir_path = Path(dir_path)
        temp_zip_path = temp_dir_path / "uber-dataset.zip"
        uber_url_path = ("http://raw.githubusercontent.com/fivethirtyeight/"
                         "uber-tlc-foil-response/master/uber-trip-data/"
                         "uber-raw-data-janjune-15.csv.zip")
        request.urlretrieve(uber_url_path, temp_zip_path)
        with zipfile.ZipFile(temp_zip_path) as zf:
            zf.extractall(path=temp_dir_path)
        uber_file_path = temp_dir_path / "uber-raw-data-janjune-15.csv"
        uber_df = pd.read_csv(
            uber_file_path,
            header=0,
            usecols=["Pickup_date", "locationID"],
            index_col=0,
        )

    # We divide the raw data according to locationID. Each json line represents
    # a time series of a loacationID. The targets are numbers of pickup-events
    # during a day or an hour.
    time_series_of_locations = list(uber_df.groupby(by="locationID"))

    dataset_path.mkdir(exist_ok=True)
    train_path = dataset_path / "train"
    test_path = dataset_path / "test"
    train_path.mkdir(exist_ok=True)
    test_path.mkdir(exist_ok=True)

    train_file = train_path / "data.json"
    test_file = test_path / "data.json"
    with open(train_file, "w") as o_train, open(test_file, "w") as o_test:
        for locationID, df in time_series_of_locations:
            df.sort_index()
            df.index = pd.to_datetime(df.index)

            count_series = df.resample(rule=freq_setting).size()
            start_time = pd.Timestamp(df.index[0]).strftime("%Y-%m-%d %X")
            target = count_series.values.tolist()
            feat_static_cat = [locationID]
            format_dict = {
                "start": start_time,
                "target": target,
                "feat_static_cat": feat_static_cat,
            }
            test_json_line = json.dumps(format_dict)
            o_test.write(test_json_line)
            o_test.write("\n")
            format_dict["target"] = format_dict["target"][:-prediction_length]
            train_json_line = json.dumps(format_dict)
            o_train.write(train_json_line)
            o_train.write("\n")

    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=len(time_series_of_locations),
                    freq=freq_setting[1],
                    prediction_length=prediction_length,
                )))