예제 #1
0
    def __init__(self,
                 df,
                 lookback,
                 horizon,
                 feature_col,
                 target_col,
                 id_col=None):
        """
        A customized TorchDataset for rolling dataframe for time series applications.

        :param df: The dataframe to roll on. The dataframe could contain single id value or
            multiple id values. If the dataframe contains multiple ids, the rows of same id
            should be consecutive. And dataframe should have been ordered by timestamp for each id.
        :param lookback: the length of the past sequence
        :param horizon: int or list,
           if `horizon` is an int, we will sample `horizon` step
           continuously after the forecasting point.
           if `horizon` is an list, we will sample discretely according
           to the input list. 1 means the timestamp just after the observed data.
        :param feature_col: list, indicate the feature col name.
        :param target_col: list, indicate the target col name.
        :param id_col: (optional) a str indicates the col name of dataframe id

        :return:

        """
        df.reset_index(drop=True, inplace=True)
        feature_col = _to_list(feature_col, "feature_col")
        target_col = _to_list(target_col, "target_col")
        _check_cols_no_na(df, col_names=target_col + feature_col)
        cols = target_col + feature_col
        cols = cols[0] if len(cols) == 1 else cols
        self.arr = df.loc[:, cols].to_numpy()
        self.arr = np.expand_dims(self.arr,
                                  axis=1) if self.arr.ndim == 1 else self.arr
        max_horizon = horizon if isinstance(horizon, int) else max(horizon)
        window_size = lookback + max_horizon
        self.roll_start_idxes = get_roll_start_idx(df,
                                                   id_col,
                                                   window_size=window_size)
        self.lookback = lookback
        self.horizon = horizon
        self.target_num = len(target_col)
예제 #2
0
    def roll(self,
             lookback,
             horizon,
             feature_col=None,
             target_col=None,
             id_sensitive=False):
        '''
        Sampling by rolling for machine learning/deep learning models.

        :param lookback: int, lookback value.
        :param horizon: int or list,
               if `horizon` is an int, we will sample `horizon` step
               continuously after the forecasting point.
               if `horizon` is a list, we will sample discretely according
               to the input list.
               specially, when `horizon` is set to 0, ground truth will be generated as None.
        :param feature_col: str or list, indicates the feature col name. Default to None,
               where we will take all available feature in rolling.
        :param target_col: str or list, indicates the target col name. Default to None,
               where we will take all target in rolling. it should be a subset of target_col
               you used to initialize the xshardtsdataset.
        :param id_sensitive: bool,
               |if `id_sensitive` is False, we will rolling on each id's sub dataframe
               |and fuse the sampings.
               |The shape of rolling will be
               |x: (num_sample, lookback, num_feature_col + num_target_col)
               |y: (num_sample, horizon, num_target_col)
               |where num_sample is the summation of sample number of each dataframe
               |if `id_sensitive` is True, we have not implement this currently.

        :return: the xshardtsdataset instance.
        '''
        if id_sensitive:
            raise NotImplementedError(
                "id_sensitive option has not been implemented.")
        feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \
            else self.feature_col
        target_col = _to_list(target_col, "target_col") if target_col is not None \
            else self.target_col
        self.numpy_shards = self.shards.transform_shard(
            roll_timeseries_dataframe, None, lookback, horizon, feature_col,
            target_col)
        return self
예제 #3
0
    def from_pandas(df,
                    dt_col,
                    target_col,
                    id_col=None,
                    extra_feature_col=None,
                    with_split=False,
                    val_ratio=0,
                    test_ratio=0.1,
                    largest_look_back=0,
                    largest_horizon=1):
        '''
        Initialize tsdataset(s) from pandas dataframe.

        :param df: a pandas dataframe for your raw time series data.
        :param dt_col: a str indicates the col name of datetime
               column in the input data frame.
        :param target_col: a str or list indicates the col name of target column
               in the input data frame.
        :param id_col: (optional) a str indicates the col name of dataframe id. If
               it is not explicitly stated, then the data is interpreted as only
               containing a single id.
        :param extra_feature_col: (optional) a str or list indicates the col name
               of extra feature columns that needs to predict the target column.
        :param with_split: (optional) bool, states if we need to split the dataframe
               to train, validation and test set. The value defaults to False.
        :param val_ratio: (optional) float, validation ratio. Only effective when
               with_split is set to True. The value defaults to 0.
        :param test_ratio: (optional) float, test ratio. Only effective when with_split
               is set to True. The value defaults to 0.1.
        :param largest_look_back: (optional) int, the largest length to look back.
               Only effective when with_split is set to True. The value defaults to 0.
        :param largest_horizon: (optional) int, the largest num of steps to look
               forward. Only effective when with_split is set to True. The value defaults
               to 1.

        :return: a TSDataset instance when with_split is set to False,
                 three TSDataset instances when with_split is set to True.

        Create a tsdataset instance by:

        >>> # Here is a df example:
        >>> # id        datetime      value   "extra feature 1"   "extra feature 2"
        >>> # 00        2019-01-01    1.9     1                   2
        >>> # 01        2019-01-01    2.3     0                   9
        >>> # 00        2019-01-02    2.4     3                   4
        >>> # 01        2019-01-02    2.6     0                   2
        >>> tsdataset = TSDataset.from_pandas(df, dt_col="datetime",
        >>>                                   target_col="value", id_col="id",
        >>>                                   extra_feature_col=["extra feature 1",
        >>>                                                      "extra feature 2"])
        '''

        _check_type(df, "df", pd.DataFrame)

        tsdataset_df = df.copy(deep=True)
        target_col = _to_list(target_col, name="target_col")
        feature_col = _to_list(extra_feature_col, name="extra_feature_col")

        if id_col is None:
            tsdataset_df[_DEFAULT_ID_COL_NAME] = _DEFAULT_ID_PLACEHOLDER
            id_col = _DEFAULT_ID_COL_NAME

        if with_split:
            tsdataset_dfs = split_timeseries_dataframe(
                df=tsdataset_df,
                id_col=id_col,
                val_ratio=val_ratio,
                test_ratio=test_ratio,
                look_back=largest_look_back,
                horizon=largest_horizon)
            return [
                TSDataset(data=tsdataset_dfs[i],
                          id_col=id_col,
                          dt_col=dt_col,
                          target_col=target_col,
                          feature_col=feature_col) for i in range(3)
            ]

        return TSDataset(data=tsdataset_df,
                         id_col=id_col,
                         dt_col=dt_col,
                         target_col=target_col,
                         feature_col=feature_col)
예제 #4
0
    def to_torch_data_loader(
        self,
        batch_size=32,
        roll=False,
        lookback=None,
        horizon=None,
        feature_col=None,
        target_col=None,
    ):
        """
        Convert TSDataset to a PyTorch DataLoader with or without rolling. We recommend to use
        to_torch_data_loader(roll=True) if you don't need to output the rolled numpy array. It is
        much more efficient than rolling separately, especially when the dataframe or lookback
        is large.

        :param batch_size: int, the batch_size for a Pytorch DataLoader. It defaults to 32.
        :param roll: Boolean. Whether to roll the dataframe before converting to DataLoader.
               If True, you must also specify lookback and horizon for rolling. If False, you must
               have called tsdataset.roll() before calling to_torch_data_loader(). Default to False.
        :param lookback: int, lookback value.
        :param horizon: int or list,
               if `horizon` is an int, we will sample `horizon` step
               continuously after the forecasting point.
               if `horizon` is a list, we will sample discretely according
               to the input list.
               specially, when `horizon` is set to 0, ground truth will be generated as None.
        :param feature_col: str or list, indicates the feature col name. Default to None,
               where we will take all available feature in rolling.
        :param target_col: str or list, indicates the target col name. Default to None,
               where we will take all target in rolling. it should be a subset of target_col
               you used to initialize the tsdataset.

        :return: A pytorch DataLoader instance.

        to_torch_data_loader() can be called by:

        >>> # Here is a df example:
        >>> # id        datetime      value   "extra feature 1"   "extra feature 2"
        >>> # 00        2019-01-01    1.9     1                   2
        >>> # 01        2019-01-01    2.3     0                   9
        >>> # 00        2019-01-02    2.4     3                   4
        >>> # 01        2019-01-02    2.6     0                   2
        >>> tsdataset = TSDataset.from_pandas(df, dt_col="datetime",
        >>>                                   target_col="value", id_col="id",
        >>>                                   extra_feature_col=["extra feature 1",
        >>>                                                      "extra feature 2"])
        >>> horizon, lookback = 1, 1
        >>> data_loader = tsdataset.to_torch_data_loader(batch_size=32,
        >>>                                              roll=True,
        >>>                                              lookback=lookback,
        >>>                                              horizon=horizon)
        >>> # or roll outside. That might be less efficient than the way above.
        >>> tsdataset.roll(lookback=lookback, horizon=horizon, id_sensitive=False)
        >>> x, y = tsdataset.to_numpy()
        >>> print(x, y) # x = [[[1.9, 1, 2 ]], [[2.3, 0, 9 ]]] y = [[[ 2.4 ]], [[ 2.6 ]]]
        >>> data_loader = tsdataset.to_torch_data_loader(batch_size=32)

        """
        from torch.utils.data import TensorDataset, DataLoader
        import torch
        if roll:
            if lookback is None:
                raise ValueError("You must input lookback if roll is True")
            if horizon is None:
                raise ValueError("You must input horizon if roll is True")
            from zoo.chronos.data.utils.roll_dataset import RollDataset
            feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \
                else self.feature_col
            target_col = _to_list(target_col, "target_col") if target_col is not None \
                else self.target_col

            # set scaler index for unscale_numpy
            self.scaler_index = [self.target_col.index(t) for t in target_col]

            torch_dataset = RollDataset(self.df,
                                        lookback=lookback,
                                        horizon=horizon,
                                        feature_col=feature_col,
                                        target_col=target_col,
                                        id_col=self.id_col)
            return DataLoader(torch_dataset,
                              batch_size=batch_size,
                              shuffle=True)
        else:
            if self.numpy_x is None:
                raise RuntimeError(
                    "Please call \"roll\" method before transforming a TSDataset to "
                    "torch DataLoader without rolling (default roll=False)!")
            x, y = self.to_numpy()
            return DataLoader(TensorDataset(
                torch.from_numpy(x).float(),
                torch.from_numpy(y).float()),
                              batch_size=batch_size,
                              shuffle=True)
예제 #5
0
    def roll(self,
             lookback,
             horizon,
             feature_col=None,
             target_col=None,
             id_sensitive=False):
        '''
        Sampling by rolling for machine learning/deep learning models.

        :param lookback: int, lookback value.
        :param horizon: int or list,
               if `horizon` is an int, we will sample `horizon` step
               continuously after the forecasting point.
               if `horizon` is a list, we will sample discretely according
               to the input list.
               specially, when `horizon` is set to 0, ground truth will be generated as None.
        :param feature_col: str or list, indicates the feature col name. Default to None,
               where we will take all available feature in rolling.
        :param target_col: str or list, indicates the target col name. Default to None,
               where we will take all target in rolling. it should be a subset of target_col
               you used to initialize the tsdataset.
        :param id_sensitive: bool,
               if `id_sensitive` is False, we will rolling on each id's sub dataframe
               and fuse the sampings.
               The shape of rolling will be
               x: (num_sample, lookback, num_feature_col + num_target_col)
               y: (num_sample, horizon, num_target_col)
               where num_sample is the summation of sample number of each dataframe

               if `id_sensitive` is True, we will rolling on the wide dataframe whose
               columns are cartesian product of id_col and feature_col
               The shape of rolling will be
               x: (num_sample, lookback, new_num_feature_col + new_num_target_col)
               y: (num_sample, horizon, new_num_target_col)
               where num_sample is the sample number of the wide dataframe,
               new_num_feature_col is the product of the number of id and the number of feature_col.
               new_num_target_col is the product of the number of id and the number of target_col.

        :return: the tsdataset instance.

        roll() can be called by:

        >>> # Here is a df example:
        >>> # id        datetime      value   "extra feature 1"   "extra feature 2"
        >>> # 00        2019-01-01    1.9     1                   2
        >>> # 01        2019-01-01    2.3     0                   9
        >>> # 00        2019-01-02    2.4     3                   4
        >>> # 01        2019-01-02    2.6     0                   2
        >>> tsdataset = TSDataset.from_pandas(df, dt_col="datetime",
        >>>                                   target_col="value", id_col="id",
        >>>                                   extra_feature_col=["extra feature 1",
        >>>                                                      "extra feature 2"])
        >>> horizon, lookback = 1, 1
        >>> tsdataset.roll(lookback=lookback, horizon=horizon, id_sensitive=False)
        >>> x, y = tsdataset.to_numpy()
        >>> print(x, y) # x = [[[1.9, 1, 2 ]], [[2.3, 0, 9 ]]] y = [[[ 2.4 ]], [[ 2.6 ]]]
        >>> print(x.shape, y.shape) # x.shape = (2, 1, 3) y.shape = (2, 1, 1)
        >>> tsdataset.roll(lookback=lookback, horizon=horizon, id_sensitive=True)
        >>> x, y = tsdataset.to_numpy()
        >>> print(x, y) # x = [[[ 1.9, 2.3, 1, 2, 0, 9 ]]] y = [[[ 2.4, 2.6]]]
        >>> print(x.shape, y.shape) # x.shape = (1, 1, 6) y.shape = (1, 1, 2)

        '''
        feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \
            else self.feature_col
        target_col = _to_list(target_col, "target_col") if target_col is not None \
            else self.target_col
        if self.roll_additional_feature:
            additional_feature_col =\
                list(set(feature_col).intersection(set(self.roll_additional_feature)))
            feature_col =\
                list(set(feature_col) - set(self.roll_additional_feature))
            self.roll_feature = feature_col + additional_feature_col
        else:
            additional_feature_col = None
            self.roll_feature = feature_col

        self.roll_target = target_col
        num_id = len(self._id_list)
        num_feature_col = len(self.roll_feature)
        num_target_col = len(self.roll_target)
        self.id_sensitive = id_sensitive
        roll_feature_df = None if self.roll_feature_df is None \
            else self.roll_feature_df[additional_feature_col]

        rolling_result = \
            self.df.groupby([self.id_col]) \
                .apply(lambda df: roll_timeseries_dataframe(df=df,
                                                            roll_feature_df=roll_feature_df,
                                                            lookback=lookback,
                                                            horizon=horizon,
                                                            feature_col=feature_col,
                                                            target_col=target_col))

        # concat the result on required axis
        concat_axis = 2 if id_sensitive else 0
        self.numpy_x = np.concatenate(
            [rolling_result[i][0] for i in self._id_list],
            axis=concat_axis).astype(np.float32)
        if horizon != 0:
            self.numpy_y = np.concatenate(
                [rolling_result[i][1] for i in self._id_list],
                axis=concat_axis).astype(np.float32)
        else:
            self.numpy_y = None

        # target first
        if self.id_sensitive:
            feature_start_idx = num_target_col * num_id
            reindex_list = [
                list(range(i * num_target_col, (i + 1) * num_target_col)) +
                list(
                    range(feature_start_idx + i * num_feature_col,
                          feature_start_idx + (i + 1) * num_feature_col))
                for i in range(num_id)
            ]
            reindex_list = functools.reduce(lambda a, b: a + b, reindex_list)
            sorted_index = sorted(range(len(reindex_list)),
                                  key=reindex_list.__getitem__)
            self.numpy_x = self.numpy_x[:, :, sorted_index]

        # scaler index
        num_roll_target = len(self.roll_target)
        repeat_factor = len(self._id_list) if self.id_sensitive else 1
        scaler_index = [
            self.target_col.index(self.roll_target[i])
            for i in range(num_roll_target)
        ] * repeat_factor
        self.scaler_index = scaler_index

        return self
예제 #6
0
    def from_parquet(path,
                     dt_col,
                     target_col,
                     id_col=None,
                     extra_feature_col=None,
                     with_split=False,
                     val_ratio=0,
                     test_ratio=0.1,
                     largest_look_back=0,
                     largest_horizon=1,
                     **kwargs):
        """
        Initialize tsdataset(s) from path of parquet file.

        :param path: A string path to parquet file. The string could be a URL.
               Valid URL schemes include hdfs, http, ftp, s3, gs, and file. For file URLs, a host
               is expected. A local file could be: file://localhost/path/to/table.parquet.
               A file URL can also be a path to a directory that contains multiple partitioned
               parquet files.
        :param dt_col: a str indicates the col name of datetime
               column in the input data frame.
        :param target_col: a str or list indicates the col name of target column
               in the input data frame.
        :param id_col: (optional) a str indicates the col name of dataframe id. If
               it is not explicitly stated, then the data is interpreted as only
               containing a single id.
        :param extra_feature_col: (optional) a str or list indicates the col name
               of extra feature columns that needs to predict the target column.
        :param with_split: (optional) bool, states if we need to split the dataframe
               to train, validation and test set. The value defaults to False.
        :param val_ratio: (optional) float, validation ratio. Only effective when
               with_split is set to True. The value defaults to 0.
        :param test_ratio: (optional) float, test ratio. Only effective when with_split
               is set to True. The value defaults to 0.1.
        :param largest_look_back: (optional) int, the largest length to look back.
               Only effective when with_split is set to True. The value defaults to 0.
        :param largest_horizon: (optional) int, the largest num of steps to look
               forward. Only effective when with_split is set to True. The value defaults
               to 1.
        :param kwargs: Any additional kwargs are passed to the pd.read_parquet
               and pyarrow.parquet.read_table.

        :return: a TSDataset instance when with_split is set to False,
                 three TSDataset instances when with_split is set to True.

        Create a tsdataset instance by:

        >>> # Here is a df example:
        >>> # id        datetime      value   "extra feature 1"   "extra feature 2"
        >>> # 00        2019-01-01    1.9     1                   2
        >>> # 01        2019-01-01    2.3     0                   9
        >>> # 00        2019-01-02    2.4     3                   4
        >>> # 01        2019-01-02    2.6     0                   2
        >>> tsdataset = TSDataset.from_parquet("hdfs://path/to/table.parquet", dt_col="datetime",
        >>>                                   target_col="value", id_col="id",
        >>>                                   extra_feature_col=["extra feature 1",
        >>>                                                      "extra feature 2"])
        """
        from zoo.chronos.data.utils.file import parquet2pd
        columns = _to_list(dt_col, name="dt_col") + \
            _to_list(target_col, name="target_col") + \
            _to_list(id_col, name="id_col") + \
            _to_list(extra_feature_col, name="extra_feature_col")
        df = parquet2pd(path, columns=columns, **kwargs)
        return TSDataset.from_pandas(
            df,
            dt_col=dt_col,
            target_col=target_col,
            id_col=id_col,
            extra_feature_col=extra_feature_col,
            with_split=with_split,
            val_ratio=val_ratio,
            test_ratio=test_ratio,
            largest_look_back=largest_look_back,
            largest_horizon=largest_horizon,
        )
예제 #7
0
    def from_xshards(shards,
                     dt_col,
                     target_col,
                     id_col=None,
                     extra_feature_col=None,
                     with_split=False,
                     val_ratio=0,
                     test_ratio=0.1,
                     largest_look_back=0,
                     largest_horizon=1):
        '''
        Initialize xshardtsdataset(s) from xshard pandas dataframe.

        :param shards: an xshards pandas dataframe for your raw time series data.
        :param dt_col: a str indicates the col name of datetime
               column in the input data frame.
        :param target_col: a str or list indicates the col name of target column
               in the input data frame.
        :param id_col: (optional) a str indicates the col name of dataframe id. If
               it is not explicitly stated, then the data is interpreted as only
               containing a single id.
        :param extra_feature_col: (optional) a str or list indicates the col name
               of extra feature columns that needs to predict the target column.
        :param with_split: (optional) bool, states if we need to split the dataframe
               to train, validation and test set. The value defaults to False.
        :param val_ratio: (optional) float, validation ratio. Only effective when
               with_split is set to True. The value defaults to 0.
        :param test_ratio: (optional) float, test ratio. Only effective when with_split
               is set to True. The value defaults to 0.1.
        :param largest_look_back: (optional) int, the largest length to look back.
               Only effective when with_split is set to True. The value defaults to 0.
        :param largest_horizon: (optional) int, the largest num of steps to look
               forward. Only effective when with_split is set to True. The value defaults
               to 1.

        :return: a XShardTSDataset instance when with_split is set to False,
                 three XShardTSDataset instances when with_split is set to True.

        Create a xshardtsdataset instance by:

        >>> # Here is a df example:
        >>> # id        datetime      value   "extra feature 1"   "extra feature 2"
        >>> # 00        2019-01-01    1.9     1                   2
        >>> # 01        2019-01-01    2.3     0                   9
        >>> # 00        2019-01-02    2.4     3                   4
        >>> # 01        2019-01-02    2.6     0                   2
        >>> from zoo.orca.data.pandas import read_csv
        >>> shards = read_csv(csv_path)
        >>> tsdataset = XShardsTSDataset.from_xshards(shards, dt_col="datetime",
        >>>                                           target_col="value", id_col="id",
        >>>                                           extra_feature_col=["extra feature 1",
        >>>                                                              "extra feature 2"])
        '''

        _check_type(shards, "shards", SparkXShards)

        target_col = _to_list(target_col, name="target_col")
        feature_col = _to_list(extra_feature_col, name="extra_feature_col")

        if id_col is None:
            shards = shards.transform_shard(add_row, _DEFAULT_ID_COL_NAME,
                                            _DEFAULT_ID_PLACEHOLDER)
            id_col = _DEFAULT_ID_COL_NAME

        # repartition to id
        shards = shards.partition_by(cols=id_col,
                                     num_partitions=len(
                                         shards[id_col].unique()))

        if with_split:
            tsdataset_shards\
                = shards.transform_shard(split_timeseries_dataframe,
                                         id_col, val_ratio, test_ratio,
                                         largest_look_back, largest_horizon).split()
            return [
                XShardsTSDataset(shards=tsdataset_shards[i],
                                 id_col=id_col,
                                 dt_col=dt_col,
                                 target_col=target_col,
                                 feature_col=feature_col) for i in range(3)
            ]

        return XShardsTSDataset(shards=shards,
                                id_col=id_col,
                                dt_col=dt_col,
                                target_col=target_col,
                                feature_col=feature_col)