def _check_basic_invariants(self): ''' This function contains a bunch of assertions to make sure strict rules(the invariants) for the internal dataframe(self.df) must stands. If not, clear and user-friendly error or warning message should be provided to the users. This function will be called after each method(e.g. impute, deduplicate ...). ''' # check type _check_type(self.df, "df", pd.DataFrame) _check_type(self.id_col, "id_col", str) _check_type(self.dt_col, "dt_col", str) _check_type(self.target_col, "target_col", list) _check_type(self.feature_col, "feature_col", list) # check valid name _check_col_within(self.df, self.id_col) _check_col_within(self.df, self.dt_col) for target_col_name in self.target_col: _check_col_within(self.df, target_col_name) for feature_col_name in self.feature_col: if self.roll_additional_feature and feature_col_name in self.roll_additional_feature: continue _check_col_within(self.df, feature_col_name) # check no n/a in critical col _check_col_no_na(self.df, self.dt_col) _check_col_no_na(self.df, self.id_col)
def from_pandas(df, dt_col, target_col, id_col=None, extra_feature_col=None, with_split=False, val_ratio=0, test_ratio=0.1, largest_look_back=0, largest_horizon=1): ''' Initialize tsdataset(s) from pandas dataframe. :param df: a pandas dataframe for your raw time series data. :param dt_col: a str indicates the col name of datetime column in the input data frame. :param target_col: a str or list indicates the col name of target column in the input data frame. :param id_col: (optional) a str indicates the col name of dataframe id. If it is not explicitly stated, then the data is interpreted as only containing a single id. :param extra_feature_col: (optional) a str or list indicates the col name of extra feature columns that needs to predict the target column. :param with_split: (optional) bool, states if we need to split the dataframe to train, validation and test set. The value defaults to False. :param val_ratio: (optional) float, validation ratio. Only effective when with_split is set to True. The value defaults to 0. :param test_ratio: (optional) float, test ratio. Only effective when with_split is set to True. The value defaults to 0.1. :param largest_look_back: (optional) int, the largest length to look back. Only effective when with_split is set to True. The value defaults to 0. :param largest_horizon: (optional) int, the largest num of steps to look forward. Only effective when with_split is set to True. The value defaults to 1. :return: a TSDataset instance when with_split is set to False, three TSDataset instances when with_split is set to True. Create a tsdataset instance by: >>> # Here is a df example: >>> # id datetime value "extra feature 1" "extra feature 2" >>> # 00 2019-01-01 1.9 1 2 >>> # 01 2019-01-01 2.3 0 9 >>> # 00 2019-01-02 2.4 3 4 >>> # 01 2019-01-02 2.6 0 2 >>> tsdataset = TSDataset.from_pandas(df, dt_col="datetime", >>> target_col="value", id_col="id", >>> extra_feature_col=["extra feature 1", >>> "extra feature 2"]) ''' _check_type(df, "df", pd.DataFrame) tsdataset_df = df.copy(deep=True) target_col = _to_list(target_col, name="target_col") feature_col = _to_list(extra_feature_col, name="extra_feature_col") if id_col is None: tsdataset_df[_DEFAULT_ID_COL_NAME] = _DEFAULT_ID_PLACEHOLDER id_col = _DEFAULT_ID_COL_NAME if with_split: tsdataset_dfs = split_timeseries_dataframe( df=tsdataset_df, id_col=id_col, val_ratio=val_ratio, test_ratio=test_ratio, look_back=largest_look_back, horizon=largest_horizon) return [ TSDataset(data=tsdataset_dfs[i], id_col=id_col, dt_col=dt_col, target_col=target_col, feature_col=feature_col) for i in range(3) ] return TSDataset(data=tsdataset_df, id_col=id_col, dt_col=dt_col, target_col=target_col, feature_col=feature_col)
def from_xshards(shards, dt_col, target_col, id_col=None, extra_feature_col=None, with_split=False, val_ratio=0, test_ratio=0.1, largest_look_back=0, largest_horizon=1): ''' Initialize xshardtsdataset(s) from xshard pandas dataframe. :param shards: an xshards pandas dataframe for your raw time series data. :param dt_col: a str indicates the col name of datetime column in the input data frame. :param target_col: a str or list indicates the col name of target column in the input data frame. :param id_col: (optional) a str indicates the col name of dataframe id. If it is not explicitly stated, then the data is interpreted as only containing a single id. :param extra_feature_col: (optional) a str or list indicates the col name of extra feature columns that needs to predict the target column. :param with_split: (optional) bool, states if we need to split the dataframe to train, validation and test set. The value defaults to False. :param val_ratio: (optional) float, validation ratio. Only effective when with_split is set to True. The value defaults to 0. :param test_ratio: (optional) float, test ratio. Only effective when with_split is set to True. The value defaults to 0.1. :param largest_look_back: (optional) int, the largest length to look back. Only effective when with_split is set to True. The value defaults to 0. :param largest_horizon: (optional) int, the largest num of steps to look forward. Only effective when with_split is set to True. The value defaults to 1. :return: a XShardTSDataset instance when with_split is set to False, three XShardTSDataset instances when with_split is set to True. Create a xshardtsdataset instance by: >>> # Here is a df example: >>> # id datetime value "extra feature 1" "extra feature 2" >>> # 00 2019-01-01 1.9 1 2 >>> # 01 2019-01-01 2.3 0 9 >>> # 00 2019-01-02 2.4 3 4 >>> # 01 2019-01-02 2.6 0 2 >>> from zoo.orca.data.pandas import read_csv >>> shards = read_csv(csv_path) >>> tsdataset = XShardsTSDataset.from_xshards(shards, dt_col="datetime", >>> target_col="value", id_col="id", >>> extra_feature_col=["extra feature 1", >>> "extra feature 2"]) ''' _check_type(shards, "shards", SparkXShards) target_col = _to_list(target_col, name="target_col") feature_col = _to_list(extra_feature_col, name="extra_feature_col") if id_col is None: shards = shards.transform_shard(add_row, _DEFAULT_ID_COL_NAME, _DEFAULT_ID_PLACEHOLDER) id_col = _DEFAULT_ID_COL_NAME # repartition to id shards = shards.partition_by(cols=id_col, num_partitions=len( shards[id_col].unique())) if with_split: tsdataset_shards\ = shards.transform_shard(split_timeseries_dataframe, id_col, val_ratio, test_ratio, largest_look_back, largest_horizon).split() return [ XShardsTSDataset(shards=tsdataset_shards[i], id_col=id_col, dt_col=dt_col, target_col=target_col, feature_col=feature_col) for i in range(3) ] return XShardsTSDataset(shards=shards, id_col=id_col, dt_col=dt_col, target_col=target_col, feature_col=feature_col)