def train(working_dir, dataset='56_sunspots', epochs=100, stopping_patience=3): # read in training df df = pd.read_csv( f'../../datasets/seed_datasets_current/{dataset}/TRAIN/dataset_TRAIN/tables/learningData.csv' ) df = _multi_index_prep(df, dataset) df = _time_col_to_seconds(df, dataset) # create TimeSeries and Learner objects ds = _create_ts_object(df, dataset) learner = DeepARLearner(ds, verbose=1) # fit learner.fit(epochs=epochs, stopping_patience=stopping_patience, early_stopping=True, checkpoint_dir=os.path.join("./checkpoints", working_dir)) # evaluate test_df = pd.read_csv( f'../../datasets/seed_datasets_current/{dataset}/TEST/dataset_TEST/tables/learningData.csv' ) test_df = _multi_index_prep(test_df, dataset) test_df = _time_col_to_seconds(test_df, dataset) test_ds = _create_ts_test_object(test_df, ds, dataset) preds = learner.predict(test_ds, horizon=None, samples=1, include_all_training=True).reshape(-1)
class DeepArPrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies a deep autoregressive forecasting algorithm for time series prediction. The implementation is based off of this paper: https://arxiv.org/pdf/1704.04110.pdf and is implemented in AWS's Sagemaker interface. Training inputs: 1) Feature dataframe, 2) Target dataframe Outputs: Dataframe with predictions for specific time series at specific future time instances Arguments: hyperparams {Hyperparams} -- D3M Hyperparameter object Keyword Arguments: random_seed {int} -- random seed (default: {0}) """ metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". "id": "3410d709-0a13-4187-a1cb-159dd24b584b", "version": __version__, "name": "DeepAR", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. "keywords": [ "time series", "forecasting", "recurrent neural network", "autoregressive", ], "source": { "name": __author__, "contact": __contact__, "uris": [ # Unstructured URIs. "https://github.com/Yonder-OSS/D3M-Primitives", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.14" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/Yonder-OSS/D3M-Primitives.git@{git_commit}#egg=yonder-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], # The same path the primitive is registered with entry points in setup.py. "python_path": "d3m.primitives.time_series_forecasting.lstm.DeepAR", # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.RECURRENT_NEURAL_NETWORK, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) # set seed for reproducibility tf.random.set_seed(random_seed) self._cols_after_drop = 0 self._is_fit = False def get_params(self) -> Params: if not self._is_fit: return Params(drop_cols_no_tgt=None, cols_after_drop=None, train_data=None, ts_frame=None, target_column=None, timestamp_column=None, ts_object=None, grouping_column=None, output_columns=None, min_train=None, freq=None, integer_timestamps=None, is_fit=None) return Params(drop_cols_no_tgt=self._drop_cols_no_tgt, cols_after_drop=self._cols_after_drop, train_data=self._train_data, ts_frame=self._ts_frame, target_column=self._target_column, timestamp_column=self._timestamp_column, ts_object=self._ts_object, grouping_column=self._grouping_column, output_columns=self._output_columns, min_train=self._min_train, freq=self.freq, integer_timestamps=self._integer_timestamps, is_fit=self._is_fit) def set_params(self, *, params: Params) -> None: self._drop_cols_no_tgt = params['drop_cols_no_tgt'] self._cols_after_drop = params['cols_after_drop'] self._train_data = params['train_data'] self._ts_frame = params['ts_frame'] self._target_column = params['target_column'] self._timestamp_column = params['timestamp_column'] self._ts_object = params['ts_object'] self._grouping_column = params['grouping_column'] self._output_columns = params['output_columns'] self._min_train = params['min_train'] self.freq = params['freq'] self._integer_timestamps = params['integer_timestamps'] self._is_fit = params['is_fit'] def _drop_multiple_special_cols(self, col_list, col_type): """ private util function that creates list of duplicated special columns (for deletion) Arguments: col_list {List[int]} -- list of column indices col_type {str} -- D3M semantic type Returns: int or None -- first column idx in col_list if any column idxs are marked (else None) """ if len(col_list) == 0: return None elif len(col_list) > 1: logger.warn( f"""There are more than one {col_type} marked. This primitive will use the first and drop other {col_type}s.""" ) self._drop_cols += col_list[1:] if col_type != "target column": self._drop_cols_no_tgt += col_list[1:] return col_list[0] def _get_cols(self, input_metadata): """ private util function: get indices of important columns from metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Raises: ValueError: If Target column is not of type 'Integer' or 'Float' """ self._drop_cols = [] self._drop_cols_no_tgt = [] # get target idx (first column by default) target_columns = input_metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/SuggestedTarget", "https://metadata.datadrivendiscovery.org/types/TrueTarget", "https://metadata.datadrivendiscovery.org/types/Target", )) if len(target_columns) == 0: raise ValueError("At least one column must be marked as a target") self._target_column = self._drop_multiple_special_cols( target_columns, "target column") # get timestamp idx (first column by default) timestamp_columns = input_metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/Time", "http://schema.org/DateTime", )) self._timestamp_column = self._drop_multiple_special_cols( timestamp_columns, "timestamp column") # get grouping idx and add suggested grouping keys to drop_cols list grouping_columns = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey", )) self._grouping_column = self._drop_multiple_special_cols( grouping_columns, "grouping column") suggested_grouping_columns = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey", )) self._drop_cols += suggested_grouping_columns self._drop_cols_no_tgt += suggested_grouping_columns # get index_col (first index column by default) index_columns = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/PrimaryKey", )) self._index_column = self._drop_multiple_special_cols( index_columns, "index column") # determine whether targets are count data target_semantic_types = input_metadata.query_column_field( self._target_column, "semantic_types") if self.hyperparams["count_data"] is not None: self._count_data = self.hyperparams["count_data"] elif "http://schema.org/Integer" in target_semantic_types: if np.min(self._ts_frame.iloc[:, self._target_column]) > 0: self._count_data = True else: self._count_data = False elif "http://schema.org/Float" in target_semantic_types: self._count_data = False else: raise ValueError( "Target column is not of type 'Integer' or 'Float'") def _update_indices(self): """ private util function: subtract length of drop cols from each marked idx to account for smaller df """ length = len(self._drop_cols) if self._target_column is not None: self._target_column -= length if self._timestamp_column is not None: self._timestamp_column -= length if self._grouping_column is not None: self._grouping_column -= length if self._index_column is not None: self._index_column -= length self._cols_after_drop = self._ts_frame.shape[0] def _create_data_object_and_learner(self, val_split): """ private util function: creates (or updates) train ds object and learner Arguments: val_split {float} -- proportion of training data to withhold for validation """ # Create TimeSeries dataset objects self._ts_object = TimeSeriesTrain( self._ts_frame, target_idx=self._target_column, timestamp_idx=self._timestamp_column, grouping_idx=self._grouping_column, index_col=self._index_column, count_data=self._count_data, negative_obs=self.hyperparams["negative_obs"], val_split=val_split, integer_timestamps=self._integer_timestamps, freq=self.freq, ) # Create learner self._learner = DeepARLearner( self._ts_object, emb_dim=self.hyperparams["emb_dim"], lstm_dim=self.hyperparams["lstm_dim"], dropout=self.hyperparams["dropout_rate"], lr=self.hyperparams["learning_rate"], batch_size=self.hyperparams["batch_size"], train_window=self.hyperparams["window_size"], verbose=0, ) # save weights so we can restart fitting from scratch (if desired by caller) self._learner.save_weights(self.hyperparams['weights_filepath']) def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets Raises: ValueError: If multiple columns are annotated with 'Time' or 'DateTime' metadata """ # save copy of train data so we don't predict for each row in training self._output_columns = outputs.columns self._train_data = inputs.copy() # combine inputs and outputs for internal TimeSeries object self._ts_frame = inputs.append_columns(outputs) # Parse cols needed for ts object self._get_cols(self._ts_frame.metadata) # drop cols if multiple special type columns if len(self._drop_cols) > 0: self._ts_frame = self._ts_frame.remove_columns(self._drop_cols) self._update_indices() # assumption is that integer timestamps are days (treated this way by DeepAR objects) if "http://schema.org/Integer" in self._ts_frame.metadata.query_column_field( self._timestamp_column, "semantic_types"): self._integer_timestamps = True else: self._integer_timestamps = False # calculate frequency of time series g_col, t_col = ( self._ts_frame.columns[self._grouping_column], self._ts_frame.columns[self._timestamp_column], ) if self._grouping_column is None: time_col_sorted = np.sort(self._ts_frame[t_col]) self._min_train = time_col_sorted[0] self.freq = calculate_time_frequency(time_col_sorted[1] - self._min_train) # self._train_diff = int( # np.diff(np.sort(self._ts_frame.iloc[:, self._timestamp_column]))[0] # ) else: # assume frequency is the same across all time series self.freq = calculate_time_frequency( int( self._ts_frame.groupby(g_col)[t_col].apply( lambda x: np.diff(np.sort(x))).iloc[0][0])) self._min_train = self._ts_frame.groupby(g_col)[t_col].agg( "min").min() # Create TimeSeries dataset object and learner self._create_data_object_and_learner(self.hyperparams["val_split"]) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fits DeepAR model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, considered (default: {None}) iterations {int} -- iterations, considered (default: {None}) Returns: CallResult[None] """ # special case for no validation if iterations is not None: self._create_data_object_and_learner(0) if iterations is None: iterations_set = False iterations = self.hyperparams["epochs"] validation = self.hyperparams["val_split"] > 0 else: iterations_set = True validation = False # time training for 1 epoch so we can consider timeout argument thoughtfully if timeout: logger.info("""Timing the fitting procedure for one epoch so we can consider timeout thoughtfully""") start_time = time.time() _, iterations_completed = self._learner.fit( validation=validation, steps_per_epoch=self.hyperparams["steps_per_epoch"], epochs=1, stopping_patience=self.hyperparams["early_stopping_patience"], stopping_delta=self.hyperparams["early_stopping_delta"], tensorboard=False, ) epoch_time_estimate = time.time() - start_time # subract 1 for epoch that already happened and 1 more to be safe timeout_epochs = timeout // epoch_time_estimate - 2 iters = min(timeout_epochs, iterations) else: iters = iterations # normal fitting logger.info(f"Fitting for {iters} iterations") start_time = time.time() _, iterations_completed = self._learner.fit( validation=validation, steps_per_epoch=self.hyperparams["steps_per_epoch"], epochs=iters, stopping_patience=self.hyperparams["early_stopping_patience"], stopping_delta=self.hyperparams["early_stopping_delta"], tensorboard=False, ) logger.info( f"Fit for {iterations_completed} epochs, took {time.time() - start_time}s" ) # maintain primitive state (mark that training data has been used) self._new_train_data = False self._is_fit = True # use fitting history to set CallResult return values if iterations_set: has_finished = False elif iters < iterations: has_finished = False else: has_finished = self._is_fit return CallResult(None, has_finished=has_finished, iterations_done=iterations_completed) def _get_pred_intervals(self, df, keep_all=False): """ private util function that retrieves unevenly spaced prediction intervals from data frame Arguments: df {pandas df} -- df of predictions from which to extract prediction intervals Keyword Arguments: keep_all {bool} -- if True, take every interval slice, otherwise only take those given by the df Returns: pd Series -- series of intervals, indexed by group, granularity of 1 interval """ # no grouping column if self._grouping_column is None: interval = discretize_time_difference( df.iloc[:, self._timestamp_column], self._min_train, self.freq, self._integer_timestamps, ) if keep_all: interval = np.arange(min(interval), max(interval) + 1) return pd.Series([interval]) # grouping column else: g_col, t_col = ( df.columns[self._grouping_column], df.columns[self._timestamp_column], ) all_intervals, groups = [], [] for (group, vals) in df.groupby(g_col)[t_col]: interval = discretize_time_difference(vals, self._min_train, self.freq, self._integer_timestamps) if keep_all: interval = np.arange(min(interval), max(interval) + 1) all_intervals.append(interval) groups.append(group) return pd.Series(all_intervals, index=groups) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's predictions for specific time series at specific future time instances * these specific timesteps / series are specified implicitly by input dataset Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- (N, 2) dataframe with d3m_index and value for each prediction slice requested. prediction slice = specific horizon idx for specific series in specific regression """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") if len(self._drop_cols_no_tgt ) > 0 and inputs.shape[1] != self._cols_after_drop: test_frame = inputs.remove_columns(self._drop_cols_no_tgt) else: test_frame = inputs.copy() # Create TimeSeriesTest object if self._train_data.equals(inputs): ts_test_object = TimeSeriesTest(self._ts_object) # test else: ts_test_object = TimeSeriesTest(self._ts_object, test_frame) # get prediction slices pred_intervals = self._get_pred_intervals(test_frame) # make predictions with learner learner = DeepARLearner( self._ts_object, emb_dim=self.hyperparams["emb_dim"], lstm_dim=self.hyperparams["lstm_dim"], dropout=self.hyperparams["dropout_rate"], lr=self.hyperparams["learning_rate"], batch_size=self.hyperparams["batch_size"], train_window=self.hyperparams["window_size"], verbose=0, ) learner.load_weights(self.hyperparams['weights_filepath']) start_time = time.time() logger.info(f"Making predictions...") preds = learner.predict(ts_test_object, include_all_training=True) logger.info( f"Prediction took {time.time() - start_time}s. Predictions array shape: {preds.shape}" ) # slice predictions with learned intervals all_preds = [] for p, idxs in zip(preds, pred_intervals.values): # all_preds.extend(p[: len(idxs)]) # this takes first n predictions all_preds.extend([p[i] for i in idxs ]) # this takes predictions at actual indices flat_list = np.array([p for pred_list in all_preds for p in pred_list]) # if np.isinf(all_preds).any(): # logger.debug(f'There are {np.isinf(all_preds).sum()} inf preds') # if np.isnan(all_preds).any(): # logger.debug(f'There are {np.isnan(all_preds).sum()} nan preds') # logger.debug(f'Max: {preds.max()}, Min: {preds.min()}') # fill nans with 0s in case model predicted some (shouldnt need to - preventing edge case) flat_list = np.nan_to_num(flat_list) # create output frame result_df = container.DataFrame( {self._ts_frame.columns[self._target_column]: flat_list}, generate_metadata=True, ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=self._is_fit) def produce_confidence_intervals(self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> CallResult[Outputs]: """ produce confidence intervals for each series 'confidence_interval_horizon' periods into the future Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, considered (default: {None}) Raises: PrimitiveNotFittedError: Returns: CallResult[Outputs] -- Ex. series | timestep | mean | 0.05 | 0.95 -------------------------------------- a | 0 | 5 | 3 | 7 a | 1 | 6 | 4 | 8 b | 0 | 5 | 3 | 7 b | 1 | 6 | 4 | 8 """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") alpha = self.hyperparams["confidence_interval_alpha"] if len(self._drop_cols_no_tgt ) > 0 and inputs.shape[1] != self._cols_after_drop: test_frame = inputs.remove_columns(self._drop_cols_no_tgt) else: test_frame = inputs.copy() # Create TimeSeriesTest object if self._train_data.equals(inputs): ts_test_object = TimeSeriesTest(self._ts_object) include_all_training = True horizon = 0 # test else: ts_test_object = TimeSeriesTest(self._ts_object, test_frame) include_all_training = self.hyperparams[ 'seed_predictions_with_all_data'] horizon = self.hyperparams["confidence_interval_horizon"] # make predictions with learner start_time = time.time() logger.info(f"Making predictions...") preds = self._learner.predict( ts_test_object, horizon=horizon, samples=self.hyperparams["confidence_interval_samples"], include_all_training=include_all_training, point_estimate=False) logger.info( f"Prediction took {time.time() - start_time}s. Predictions array shape: {preds.shape}" ) # convert samples to percentiles means = np.percentile(preds, 50, axis=2).reshape(-1, 1) lowers = np.percentile(preds, alpha / 2 * 100, axis=2).reshape(-1, 1) uppers = np.percentile(preds, (1 - alpha / 2) * 100, axis=2).reshape(-1, 1) assert (lowers < means).all() assert (means < uppers).all() # convert to df if self._grouping_column is None: indices = np.repeat(self._output_columns[0], preds.shape[1]) else: indices = np.repeat( test_frame[test_frame.columns[self._grouping_column]].unique(), preds.shape[1]) interval_df = pd.DataFrame( np.concatenate((means, lowers, uppers), axis=1), columns=["mean", str(alpha / 2), str(1 - alpha / 2)], index=indices, ) # add index column interval_df["horizon_index"] = np.tile(np.arange(preds.shape[1]), len(interval_df.index.unique())) logger.debug(interval_df.head()) # structure return df return CallResult( container.DataFrame(interval_df, generate_metadata=True), has_finished=self._is_fit, )
def hp_search(working_dir, dataset='56_sunspots', epochs=100, metric='eval_mae_result', stopping_patience=5, stopping_delta=1): working_dir = os.path.join("./checkpoints", working_dir) # define domains for HP search HP_EMB_DIM = hp.HParam('emb_dim', hp.Discrete([32, 64, 128])) HP_LSTM_DIM = hp.HParam('lstm_dim', hp.Discrete([32, 64, 128])) HP_DROPOUT = hp.HParam('lstm_dropout', hp.Discrete([0.1, 0.2, 0.3])) HP_LR = hp.HParam('learning_rate', hp.Discrete([.0001, .001, .01])) HP_BS = hp.HParam('batch_size', hp.Discrete([32, 64, 128])) HP_WINDOW = hp.HParam('window_size', hp.Discrete([20, 40, 60])) # set up config with tf.summary.create_file_writer(working_dir).as_default(): hp.hparams_config(hparams=[ HP_EMB_DIM, HP_LSTM_DIM, HP_DROPOUT, HP_LR, HP_BS, HP_WINDOW ], metrics=[hp.Metric(metric, display_name=metric)]) # read in training df df = pd.read_csv( f'../../datasets/seed_datasets_current/{dataset}/TRAIN/dataset_TRAIN/tables/learningData.csv' ) df = _multi_index_prep(df, dataset) df = _time_col_to_seconds(df, dataset) # create TimeSeries and Learner objects ds = _create_ts_object(df, dataset) # grid search over parameters run_num = 0 total_run_count = len(HP_EMB_DIM.domain.values) * \ len(HP_LSTM_DIM.domain.values) * \ len(HP_DROPOUT.domain.values) * \ len(HP_LR.domain.values) * \ len(HP_BS.domain.values) * \ len(HP_WINDOW.domain.values) # outfile for saving hp config and runtimes outfile = open(os.path.join(working_dir, "metrics.txt"), "w+", buffering=1) for emb_dim in HP_EMB_DIM.domain.values: for lstm_dim in HP_LSTM_DIM.domain.values: for dropout in HP_DROPOUT.domain.values: for lr in HP_LR.domain.values: for bs in HP_BS.domain.values: for window_size in HP_WINDOW.domain.values: # create dict of parameters hp_dict = { 'emb_dim': emb_dim, 'lstm_dim': lstm_dim, 'lstm_dropout': dropout, 'learning_rate': lr, 'batch_size': bs, 'window_size': window_size, } run_name = f'run-{run_num}' logger.info( f'--- Starting Run: {run_name} of {total_run_count} ---' ) # print_dict = { # h.name: hp_dict[h] for h in hp_dict # } logger.info(f'HP Dict: {hp_dict}') hp.hparams(hp_dict) # create learner and fit with these HPs start_time = time.time() learner = DeepARLearner(ds, verbose=1, hparams=hp_dict) final_metric = learner.fit( epochs=epochs, stopping_patience=stopping_patience, stopping_delta=stopping_delta, checkpoint_dir=os.path.join( working_dir, run_name)) outfile.write( f'HPs: {hp_dict} ---- Metric: {final_metric} ---- Time: {round(time.time() - start_time,2)}\n' ) tf.summary.scalar(metric, final_metric, step=1) run_num += 1 outfile.close()