def _hp_search(self, input_df, validation_df, metric, recipe, mc, resources_per_trial, remote_dir): ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col, self.drop_missing) if isinstance(input_df, list): feature_list = ft.get_feature_list(input_df[0]) else: feature_list = ft.get_feature_list(input_df) # model = VanillaLSTM(check_optional_config=False) model = TimeSequenceModel(check_optional_config=False, future_seq_len=self.future_seq_len) # prepare parameters for search engine search_space = recipe.search_space(feature_list) runtime_params = recipe.runtime_params() num_samples = runtime_params['num_samples'] stop = dict(runtime_params) search_algorithm_params = recipe.search_algorithm_params() search_algorithm = recipe.search_algorithm() fixed_params = recipe.fixed_params() del stop['num_samples'] searcher = RayTuneSearchEngine(logs_dir=self.logs_dir, resources_per_trial=resources_per_trial, name=self.name, remote_dir=remote_dir, ) searcher.compile(input_df, search_space=search_space, stop=stop, search_algorithm_params=search_algorithm_params, search_algorithm=search_algorithm, fixed_params=fixed_params, # feature_transformers=TimeSequenceFeatures, feature_transformers=ft, # model=model, future_seq_len=self.future_seq_len, validation_df=validation_df, metric=metric, mc=mc, num_samples=num_samples) # searcher.test_run() searcher.run() best = searcher.get_best_trials(k=1)[0] # get the best one trial, later could be n pipeline = self._make_pipeline(best, feature_transformers=ft, model=model, remote_dir=remote_dir) return pipeline
def _hp_search(self, input_df, validation_df, metric, recipe, mc, resources_per_trial, remote_dir): ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col, self.drop_missing) if isinstance(input_df, list): feature_list = ft.get_feature_list(input_df[0]) else: feature_list = ft.get_feature_list(input_df) def model_create_func(): # model = VanillaLSTM(check_optional_config=False) _model = TimeSequenceModel(check_optional_config=False, future_seq_len=self.future_seq_len) return _model model = model_create_func() # prepare parameters for search engine search_space = recipe.search_space(feature_list) metric_mode = TimeSequencePredictor._get_metric_mode(metric) searcher = RayTuneSearchEngine( logs_dir=self.logs_dir, resources_per_trial=resources_per_trial, name=self.name, remote_dir=remote_dir, ) searcher.compile( input_df, model_create_func=model_create_func(), search_space=search_space, recipe=recipe, feature_transformers=ft, future_seq_len=self.future_seq_len, validation_df=validation_df, metric=metric, metric_mode=metric_mode, mc=mc, ) # searcher.test_run() analysis = searcher.run() pipeline = self._make_pipeline(analysis, metric_mode, feature_transformers=ft, model=model, remote_dir=remote_dir) return pipeline
class TimeSequencePipeline(Pipeline): def __init__(self, feature_transformers=None, model=None, config=None, name=None): """ initialize a pipeline :param model: the internal model :param feature_transformers: the feature transformers """ self.feature_transformers = feature_transformers self.model = model self.config = config self.name = name self.time = time.strftime("%Y%m%d-%H%M%S") def describe(self): init_info = [ 'future_seq_len', 'dt_col', 'target_col', 'extra_features_col', 'drop_missing' ] print("**** Initialization info ****") for info in init_info: print(info + ":", self.config[info]) print("") def fit(self, input_df, validation_df=None, mc=False, epoch_num=20): x, y = self.feature_transformers.transform(input_df, is_train=True) if validation_df is not None and not validation_df.empty: validation_data = self.feature_transformers.transform( validation_df) else: validation_data = None new_config = {'epochs': epoch_num} self.model.fit_eval(x, y, validation_data, mc=mc, verbose=1, **new_config) print('Fit done!') def _is_val_df_valid(self, validation_df): df_not_empty = isinstance(validation_df, pd.DataFrame) and not validation_df.empty df_list_not_empty = isinstance(validation_df, list) \ and validation_df and not all([d.empty for d in validation_df]) if validation_df is not None and (df_not_empty or df_list_not_empty): return True else: return False def _check_configs(self): required_configs = {'future_seq_len'} if not self.config.keys() & required_configs: raise ValueError("Missing required parameters in configuration. " + "Required parameters are: " + str(required_configs)) default_config = { 'dt_col': 'datetime', 'target_col': 'value', 'extra_features_col': None, 'drop_missing': True, 'past_seq_len': 2, 'batch_size': 64, 'lr': 0.001, 'dropout': 0.2, 'epochs': 10, 'metric': 'mse' } for config, value in default_config.items(): if config not in self.config: print('Config: \'{}\' is not specified. ' 'A default value of {} will be used.'.format( config, value)) def get_default_configs(self): default_configs = { 'dt_col': 'datetime', 'target_col': 'value', 'extra_features_col': None, 'drop_missing': True, 'future_seq_len': 1, 'past_seq_len': 2, 'batch_size': 64, 'lr': 0.001, 'dropout': 0.2, 'epochs': 10, 'metric': 'mean_squared_error' } print("**** default config: ****") for config in default_configs: print(config + ":", default_configs[config]) print( "You can change any fields in the default configs by passing into " "fit_with_fixed_configs(). Otherwise, the default values will be used." ) return default_configs def fit_with_fixed_configs(self, input_df, validation_df=None, mc=False, **user_configs): """ Fit pipeline with fixed configs. The model will be trained from initialization with the hyper-parameter specified in configs. The configs contain both identity configs (Eg. "future_seq_len", "dt_col", "target_col", "metric") and automl tunable configs (Eg. "past_seq_len", "batch_size"). We recommend calling get_default_configs to see the name and default values of configs you you can specify. :param input_df: one data frame or a list of data frames :param validation_df: one data frame or a list of data frames :param user_configs: you can overwrite or add more configs with user_configs. Eg. "epochs" :return: """ # self._check_configs() if self.config is None: self.config = self.get_default_configs() if user_configs is not None: self.config.update(user_configs) ft_id_config_set = { 'future_seq_len', 'dt_col', 'target_col', 'extra_features_col', 'drop_missing' } ft_id_configs = {a: self.config[a] for a in ft_id_config_set} self.feature_transformers = TimeSequenceFeatureTransformer( **ft_id_configs) model_id_config_set = {'future_seq_len'} ft_id_configs = {a: self.config[a] for a in model_id_config_set} self.model = TimeSequenceModel(check_optional_config=False, **ft_id_configs) all_available_features = self.feature_transformers.get_feature_list( input_df) self.config.update({"selected_features": all_available_features}) (x_train, y_train) = self.feature_transformers.fit_transform( input_df, **self.config) if self._is_val_df_valid(validation_df): validation_data = self.feature_transformers.transform( validation_df) else: validation_data = None self.model.fit_eval(x_train, y_train, validation_data=validation_data, mc=mc, verbose=1, **self.config) def evaluate(self, input_df, metrics=["mse"], multioutput='raw_values'): """ evaluate the pipeline :param input_df: :param metrics: subset of ['mean_squared_error', 'r_square', 'sMAPE'] :param multioutput: string in ['raw_values', 'uniform_average'] 'raw_values' : Returns a full set of errors in case of multioutput input. 'uniform_average' : Errors of all outputs are averaged with uniform weight. :return: """ if isinstance(metrics, str): metrics = [metrics] # if not isinstance(metrics, list): # raise ValueError("Expected metrics to be a list!") x, y = self.feature_transformers.transform(input_df, is_train=True) y_pred = self.model.predict(x) if y_pred.shape[1] == 1: multioutput = 'uniform_average' y_unscale, y_pred_unscale = self.feature_transformers.post_processing( input_df, y_pred, is_train=True) return [ Evaluator.evaluate(m, y_unscale, y_pred_unscale, multioutput=multioutput) for m in metrics ] def predict(self, input_df): """ predict test data with the pipeline fitted :param input_df: :return: """ x, _ = self.feature_transformers.transform(input_df, is_train=False) y_pred = self.model.predict(x) y_output = self.feature_transformers.post_processing(input_df, y_pred, is_train=False) return y_output def predict_with_uncertainty(self, input_df, n_iter=100): x, _ = self.feature_transformers.transform(input_df, is_train=False) y_pred, y_pred_uncertainty = self.model.predict_with_uncertainty( x=x, n_iter=n_iter) y_output = self.feature_transformers.post_processing(input_df, y_pred, is_train=False) y_uncertainty = self.feature_transformers.unscale_uncertainty( y_pred_uncertainty) return y_output, y_uncertainty def save(self, ppl_file=None): """ save pipeline to file, contains feature transformer, model, trial config. :param ppl_file: :return: """ ppl_file = ppl_file or os.path.join( DEFAULT_PPL_DIR, "{}_{}.ppl".format(self.name, self.time)) save_zip(ppl_file, self.feature_transformers, self.model, self.config) print("Pipeline is saved in", ppl_file) return ppl_file def config_save(self, config_file=None): """ save all configs to file. :param config_file: :return: """ config_file = config_file or os.path.join( DEFAULT_CONFIG_DIR, "{}_{}.json".format(self.name, self.time)) save_config(config_file, self.config, replace=True) return config_file
def _hp_search(self, input_df, validation_df, metric): # features # feature_list = ["WEEKDAY(datetime)", "HOUR(datetime)", # "PERCENTILE(value)", "IS_WEEKEND(datetime)", # "IS_AWAKE(datetime)", "IS_BUSY_HOURS(datetime)" # # "DAY(datetime)","MONTH(datetime)", #probabaly not useful # ] # target_list = ["value"] # ft = TimeSequenceFeatures(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col) # ft = DummyTimeSequenceFeatures(file_path='../../../../data/nyc_taxi_rolled_split.npz') ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col, self.drop_missing) feature_list = ft.get_feature_list(input_df) # model model = VanillaLSTM(check_optional_config=False, future_seq_len=self.future_seq_len) search_space = { # -------- feature related parameters "selected_features": RandomSample(lambda spec: np.random.choice( feature_list, size=np.random.randint(low=3, high=len(feature_list), size=1), replace=False)), # --------- model related parameters # 'input_shape_x': x_train.shape[1], # 'input_shape_y': x_train.shape[-1], 'out_units': self.future_seq_len, "lr": 0.001, "lstm_1_units": GridSearch([16, 32]), "dropout_1": 0.2, "lstm_2_units": 10, "dropout_2": RandomSample(lambda spec: np.random.uniform(0.2, 0.5)), "batch_size": 1024, } stop = {"reward_metric": -0.05, "training_iteration": 10} searcher = RayTuneSearchEngine(logs_dir=self.logs_dir, ray_num_cpus=6, resources_per_trial={"cpu": 2}) searcher.compile( input_df, search_space=search_space, stop=stop, # feature_transformers=TimeSequenceFeatures, feature_transformers=ft, # use dummy features for testing the rest model=model, validation_df=validation_df, metric=metric) # searcher.test_run() trials = searcher.run() best = searcher.get_best_trials( k=1)[0] # get the best one trial, later could be n pipeline = self._make_pipeline( best, feature_transformers=ft, # feature_transformers=TimeSequenceFeatures( # file_path='../../../../data/nyc_taxi_rolled_split.npz'), model=VanillaLSTM(check_optional_config=False)) return pipeline