def _make_pipeline(self, analysis, feature_transformers, model, remote_dir): metric = self.metric mode = Evaluator.get_metric_mode(metric) best_config = analysis.get_best_config(metric=metric, mode=mode) best_logdir = analysis.get_best_logdir(metric=metric, mode=mode) print("best log dir is ", best_logdir) dataframe = analysis.dataframe(metric=metric, mode=mode) # print(dataframe) model_path = os.path.join(best_logdir, dataframe["checkpoint"].iloc[0]) config = convert_bayes_configs(best_config).copy() self._print_config(config) if remote_dir is not None: all_config = restore_hdfs(model_path, remote_dir, feature_transformers, model, # config) ) else: all_config = restore_zip(model_path, feature_transformers, model, # config) ) return TimeSequencePipeline(name=self.name, feature_transformers=feature_transformers, model=model, config=all_config)
def train_func(config): # make a copy from global variables for trial to make changes global_ft = ray.get(ft_id) trial_ft = deepcopy(global_ft) if isinstance(model_create_func, ModelBuilder): trial_model = model_create_func.build(config) else: trial_model = model_create_func() imputer = None if "imputation" in config: if config["imputation"] == "LastFillImpute": imputer = LastFillImpute() elif config["imputation"] == "FillZeroImpute": imputer = FillZeroImpute() # handling input global_input_df = ray.get(input_df_id) trial_input_df = deepcopy(global_input_df) if imputer: trial_input_df = imputer.impute(trial_input_df) config = convert_bayes_configs(config).copy() # print("config is ", config) (x_train, y_train) = trial_ft.fit_transform(trial_input_df, **config) # trial_ft.fit(trial_input_df, **config) # handling validation data validation_data = None if is_val_df_valid: global_validation_df = ray.get(validation_df_id) trial_validation_df = deepcopy(global_validation_df) validation_data = trial_ft.transform(trial_validation_df) # no need to call build since it is called the first time fit_eval is called. # callbacks = [TuneCallback(tune_reporter)] # fit model best_reward_m = None # print("config:", config) for i in range(1, 101): result = trial_model.fit_eval( x_train, y_train, validation_data=validation_data, mc=mc, metric=metric, # verbose=1, **config) reward_m = result if Evaluator.get_metric_mode( metric) == "max" else -result ckpt_name = "best.ckpt" if best_reward_m is None or reward_m > best_reward_m: best_reward_m = reward_m save_zip(ckpt_name, trial_ft, trial_model, config) if remote_dir is not None: upload_ppl_hdfs(remote_dir, ckpt_name) tune.track.log(training_iteration=i, reward_metric=reward_m, checkpoint="best.ckpt")
def _detach_recipe(self, recipe): self.search_space = recipe.search_space() stop = recipe.runtime_params() self.metric_threshold = None if "reward_metric" in stop.keys(): self.mode = Evaluator.get_metric_mode(self.metric) self.metric_threshold = -stop["reward_metric"] if \ self.mode == "min" else stop["reward_metric"] self.epochs = stop["training_iteration"] self.num_samples = stop["num_samples"]
def _validate_metric_mode(metric, mode): from zoo.automl.common.metrics import Evaluator if not mode: try: mode = Evaluator.get_metric_mode(metric) except ValueError: pass if not mode: raise ValueError(f"We cannot infer metric mode with metric name of {metric}. " f"Please specify the `metric_mode` parameter in AutoEstimator.fit().") if mode not in ["min", "max"]: raise ValueError("`mode` has to be one of ['min', 'max']") return mode
def _train(self): # print("self.config in train is ", self.config) result = self.trial_model.fit_eval(self.x_train, self.y_train, validation_data=self.validation_data, # verbose=1, **self.config) self.reward_m = result if Evaluator.get_metric_mode(metric) == "max" else -result # if metric == "mean_squared_error": # self.reward_m = (-1) * result # # print("running iteration: ",i) # elif metric == "r_square": # self.reward_m = result # else: # raise ValueError("metric can only be \"mean_squared_error\" or \"r_square\"") return {"reward_metric": self.reward_m, "checkpoint": self.ckpt_name}
def _validate_metric_mode(metric, mode): if not mode: if callable(metric): raise ValueError("You must specify `metric_mode` for your metric function") try: from zoo.automl.common.metrics import Evaluator mode = Evaluator.get_metric_mode(metric) except ValueError: pass if not mode: raise ValueError(f"We cannot infer metric mode with metric name of {metric}. Please" f" specify the `metric_mode` parameter in AutoEstimator.fit().") if mode not in ["min", "max"]: raise ValueError("`mode` has to be one of ['min', 'max']") return mode
def train_func(config): train_data = ray.get(data_id) val_data = ray.get(validation_data_id) config = convert_bayes_configs(config).copy() if not isinstance(model_builder, ModelBuilder): raise ValueError(f"You must input a ModelBuilder instance for model_builder") trial_model = model_builder.build(config) # no need to call build since it is called the first time fit_eval is called. # callbacks = [TuneCallback(tune_reporter)] # fit model best_reward = None for i in range(1, 101): result = trial_model.fit_eval(data=train_data, validation_data=val_data, mc=mc, metric=metric, **config) reward = result checkpoint_filename = "best.ckpt" # Save best reward iteration mode = Evaluator.get_metric_mode(metric) if mode == "max": has_best_reward = best_reward is None or reward > best_reward else: has_best_reward = best_reward is None or reward < best_reward if has_best_reward: best_reward = reward trial_model.save(checkpoint_filename) # Save to hdfs if remote_dir is not None: put_ckpt_hdfs(remote_dir, checkpoint_filename) report_dict = {"training_iteration": i, metric: reward, "checkpoint": checkpoint_filename, "best_" + metric: best_reward} tune.report(**report_dict)
def compile(self, data, model_builder, epochs=1, validation_data=None, metric="mse", metric_threshold=None, n_sampling=1, search_space=None, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None, mc=False): """ Do necessary preparations for the engine :param data: data for training Pandas Dataframe: a Pandas dataframe for training Numpy ndarray: a tuple in form of (x, y) x: ndarray for training input y: ndarray for training output :param model_builder: model creation function :param search_space: a dict for search space :param metric_threshold: a trial will be terminated when metric threshold is met :param n_sampling: number of sampling :param epochs: max epochs for training :param validation_data: data for validation Pandas Dataframe: a Pandas dataframe for validation Numpy ndarray: a tuple in form of (x, y) x: ndarray for validation input y: ndarray for validation output :param search_space: search_space :param search_alg: str, all supported searcher provided by ray tune (i.e."variant_generator", "random", "ax", "dragonfly", "skopt", "hyperopt", "bayesopt", "bohb", "nevergrad", "optuna", "zoopt" and "sigopt") :param search_alg_params: extra parameters for searcher algorithm :param scheduler: str, all supported scheduler provided by ray tune :param scheduler_params: parameters for scheduler :param mc: if calculate uncertainty :param metric: metric name """ # metric and metric's mode self.metric = metric self.mode = Evaluator.get_metric_mode(metric) self.num_samples = n_sampling self.stopper = TrialStopper(metric_threshold=metric_threshold, epochs=epochs, metric=self.metric, mode=self.mode) self.search_space = search_space self._search_alg = RayTuneSearchEngine._set_search_alg(search_alg, search_alg_params, self.metric, self.mode) self._scheduler = RayTuneSearchEngine._set_scheduler(scheduler, scheduler_params, self.metric, self.mode) self.train_func = self._prepare_train_func(data=data, model_builder=model_builder, validation_data=validation_data, metric=metric, mc=mc, remote_dir=self.remote_dir )
def compile(self, data, model_create_func, recipe, search_space=None, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None, feature_transformers=None, mc=False, metric="mse"): """ Do necessary preparations for the engine :param data: data dictionary Pandas Dataframe API keys: "df": dataframe for training "val_df": (optional) dataframe for validation "feature_cols": (optional) column name for extra features "target_col": (optional) column name for target Numpy ndarray API keys: "x": ndarray for training input "y": ndarray for training output "x_val": (optional) ndarray for validation input "y_val": (optional) ndarray for validation output Note: For Pandas Dataframe API keys, if "feature_cols" or "target_col" is missing, then feature_transformers is required. :param model_create_func: model creation function :param recipe: search recipe :param search_space: search_space, required if recipe is not provided :param search_alg: str, one of "skopt", "bayesopt" and "sigopt" :param search_alg_params: extra parameters for searcher algorithm :param scheduler: str, all supported scheduler provided by ray tune :param scheduler_params: parameters for scheduler :param feature_transformers: feature transformer instance :param mc: if calculate uncertainty :param metric: metric name """ # data mode detection assert isinstance(data, dict), 'ERROR: Argument \'data\' should be a dictionary.' data_mode = None # data_mode can only be 'dataframe' or 'ndarray' data_schema = set(data.keys()) if set(["df"]).issubset(data_schema): data_mode = 'dataframe' if set(["x", "y"]).issubset(data_schema): data_mode = 'ndarray' assert data_mode in ['dataframe', 'ndarray'],\ 'ERROR: Argument \'data\' should fit either \ dataframe schema (include \'df\' in keys) or\ ndarray (include \'x\' and \'y\' in keys) schema.' # data extract if data_mode == 'dataframe': input_data = data['df'] feature_cols = data.get("feature_cols", None) target_col = data.get("target_col", None) validation_data = data.get("val_df", None) else: input_data = {"x": data["x"], "y": data["y"]} if 'val_x' in data.keys(): validation_data = {"x": data["val_x"], "y": data["val_y"]} else: validation_data = None # metric and metric's mode self.metric = metric self.mode = Evaluator.get_metric_mode(metric) # prepare parameters for search engine runtime_params = recipe.runtime_params() self.num_samples = runtime_params['num_samples'] stop = dict(runtime_params) del stop['num_samples'] # temp operation for reward_metric redundant_stop_keys = stop.keys() - {"reward_metric", "training_iteration"} assert len(redundant_stop_keys) == 0, \ f"{redundant_stop_keys} is not expected in stop criteria, \ only \"reward_metric\", \"training_iteration\" are expected." if "reward_metric" in stop.keys(): stop[self.metric] = -stop["reward_metric"] if \ self.mode == "min" else stop["reward_metric"] del stop["reward_metric"] stop.setdefault("training_iteration", 1) self.stopper = TrialStopper(stop=stop, metric=self.metric, mode=self.mode) if search_space is None: search_space = recipe.search_space() self.search_space = search_space self._search_alg = RayTuneSearchEngine._set_search_alg(search_alg, search_alg_params, recipe, self.metric, self.mode) self._scheduler = RayTuneSearchEngine._set_scheduler(scheduler, scheduler_params, self.metric, self.mode) if feature_transformers is None and data_mode == 'dataframe': feature_transformers = IdentityTransformer(feature_cols, target_col) numpy_format = True if data_mode == 'ndarray' else False self.train_func = self._prepare_train_func(input_data=input_data, model_create_func=model_create_func, feature_transformers=feature_transformers, validation_data=validation_data, metric=metric, mc=mc, remote_dir=self.remote_dir, numpy_format=numpy_format )
def train_func(config): numpy_format = ray.get(numpy_format_id) if isinstance(model_create_func, ModelBuilder): trial_model = model_create_func.build(config) else: trial_model = model_create_func() if not numpy_format: global_ft = ray.get(ft_id) trial_ft = deepcopy(global_ft) imputer = None if "imputation" in config: if config["imputation"] == "LastFillImpute": imputer = LastFillImpute() elif config["imputation"] == "FillZeroImpute": imputer = FillZeroImpute() # handling input global_input_df = ray.get(input_data_id) trial_input_df = deepcopy(global_input_df) if imputer: trial_input_df = imputer.impute(trial_input_df) config = convert_bayes_configs(config).copy() (x_train, y_train) = trial_ft.fit_transform(trial_input_df, **config) # handling validation data validation_data = None if is_val_valid: global_validation_df = ray.get(validation_data_id) trial_validation_df = deepcopy(global_validation_df) validation_data = trial_ft.transform(trial_validation_df) else: train_data = ray.get(input_data_id) x_train, y_train = (train_data["x"], train_data["y"]) validation_data = None if is_val_valid: validation_data = ray.get(validation_data_id) validation_data = (validation_data["x"], validation_data["y"]) trial_ft = None # no need to call build since it is called the first time fit_eval is called. # callbacks = [TuneCallback(tune_reporter)] # fit model best_reward = None for i in range(1, 101): result = trial_model.fit_eval(x_train, y_train, validation_data=validation_data, mc=mc, metric=metric, # verbose=1, **config) reward = result checkpoint_filename = "best.ckpt" # Save best reward iteration mode = Evaluator.get_metric_mode(metric) if mode == "max": has_best_reward = best_reward is None or reward > best_reward else: has_best_reward = best_reward is None or reward < best_reward if has_best_reward: best_reward = reward if isinstance(model_create_func, ModelBuilder): trial_model.save(checkpoint_filename) else: save_zip(checkpoint_filename, trial_ft, trial_model, config) # Save to hdfs if remote_dir is not None: upload_ppl_hdfs(remote_dir, checkpoint_filename) report_dict = {"training_iteration": i, metric: reward, "checkpoint": checkpoint_filename, "best_" + metric: best_reward} tune.report(**report_dict)
def compile(self, data, model_create_func, recipe, validation_data=None, search_space=None, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None, feature_transformers=None, mc=False, metric="mse"): """ Do necessary preparations for the engine :param data: data for training Pandas Dataframe: a Pandas dataframe for training Numpy ndarray: a tuple in form of (x, y) x: ndarray for training input y: ndarray for training output :param model_create_func: model creation function :param recipe: search recipe :param validation_data: data for validation Pandas Dataframe: a Pandas dataframe for validation Numpy ndarray: a tuple in form of (x, y) x: ndarray for validation input y: ndarray for validation output :param search_space: search_space, required if recipe is not provided :param search_alg: str, all supported searcher provided by ray tune (i.e."variant_generator", "random", "ax", "dragonfly", "skopt", "hyperopt", "bayesopt", "bohb", "nevergrad", "optuna", "zoopt" and "sigopt") :param search_alg_params: extra parameters for searcher algorithm :param scheduler: str, all supported scheduler provided by ray tune :param scheduler_params: parameters for scheduler :param feature_transformers: feature transformer instance :param mc: if calculate uncertainty :param metric: metric name """ # metric and metric's mode self.metric = metric self.mode = Evaluator.get_metric_mode(metric) # prepare parameters for search engine runtime_params = recipe.runtime_params() self.num_samples = runtime_params['num_samples'] stop = dict(runtime_params) del stop['num_samples'] # temp operation for reward_metric redundant_stop_keys = stop.keys() - { "reward_metric", "training_iteration" } assert len(redundant_stop_keys) == 0, \ f"{redundant_stop_keys} is not expected in stop criteria, \ only \"reward_metric\", \"training_iteration\" are expected." if "reward_metric" in stop.keys(): stop[self.metric] = -stop["reward_metric"] if \ self.mode == "min" else stop["reward_metric"] del stop["reward_metric"] stop.setdefault("training_iteration", 1) self.stopper = TrialStopper(stop=stop, metric=self.metric, mode=self.mode) if search_space is None: search_space = recipe.search_space() self.search_space = search_space self._search_alg = RayTuneSearchEngine._set_search_alg( search_alg, search_alg_params, recipe, self.metric, self.mode) self._scheduler = RayTuneSearchEngine._set_scheduler( scheduler, scheduler_params, self.metric, self.mode) self.train_func = self._prepare_train_func( data=data, model_create_func=model_create_func, feature_transformers=feature_transformers, validation_data=validation_data, metric=metric, mc=mc, remote_dir=self.remote_dir)