def setup_method(self, method): # super().setup_method(method) self.model = XGBoost(config={ 'n_estimators': 5, 'max_depth': 2, 'tree_method': 'hist' }) feature_cols = ["f", "f2"] target_col = "t" train_df = pd.DataFrame({ "f": np.random.randn(20), "f2": np.random.randn(20), "t": np.random.randint(20) }) val_df = pd.DataFrame({ "f": np.random.randn(5), "f2": np.random.randn(5), "t": np.random.randint(5) }) ft = IdentityTransformer(feature_cols=feature_cols, target_col=target_col) self.x, self.y = ft.transform(train_df) self.val_x, self.val_y = ft.transform(val_df)
def compile(self, input_df, model_create_func, recipe, feature_cols=None, target_col=None, search_space=None, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None, feature_transformers=None, validation_df=None, mc=False, metric="mse"): """ Do necessary preparations for the engine :param input_df: :param search_space: :param num_samples: :param stop: :param search_algorithm: :param search_algorithm_params: :param fixed_params: :param feature_transformers: :param model: :param validation_df: :param metric: :return: """ # prepare parameters for search engine runtime_params = recipe.runtime_params() self.num_samples = runtime_params['num_samples'] stop = dict(runtime_params) del stop['num_samples'] self.stop_criteria = stop if search_space is None: search_space = recipe.search_space(all_available_features=None) self._search_alg = RayTuneSearchEngine._set_search_alg( search_alg, search_alg_params, recipe, search_space) self._scheduler = RayTuneSearchEngine._set_scheduler( scheduler, scheduler_params) self.search_space = self._prepare_tune_config(search_space) if feature_transformers is None: feature_transformers = IdentityTransformer(feature_cols, target_col) self.train_func = self._prepare_train_func( input_df=input_df, model_create_func=model_create_func, feature_transformers=feature_transformers, validation_df=validation_df, metric=metric, mc=mc, remote_dir=self.remote_dir)
def load_xgboost_pipeline(file): from zoo.automl.feature.identity_transformer import IdentityTransformer from zoo.automl.model import XGBoostRegressor feature_transformers = IdentityTransformer() model = XGBoostRegressor() all_config = restore_zip(file, feature_transformers, model) ts_pipeline = TimeSequencePipeline(feature_transformers=feature_transformers, model=model, config=all_config) print("Restore pipeline from", file) return ts_pipeline
def _hp_search( self, input_df, validation_df, config, metric, recipe, mc, resources_per_trial, remote_dir, ): def model_create_func(): _model = XGBoost(model_type=self.model_type, config=config) if "cpu" in resources_per_trial: _model.set_params(n_jobs=resources_per_trial.get("cpu")) return _model model = model_create_func() ft = IdentityTransformer(self.feature_cols, self.target_col) # prepare parameters for search engine search_space = recipe.search_space(None) from zoo.automl.regression.time_sequence_predictor import TimeSequencePredictor metric_mode = TimeSequencePredictor._get_metric_mode(metric) searcher = RayTuneSearchEngine( logs_dir=self.logs_dir, resources_per_trial=resources_per_trial, name=self.name, remote_dir=remote_dir, ) searcher.compile( input_df, model_create_func=model_create_func(), search_space=search_space, recipe=recipe, feature_transformers=ft, validation_df=validation_df, metric=metric, metric_mode=metric_mode, mc=mc, ) # searcher.test_run() analysis = searcher.run() pipeline = self._make_pipeline(analysis, metric_mode, feature_transformers=ft, model=model, remote_dir=remote_dir) return pipeline
def _hp_search(self, input_df, validation_df, config, metric, recipe, mc, resources_per_trial, remote_dir): def model_create_func(): model = XGBoostRegressor(config) model.set_params(n_jobs=resources_per_trial) return model model = model_create_func() ft = IdentityTransformer(self.feature_cols, self.target_col) # prepare parameters for search engine search_space = recipe.search_space(None) runtime_params = recipe.runtime_params() num_samples = runtime_params['num_samples'] stop = dict(runtime_params) search_algorithm_params = recipe.search_algorithm_params() search_algorithm = recipe.search_algorithm() fixed_params = recipe.fixed_params() del stop['num_samples'] from zoo.automl.regression.time_sequence_predictor import TimeSequencePredictor metric_mode = TimeSequencePredictor._get_metric_mode(metric) searcher = RayTuneSearchEngine( logs_dir=self.logs_dir, resources_per_trial=resources_per_trial, name=self.name, remote_dir=remote_dir, ) searcher.compile(input_df, model_create_func=model_create_func(), search_space=search_space, stop=stop, search_algorithm_params=search_algorithm_params, search_algorithm=search_algorithm, fixed_params=fixed_params, feature_transformers=ft, validation_df=validation_df, metric=metric, metric_mode=metric_mode, mc=mc, num_samples=num_samples) # searcher.test_run() analysis = searcher.run() pipeline = self._make_pipeline(analysis, metric_mode, feature_transformers=ft, model=model, remote_dir=remote_dir) return pipeline
def create_feature_transformer(self): ft = IdentityTransformer(self.feature_cols, self.target_col) return ft
def compile(self, data, model_create_func, recipe, search_space=None, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None, feature_transformers=None, mc=False, metric="mse"): """ Do necessary preparations for the engine :param input_df: :param search_space: :param num_samples: :param stop: :param search_algorithm: :param search_algorithm_params: :param fixed_params: :param feature_transformers: :param model: :param validation_df: :param metric: :return: """ # data mode detection assert isinstance( data, dict), 'ERROR: Argument \'data\' should be a dictionary.' data_mode = None # data_mode can only be 'dataframe' or 'ndarray' data_schema = set(data.keys()) if set(["df"]).issubset(data_schema): data_mode = 'dataframe' if set(["x", "y"]).issubset(data_schema): data_mode = 'ndarray' assert data_mode in ['dataframe', 'ndarray'],\ 'ERROR: Argument \'data\' should fit either \ dataframe schema (include \'df\' in keys) or\ ndarray (include \'x\' and \'y\' in keys) schema.' # data extract if data_mode == 'dataframe': input_df = data['df'] feature_cols = data.get("feature_cols", None) target_col = data.get("target_col", None) validation_df = data.get("val_df", None) else: if data["x"].ndim == 1: data["x"] = data["x"].reshape(-1, 1) if data["y"].ndim == 1: data["y"] = data["y"].reshape(-1, 1) if "val_x" in data.keys() and data["val_x"].ndim == 1: data["val_x"] = data["val_x"].reshape(-1, 1) if "val_y" in data.keys() and data["val_y"].ndim == 1: data["val_y"] = data["val_y"].reshape(-1, 1) input_data = {"x": data["x"], "y": data["y"]} if 'val_x' in data.keys(): validation_data = {"x": data["val_x"], "y": data["val_y"]} else: validation_data = None # prepare parameters for search engine runtime_params = recipe.runtime_params() self.num_samples = runtime_params['num_samples'] stop = dict(runtime_params) del stop['num_samples'] self.stop_criteria = stop if search_space is None: search_space = recipe.search_space(all_available_features=None) self._search_alg = RayTuneSearchEngine._set_search_alg( search_alg, search_alg_params, recipe, search_space) self._scheduler = RayTuneSearchEngine._set_scheduler( scheduler, scheduler_params) self.search_space = self._prepare_tune_config(search_space) if feature_transformers is None and data_mode == 'dataframe': feature_transformers = IdentityTransformer(feature_cols, target_col) if data_mode == 'dataframe': self.train_func = self._prepare_train_func( input_data=input_df, model_create_func=model_create_func, feature_transformers=feature_transformers, validation_data=validation_df, metric=metric, mc=mc, remote_dir=self.remote_dir, numpy_format=False) else: self.train_func = self._prepare_train_func( input_data=input_data, model_create_func=model_create_func, feature_transformers=None, validation_data=validation_data, metric=metric, mc=mc, remote_dir=self.remote_dir, numpy_format=True)
def compile(self, input_df, model_create_func, recipe, feature_cols=None, target_col=None, search_space=None, feature_transformers=None, validation_df=None, mc=False, metric="mse"): """ Do necessary preparations for the engine :param input_df: :param search_space: :param num_samples: :param stop: :param search_algorithm: :param search_algorithm_params: :param fixed_params: :param feature_transformers: :param model: :param validation_df: :param metric: :return: """ # prepare parameters for search engine runtime_params = recipe.runtime_params() num_samples = runtime_params['num_samples'] stop = dict(runtime_params) search_algorithm_params = recipe.search_algorithm_params() search_algorithm = recipe.search_algorithm() fixed_params = recipe.fixed_params() schedule_algorithm = recipe.scheduler_algorithm() del stop['num_samples'] if search_space is None: search_space = recipe.search_space(all_available_features=None) self.search_space = self._prepare_tune_config(search_space) self.stop_criteria = stop self.num_samples = num_samples if schedule_algorithm == 'AsyncHyperBand': from ray.tune.schedulers import AsyncHyperBandScheduler self.sched = AsyncHyperBandScheduler( time_attr="training_iteration", metric="reward_metric", mode="max", max_t=50, grace_period=1, reduction_factor=3, brackets=3, ) else: from ray.tune.schedulers import FIFOScheduler self.sched = FIFOScheduler() if search_algorithm == 'BayesOpt': self.search_algorithm = BayesOptSearch( self.search_space, metric="reward_metric", mode="max", utility_kwargs=search_algorithm_params["utility_kwargs"]) elif search_algorithm == 'SkOpt': from skopt import Optimizer from ray.tune.suggest.skopt import SkOptSearch opt_params = recipe.opt_params() optimizer = Optimizer(opt_params) self.search_algorithm = SkOptSearch( optimizer, list(self.search_space.keys()), metric="reward_metric", mode="max", ) else: self.search_algorithm = None self.fixed_params = fixed_params if feature_transformers is None: feature_transformers = IdentityTransformer(feature_cols, target_col) self.train_func = self._prepare_train_func( input_df=input_df, model_create_func=model_create_func, feature_transformers=feature_transformers, validation_df=validation_df, metric=metric, mc=mc, remote_dir=self.remote_dir)