예제 #1
0
    def setup_method(self, method):
        # super().setup_method(method)
        self.model = XGBoost(config={
            'n_estimators': 5,
            'max_depth': 2,
            'tree_method': 'hist'
        })
        feature_cols = ["f", "f2"]
        target_col = "t"
        train_df = pd.DataFrame({
            "f": np.random.randn(20),
            "f2": np.random.randn(20),
            "t": np.random.randint(20)
        })
        val_df = pd.DataFrame({
            "f": np.random.randn(5),
            "f2": np.random.randn(5),
            "t": np.random.randint(5)
        })

        ft = IdentityTransformer(feature_cols=feature_cols,
                                 target_col=target_col)

        self.x, self.y = ft.transform(train_df)
        self.val_x, self.val_y = ft.transform(val_df)
    def compile(self,
                input_df,
                model_create_func,
                recipe,
                feature_cols=None,
                target_col=None,
                search_space=None,
                search_alg=None,
                search_alg_params=None,
                scheduler=None,
                scheduler_params=None,
                feature_transformers=None,
                validation_df=None,
                mc=False,
                metric="mse"):
        """
        Do necessary preparations for the engine
        :param input_df:
        :param search_space:
        :param num_samples:
        :param stop:
        :param search_algorithm:
        :param search_algorithm_params:
        :param fixed_params:
        :param feature_transformers:
        :param model:
        :param validation_df:
        :param metric:
        :return:
        """

        # prepare parameters for search engine
        runtime_params = recipe.runtime_params()
        self.num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        del stop['num_samples']
        self.stop_criteria = stop
        if search_space is None:
            search_space = recipe.search_space(all_available_features=None)
        self._search_alg = RayTuneSearchEngine._set_search_alg(
            search_alg, search_alg_params, recipe, search_space)
        self._scheduler = RayTuneSearchEngine._set_scheduler(
            scheduler, scheduler_params)
        self.search_space = self._prepare_tune_config(search_space)

        if feature_transformers is None:
            feature_transformers = IdentityTransformer(feature_cols,
                                                       target_col)

        self.train_func = self._prepare_train_func(
            input_df=input_df,
            model_create_func=model_create_func,
            feature_transformers=feature_transformers,
            validation_df=validation_df,
            metric=metric,
            mc=mc,
            remote_dir=self.remote_dir)
예제 #3
0
def load_xgboost_pipeline(file):
    from zoo.automl.feature.identity_transformer import IdentityTransformer
    from zoo.automl.model import XGBoostRegressor
    feature_transformers = IdentityTransformer()
    model = XGBoostRegressor()

    all_config = restore_zip(file, feature_transformers, model)
    ts_pipeline = TimeSequencePipeline(feature_transformers=feature_transformers,
                                       model=model,
                                       config=all_config)
    print("Restore pipeline from", file)
    return ts_pipeline
예제 #4
0
    def _hp_search(
        self,
        input_df,
        validation_df,
        config,
        metric,
        recipe,
        mc,
        resources_per_trial,
        remote_dir,
    ):
        def model_create_func():
            _model = XGBoost(model_type=self.model_type, config=config)
            if "cpu" in resources_per_trial:
                _model.set_params(n_jobs=resources_per_trial.get("cpu"))
            return _model

        model = model_create_func()
        ft = IdentityTransformer(self.feature_cols, self.target_col)

        # prepare parameters for search engine
        search_space = recipe.search_space(None)

        from zoo.automl.regression.time_sequence_predictor import TimeSequencePredictor
        metric_mode = TimeSequencePredictor._get_metric_mode(metric)
        searcher = RayTuneSearchEngine(
            logs_dir=self.logs_dir,
            resources_per_trial=resources_per_trial,
            name=self.name,
            remote_dir=remote_dir,
        )
        searcher.compile(
            input_df,
            model_create_func=model_create_func(),
            search_space=search_space,
            recipe=recipe,
            feature_transformers=ft,
            validation_df=validation_df,
            metric=metric,
            metric_mode=metric_mode,
            mc=mc,
        )
        # searcher.test_run()
        analysis = searcher.run()

        pipeline = self._make_pipeline(analysis,
                                       metric_mode,
                                       feature_transformers=ft,
                                       model=model,
                                       remote_dir=remote_dir)
        return pipeline
    def _hp_search(self, input_df, validation_df, config, metric, recipe, mc,
                   resources_per_trial, remote_dir):
        def model_create_func():
            model = XGBoostRegressor(config)
            model.set_params(n_jobs=resources_per_trial)
            return model

        model = model_create_func()
        ft = IdentityTransformer(self.feature_cols, self.target_col)

        # prepare parameters for search engine
        search_space = recipe.search_space(None)
        runtime_params = recipe.runtime_params()
        num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        search_algorithm_params = recipe.search_algorithm_params()
        search_algorithm = recipe.search_algorithm()
        fixed_params = recipe.fixed_params()
        del stop['num_samples']

        from zoo.automl.regression.time_sequence_predictor import TimeSequencePredictor
        metric_mode = TimeSequencePredictor._get_metric_mode(metric)
        searcher = RayTuneSearchEngine(
            logs_dir=self.logs_dir,
            resources_per_trial=resources_per_trial,
            name=self.name,
            remote_dir=remote_dir,
        )
        searcher.compile(input_df,
                         model_create_func=model_create_func(),
                         search_space=search_space,
                         stop=stop,
                         search_algorithm_params=search_algorithm_params,
                         search_algorithm=search_algorithm,
                         fixed_params=fixed_params,
                         feature_transformers=ft,
                         validation_df=validation_df,
                         metric=metric,
                         metric_mode=metric_mode,
                         mc=mc,
                         num_samples=num_samples)
        # searcher.test_run()
        analysis = searcher.run()

        pipeline = self._make_pipeline(analysis,
                                       metric_mode,
                                       feature_transformers=ft,
                                       model=model,
                                       remote_dir=remote_dir)
        return pipeline
예제 #6
0
 def create_feature_transformer(self):
     ft = IdentityTransformer(self.feature_cols, self.target_col)
     return ft
    def compile(self,
                data,
                model_create_func,
                recipe,
                search_space=None,
                search_alg=None,
                search_alg_params=None,
                scheduler=None,
                scheduler_params=None,
                feature_transformers=None,
                mc=False,
                metric="mse"):
        """
        Do necessary preparations for the engine
        :param input_df:
        :param search_space:
        :param num_samples:
        :param stop:
        :param search_algorithm:
        :param search_algorithm_params:
        :param fixed_params:
        :param feature_transformers:
        :param model:
        :param validation_df:
        :param metric:
        :return:
        """

        # data mode detection
        assert isinstance(
            data, dict), 'ERROR: Argument \'data\' should be a dictionary.'
        data_mode = None  # data_mode can only be 'dataframe' or 'ndarray'
        data_schema = set(data.keys())
        if set(["df"]).issubset(data_schema):
            data_mode = 'dataframe'
        if set(["x", "y"]).issubset(data_schema):
            data_mode = 'ndarray'
        assert data_mode in ['dataframe', 'ndarray'],\
            'ERROR: Argument \'data\' should fit either \
                dataframe schema (include \'df\' in keys) or\
                     ndarray (include \'x\' and \'y\' in keys) schema.'

        # data extract
        if data_mode == 'dataframe':
            input_df = data['df']
            feature_cols = data.get("feature_cols", None)
            target_col = data.get("target_col", None)
            validation_df = data.get("val_df", None)
        else:
            if data["x"].ndim == 1:
                data["x"] = data["x"].reshape(-1, 1)
            if data["y"].ndim == 1:
                data["y"] = data["y"].reshape(-1, 1)
            if "val_x" in data.keys() and data["val_x"].ndim == 1:
                data["val_x"] = data["val_x"].reshape(-1, 1)
            if "val_y" in data.keys() and data["val_y"].ndim == 1:
                data["val_y"] = data["val_y"].reshape(-1, 1)

            input_data = {"x": data["x"], "y": data["y"]}
            if 'val_x' in data.keys():
                validation_data = {"x": data["val_x"], "y": data["val_y"]}
            else:
                validation_data = None

        # prepare parameters for search engine
        runtime_params = recipe.runtime_params()
        self.num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        del stop['num_samples']
        self.stop_criteria = stop
        if search_space is None:
            search_space = recipe.search_space(all_available_features=None)
        self._search_alg = RayTuneSearchEngine._set_search_alg(
            search_alg, search_alg_params, recipe, search_space)
        self._scheduler = RayTuneSearchEngine._set_scheduler(
            scheduler, scheduler_params)
        self.search_space = self._prepare_tune_config(search_space)

        if feature_transformers is None and data_mode == 'dataframe':
            feature_transformers = IdentityTransformer(feature_cols,
                                                       target_col)

        if data_mode == 'dataframe':
            self.train_func = self._prepare_train_func(
                input_data=input_df,
                model_create_func=model_create_func,
                feature_transformers=feature_transformers,
                validation_data=validation_df,
                metric=metric,
                mc=mc,
                remote_dir=self.remote_dir,
                numpy_format=False)
        else:
            self.train_func = self._prepare_train_func(
                input_data=input_data,
                model_create_func=model_create_func,
                feature_transformers=None,
                validation_data=validation_data,
                metric=metric,
                mc=mc,
                remote_dir=self.remote_dir,
                numpy_format=True)
예제 #8
0
    def compile(self,
                input_df,
                model_create_func,
                recipe,
                feature_cols=None,
                target_col=None,
                search_space=None,
                feature_transformers=None,
                validation_df=None,
                mc=False,
                metric="mse"):
        """
        Do necessary preparations for the engine
        :param input_df:
        :param search_space:
        :param num_samples:
        :param stop:
        :param search_algorithm:
        :param search_algorithm_params:
        :param fixed_params:
        :param feature_transformers:
        :param model:
        :param validation_df:
        :param metric:
        :return:
        """

        # prepare parameters for search engine
        runtime_params = recipe.runtime_params()
        num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        search_algorithm_params = recipe.search_algorithm_params()
        search_algorithm = recipe.search_algorithm()
        fixed_params = recipe.fixed_params()
        schedule_algorithm = recipe.scheduler_algorithm()
        del stop['num_samples']
        if search_space is None:
            search_space = recipe.search_space(all_available_features=None)
        self.search_space = self._prepare_tune_config(search_space)
        self.stop_criteria = stop
        self.num_samples = num_samples
        if schedule_algorithm == 'AsyncHyperBand':
            from ray.tune.schedulers import AsyncHyperBandScheduler
            self.sched = AsyncHyperBandScheduler(
                time_attr="training_iteration",
                metric="reward_metric",
                mode="max",
                max_t=50,
                grace_period=1,
                reduction_factor=3,
                brackets=3,
            )
        else:
            from ray.tune.schedulers import FIFOScheduler
            self.sched = FIFOScheduler()

        if search_algorithm == 'BayesOpt':
            self.search_algorithm = BayesOptSearch(
                self.search_space,
                metric="reward_metric",
                mode="max",
                utility_kwargs=search_algorithm_params["utility_kwargs"])
        elif search_algorithm == 'SkOpt':
            from skopt import Optimizer
            from ray.tune.suggest.skopt import SkOptSearch
            opt_params = recipe.opt_params()
            optimizer = Optimizer(opt_params)
            self.search_algorithm = SkOptSearch(
                optimizer,
                list(self.search_space.keys()),
                metric="reward_metric",
                mode="max",
            )
        else:
            self.search_algorithm = None

        self.fixed_params = fixed_params
        if feature_transformers is None:
            feature_transformers = IdentityTransformer(feature_cols,
                                                       target_col)

        self.train_func = self._prepare_train_func(
            input_df=input_df,
            model_create_func=model_create_func,
            feature_transformers=feature_transformers,
            validation_df=validation_df,
            metric=metric,
            mc=mc,
            remote_dir=self.remote_dir)