Exemplo n.º 1
0
 def evaluate_joint_solution(self):
     # Update join incumbent from FE and HPO.
     _perf = None
     try:
         with time_limit(600):
             if self.task_type in CLS_TASKS:
                 _perf = ClassificationEvaluator(
                     self.local_inc['hpo'],
                     data_node=self.local_inc['fe'],
                     scorer=self.metric,
                     name='fe',
                     resampling_strategy=self.evaluation_type,
                     seed=self.seed)(self.local_inc['hpo'])
             else:
                 _perf = RegressionEvaluator(
                     self.local_inc['hpo'],
                     data_node=self.local_inc['fe'],
                     scorer=self.metric,
                     name='fe',
                     resampling_strategy=self.evaluation_type,
                     seed=self.seed)(self.local_inc['hpo'])
     except Exception as e:
         self.logger.error(str(e))
     # Update INC.
     if _perf is not None and np.isfinite(
             _perf) and _perf > self.incumbent_perf:
         self.inc['hpo'] = self.local_inc['hpo']
         self.inc['fe'] = self.local_inc['fe']
         self.incumbent_perf = _perf
def test_evaluator():
    config = {'colsample_bytree': 0.7214005546233202,
              'estimator': 'lightgbm',
              'learning_rate': 0.20740875048979773,
              'min_child_weight': 5,
              'n_estimators': 424,
              'num_leaves': 82,
              'reg_alpha': 0.001268145413023973,
              'reg_lambda': 0.15002466116267585,
              'subsample': 0.8110820196868197}
    config.pop('estimator', None)
    gbm = LightGBMRegressor(**config)
    scorer = make_scorer(smape, greater_is_better=False)
    raw_data, _ = fetch_data(task_id)
    evaluator = RegressionEvaluator(None, scorer, data_node=raw_data, name='fe', seed=1, estimator=gbm)
    print(evaluator(None))
def evaluation_based_feature_engineering(time_limit, seed=1):
    if task_id == 3 and regressor_id == 'lightgbm':
        config = {'colsample_bytree': 0.556390018826356,
                  'estimator': 'lightgbm',
                  'learning_rate': 0.027650212980431577,
                  'min_child_weight': 4,
                  'n_estimators': 1000,  # 2493,
                  'num_leaves': 818,
                  'reg_alpha': 0.00012695064964599962,
                  'reg_lambda': 0.0006320421481400761,
                  'subsample': 0.5611631795995178}
    elif task_id == 1 and regressor_id == 'lightgbm':
        config = {'colsample_bytree': 0.5836692544286752,
                  'estimator': 'lightgbm',
                  'learning_rate': 0.025011125056624308,
                  'min_child_weight': 3,
                  'n_estimators': 1000,  # 2000,
                  'num_leaves': 958,
                  'reg_alpha': 0.00025307513851761005,
                  'reg_lambda': 0.01911305077512719,
                  'subsample': 0.7850946965061745
                  }
    elif task_id == 3 and regressor_id == 'catboost_gpu':
        config = {'loss_function': 'RMSE',
                  'task_type': 'GPU',
                  'bootstrap_type': 'Poisson',
                  'learning_rate': 0.07215105304885769,
                  'n_estimators': 10000,
                  'min_child_samples': 7,
                  'max_depth': 8,
                  'reg_lambda': 4.084654778260157e-06,
                  'subsample': 0.9998568450178255
                  }
    elif task_id == 1 and regressor_id == 'catboost_gpu':
        config = {'loss_function': 'RMSE',
                  'task_type': 'GPU',
                  'bootstrap_type': 'Poisson',
                  'learning_rate': 0.030167431274216235,
                  'n_estimators': 10000,
                  'min_child_samples': 2,
                  'max_depth': 11,
                  'reg_lambda': 0.00010924008880152775,
                  'subsample': 0.9996005646983249
                  }
    else:
        raise ValueError("Hyperparameters not available!")

    config.pop('estimator', None)
    if regressor_id == 'lightgbm':
        estimator = LightGBMRegressor(**config)
    elif 'catboost' in regressor_id:
        estimator = CatBoostRegressor(**config)
    scorer = make_scorer(smape, greater_is_better=False)
    evaluator = RegressionEvaluator(None, scorer, name='fe', seed=seed, estimator=estimator)
    train_data, test_data = fetch_data(task_id)

    X, y = train_data.data
    idxs = np.arange(X.shape[0])
    np.random.shuffle(idxs)
    sample_size = int(X.shape[0] * train_size)
    subset_ids = idxs[:sample_size]
    X, y = X.iloc[subset_ids, :], y[subset_ids]
    train_data.data = [X, y]
    print(train_data)
    """
    nystronem_sampler: 15 bad
    kitchen_sinks: 13 bad
    random_trees_embedding: 18 bad
    feature_agglomeration_decomposer: 11 timeout.
    """
    # TODO: fast_ica, kernel_pca, and polynomial_features.
    # trans_used = [0, 3, 4, 5, 12, 16, 19, 30, 31, 32]
    # trans_used = [0, 3, 4, 5, 10, 11, 12, 16, 17, 19]
    # trans_used = [17, 30, 31]
    # trans_used = [30]
    pipeline = FEPipeline(task_type='regression', task_id='anti_plague',
                          fe_enabled=True, optimizer_type='eval_base',
                          time_budget=time_limit, evaluator=evaluator,
                          seed=seed, model_id='lightgbm',
                          time_limit_per_trans=900,
                          trans_set=None
                          )
    transformed_train_data = pipeline.fit_transform(train_data)
    print(pipeline.optimizer.get_incumbent_path())
    print('final train data shape & score', transformed_train_data.shape, transformed_train_data.score)
    transformed_test_datanode = pipeline.transform(test_data)
    print('final test data shape', transformed_test_datanode.shape)

    # Save results.
    np.save(data_dir + 'data/transformed_train_x-%d.csv' % task_id, transformed_train_data.data[0])
    np.save(data_dir + 'data/transformed_train_y-%d.csv' % task_id, transformed_train_data.data[1])
    np.save(data_dir + 'data/transformed_test-%d.csv' % task_id, transformed_test_datanode.data[0])
        cs = _estimators[algo].get_hyperparameter_search_space()
        model = UnParametrizedHyperparameter("estimator", algo)
        cs.add_hyperparameter(model)
        default_hpo_config = cs.get_default_configuration()

        if task == 'cls':
            fe_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric,
                                                   name='fe', resampling_strategy=eval_type,
                                                   seed=1)
            hpo_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric,
                                                    data_node=train_data, name='hpo',
                                                    resampling_strategy=eval_type,
                                                    seed=1)
        else:
            fe_evaluator = RegressionEvaluator(default_hpo_config, scorer=metric,
                                               name='fe', resampling_strategy=eval_type,
                                               seed=1)
            hpo_evaluator = RegressionEvaluator(default_hpo_config, scorer=metric,
                                                data_node=train_data, name='hpo',
                                                resampling_strategy=eval_type,
                                                seed=1)

        fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION if task == 'cls' else REGRESSION,
                                                     input_data=train_data,
                                                     evaluator=fe_evaluator,
                                                     model_id=algo,
                                                     time_limit_per_trans=600,
                                                     mem_limit_per_trans=5120,
                                                     number_of_unit_resource=10,
                                                     seed=1)
        hpo_optimizer = SMACOptimizer(evaluator=hpo_evaluator,
Exemplo n.º 5
0
    def prepare_optimizer(self, _arm):
        if _arm == 'fe':
            # Build the Feature Engineering component.
            self.original_data._node_id = -1
            inc_hpo = copy.deepcopy(self.inc['hpo'])
            if self.task_type in CLS_TASKS:
                fe_evaluator = ClassificationEvaluator(
                    inc_hpo,
                    scorer=self.metric,
                    name='fe',
                    resampling_strategy=self.evaluation_type,
                    seed=self.seed)
            elif self.task_type in REG_TASKS:
                fe_evaluator = RegressionEvaluator(
                    inc_hpo,
                    scorer=self.metric,
                    name='fe',
                    resampling_strategy=self.evaluation_type,
                    seed=self.seed)
            else:
                raise ValueError('Invalid task type!')
            self.optimizer[_arm] = build_fe_optimizer(
                self.fe_algo,
                self.evaluation_type,
                self.task_type,
                self.inc['fe'],
                fe_evaluator,
                self.estimator_id,
                self.per_run_time_limit,
                self.per_run_mem_limit,
                self.seed,
                shared_mode=self.share_fe,
                n_jobs=self.n_jobs)
        else:
            # trials_per_iter = self.optimizer['fe'].evaluation_num_last_iteration // 2
            # trials_per_iter = max(20, trials_per_iter)
            trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource
            if self.task_type in CLS_TASKS:
                hpo_evaluator = ClassificationEvaluator(
                    self.default_config,
                    scorer=self.metric,
                    data_node=self.inc['fe'].copy_(),
                    name='hpo',
                    resampling_strategy=self.evaluation_type,
                    seed=self.seed)
            elif self.task_type in REG_TASKS:
                hpo_evaluator = RegressionEvaluator(
                    self.default_config,
                    scorer=self.metric,
                    data_node=self.inc['fe'].copy_(),
                    name='hpo',
                    resampling_strategy=self.evaluation_type,
                    seed=self.seed)
            else:
                raise ValueError('Invalid task type!')

            self.optimizer[_arm] = build_hpo_optimizer(
                self.evaluation_type,
                hpo_evaluator,
                self.config_space,
                output_dir=self.output_dir,
                per_run_time_limit=self.per_run_time_limit,
                trials_per_iter=trials_per_iter,
                seed=self.seed)

        self.logger.debug('=' * 30)
        self.logger.debug('UPDATE OPTIMIZER: %s' % _arm)
        self.logger.debug('=' * 30)
Exemplo n.º 6
0
    def __init__(self,
                 task_type,
                 estimator_id: str,
                 data: DataNode,
                 metric,
                 share_fe=False,
                 output_dir='logs',
                 per_run_time_limit=120,
                 per_run_mem_limit=5120,
                 dataset_id='default',
                 eval_type='holdout',
                 mth='rb',
                 sw_size=3,
                 n_jobs=1,
                 seed=1,
                 fe_algo='tree_based',
                 enable_intersection=True,
                 number_of_unit_resource=2,
                 total_resource=30):
        self.task_type = task_type
        self.metric = metric
        self.number_of_unit_resource = number_of_unit_resource
        # One unit of resource, that's, the number of trials per iteration.
        self.one_unit_of_resource = 5
        self.total_resource = total_resource
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.estimator_id = estimator_id
        self.evaluation_type = eval_type
        self.original_data = data.copy_()
        self.share_fe = share_fe
        self.output_dir = output_dir
        self.n_jobs = n_jobs
        self.mth = mth
        self.seed = seed
        self.sliding_window_size = sw_size
        task_id = '%s-%d-%s' % (dataset_id, seed, estimator_id)
        self.logger = get_logger(self.__class__.__name__ + '-' + task_id)
        np.random.seed(self.seed)

        # Bandit settings.
        # self.arms = ['fe', 'hpo']
        self.arms = ['hpo', 'fe']
        self.rewards = dict()
        self.optimizer = dict()
        self.evaluation_cost = dict()
        self.update_flag = dict()
        # Global incumbent.
        self.inc = dict()
        self.local_inc = dict()
        self.local_hist = {'fe': [], 'hpo': []}
        for arm in self.arms:
            self.rewards[arm] = list()
            self.update_flag[arm] = False
            self.evaluation_cost[arm] = list()
        self.pull_cnt = 0
        self.action_sequence = list()
        self.final_rewards = list()
        self.incumbent_perf = float("-INF")
        self.early_stopped_flag = False
        self.enable_intersection = enable_intersection

        # Fetch hyperparameter space.
        if self.task_type in CLS_TASKS:
            from solnml.components.models.classification import _classifiers, _addons
            if estimator_id in _classifiers:
                clf_class = _classifiers[estimator_id]
            elif estimator_id in _addons.components:
                clf_class = _addons.components[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = clf_class.get_hyperparameter_search_space()
            model = UnParametrizedHyperparameter("estimator", estimator_id)
            cs.add_hyperparameter(model)
        elif self.task_type in REG_TASKS:
            from solnml.components.models.regression import _regressors, _addons
            if estimator_id in _regressors:
                reg_class = _regressors[estimator_id]
            elif estimator_id in _addons.components:
                reg_class = _addons.components[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = reg_class.get_hyperparameter_search_space()
            model = UnParametrizedHyperparameter("estimator", estimator_id)
            cs.add_hyperparameter(model)
        else:
            raise ValueError("Unknown task type %s!" % self.task_type)

        self.config_space = cs
        self.default_config = cs.get_default_configuration()
        self.config_space.seed(self.seed)

        # Build the Feature Engineering component.
        if self.task_type in CLS_TASKS:
            fe_evaluator = ClassificationEvaluator(
                self.default_config,
                scorer=self.metric,
                name='fe',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
            hpo_evaluator = ClassificationEvaluator(
                self.default_config,
                scorer=self.metric,
                data_node=self.original_data,
                name='hpo',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
        elif self.task_type in REG_TASKS:
            fe_evaluator = RegressionEvaluator(
                self.default_config,
                scorer=self.metric,
                name='fe',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
            hpo_evaluator = RegressionEvaluator(
                self.default_config,
                scorer=self.metric,
                data_node=self.original_data,
                name='hpo',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
        else:
            raise ValueError('Invalid task type!')

        self.fe_algo = fe_algo
        self.optimizer['fe'] = build_fe_optimizer(self.fe_algo,
                                                  self.evaluation_type,
                                                  self.task_type,
                                                  self.original_data,
                                                  fe_evaluator,
                                                  estimator_id,
                                                  per_run_time_limit,
                                                  per_run_mem_limit,
                                                  self.seed,
                                                  shared_mode=self.share_fe,
                                                  n_jobs=n_jobs)

        self.inc['fe'], self.local_inc[
            'fe'] = self.original_data, self.original_data

        # Build the HPO component.
        # trials_per_iter = max(len(self.optimizer['fe'].trans_types), 20)
        trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource

        self.optimizer['hpo'] = build_hpo_optimizer(
            self.evaluation_type,
            hpo_evaluator,
            cs,
            output_dir=output_dir,
            per_run_time_limit=per_run_time_limit,
            trials_per_iter=trials_per_iter,
            seed=self.seed,
            n_jobs=n_jobs)

        self.inc['hpo'], self.local_inc[
            'hpo'] = self.default_config, self.default_config
        self.init_config = cs.get_default_configuration()
        self.local_hist['fe'].append(self.original_data)
        self.local_hist['hpo'].append(self.default_config)