示例#1
0
    def fit(self, datanode, solvers=None):
        model_cnt = 0
        for algo_id in self.stats["include_algorithms"]:
            model_to_eval = self.stats[algo_id]['model_to_eval']
            for idx, (node, config) in enumerate(model_to_eval):
                X, y = node.data
                if self.base_model_mask[model_cnt] == 1:
                    estimator = fetch_predict_estimator(
                        self.task_type,
                        config,
                        X,
                        y,
                        weight_balance=node.enable_balance,
                        data_balance=node.data_balance)
                    with open(
                            os.path.join(
                                self.output_dir, '%s-bagging-model%d' %
                                (self.timestamp, model_cnt)), 'wb') as f:
                        pkl.dump(estimator, f)
                    if (solvers is not None):
                        fe_savepath = os.path.join(
                            self.output_dir,
                            '%s-bagging-fe%d' % (self.timestamp, model_cnt))
                        solvers[algo_id].optimizer['fe'].save(
                            node, fe_savepath)

                model_cnt += 1
        return self
示例#2
0
    def refit(self, solvers=None):
        # Refit models on whole training data
        model_cnt = 0
        for algo_id in self.stats["include_algorithms"]:
            model_to_eval = self.stats[algo_id]['model_to_eval']
            for idx, (node, config) in enumerate(model_to_eval):
                X, y = node.data
                if self.weights_[model_cnt] != 0:
                    self.logger.info("Refit model %d" % model_cnt)
                    estimator = fetch_predict_estimator(
                        self.task_type,
                        config,
                        X,
                        y,
                        weight_balance=node.enable_balance,
                        data_balance=node.data_balance)
                    with open(
                            os.path.join(
                                self.output_dir,
                                '%s-model%d' % (self.timestamp, model_cnt)),
                            'wb') as f:
                        pkl.dump(estimator, f)

                    if (solvers is not None):
                        print('saving fe ###########')
                        fe_savepath = os.path.join(
                            self.output_dir,
                            '%s-fe%d' % (self.timestamp, model_cnt))
                        solvers[algo_id].optimizer['fe'].save(
                            node, fe_savepath)
                model_cnt += 1
示例#3
0
 def refit(self):
     # Refit models on whole training data
     model_cnt = 0
     for algo_id in self.stats["include_algorithms"]:
         model_to_eval = self.stats[algo_id]['model_to_eval']
         for idx, (node, config) in enumerate(model_to_eval):
             X, y = node.data
             if self.weights_[model_cnt] != 0:
                 self.logger.info("Refit model %d" % model_cnt)
                 estimator = fetch_predict_estimator(
                     self.task_type,
                     config,
                     X,
                     y,
                     weight_balance=node.enable_balance,
                     data_balance=node.data_balance,
                     combined=True,
                 )
                 with open(
                         os.path.join(
                             self.output_dir,
                             '%s-model%d' % (self.timestamp, model_cnt)),
                         'wb') as f:
                     pkl.dump(estimator, f)
             model_cnt += 1
示例#4
0
    def fit(self, data):
        # Split training data for phase 1 and phase 2
        test_size = 0.2

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats.keys():
            model_to_eval = self.stats[algo_id]
            for idx, (config, _, path) in enumerate(model_to_eval):
                with open(path, 'rb')as f:
                    op_list, model = pkl.load(f)
                _node = data.copy_()

                _node = construct_node(_node, op_list, mode='train')

                X, y = _node.data
                if self.task_type in CLS_TASKS:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(X, y, test_size=test_size,
                                                              stratify=data.data[1], random_state=1)
                else:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(X, y, test_size=test_size,
                                                              random_state=1)

                if self.base_model_mask[model_cnt] == 1:
                    estimator = fetch_predict_estimator(self.task_type, algo_id, config[0], x_p1, y_p1,
                                                        weight_balance=_node.enable_balance,
                                                        data_balance=_node.data_balance)
                    with open(os.path.join(self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)),
                              'wb') as f:
                        pkl.dump(estimator, f)
                    if self.task_type in CLS_TASKS:
                        pred = estimator.predict_proba(x_p2)
                        n_dim = np.array(pred).shape[1]
                        if n_dim == 2:
                            # Binary classificaion
                            n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(x_p2)
                            feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim))
                        if n_dim == 1:
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2]
                        else:
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred
                    else:
                        pred = estimator.predict(x_p2).reshape(-1, 1)
                        n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(x_p2)
                            feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim))
                        feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred
                    suc_cnt += 1
                model_cnt += 1
        self.meta_learner.fit(feature_p2, y_p2)

        return self
示例#5
0
def evaluate_bo_optimizer(dataset, time_limit, run_id, seed):
    from solnml.components.fe_optimizers.bo_optimizer import BayesianOptimizationOptimizer
    # Prepare the configuration for random forest.
    from ConfigSpace.hyperparameters import UnParametrizedHyperparameter
    from autosklearn.pipeline.components.classification.random_forest import RandomForest
    cs = RandomForest.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest')
    cs.add_hyperparameter(clf_hp)
    print(cs.get_default_configuration())
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')

    train_data, test_data = load_train_test_data(dataset)
    cls_task_type = BINARY_CLS if len(set(
        train_data.data[1])) == 2 else MULTICLASS_CLS
    optimizer = BayesianOptimizationOptimizer(cls_task_type,
                                              train_data,
                                              evaluator,
                                              'random_forest',
                                              300,
                                              10000,
                                              seed,
                                              time_budget=time_limit)
    optimizer.optimize()
    inc = optimizer.incumbent_config
    val_score = 1 - optimizer.evaluate_function(inc)
    print(val_score)
    print(optimizer.incumbent_score)

    optimizer.fetch_nodes(n=10)
    print("Refit finished!")

    final_train_data = optimizer.apply(train_data,
                                       optimizer.incumbent,
                                       phase='train')
    X_train, y_train = final_train_data.data
    final_test_data = optimizer.apply(test_data, optimizer.incumbent)
    X_test, y_test = final_test_data.data

    clf = fetch_predict_estimator(
        cls_task_type,
        cs.get_default_configuration(),
        X_train,
        y_train,
        weight_balance=final_train_data.enable_balance,
        data_balance=final_train_data.data_balance)
    y_pred = clf.predict(X_test)

    from solnml.components.metrics.cls_metrics import balanced_accuracy
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    save_path = save_dir + 'bo_fe_%s_%d_%d.pkl' % (dataset, time_limit, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score], f)
示例#6
0
    def fit(self, data):
        # Split training data for phase 1 and phase 2
        if self.task_type in CLS_TASKS:
            kf = StratifiedKFold(n_splits=self.kfold)
        else:
            kf = KFold(n_splits=self.kfold)

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats["include_algorithms"]:
            model_to_eval = self.stats[algo_id]['model_to_eval']
            for idx, (node, config) in enumerate(model_to_eval):
                X, y = node.data
                if self.base_model_mask[model_cnt] == 1:
                    for j, (train, test) in enumerate(kf.split(X, y)):
                        x_p1, x_p2, y_p1, _ = X[train], X[test], y[train], y[test]
                        estimator = fetch_predict_estimator(self.task_type, config, x_p1, y_p1,
                                                            weight_balance=data.enable_balance,
                                                            data_balance=data.data_balance
                                                            )
                        with open(
                                os.path.join(self.output_dir, '%s-model%d_part%d' % (self.timestamp, model_cnt, j)),
                                'wb') as f:
                            pkl.dump(estimator, f)
                        if self.task_type in CLS_TASKS:
                            pred = estimator.predict_proba(x_p2)
                            n_dim = np.array(pred).shape[1]
                            if n_dim == 2:
                                # Binary classificaion
                                n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(train) + len(test)
                                feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim))
                            if n_dim == 1:
                                feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2]
                            else:
                                feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred
                        else:
                            pred = estimator.predict(x_p2).reshape(-1, 1)
                            n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(train) + len(test)
                                feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim))
                            feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred
                    suc_cnt += 1
                model_cnt += 1
        # Train model for stacking using the other part of training data
        self.meta_learner.fit(feature_p2, y)
        return self
示例#7
0
def evaluate_ml_algorithm(dataset,
                          algo,
                          run_id,
                          obj_metric,
                          total_resource=20,
                          seed=1,
                          task_type=None):
    print('EVALUATE-%s-%s-%s: run_id=%d' % (dataset, algo, obj_metric, run_id))
    train_data, test_data = load_train_test_data(dataset, task_type=task_type)
    if task_type in CLS_TASKS:
        task_type = BINARY_CLS if len(set(
            train_data.data[1])) == 2 else MULTICLASS_CLS
    print(set(train_data.data[1]))
    metric = get_metric(obj_metric)
    bandit = SecondLayerBandit(task_type,
                               algo,
                               train_data,
                               metric,
                               per_run_time_limit=300,
                               seed=seed,
                               eval_type='holdout',
                               fe_algo='bo',
                               total_resource=total_resource)
    bandit.optimize_fixed_pipeline()

    val_score = bandit.incumbent_perf
    best_config = bandit.inc['hpo']

    fe_optimizer = bandit.optimizer['fe']
    fe_optimizer.fetch_nodes(10)
    best_data_node = fe_optimizer.incumbent
    test_data_node = fe_optimizer.apply(test_data, best_data_node)

    estimator = fetch_predict_estimator(
        task_type,
        best_config,
        best_data_node.data[0],
        best_data_node.data[1],
        weight_balance=best_data_node.enable_balance,
        data_balance=best_data_node.data_balance)
    score = metric(estimator, test_data_node.data[0],
                   test_data_node.data[1]) * metric._sign
    print('Test score', score)

    save_path = save_dir + '%s-%s-%s-%d-%d.pkl' % (dataset, algo, obj_metric,
                                                   run_id, total_resource)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, algo, score, val_score, task_type], f)
示例#8
0
 def fit(self, datanode):
     model_cnt = 0
     for algo_id in self.stats["include_algorithms"]:
         model_to_eval = self.stats[algo_id]['model_to_eval']
         for idx, (node, config) in enumerate(model_to_eval):
             X, y = node.data
             if self.base_model_mask[model_cnt] == 1:
                 estimator = fetch_predict_estimator(
                     self.task_type,
                     config,
                     X,
                     y,
                     weight_balance=node.enable_balance,
                     data_balance=node.data_balance,
                     combined=True)
                 with open(
                         os.path.join(
                             self.output_dir, '%s-bagging-model%d' %
                             (self.timestamp, model_cnt)), 'wb') as f:
                     pkl.dump(estimator, f)
             model_cnt += 1
     return self
    def optimize(self):
        if self.inner_opt_algorithm in ['rb_hpo', 'fixed']:
            self.optimize_explore_first()
        elif self.inner_opt_algorithm == 'equal':
            self.optimize_equal_resource()
        else:
            raise ValueError('Unsupported optimization method: %s!' %
                             self.inner_opt_algorithm)

        scores = list()
        for _arm in self.arms:
            scores.append(self.sub_bandits[_arm].incumbent_perf)
        scores = np.array(scores)
        algo_idx = np.argmax(scores)
        self.optimal_algo_id = self.arms[algo_idx]
        _best_perf = scores[algo_idx]
        _threshold, _ensemble_size = 0.90, 5

        idxs = np.argsort(-scores)[:_ensemble_size]
        _algo_ids = [self.arms[idx] for idx in idxs]
        self.nbest_algo_ids = list()
        for _idx, _arm in zip(idxs, _algo_ids):
            if scores[_idx] >= _threshold * _best_perf:
                self.nbest_algo_ids.append(_arm)
        assert len(self.nbest_algo_ids) > 0

        self.logger.info('=' * 50)
        self.logger.info('Best_algo_perf:  %s' % str(_best_perf))
        self.logger.info('Best_algo_id:    %s' % str(self.optimal_algo_id))
        self.logger.info('Nbest_algo_ids:  %s' % str(self.nbest_algo_ids))
        self.logger.info('Arm candidates:  %s' % str(self.arms))
        self.logger.info('Best val scores: %s' % str(list(scores)))
        self.logger.info('=' * 50)

        # Fit the best model
        self.fe_optimizer = self.sub_bandits[
            self.optimal_algo_id].optimizer['fe']
        if self.fe_algo == 'bo':
            self.fe_optimizer.fetch_nodes(1)

        best_config = self.sub_bandits[self.optimal_algo_id].inc['hpo']
        best_estimator = fetch_predict_estimator(
            self.task_type,
            best_config,
            self.best_data_node.data[0],
            self.best_data_node.data[1],
            weight_balance=self.best_data_node.enable_balance,
            data_balance=self.best_data_node.data_balance)

        with open(
                os.path.join(self.output_dir,
                             '%s-best_model' % self.timestamp), 'wb') as f:
            pkl.dump(best_estimator, f)

        if self.ensemble_method is not None:
            # stats = self.fetch_ensemble_members()
            stats = self.fetch_ensemble_members_ano()

            # Ensembling all intermediate/ultimate models found in above optimization process.
            self.es = EnsembleBuilder(stats=stats,
                                      ensemble_method=self.ensemble_method,
                                      ensemble_size=self.ensemble_size,
                                      task_type=self.task_type,
                                      metric=self.metric,
                                      output_dir=self.output_dir)
            self.es.fit(data=self.original_data)
示例#10
0
    def __init__(self,
                 stats,
                 ensemble_method: str,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 base_save=False,
                 output_dir=None):
        self.stats = stats
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.task_type = task_type
        self.metric = metric
        self.output_dir = output_dir

        self.train_predictions = []
        self.config_list = []
        self.train_data_dict = {}
        self.train_labels = None
        self.seed = self.stats['split_seed']
        self.timestamp = str(time.time())
        logger_name = 'EnsembleBuilder'
        self.logger = get_logger(logger_name)
        model_cnt = 0
        for algo_id in self.stats["include_algorithms"]:
            model_to_eval = self.stats[algo_id]['model_to_eval']
            for idx, (node, config) in enumerate(model_to_eval):
                X, y = node.data

                # TODO: Hyperparameter
                test_size = 0.33

                if self.task_type in CLS_TASKS:
                    ss = StratifiedShuffleSplit(n_splits=1,
                                                test_size=test_size,
                                                random_state=self.seed)
                else:
                    ss = ShuffleSplit(n_splits=1,
                                      test_size=test_size,
                                      random_state=self.seed)

                for train_index, test_index in ss.split(X, y):
                    X_train, X_valid = X[train_index], X[test_index]
                    y_train, y_valid = y[train_index], y[test_index]

                if self.train_labels is not None:
                    assert (self.train_labels == y_valid).all()
                else:
                    self.train_labels = y_valid

                estimator = fetch_predict_estimator(
                    self.task_type,
                    config,
                    X_train,
                    y_train,
                    weight_balance=node.enable_balance,
                    data_balance=node.data_balance)
                if base_save:  # For ensemble selection
                    with open(
                            os.path.join(
                                self.output_dir,
                                '%s-model%d' % (self.timestamp, model_cnt)),
                            'wb') as f:
                        pkl.dump(estimator, f)

                if self.task_type in CLS_TASKS:
                    y_valid_pred = estimator.predict_proba(X_valid)
                else:
                    y_valid_pred = estimator.predict(X_valid)
                self.train_predictions.append(y_valid_pred)
                model_cnt += 1

        if len(self.train_predictions) < self.ensemble_size:
            self.ensemble_size = len(self.train_predictions)

        if ensemble_method == 'ensemble_selection':
            return

        if task_type in CLS_TASKS:
            self.base_model_mask = choose_base_models_classification(
                np.array(self.train_predictions), self.ensemble_size)
        else:
            self.base_model_mask = choose_base_models_regression(
                np.array(self.train_predictions), np.array(y_valid),
                self.ensemble_size)
        self.ensemble_size = sum(self.base_model_mask)
示例#11
0
    def refit(self):
        if self.ensemble_method is not None:
            self.logger.info('Start to refit all the well-performed models!')
            config_path = os.path.join(self.output_dir,
                                       '%s_topk_config.pkl' % self.timestamp)

            if not os.path.exists(config_path):
                warnings.warn(
                    "Config path %s not found! Please check if all the evaluations are failed!"
                    % config_path)
                return

            with open(config_path, 'rb') as f:
                stats = pkl.load(f)
            for algo_id in stats.keys():
                model_to_eval = stats[algo_id]
                for idx, (config, perf, path) in enumerate(model_to_eval):
                    data_node, op_list = parse_config(
                        self.original_data.copy_(),
                        config,
                        record=True,
                        if_imbal=self.if_imbal)
                    algo_id = config['algorithm']
                    estimator = fetch_predict_estimator(
                        self.task_type,
                        algo_id,
                        config,
                        data_node.data[0],
                        data_node.data[1],
                        weight_balance=data_node.enable_balance,
                        data_balance=data_node.data_balance)
                    with open(path, 'wb') as f:
                        pkl.dump([op_list, estimator, None], f)

            self.fit_ensemble()
        else:
            self.logger.info('Start to refit the best model!')

            if self.incumbent is None:
                warnings.warn(
                    "The best config is None! Please check if all the evaluations are failed!"
                )
                return

            model_path = os.path.join(
                self.output_dir, '%s_%s.pkl' %
                (self.timestamp,
                 CombinedTopKModelSaver.get_configuration_id(self.incumbent)))
            config = self.incumbent.copy()
            data_node, op_list = parse_config(self.original_data.copy_(),
                                              config,
                                              record=True,
                                              if_imbal=self.if_imbal)
            algo_id = config['algorithm']
            estimator = fetch_predict_estimator(
                self.task_type,
                algo_id,
                config,
                data_node.data[0],
                data_node.data[1],
                weight_balance=data_node.enable_balance,
                data_balance=data_node.data_balance)
            with open(model_path, 'wb') as f:
                pkl.dump([op_list, estimator, None], f)
示例#12
0
    def optimize(self):
        if self.inner_opt_algorithm in ['rb_hpo', 'fixed', 'alter_hpo', 'alter', 'combined']:
            self.optimize_explore_first()
        elif self.inner_opt_algorithm == 'equal':
            self.optimize_equal_resource()
        else:
            raise ValueError('Unsupported optimization method: %s!' % self.inner_opt_algorithm)

        scores = list()
        for _arm in self.arms:
            scores.append(self.sub_bandits[_arm].incumbent_perf)
        scores = np.array(scores)
        algo_idx = np.argmax(scores)
        self.optimal_algo_id = self.arms[algo_idx]
        self.incumbent_perf = scores[algo_idx]
        _threshold, _ensemble_size = self.incumbent_perf * 0.90, 5
        if self.incumbent_perf < 0.:
            _threshold = self.incumbent_perf / 0.9

        idxs = np.argsort(-scores)[:_ensemble_size]
        _algo_ids = [self.arms[idx] for idx in idxs]
        self.nbest_algo_ids = list()
        for _idx, _arm in zip(idxs, _algo_ids):
            if scores[_idx] >= _threshold:
                self.nbest_algo_ids.append(_arm)
        assert len(self.nbest_algo_ids) > 0

        self.logger.info('=' * 50)
        self.logger.info('Best_algo_perf:  %s' % str(self.incumbent_perf))
        self.logger.info('Best_algo_id:    %s' % str(self.optimal_algo_id))
        self.logger.info('Nbest_algo_ids:  %s' % str(self.nbest_algo_ids))
        self.logger.info('Arm candidates:  %s' % str(self.arms))
        self.logger.info('Best val scores: %s' % str(list(scores)))
        self.logger.info('=' * 50)

        if self.inner_opt_algorithm == 'combined':
            tmp_evaluator = ClassificationEvaluator(None)
            # A tmp optimizer for recording fe transformations
            self.tmp_bo = AnotherBayesianOptimizationOptimizer(0, self.original_data, tmp_evaluator, 'adaboost',
                                                               1, 1, 1)

            # Fit the best mode
            best_config = self.sub_bandits[self.optimal_algo_id].incumbent_config
            self.best_node = self.tmp_bo.fetch_nodes_by_config([best_config])[0]
            best_estimator = fetch_predict_estimator(self.task_type, best_config, self.best_node.data[0],
                                                     self.best_node.data[1],
                                                     weight_balance=self.best_node.enable_balance,
                                                     data_balance=self.best_node.data_balance,
                                                     combined=True)
        else:
            # Fit the best model
            self.fe_optimizer = self.sub_bandits[self.optimal_algo_id].optimizer['fe']
            if self.fe_algo == 'bo':
                self.fe_optimizer.fetch_nodes(1)

            best_config = self.sub_bandits[self.optimal_algo_id].inc['hpo']
            best_estimator = fetch_predict_estimator(self.task_type, best_config, self.best_data_node.data[0],
                                                     self.best_data_node.data[1],
                                                     weight_balance=self.best_data_node.enable_balance,
                                                     data_balance=self.best_data_node.data_balance)

        with open(os.path.join(self.output_dir, '%s-best_model' % self.timestamp), 'wb') as f:
            pkl.dump(best_estimator, f)

        if self.ensemble_method is not None:
            if self.inner_opt_algorithm == 'combined':
                eval_dict = {key: self.sub_bandits[key].eval_dict for key in self.include_algorithms}
                stats = fetch_ensemble_members(self.nbest_algo_ids, self.seed, eval_dict, self.tmp_bo)
                from solnml.components.ensemble.combined_ensemble.ensemble_bulider import EnsembleBuilder
            else:
                # stats = self.fetch_ensemble_members_ano()
                stats = self.fetch_ensemble_members()

                from solnml.components.ensemble import EnsembleBuilder

            # Ensembling all intermediate/ultimate models found in above optimization process.
            self.es = EnsembleBuilder(stats=stats,
                                      ensemble_method=self.ensemble_method,
                                      ensemble_size=self.ensemble_size,
                                      task_type=self.task_type,
                                      metric=self.metric,
                                      output_dir=self.output_dir)
            self.es.fit(data=self.original_data)
示例#13
0
def execute_func(params):
    estimator = fetch_predict_estimator(*params)
    return estimator
示例#14
0
def evaluate_evaluation_based_fe(dataset, time_limit, run_id, seed):
    from solnml.components.fe_optimizers.evaluation_based_optimizer import EvaluationBasedOptimizer

    # Prepare the configuration for random forest.
    from ConfigSpace.hyperparameters import UnParametrizedHyperparameter
    from autosklearn.pipeline.components.classification.random_forest import RandomForest
    cs = RandomForest.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest')
    cs.add_hyperparameter(clf_hp)
    print(cs.get_default_configuration())
    """
    Configuration:
      bootstrap, Value: 'True'
      criterion, Value: 'gini'
      estimator, Constant: 'random_forest'
      max_depth, Constant: 'None'
      max_features, Value: 0.5
      max_leaf_nodes, Constant: 'None'
      min_impurity_decrease, Constant: 0.0
      min_samples_leaf, Value: 1
      min_samples_split, Value: 2
      min_weight_fraction_leaf, Constant: 0.0
      n_estimators, Constant: 100
    """
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')

    train_data, test_data = load_train_test_data(dataset)
    optimizer = EvaluationBasedOptimizer(MULTICLASS_CLS,
                                         train_data,
                                         evaluator,
                                         'random_forest',
                                         300,
                                         10000,
                                         seed,
                                         trans_set=None)

    _start_time = time.time()
    _iter_id = 0
    while True:
        if time.time(
        ) > _start_time + time_limit or optimizer.early_stopped_flag:
            break
        score, iteration_cost, inc = optimizer.iterate()
        print('%d - %.4f' % (_iter_id, score))
        _iter_id += 1

    final_train_data = optimizer.apply(train_data, optimizer.incumbent)
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, score)

    final_test_data = optimizer.apply(test_data, optimizer.incumbent)
    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(MULTICLASS_CLS,
                                  cs.get_default_configuration(), X_train,
                                  y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)

    from solnml.components.metrics.cls_metrics import balanced_accuracy
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    save_path = save_dir + 'hmab_fe_%s_%d_%d.pkl' % (dataset, time_limit,
                                                     run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score], f)
示例#15
0
    def fit(self, data, solvers=None):
        # Split training data for phase 1 and phase 2
        test_size = 0.2

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats["include_algorithms"]:
            model_to_eval = self.stats[algo_id]['model_to_eval']
            for idx, (node, config) in enumerate(model_to_eval):
                X, y = node.data
                if self.task_type in CLS_TASKS:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(
                        X,
                        y,
                        test_size=test_size,
                        stratify=data.data[1],
                        random_state=self.seed)
                else:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(
                        X, y, test_size=test_size, random_state=self.seed)

                if self.base_model_mask[model_cnt] == 1:
                    estimator = fetch_predict_estimator(
                        self.task_type,
                        config,
                        x_p1,
                        y_p1,
                        weight_balance=node.enable_balance,
                        data_balance=node.data_balance)
                    with open(
                            os.path.join(
                                self.output_dir, '%s-blending-model%d' %
                                (self.timestamp, model_cnt)), 'wb') as f:
                        pkl.dump(estimator, f)

                    if (solvers is not None):
                        fe_savepath = os.path.join(
                            self.output_dir,
                            '%s-blending-fe%d' % (self.timestamp, model_cnt))
                        solvers[algo_id].optimizer['fe'].save(
                            node, fe_savepath)

                    if self.task_type in CLS_TASKS:
                        pred = estimator.predict_proba(x_p2)
                        n_dim = np.array(pred).shape[1]
                        if n_dim == 2:
                            # Binary classificaion
                            n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(x_p2)
                            feature_p2 = np.zeros(
                                (num_samples, self.ensemble_size * n_dim))
                        if n_dim == 1:
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                       n_dim] = pred[:, 1:2]
                        else:
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                       n_dim] = pred
                    else:
                        pred = estimator.predict(x_p2).reshape(-1, 1)
                        n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(x_p2)
                            feature_p2 = np.zeros(
                                (num_samples, self.ensemble_size * n_dim))
                        feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                   n_dim] = pred
                    suc_cnt += 1
                model_cnt += 1
        self.meta_learner.fit(feature_p2, y_p2)

        return self