Пример #1
0
def conduct_fe(dataset='pc4',
               classifier_id='random_forest',
               iter_num=100,
               run_id=0,
               seed=1):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)
    default_config = cs.get_default_configuration()

    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    evaluator = ClassificationEvaluator(default_config,
                                        name='fe',
                                        data_node=raw_data,
                                        resampling_strategy='holdout',
                                        seed=seed)

    val_acc = evaluator(default_config)
    estimator = fetch_predict_estimator(default_config, raw_data.data[0],
                                        raw_data.data[1])
    pred = estimator.predict(test_raw_data.data[0])
    test_acc = balanced_accuracy(test_raw_data.data[1], pred)

    optimizer = EvaluationBasedOptimizer(task_type='classification',
                                         input_data=raw_data,
                                         evaluator=evaluator,
                                         model_id=classifier_id,
                                         time_limit_per_trans=240,
                                         mem_limit_per_trans=10000,
                                         seed=seed)

    task_id = 'fe-%s-%s-%d' % (dataset, classifier_id, iter_num)
    val_acc_list, test_acc_list = [], []

    val_acc_list.append(val_acc)
    test_acc_list.append(test_acc)

    for _iter in range(iter_num):
        perf, _, incubent = optimizer.iterate()
        val_acc_list.append(perf)
        train_node = optimizer.apply(raw_data, incubent)
        test_node = optimizer.apply(test_raw_data, incubent)
        estimator = fetch_predict_estimator(default_config, train_node.data[0],
                                            train_node.data[1])
        pred = estimator.predict(test_node.data[0])
        test_perf = balanced_accuracy(test_node.data[1], pred)
        test_acc_list.append(test_perf)
        print(val_acc_list)
        print(test_acc_list)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([val_acc_list, test_acc_list], f)
Пример #2
0
def conduct_hpo(dataset='pc4',
                classifier_id='random_forest',
                iter_num=100,
                run_id=0,
                seed=1):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)

    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='hpo',
                                        data_node=raw_data,
                                        resampling_strategy='holdout',
                                        seed=seed)

    default_config = cs.get_default_configuration()
    val_acc = 1. - evaluator(default_config)
    estimator = fetch_predict_estimator(default_config, raw_data.data[0],
                                        raw_data.data[1])
    pred = estimator.predict(test_raw_data.data[0])
    test_acc = balanced_accuracy(test_raw_data.data[1], pred)

    optimizer = SMACOptimizer(evaluator,
                              cs,
                              trials_per_iter=2,
                              output_dir='logs',
                              per_run_time_limit=180)
    task_id = 'hpo-%s-%s-%d' % (dataset, classifier_id, iter_num)

    val_acc_list, test_acc_list = [], []
    val_acc_list.append(val_acc)
    test_acc_list.append(test_acc)

    for _iter in range(iter_num):
        perf, _, config = optimizer.iterate()
        val_acc_list.append(perf)
        estimator = fetch_predict_estimator(config, raw_data.data[0],
                                            raw_data.data[1])
        pred = estimator.predict(test_raw_data.data[0])
        test_perf = balanced_accuracy(test_raw_data.data[1], pred)
        test_acc_list.append(test_perf)
        print(val_acc_list)
        print(test_acc_list)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([val_acc_list, test_acc_list], f)
Пример #3
0
def evaluate(train_data, test_data, config):
    X_train, y_train = train_data.data
    X_test, y_test = test_data.data
    print('X_train/test shapes: %s, %s' %
          (str(X_train.shape), str(X_test.shape)))

    # Build the ML estimator.
    from automlToolkit.components.evaluators.evaluator import fetch_predict_estimator
    estimator = fetch_predict_estimator(config, X_train, y_train)

    y_pred = estimator.predict(X_test)
    return balanced_accuracy(y_test, y_pred)
def evaluate_2rd_layered_bandit(run_id, mth='rb', dataset='pc4', algo='libsvm_svc',
                                cv='holdout', time_limit=120000, seed=1):
    train_data, test_data = load_train_test_data(dataset)
    bandit = SecondLayerBandit(algo, train_data, dataset_id=dataset, mth=mth, seed=seed, eval_type=cv)

    _start_time = time.time()
    _iter_id = 0
    stats = list()

    while True:
        if time.time() > time_limit + _start_time or bandit.early_stopped_flag:
            break
        res = bandit.play_once()
        print('Iteration %d - %.4f' % (_iter_id, res))
        stats.append([_iter_id, time.time() - _start_time, res])
        _iter_id += 1

    print(bandit.final_rewards)
    print(bandit.action_sequence)
    print(np.mean(bandit.evaluation_cost['fe']))
    print(np.mean(bandit.evaluation_cost['hpo']))

    fe_optimizer = bandit.optimizer['fe']
    final_train_data = fe_optimizer.apply(train_data, bandit.inc['fe'])
    assert final_train_data == bandit.inc['fe']
    final_test_data = fe_optimizer.apply(test_data, bandit.inc['fe'])
    config = bandit.inc['hpo']

    evaluator = ClassificationEvaluator(config, name='fe', seed=seed, resampling_strategy='holdout')
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, res)

    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(config, X_train, y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    # Alleviate overfitting.
    y_pred1 = bandit.predict(test_data.data[0])
    test_score1 = balanced_accuracy(y_test, y_pred1)
    print('==> Test score with average ensemble', test_score1)

    y_pred2 = bandit.predict(test_data.data[0], is_weighted=True)
    test_score2 = balanced_accuracy(y_test, y_pred2)
    print('==> Test score with weighted ensemble', test_score2)

    save_path = save_folder + '%s_%s_%d_%d_%s.pkl' % (mth, dataset, time_limit, run_id, algo)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score, test_score1, test_score2], f)
Пример #5
0
    def predict(self, test_data: DataNode):
        best_arm = self.optimal_algo_id
        sub_bandit = self.sub_bandits[best_arm]
        fe_optimizer = sub_bandit.optimizer['fe']

        train_data_node = sub_bandit.inc['fe']
        test_data_node = fe_optimizer.apply(test_data, sub_bandit.inc['fe'])
        config = sub_bandit.inc['hpo']

        # Check the validity of feature engineering.
        _train_data = fe_optimizer.apply(self.original_data, sub_bandit.inc['fe'])
        assert train_data_node == _train_data

        X_train, y_train = train_data_node.data
        X_test, y_test = test_data_node.data
        self.logger.info('X_train/test shapes: %s, %s' % (str(X_train.shape), str(X_test.shape)))

        # Build the ML estimator.
        from automlToolkit.components.evaluators.cls_evaluator import fetch_predict_estimator
        estimator = fetch_predict_estimator(config, X_train, y_train)
        y_pred = estimator.predict(X_test)
        return y_pred
Пример #6
0
def evaluate_base_model(classifier_id, dataset):
    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset)

    from autosklearn.pipeline.components.classification import _classifiers
    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)
    default_config = cs.get_default_configuration()
    X_train, y_train = train_data.data
    X_test, y_test = test_data.data
    print('X_train/test shapes: %s, %s' %
          (str(X_train.shape), str(X_test.shape)))

    # Build the ML estimator.
    from automlToolkit.components.evaluators.cls_evaluator import fetch_predict_estimator
    estimator = fetch_predict_estimator(default_config, X_train, y_train)

    y_pred = estimator.predict(X_test)
    print(balanced_accuracy(y_test, y_pred))
    print(balanced_accuracy(y_pred, y_test))
Пример #7
0
def evaluate_evaluation_based_fe(dataset, time_limit, run_id, seed):
    from automlToolkit.components.fe_optimizers.evaluation_based_optimizer import EvaluationBasedOptimizer

    # Prepare the configuration for random forest.
    from ConfigSpace.hyperparameters import UnParametrizedHyperparameter
    from autosklearn.pipeline.components.classification.random_forest import RandomForest
    cs = RandomForest.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest')
    cs.add_hyperparameter(clf_hp)
    print(cs.get_default_configuration())
    """
    Configuration:
      bootstrap, Value: 'True'
      criterion, Value: 'gini'
      estimator, Constant: 'random_forest'
      max_depth, Constant: 'None'
      max_features, Value: 0.5
      max_leaf_nodes, Constant: 'None'
      min_impurity_decrease, Constant: 0.0
      min_samples_leaf, Value: 1
      min_samples_split, Value: 2
      min_weight_fraction_leaf, Constant: 0.0
      n_estimators, Constant: 100
    """
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')

    train_data, test_data = load_train_test_data(dataset)
    optimizer = EvaluationBasedOptimizer('classification',
                                         train_data,
                                         evaluator,
                                         'random_forest',
                                         300,
                                         10000,
                                         seed,
                                         trans_set=None)

    _start_time = time.time()
    _iter_id = 0
    while True:
        if time.time(
        ) > _start_time + time_limit or optimizer.early_stopped_flag:
            break
        score, iteration_cost, inc = optimizer.iterate()
        print('%d - %.4f' % (_iter_id, score))
        _iter_id += 1

    final_train_data = optimizer.apply(train_data, optimizer.incumbent)
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, score)

    final_test_data = optimizer.apply(test_data, optimizer.incumbent)
    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(cs.get_default_configuration(), X_train,
                                  y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)

    from automlToolkit.components.metrics.cls_metrics import balanced_accuracy
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    save_path = save_dir + 'hmab_fe_%s_%d_%d.pkl' % (dataset, time_limit,
                                                     run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score], f)
Пример #8
0
def evaluate_2rd_bandit(dataset, algo, time_limit, run_id, seed):
    print('HMAB-%s-%s: run_id=%d' % (dataset, algo, run_id))
    print('==> Start to Evaluate', dataset, 'Budget', time_limit)
    train_data, test_data = load_train_test_data(dataset)
    enable_intersect = True
    bandit = SecondLayerBandit(algo,
                               train_data,
                               per_run_time_limit=300,
                               seed=seed,
                               eval_type='holdout',
                               mth='alter_hpo',
                               enable_intersection=enable_intersect)
    mth_id = 'hmab' if enable_intersect else 'hmab0'
    _start_time = time.time()
    _iter_id = 0
    stats = list()

    while True:
        if time.time() > time_limit + _start_time or bandit.early_stopped_flag:
            break
        res = bandit.play_once()
        print('Iteration %d - %.4f' % (_iter_id, res))
        stats.append([_iter_id, time.time() - _start_time, res])
        _iter_id += 1

    print(bandit.final_rewards)
    print(bandit.action_sequence)
    print(np.mean(bandit.evaluation_cost['fe']))
    print(np.mean(bandit.evaluation_cost['hpo']))

    fe_optimizer = bandit.optimizer['fe']
    final_train_data = fe_optimizer.apply(train_data, bandit.inc['fe'])
    assert final_train_data == bandit.inc['fe']
    final_test_data = fe_optimizer.apply(test_data, bandit.inc['fe'])
    config = bandit.inc['hpo']

    evaluator = ClassificationEvaluator(config,
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, res)

    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(config, X_train, y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    # Alleviate overfitting.
    y_pred1 = bandit.predict(test_data.data[0])
    test_score1 = balanced_accuracy(y_test, y_pred1)
    print('==> Test score with average ensemble', test_score1)

    y_pred2 = bandit.predict(test_data.data[0], is_weighted=True)
    test_score2 = balanced_accuracy(y_test, y_pred2)
    print('==> Test score with weighted ensemble', test_score2)

    save_path = save_dir + '%s_2rd_bandit_%s_%d_%d_%s.pkl' % (
        mth_id, dataset, time_limit, run_id, algo)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score, test_score1, test_score2],
                    f)