Exemplo n.º 1
0
def conduct_fe(dataset='pc4',
               classifier_id='random_forest',
               iter_num=100,
               run_id=0,
               seed=1):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)
    default_config = cs.get_default_configuration()

    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    evaluator = ClassificationEvaluator(default_config,
                                        name='fe',
                                        data_node=raw_data,
                                        resampling_strategy='holdout',
                                        seed=seed)

    val_acc = evaluator(default_config)
    estimator = fetch_predict_estimator(default_config, raw_data.data[0],
                                        raw_data.data[1])
    pred = estimator.predict(test_raw_data.data[0])
    test_acc = balanced_accuracy(test_raw_data.data[1], pred)

    optimizer = EvaluationBasedOptimizer(task_type='classification',
                                         input_data=raw_data,
                                         evaluator=evaluator,
                                         model_id=classifier_id,
                                         time_limit_per_trans=240,
                                         mem_limit_per_trans=10000,
                                         seed=seed)

    task_id = 'fe-%s-%s-%d' % (dataset, classifier_id, iter_num)
    val_acc_list, test_acc_list = [], []

    val_acc_list.append(val_acc)
    test_acc_list.append(test_acc)

    for _iter in range(iter_num):
        perf, _, incubent = optimizer.iterate()
        val_acc_list.append(perf)
        train_node = optimizer.apply(raw_data, incubent)
        test_node = optimizer.apply(test_raw_data, incubent)
        estimator = fetch_predict_estimator(default_config, train_node.data[0],
                                            train_node.data[1])
        pred = estimator.predict(test_node.data[0])
        test_perf = balanced_accuracy(test_node.data[1], pred)
        test_acc_list.append(test_perf)
        print(val_acc_list)
        print(test_acc_list)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([val_acc_list, test_acc_list], f)
def evaluate_2rd_layered_bandit(run_id, mth='rb', dataset='pc4', algo='libsvm_svc',
                                cv='holdout', time_limit=120000, seed=1):
    train_data, test_data = load_train_test_data(dataset)
    bandit = SecondLayerBandit(algo, train_data, dataset_id=dataset, mth=mth, seed=seed, eval_type=cv)

    _start_time = time.time()
    _iter_id = 0
    stats = list()

    while True:
        if time.time() > time_limit + _start_time or bandit.early_stopped_flag:
            break
        res = bandit.play_once()
        print('Iteration %d - %.4f' % (_iter_id, res))
        stats.append([_iter_id, time.time() - _start_time, res])
        _iter_id += 1

    print(bandit.final_rewards)
    print(bandit.action_sequence)
    print(np.mean(bandit.evaluation_cost['fe']))
    print(np.mean(bandit.evaluation_cost['hpo']))

    fe_optimizer = bandit.optimizer['fe']
    final_train_data = fe_optimizer.apply(train_data, bandit.inc['fe'])
    assert final_train_data == bandit.inc['fe']
    final_test_data = fe_optimizer.apply(test_data, bandit.inc['fe'])
    config = bandit.inc['hpo']

    evaluator = ClassificationEvaluator(config, name='fe', seed=seed, resampling_strategy='holdout')
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, res)

    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(config, X_train, y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    # Alleviate overfitting.
    y_pred1 = bandit.predict(test_data.data[0])
    test_score1 = balanced_accuracy(y_test, y_pred1)
    print('==> Test score with average ensemble', test_score1)

    y_pred2 = bandit.predict(test_data.data[0], is_weighted=True)
    test_score2 = balanced_accuracy(y_test, y_pred2)
    print('==> Test score with weighted ensemble', test_score2)

    save_path = save_folder + '%s_%s_%d_%d_%s.pkl' % (mth, dataset, time_limit, run_id, algo)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score, test_score1, test_score2], f)
Exemplo n.º 3
0
def conduct_hpo(dataset='pc4',
                classifier_id='random_forest',
                iter_num=100,
                run_id=0,
                seed=1):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)

    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='hpo',
                                        data_node=raw_data,
                                        resampling_strategy='holdout',
                                        seed=seed)

    default_config = cs.get_default_configuration()
    val_acc = 1. - evaluator(default_config)
    estimator = fetch_predict_estimator(default_config, raw_data.data[0],
                                        raw_data.data[1])
    pred = estimator.predict(test_raw_data.data[0])
    test_acc = balanced_accuracy(test_raw_data.data[1], pred)

    optimizer = SMACOptimizer(evaluator,
                              cs,
                              trials_per_iter=2,
                              output_dir='logs',
                              per_run_time_limit=180)
    task_id = 'hpo-%s-%s-%d' % (dataset, classifier_id, iter_num)

    val_acc_list, test_acc_list = [], []
    val_acc_list.append(val_acc)
    test_acc_list.append(test_acc)

    for _iter in range(iter_num):
        perf, _, config = optimizer.iterate()
        val_acc_list.append(perf)
        estimator = fetch_predict_estimator(config, raw_data.data[0],
                                            raw_data.data[1])
        pred = estimator.predict(test_raw_data.data[0])
        test_perf = balanced_accuracy(test_raw_data.data[1], pred)
        test_acc_list.append(test_perf)
        print(val_acc_list)
        print(test_acc_list)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([val_acc_list, test_acc_list], f)
Exemplo n.º 4
0
def evaluate(train_data, test_data, config):
    X_train, y_train = train_data.data
    X_test, y_test = test_data.data
    print('X_train/test shapes: %s, %s' %
          (str(X_train.shape), str(X_test.shape)))

    # Build the ML estimator.
    from automlToolkit.components.evaluators.evaluator import fetch_predict_estimator
    estimator = fetch_predict_estimator(config, X_train, y_train)

    y_pred = estimator.predict(X_test)
    return balanced_accuracy(y_test, y_pred)
Exemplo n.º 5
0
def evaluate_base_model(classifier_id, dataset):
    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset)

    from autosklearn.pipeline.components.classification import _classifiers
    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)
    default_config = cs.get_default_configuration()
    X_train, y_train = train_data.data
    X_test, y_test = test_data.data
    print('X_train/test shapes: %s, %s' %
          (str(X_train.shape), str(X_test.shape)))

    # Build the ML estimator.
    from automlToolkit.components.evaluators.cls_evaluator import fetch_predict_estimator
    estimator = fetch_predict_estimator(default_config, X_train, y_train)

    y_pred = estimator.predict(X_test)
    print(balanced_accuracy(y_test, y_pred))
    print(balanced_accuracy(y_pred, y_test))
Exemplo n.º 6
0
def evaluate_autosklearn(algorithms,
                         dataset,
                         run_id,
                         trial_num,
                         seed,
                         time_limit=1200):
    print('==> Start to evaluate', dataset, 'budget', time_limit)
    include_models = algorithms
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_limit,
        per_run_time_limit=per_run_time_limit,
        include_preprocessors=None,
        exclude_preprocessors=None,
        n_jobs=1,
        include_estimators=include_models,
        ensemble_memory_limit=8192,
        ml_memory_limit=8192,
        ensemble_size=1,
        ensemble_nbest=1,
        initial_configurations_via_metalearning=0,
        seed=int(seed),
        resampling_strategy='holdout',
        resampling_strategy_arguments={'train_size': 0.67})
    print(automl)

    train_data, test_data = load_train_test_data(dataset)
    X, y = train_data.data
    feat_type = [
        'Categorical' if _type == CATEGORICAL else 'Numerical'
        for _type in train_data.feature_types
    ]

    from autosklearn.metrics import balanced_accuracy
    automl.fit(X.copy(),
               y.copy(),
               metric=balanced_accuracy,
               feat_type=feat_type)
    model_desc = automl.show_models()
    print(model_desc)
    val_result = np.max(automl.cv_results_['mean_test_score'])
    print('Best validation accuracy', val_result)

    X_test, y_test = test_data.data
    automl.refit(X.copy(), y.copy())
    y_pred = automl.predict(X_test)
    test_result = balanced_accuracy(y_test, y_pred)
    print('Test accuracy', test_result)
    save_path = project_dir + 'data/ausk_vanilla_%s_%d_%d_%d_%d.pkl' % (
        dataset, trial_num, len(algorithms), seed, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_result, test_result, model_desc], f)
Exemplo n.º 7
0
def evaluate_2rd_bandit(dataset, algo, time_limit, run_id, seed):
    print('HMAB-%s-%s: run_id=%d' % (dataset, algo, run_id))
    print('==> Start to Evaluate', dataset, 'Budget', time_limit)
    train_data, test_data = load_train_test_data(dataset)
    enable_intersect = True
    bandit = SecondLayerBandit(algo,
                               train_data,
                               per_run_time_limit=300,
                               seed=seed,
                               eval_type='holdout',
                               mth='alter_hpo',
                               enable_intersection=enable_intersect)
    mth_id = 'hmab' if enable_intersect else 'hmab0'
    _start_time = time.time()
    _iter_id = 0
    stats = list()

    while True:
        if time.time() > time_limit + _start_time or bandit.early_stopped_flag:
            break
        res = bandit.play_once()
        print('Iteration %d - %.4f' % (_iter_id, res))
        stats.append([_iter_id, time.time() - _start_time, res])
        _iter_id += 1

    print(bandit.final_rewards)
    print(bandit.action_sequence)
    print(np.mean(bandit.evaluation_cost['fe']))
    print(np.mean(bandit.evaluation_cost['hpo']))

    fe_optimizer = bandit.optimizer['fe']
    final_train_data = fe_optimizer.apply(train_data, bandit.inc['fe'])
    assert final_train_data == bandit.inc['fe']
    final_test_data = fe_optimizer.apply(test_data, bandit.inc['fe'])
    config = bandit.inc['hpo']

    evaluator = ClassificationEvaluator(config,
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, res)

    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(config, X_train, y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    # Alleviate overfitting.
    y_pred1 = bandit.predict(test_data.data[0])
    test_score1 = balanced_accuracy(y_test, y_pred1)
    print('==> Test score with average ensemble', test_score1)

    y_pred2 = bandit.predict(test_data.data[0], is_weighted=True)
    test_score2 = balanced_accuracy(y_test, y_pred2)
    print('==> Test score with weighted ensemble', test_score2)

    save_path = save_dir + '%s_2rd_bandit_%s_%d_%d_%s.pkl' % (
        mth_id, dataset, time_limit, run_id, algo)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score, test_score1, test_score2],
                    f)