示例#1
0
def evaluate_c():
    rep_num = 10
    run_count = 500
    start_id = args.start_runid
    datasets = args.datasets.split(',')
    task_id = 'exp5_eval_c'
    print(rep_num, run_count, datasets, task_id)

    for dataset in datasets:
        # Make directories.
        dataset_id = dataset.split('_')[0]
        save_dir = "data/%s/" % dataset_id
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        result = dict()
        seeds = get_seeds(dataset, start_id + rep_num)
        for run_id in range(start_id, start_id + rep_num):
            seed = seeds[run_id]

            # Dataset partition.
            X, y, _ = load_data(dataset)
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y)
            dm = DataManager(X_train, y_train)

            # Test each optimizer algorithm:
            for p in [1, 4, 10, 14, 16, 20]:
                task_name = dataset + '_%s_%d_%d_%d' % (task_id, run_count,
                                                        run_id, p)
                mode = 3
                optimizer = 'mono_smbo'

                print('Test %s optimizer => %s' % (optimizer, task_name))

                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                task_name=task_name,
                                                update_mode=mode,
                                                param=p)
                acc = cls.score(X_test, y_test)
                key_id = '%s_%d_%d_%d_%s' % (dataset, run_count, run_id, p,
                                             optimizer)
                result[key_id] = acc

            # Display and save the test result.
            print(result)
            with open(
                    'data/%s/%s_test_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset, task_id, run_count, rep_num, start_id),
                    'wb') as f:
                pickle.dump(result, f)
示例#2
0
def test_cash_module():
    rep_num = args.rep
    run_count = args.run_count
    start_id = args.start_runid
    datasets = args.datasets.split(',')
    optimizer_algos = args.opt_algo.split(',')
    task_id = args.task_id
    print(rep_num, run_count, datasets, optimizer_algos, task_id)

    result = dict()
    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        seeds = get_seeds(dataset, rep_num)
        for run_id in range(start_id, rep_num):
            task_name = dataset + '_%s_%d_%d' % (task_id, run_count, run_id)
            seed = seeds[run_id]

            # Dataset partition.
            X, y, _ = load_data(dataset)
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y)
            dm = DataManager(X_train, y_train)

            # Test each optimizer algorithm:
            for optimizer in optimizer_algos:
                # Parse the parameters for each optimizer.
                mode = 2
                eta, r = 2, 2
                if optimizer.startswith('baseline'):
                    optimizer, mode = optimizer.split('_')
                    mode = 1 if mode == 'rand' else 2
                if optimizer.startswith('sh'):
                    if len(optimizer.split('_')) == 2:
                        optimizer, eta = optimizer.split('_')
                        eta = float(eta)
                    else:
                        raise ValueError('Wrong SH params!')
                if optimizer.startswith('rl'):
                    if len(optimizer.split('_')) == 3:
                        _, mode, eta = optimizer.split('_')
                        mode = int(mode)
                        optimizer = 'rl_smbo'
                    else:
                        raise ValueError('Wrong SH params!')
                if optimizer.startswith('ts_smbo'):
                    mode = 1
                    if len(optimizer.split('_')) == 3:
                        _, _, mode = optimizer.split('_')
                        mode = int(mode)
                        optimizer = 'ts_smbo'
                if optimizer.startswith('mcmc_ts'):
                    _, _, mode, eta, r = optimizer.split('_')
                    mode = int(mode)
                    eta = int(eta)
                    r = int(r)
                    optimizer = 'mcmc_ts_smbo'

                if optimizer.startswith('ucb_smbo'):
                    mode = 1
                    if len(optimizer.split('_')) == 3:
                        _, _, mode = optimizer.split('_')
                        mode = int(mode)
                        optimizer = 'ucb_smbo'

                if optimizer.startswith('mono_smbo'):
                    mode = 2
                    if len(optimizer.split('_')) == 4:
                        _, _, mode, r = optimizer.split('_')
                        mode, r = int(mode), int(r)
                        eta = 10
                        optimizer = 'mono_smbo'

                print('Test %s optimizer => %s' % (optimizer, task_name))

                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                task_name=task_name,
                                                update_mode=mode,
                                                eta=eta,
                                                r=r,
                                                param=eta)
                acc = cls.score(X_test, y_test)
                key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id,
                                          optimizer)
                result[key_id] = acc

            # Display and save the test result.
            print(result)
            with open(
                    'data/%s/%s_test_result_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset_id, task_id, run_count, rep_num,
                 start_id), 'wb') as f:
                pickle.dump(result, f)
示例#3
0
from time import time

warnings.filterwarnings("ignore")

parser = argparse.ArgumentParser()
parser.add_argument("--generated_feature", type=int, default=1)
parser.add_argument("--dataset", type=str)
args = parser.parse_args()

x, y, c = load_data(args.dataset)

dm = DataManager(x, y)

lr = LogisticRegression()
lr.fit(dm.train_X, dm.train_y)
y_pred = lr.predict(dm.val_X)
print("original lr accu:", accuracy_score(dm.val_y, y_pred), flush=True)

if args.generated_feature > 0:
    af = AutoFeature("accuracy", "auto_cross")
    af.fit(dm, args.generated_feature)
    dm = af.transform(dm)

clf = Classifier()
start_time = time()
clf.fit(dm, metric="accuracy", runcount=50)
print("alphaml time:", time() - start_time)
print("dataset:", args.dataset)
print("generated data:", args.generated_feature, ", alphaml score:",
      clf.score(dm.val_X, dm.val_y))
示例#4
0
def test_exp4_runtime():
    rep_num = args.rep
    run_count = args.run_count
    B = args.B
    if B > 0:
        run_count = 0

    start_id = args.start_runid
    datasets = args.datasets.split(',')
    print(rep_num, run_count, datasets)
    task_id = "exp4_runtime"

    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        # Dataset partition.
        X, y, _ = load_data(dataset)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y)
        dm = DataManager(X_train, y_train)

        runcount_dict = dict()
        tpe_runcount = 0.

        optimizer_algos = ['mono_smbo_4', 'smbo', 'tpe']
        # optimizer_algos = ['mono_smbo_3_0']
        # Test each optimizer algorithm:
        assert optimizer_algos[-1] == 'tpe'
        for opt_algo in optimizer_algos:
            # if algo is tpe, we need to estimate its runcount in one hour.
            if opt_algo != 'tpe':
                runcount_dict[opt_algo] = list()
            else:
                count_list = list()
                for key in runcount_dict.keys():
                    count_list.append(np.mean(runcount_dict[key]))
                assert len(count_list) > 0
                tpe_runcount = np.min(count_list)
                print('=' * 50, tpe_runcount)

            result = dict()
            mode, eta = None, None
            # Parse the parameters for each optimizer.
            if opt_algo.startswith('mono_smbo'):
                mode = 2
                if len(opt_algo.split('_')) == 3:
                    _, _, mode = opt_algo.split('_')
                    mode = int(mode)
                    eta = 10
                    optimizer = 'mono_smbo'
            else:
                optimizer = opt_algo

            print('Test optimizer: %s' % optimizer)

            seeds = get_seeds(dataset, rep_num)
            for run_id in range(start_id, rep_num):
                if B > 0:
                    task_name = dataset + '_%s_%d_%d_%d' % (task_id, B,
                                                            run_count, run_id)
                else:
                    task_name = dataset + '_%s_%d_%d' % (task_id, run_count,
                                                         run_id)
                seed = seeds[run_id]

                runcount_const = run_count if opt_algo != 'tpe' else tpe_runcount
                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=runcount_const,
                                                runtime=B,
                                                task_name=task_name,
                                                update_mode=mode,
                                                param=eta)

                # Test the CASH performance on test set.
                cash_test_acc = cls.score(X_test, y_test)

                # Load CASH intermediate infos.
                if optimizer == 'smbo':
                    file_id = 'smac'
                elif optimizer == 'tpe':
                    file_id = 'hyperopt'
                elif optimizer == 'mono_smbo':
                    file_id = 'mm_bandit_%d_smac' % mode
                else:
                    raise ValueError('Invalid optimizer!')

                tmp_task_id = '%s_%d' % (task_id, B) if B > 0 else task_id
                tmp_configs, tmp_perfs = load_infos(dataset, tmp_task_id,
                                                    run_count, run_id, file_id)
                if opt_algo != 'tpe':
                    runcount_dict[opt_algo].append(len(tmp_configs))

                model_infos = (tmp_configs, tmp_perfs)
                ensemble_size = 50
                task_type = type_of_target(dm.train_y)
                if optimizer == 'tpe':
                    task_type = 'hyperopt_' + task_type
                metric = accuracy_score

                ensemble_model = EnsembleSelection(model_infos,
                                                   ensemble_size,
                                                   task_type,
                                                   metric,
                                                   n_best=20)
                ensemble_model.fit(dm)

                ens_val_pred = ensemble_model.predict(dm.val_X)
                ens_val_acc = accuracy_score(ens_val_pred, dm.val_y)

                ens_pred = ensemble_model.predict(X_test)
                ens_test_acc = accuracy_score(ens_pred, y_test)

                key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id,
                                          optimizer)
                result[key_id] = [cash_test_acc, ens_val_acc, ens_test_acc]
                print(result)

            # Save the test result.
            with open(
                    'data/%s/%s_test_result_%s_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset, opt_algo, task_id, run_count, rep_num,
                 start_id), 'wb') as f:
                pickle.dump(result, f)
示例#5
0
def test_exp2_evaluation():
    rep_num = args.rep
    run_count = args.run_count

    start_id = args.start_runid
    datasets = args.datasets.split(',')
    print(rep_num, run_count, datasets)
    task_id = "exp_2_evaluation"

    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        # Dataset partition.
        X, y, _ = load_data(dataset)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y)
        dm = DataManager(X_train, y_train)

        # optimizer_algos = ['cmab_ts', 'rl_1_0.3', 'rl_2_1', 'rl_3_0']
        optimizer_algos = ['cmab_ts', 'rl_2_1', 'rl_3_0']
        # Test each optimizer algorithm:
        for opt_algo in optimizer_algos:
            result = dict()
            mode, eta = None, None
            # Parse the parameters for each optimizer.
            if opt_algo.startswith('rl'):
                if len(opt_algo.split('_')) == 3:
                    _, mode, eta = opt_algo.split('_')
                    mode = int(mode)
                    optimizer = 'rl_smbo'
                    eta = float(eta)
                else:
                    raise ValueError('Wrong params!')
            else:
                optimizer = opt_algo

            print('Test optimizer: %s' % optimizer)

            seeds = get_seeds(dataset, rep_num)
            for run_id in range(start_id, rep_num):
                task_name = dataset + '_%s_%d_%d' % (task_id, run_count,
                                                     run_id)
                seed = seeds[run_id]

                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                runtime=None,
                                                task_name=task_name,
                                                update_mode=mode,
                                                param=eta)

                # Test the CASH performance on test set.
                cash_test_acc = cls.score(X_test, y_test)
                key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id,
                                          optimizer)
                result[key_id] = [cash_test_acc]
                print(result)

            # Save the test result.
            with open(
                    'data/%s/%s_test_result_%s_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset, opt_algo, task_id, run_count, rep_num,
                 start_id), 'wb') as f:
                pickle.dump(result, f)
示例#6
0
def evaluate_k():
    algo_list = [
        'xgboost', 'liblinear_svc', 'gradient_boosting', 'decision_tree',
        'passive_aggressive', 'qda', 'random_forest', 'sgd', 'extra_trees',
        'lda', 'gaussian_nb', 'libsvm_svc', 'logistic_regression', 'adaboost',
        'k_nearest_neighbors'
    ]

    rep_num = args.rep
    run_count = args.run_count
    start_id = args.start_runid
    datasets = args.datasets.split(',')
    task_id = 'exp5_eval_k'
    print(rep_num, run_count, datasets, task_id)

    for dataset in datasets:
        # Make directories.
        dataset_id = dataset.split('_')[0]
        save_dir = "data/%s/" % dataset_id
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Dataset partition.
        X, y, _ = load_data(dataset)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y)
        dm = DataManager(X_train, y_train)

        # opt_algos = ['mono_smbo_3_0', 'smbo', 'baseline_2', 'tpe']
        opt_algos = ['mono_smbo_3_0', 'smbo', 'baseline_2']
        for algo in opt_algos:
            result = dict()
            seeds = get_seeds(dataset, rep_num)
            for run_id in range(start_id, rep_num):
                seed = seeds[run_id]

                # Test each optimizer algorithm:
                for n_est in [15, 12, 8, 4, 2, 1]:
                    algos = algo_list[:n_est]
                    task_name = dataset + '_%s_%d_%d_%d' % (task_id, run_count,
                                                            run_id, n_est)
                    mode, param = 3, None
                    if algo.startswith('mono_smbo'):
                        optimizer = 'mono_smbo'
                        mode, param = 3, 10
                    elif algo.startswith('baseline'):
                        optimizer = 'baseline'
                        mode = 2
                    else:
                        optimizer = algo

                    print('Test %s optimizer => %s' % (optimizer, task_name))

                    # Construct the AutoML classifier.
                    cls = Classifier(optimizer=optimizer,
                                     seed=seed,
                                     include_models=algos).fit(
                                         dm,
                                         metric='accuracy',
                                         runcount=run_count,
                                         task_name=task_name,
                                         update_mode=mode,
                                         param=param)
                    acc = cls.score(X_test, y_test)
                    key_id = '%s_%d_%d_%d_%s' % (dataset, run_count, n_est,
                                                 run_id, optimizer)
                    result[key_id] = acc

                # Display and save the test result.
                print(result)

                with open(
                        'data/%s/%s_test_%s_%d_%d_%d.pkl' %
                    (dataset_id, dataset, algo, run_count, rep_num, start_id),
                        'wb') as f:
                    pickle.dump(result, f)