示例#1
0
def test_estimator():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    rep_num = args.rep
    run_count = args.run_count
    datasets = args.datasets.split(',')
    print(rep_num, run_count, datasets)

    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        task_format = dataset + '_est_%d'
        X, y, _ = load_data(dataset)
        dm = DataManager(X, y)
        seed = np.random.random_integers(MAX_INT)
        for optimizer in ['smbo']:
            cls = Classifier(include_models=['gradient_boosting'],
                             optimizer=optimizer,
                             seed=seed).fit(dm,
                                            metric='accuracy',
                                            runcount=run_count,
                                            task_name=task_format)
            print(cls.predict(X))
示例#2
0
def test_hyperspace():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    try:
        for dataset in datasets:
            for run_id in range(start_run, rep_num):
                X, y, _ = load_data(dataset)
                dm = DataManager(X, y)
                seed = np.random.random_integers(MAX_INT)

                for update_mode in [2, 3]:
                    task_format = dataset + '_mode_%d_%d' % (update_mode,
                                                             run_id)
                    cls = Classifier(optimizer='ts_smbo',
                                     seed=seed).fit(dm,
                                                    metric='accuracy',
                                                    runcount=run_count,
                                                    task_name=task_format,
                                                    update_mode=update_mode)
                    print(cls.predict(X))
    except Exception as e:
        print(e)
        print('Exit!')
示例#3
0
def evaluate_c():
    rep_num = 10
    run_count = 500
    start_id = args.start_runid
    datasets = args.datasets.split(',')
    task_id = 'exp5_eval_c'
    print(rep_num, run_count, datasets, task_id)

    for dataset in datasets:
        # Make directories.
        dataset_id = dataset.split('_')[0]
        save_dir = "data/%s/" % dataset_id
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        result = dict()
        seeds = get_seeds(dataset, start_id + rep_num)
        for run_id in range(start_id, start_id + rep_num):
            seed = seeds[run_id]

            # Dataset partition.
            X, y, _ = load_data(dataset)
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y)
            dm = DataManager(X_train, y_train)

            # Test each optimizer algorithm:
            for p in [1, 4, 10, 14, 16, 20]:
                task_name = dataset + '_%s_%d_%d_%d' % (task_id, run_count,
                                                        run_id, p)
                mode = 3
                optimizer = 'mono_smbo'

                print('Test %s optimizer => %s' % (optimizer, task_name))

                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                task_name=task_name,
                                                update_mode=mode,
                                                param=p)
                acc = cls.score(X_test, y_test)
                key_id = '%s_%d_%d_%d_%s' % (dataset, run_count, run_id, p,
                                             optimizer)
                result[key_id] = acc

            # Display and save the test result.
            print(result)
            with open(
                    'data/%s/%s_test_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset, task_id, run_count, rep_num, start_id),
                    'wb') as f:
                pickle.dump(result, f)
示例#4
0
文件: test.py 项目: zwt233/alpha-ml
def test_auto():
    from sklearn.datasets import load_breast_cancer
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier

    X, y = load_breast_cancer(return_X_y=True)
    # Classifier(exclude_models=['libsvm_svc']).fit(DataManager(X, y))

    for _ in range(5):
        Classifier(include_models=['adaboost', 'gradient_boosting', 'random_forest'], optimizer='ts_smac').fit(DataManager(X, y))

    for _ in range(5):
        Classifier(include_models=['adaboost', 'gradient_boosting', 'random_forest'], optimizer='smac').fit(DataManager(X, y))
示例#5
0
def test_cash_module():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    import random
    from sklearn.metrics import roc_auc_score
    result = []
    for i in range(1):
        import xlrd
        sheet = xlrd.open_workbook("lyqdata.xlsx")
        sheet = sheet.sheet_by_index(0)
        nrows = sheet.nrows
        X_train = []
        y_train = []
        for i in range(2, nrows):
            X_train.append(sheet.row_values(i, start_colx=1))
            y_train.append(int(sheet.cell_value(i, 0)))

        dm = DataManager(X_train, y_train)
        cls = Classifier(
            # include_models=['liblinear_svc', 'libsvm_svc', 'random_forest', 'logistic_regression', 'mlp'],
            include_models=['mlp'],
            optimizer='smbo',
            cross_valid=False,
            ensemble_method='ensemble_selection',
            ensemble_size=args.ensemble_size,
            save_dir='data/save_models'
        )
        cls.fit(dm, metric='auc', runcount=args.run_count)

        sheet = xlrd.open_workbook("lyqtestdata.xlsx")
        sheet = sheet.sheet_by_index(0)
        nrows = sheet.nrows
        X_test = []
        y_test = []
        for i in range(1, nrows):
            X_test.append(sheet.row_values(i, start_colx=1))
            y_test.append(int(sheet.cell_value(i, 0)))

        pred = cls.predict_proba(X_test)
        result.append(roc_auc_score(y_test, pred[:, 1:2]))
        print(result)

    import pickle
    with open('result.pkl', 'wb') as f:
        pickle.dump(result, f)
示例#6
0
def test_cash_module():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    import random
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import OneHotEncoder

    result = []
    for i in range(1):
        import xlrd
        sheet = xlrd.open_workbook("ybai_Keratoconus_TJ_20190425.xlsx")
        sheet = sheet.sheet_by_index(0)
        nrows = sheet.nrows
        X_train = []
        y_train = []

        for i in range(1, nrows):
            X_train.append(sheet.row_values(i, start_colx=1))
            y_train.append(int(sheet.cell_value(i, 0)))

        encoder = OneHotEncoder()
        encoder.fit(np.reshape(y_train, (len(y_train), 1)))
        X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

        dm = DataManager(X_train, y_train)
        cls = Classifier(
            # include_models=['liblinear_svc', 'libsvm_svc', 'xgboost', 'random_forest', 'logistic_regression', 'mlp'],
            optimizer='smbo',
            ensemble_method='bagging',
            ensemble_size=args.ensemble_size,
        )
        cls.fit(dm, metric='auc', runcount=args.run_count)

        pred = cls.predict_proba(X_test)
        print(pred)
        y_test = encoder.transform(np.reshape(y_test, (len(y_test), 1))).toarray()
        result.append(roc_auc_score(y_test, pred))
        print(result)

        import pickle
        with open('result.pkl', 'wb') as f:
            pickle.dump(result, f)
示例#7
0
def test_cash_module():
    df = pd.read_csv("data/cls_data/santander/train.csv")
    df = df.drop(columns=["ID"])
    cls = Classifier(include_models=['xgboost', 'random_forest', 'decision_tree'],
                     optimizer='baseline',
                     ensemble_method='ensemble_selection',
                     ensemble_size=30,
                     ).fit(df, metric='auc', runcount=1)
    df = pd.read_csv("data/cls_data/santander/test.csv")
    data = df.values
    df = df.drop(columns=["ID"])
    pred2 = cls.predict(df)
    print(pred2)

    import csv
    with open('data/cls_data/santander/submission.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['ID', 'TARGET'])
        for i in range(len(pred2)):
            line = [int(data[i, 0]), pred2[i]]
            writer.writerow(line)
示例#8
0
def test_no_free_lunch():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data

    for dataset in datasets:
        seeds = get_seeds(dataset, rep_num)
        for run_id in range(rep_num):
            seed = seeds[run_id]

            # Dataset partition.
            X, y, _ = load_data(dataset)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
            dm = DataManager(X_train, y_train)
            for algo in algo_list:
                for optimizer in ['smbo']:
                    task_format = dataset + '_' + algo + '_%d_%d'
                    cls = Classifier(
                        include_models=[algo], optimizer=optimizer, seed=seed).fit(
                        dm, metric='accuracy', runcount=run_count, task_name=task_format % (run_count, run_id))
                    print(cls.predict(X))
示例#9
0
def test_hyperspace():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    try:
        for dataset in datasets:
            for run_id in range(start_run, rep_num):
                X, y, _ = load_data(dataset)
                dm = DataManager(X, y)
                seed = np.random.random_integers(MAX_INT)

                for n_est in [1, 2, 4, 8, 12]:
                    algos = algo_list[:n_est]
                    task_format = dataset + '_hp_%d_%d' % (n_est, run_id)
                    cls = Classifier(
                        include_models=algos, optimizer='smbo', seed=seed).fit(
                        dm, metric='accuracy', runcount=run_count, task_name=task_format)
                    print(cls.predict(X))
    except Exception as e:
        print(e)
        print('Exit!')
示例#10
0
def test_claim():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    perfs_list = list()
    for dataset in datasets:
        for run_id in range(rep_num):
            X, y, _ = load_data(dataset)
            dm = DataManager(X, y)
            seed = np.random.random_integers(MAX_INT)
            task_format = dataset + '_claim_%d'

            for optimizer in ['smbo']:
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                task_name=task_format % run_id)
                print(cls.predict(X))

                file_id = 'data/%s/%s_claim_%d_%s.data' % (dataset, dataset,
                                                           run_id, 'smac')
                with open(file_id, 'rb') as f:
                    data = pickle.load(f)

                best_id = np.argmax(data['perfs'])
                best_value = data['perfs'][best_id]
                if data['perfs'].count(best_value) > 1:
                    stats = dict()
                    for conf, perf in zip(data['configs'], data['perfs']):
                        if perf == best_value:
                            est = conf['estimator']
                            if est not in stats:
                                stats[est] = 0
                            stats[est] += 1
                    tmp_id = np.argmax(stats.values())
                    best_estimator = list(stats.keys())[tmp_id]
                    print('=' * 20, best_value, stats)
                else:
                    best_estimator = data['configs'][best_id]['estimator']
                    print('=' * 20, data['perfs'][best_id],
                          data['configs'][best_id])

                run_cnts = len([
                    item for item in data['configs']
                    if item['estimator'] == best_estimator
                ])

                task_format = dataset + '_claim_single_%d'
                cls = Classifier(include_models=[best_estimator],
                                 optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_cnts,
                                                task_name=task_format % run_id)
                print(cls.predict(X))

                file_id = 'data/%s/%s_claim_single_%d_%s.data' % (
                    dataset, dataset, run_id, 'smac')
                with open(file_id, 'rb') as f:
                    data_s = pickle.load(f)
                print('=' * 20 + 'single', max(data_s['perfs']))
                perfs_list.append((data['perfs'], data_s['perfs']))

    for item in perfs_list:
        item1, item2 = item
        print(len(item1), max(item1), len(item2), max(item2))
    print('=' * 50)
    print(perfs_list)
示例#11
0
def test_cash_module():
    rep_num = args.rep
    run_count = args.run_count
    start_id = args.start_runid
    datasets = args.datasets.split(',')
    optimizer_algos = args.opt_algo.split(',')
    task_id = args.task_id
    print(rep_num, run_count, datasets, optimizer_algos, task_id)

    result = dict()
    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        seeds = get_seeds(dataset, rep_num)
        for run_id in range(start_id, rep_num):
            task_name = dataset + '_%s_%d_%d' % (task_id, run_count, run_id)
            seed = seeds[run_id]

            # Dataset partition.
            X, y, _ = load_data(dataset)
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y)
            dm = DataManager(X_train, y_train)

            # Test each optimizer algorithm:
            for optimizer in optimizer_algos:
                # Parse the parameters for each optimizer.
                mode = 2
                eta, r = 2, 2
                if optimizer.startswith('baseline'):
                    optimizer, mode = optimizer.split('_')
                    mode = 1 if mode == 'rand' else 2
                if optimizer.startswith('sh'):
                    if len(optimizer.split('_')) == 2:
                        optimizer, eta = optimizer.split('_')
                        eta = float(eta)
                    else:
                        raise ValueError('Wrong SH params!')
                if optimizer.startswith('rl'):
                    if len(optimizer.split('_')) == 3:
                        _, mode, eta = optimizer.split('_')
                        mode = int(mode)
                        optimizer = 'rl_smbo'
                    else:
                        raise ValueError('Wrong SH params!')
                if optimizer.startswith('ts_smbo'):
                    mode = 1
                    if len(optimizer.split('_')) == 3:
                        _, _, mode = optimizer.split('_')
                        mode = int(mode)
                        optimizer = 'ts_smbo'
                if optimizer.startswith('mcmc_ts'):
                    _, _, mode, eta, r = optimizer.split('_')
                    mode = int(mode)
                    eta = int(eta)
                    r = int(r)
                    optimizer = 'mcmc_ts_smbo'

                if optimizer.startswith('ucb_smbo'):
                    mode = 1
                    if len(optimizer.split('_')) == 3:
                        _, _, mode = optimizer.split('_')
                        mode = int(mode)
                        optimizer = 'ucb_smbo'

                if optimizer.startswith('mono_smbo'):
                    mode = 2
                    if len(optimizer.split('_')) == 4:
                        _, _, mode, r = optimizer.split('_')
                        mode, r = int(mode), int(r)
                        eta = 10
                        optimizer = 'mono_smbo'

                print('Test %s optimizer => %s' % (optimizer, task_name))

                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                task_name=task_name,
                                                update_mode=mode,
                                                eta=eta,
                                                r=r,
                                                param=eta)
                acc = cls.score(X_test, y_test)
                key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id,
                                          optimizer)
                result[key_id] = acc

            # Display and save the test result.
            print(result)
            with open(
                    'data/%s/%s_test_result_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset_id, task_id, run_count, rep_num,
                 start_id), 'wb') as f:
                pickle.dump(result, f)
示例#12
0
from time import time

warnings.filterwarnings("ignore")

parser = argparse.ArgumentParser()
parser.add_argument("--generated_feature", type=int, default=1)
parser.add_argument("--dataset", type=str)
args = parser.parse_args()

x, y, c = load_data(args.dataset)

dm = DataManager(x, y)

lr = LogisticRegression()
lr.fit(dm.train_X, dm.train_y)
y_pred = lr.predict(dm.val_X)
print("original lr accu:", accuracy_score(dm.val_y, y_pred), flush=True)

if args.generated_feature > 0:
    af = AutoFeature("accuracy", "auto_cross")
    af.fit(dm, args.generated_feature)
    dm = af.transform(dm)

clf = Classifier()
start_time = time()
clf.fit(dm, metric="accuracy", runcount=50)
print("alphaml time:", time() - start_time)
print("dataset:", args.dataset)
print("generated data:", args.generated_feature, ", alphaml score:",
      clf.score(dm.val_X, dm.val_y))
示例#13
0
def test_exp4_runtime():
    rep_num = args.rep
    run_count = args.run_count
    B = args.B
    if B > 0:
        run_count = 0

    start_id = args.start_runid
    datasets = args.datasets.split(',')
    print(rep_num, run_count, datasets)
    task_id = "exp4_runtime"

    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        # Dataset partition.
        X, y, _ = load_data(dataset)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y)
        dm = DataManager(X_train, y_train)

        runcount_dict = dict()
        tpe_runcount = 0.

        optimizer_algos = ['mono_smbo_4', 'smbo', 'tpe']
        # optimizer_algos = ['mono_smbo_3_0']
        # Test each optimizer algorithm:
        assert optimizer_algos[-1] == 'tpe'
        for opt_algo in optimizer_algos:
            # if algo is tpe, we need to estimate its runcount in one hour.
            if opt_algo != 'tpe':
                runcount_dict[opt_algo] = list()
            else:
                count_list = list()
                for key in runcount_dict.keys():
                    count_list.append(np.mean(runcount_dict[key]))
                assert len(count_list) > 0
                tpe_runcount = np.min(count_list)
                print('=' * 50, tpe_runcount)

            result = dict()
            mode, eta = None, None
            # Parse the parameters for each optimizer.
            if opt_algo.startswith('mono_smbo'):
                mode = 2
                if len(opt_algo.split('_')) == 3:
                    _, _, mode = opt_algo.split('_')
                    mode = int(mode)
                    eta = 10
                    optimizer = 'mono_smbo'
            else:
                optimizer = opt_algo

            print('Test optimizer: %s' % optimizer)

            seeds = get_seeds(dataset, rep_num)
            for run_id in range(start_id, rep_num):
                if B > 0:
                    task_name = dataset + '_%s_%d_%d_%d' % (task_id, B,
                                                            run_count, run_id)
                else:
                    task_name = dataset + '_%s_%d_%d' % (task_id, run_count,
                                                         run_id)
                seed = seeds[run_id]

                runcount_const = run_count if opt_algo != 'tpe' else tpe_runcount
                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=runcount_const,
                                                runtime=B,
                                                task_name=task_name,
                                                update_mode=mode,
                                                param=eta)

                # Test the CASH performance on test set.
                cash_test_acc = cls.score(X_test, y_test)

                # Load CASH intermediate infos.
                if optimizer == 'smbo':
                    file_id = 'smac'
                elif optimizer == 'tpe':
                    file_id = 'hyperopt'
                elif optimizer == 'mono_smbo':
                    file_id = 'mm_bandit_%d_smac' % mode
                else:
                    raise ValueError('Invalid optimizer!')

                tmp_task_id = '%s_%d' % (task_id, B) if B > 0 else task_id
                tmp_configs, tmp_perfs = load_infos(dataset, tmp_task_id,
                                                    run_count, run_id, file_id)
                if opt_algo != 'tpe':
                    runcount_dict[opt_algo].append(len(tmp_configs))

                model_infos = (tmp_configs, tmp_perfs)
                ensemble_size = 50
                task_type = type_of_target(dm.train_y)
                if optimizer == 'tpe':
                    task_type = 'hyperopt_' + task_type
                metric = accuracy_score

                ensemble_model = EnsembleSelection(model_infos,
                                                   ensemble_size,
                                                   task_type,
                                                   metric,
                                                   n_best=20)
                ensemble_model.fit(dm)

                ens_val_pred = ensemble_model.predict(dm.val_X)
                ens_val_acc = accuracy_score(ens_val_pred, dm.val_y)

                ens_pred = ensemble_model.predict(X_test)
                ens_test_acc = accuracy_score(ens_pred, y_test)

                key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id,
                                          optimizer)
                result[key_id] = [cash_test_acc, ens_val_acc, ens_test_acc]
                print(result)

            # Save the test result.
            with open(
                    'data/%s/%s_test_result_%s_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset, opt_algo, task_id, run_count, rep_num,
                 start_id), 'wb') as f:
                pickle.dump(result, f)
示例#14
0
def test_exp2_evaluation():
    rep_num = args.rep
    run_count = args.run_count

    start_id = args.start_runid
    datasets = args.datasets.split(',')
    print(rep_num, run_count, datasets)
    task_id = "exp_2_evaluation"

    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        # Dataset partition.
        X, y, _ = load_data(dataset)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y)
        dm = DataManager(X_train, y_train)

        # optimizer_algos = ['cmab_ts', 'rl_1_0.3', 'rl_2_1', 'rl_3_0']
        optimizer_algos = ['cmab_ts', 'rl_2_1', 'rl_3_0']
        # Test each optimizer algorithm:
        for opt_algo in optimizer_algos:
            result = dict()
            mode, eta = None, None
            # Parse the parameters for each optimizer.
            if opt_algo.startswith('rl'):
                if len(opt_algo.split('_')) == 3:
                    _, mode, eta = opt_algo.split('_')
                    mode = int(mode)
                    optimizer = 'rl_smbo'
                    eta = float(eta)
                else:
                    raise ValueError('Wrong params!')
            else:
                optimizer = opt_algo

            print('Test optimizer: %s' % optimizer)

            seeds = get_seeds(dataset, rep_num)
            for run_id in range(start_id, rep_num):
                task_name = dataset + '_%s_%d_%d' % (task_id, run_count,
                                                     run_id)
                seed = seeds[run_id]

                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                runtime=None,
                                                task_name=task_name,
                                                update_mode=mode,
                                                param=eta)

                # Test the CASH performance on test set.
                cash_test_acc = cls.score(X_test, y_test)
                key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id,
                                          optimizer)
                result[key_id] = [cash_test_acc]
                print(result)

            # Save the test result.
            with open(
                    'data/%s/%s_test_result_%s_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset, opt_algo, task_id, run_count, rep_num,
                 start_id), 'wb') as f:
                pickle.dump(result, f)
示例#15
0
df["Sex"] = df["Sex"].replace(["male", "female"], [0, 1])

df.drop(columns="Ticket", axis=1, inplace=True)

for i in range(df.shape[0]):
    if df["Cabin"][i] == "C23 C25 C27":
        df["Cabin"][i] = 0
    else:
        df["Cabin"][i] = 1

df["Cabin"] = df["Cabin"].astype("float")

df = pd.get_dummies(df)

x = df.values

x_train = x[:train_size]
x_test = x[train_size:]

dm = DataManager()
dm.train_X = x_train
dm.train_y = y_train


clf = Classifier(optimizer="smbo")
clf.fit(dm, metric="accuracy", runcount=200)

submission = pd.read_csv(home_path + "/datasets/titanic/gender_submission.csv")
submission["Survived"] = clf.predict(x_test)
submission.to_csv(home_path + "/datasets/titanic/xgboost.csv", index=False)
示例#16
0
def evaluate_k():
    algo_list = [
        'xgboost', 'liblinear_svc', 'gradient_boosting', 'decision_tree',
        'passive_aggressive', 'qda', 'random_forest', 'sgd', 'extra_trees',
        'lda', 'gaussian_nb', 'libsvm_svc', 'logistic_regression', 'adaboost',
        'k_nearest_neighbors'
    ]

    rep_num = args.rep
    run_count = args.run_count
    start_id = args.start_runid
    datasets = args.datasets.split(',')
    task_id = 'exp5_eval_k'
    print(rep_num, run_count, datasets, task_id)

    for dataset in datasets:
        # Make directories.
        dataset_id = dataset.split('_')[0]
        save_dir = "data/%s/" % dataset_id
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Dataset partition.
        X, y, _ = load_data(dataset)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y)
        dm = DataManager(X_train, y_train)

        # opt_algos = ['mono_smbo_3_0', 'smbo', 'baseline_2', 'tpe']
        opt_algos = ['mono_smbo_3_0', 'smbo', 'baseline_2']
        for algo in opt_algos:
            result = dict()
            seeds = get_seeds(dataset, rep_num)
            for run_id in range(start_id, rep_num):
                seed = seeds[run_id]

                # Test each optimizer algorithm:
                for n_est in [15, 12, 8, 4, 2, 1]:
                    algos = algo_list[:n_est]
                    task_name = dataset + '_%s_%d_%d_%d' % (task_id, run_count,
                                                            run_id, n_est)
                    mode, param = 3, None
                    if algo.startswith('mono_smbo'):
                        optimizer = 'mono_smbo'
                        mode, param = 3, 10
                    elif algo.startswith('baseline'):
                        optimizer = 'baseline'
                        mode = 2
                    else:
                        optimizer = algo

                    print('Test %s optimizer => %s' % (optimizer, task_name))

                    # Construct the AutoML classifier.
                    cls = Classifier(optimizer=optimizer,
                                     seed=seed,
                                     include_models=algos).fit(
                                         dm,
                                         metric='accuracy',
                                         runcount=run_count,
                                         task_name=task_name,
                                         update_mode=mode,
                                         param=param)
                    acc = cls.score(X_test, y_test)
                    key_id = '%s_%d_%d_%d_%s' % (dataset, run_count, n_est,
                                                 run_id, optimizer)
                    result[key_id] = acc

                # Display and save the test result.
                print(result)

                with open(
                        'data/%s/%s_test_%s_%d_%d_%d.pkl' %
                    (dataset_id, dataset, algo, run_count, rep_num, start_id),
                        'wb') as f:
                    pickle.dump(result, f)