def test_estimator(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT rep_num = args.rep run_count = args.run_count datasets = args.datasets.split(',') print(rep_num, run_count, datasets) for dataset in datasets: dataset_id = dataset.split('_')[0] result_dir = 'data/' + dataset_id if not os.path.exists(result_dir): os.mkdir(result_dir) task_format = dataset + '_est_%d' X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) for optimizer in ['smbo']: cls = Classifier(include_models=['gradient_boosting'], optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_format) print(cls.predict(X))
def test_hyperspace(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT try: for dataset in datasets: for run_id in range(start_run, rep_num): X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) for update_mode in [2, 3]: task_format = dataset + '_mode_%d_%d' % (update_mode, run_id) cls = Classifier(optimizer='ts_smbo', seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_format, update_mode=update_mode) print(cls.predict(X)) except Exception as e: print(e) print('Exit!')
def test_cash_module(): df = pd.read_csv("data/cls_data/santander/train.csv") df = df.drop(columns=["ID"]) cls = Classifier(include_models=['xgboost', 'random_forest', 'decision_tree'], optimizer='baseline', ensemble_method='ensemble_selection', ensemble_size=30, ).fit(df, metric='auc', runcount=1) df = pd.read_csv("data/cls_data/santander/test.csv") data = df.values df = df.drop(columns=["ID"]) pred2 = cls.predict(df) print(pred2) import csv with open('data/cls_data/santander/submission.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['ID', 'TARGET']) for i in range(len(pred2)): line = [int(data[i, 0]), pred2[i]] writer.writerow(line)
def test_no_free_lunch(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data for dataset in datasets: seeds = get_seeds(dataset, rep_num) for run_id in range(rep_num): seed = seeds[run_id] # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) for algo in algo_list: for optimizer in ['smbo']: task_format = dataset + '_' + algo + '_%d_%d' cls = Classifier( include_models=[algo], optimizer=optimizer, seed=seed).fit( dm, metric='accuracy', runcount=run_count, task_name=task_format % (run_count, run_id)) print(cls.predict(X))
def test_hyperspace(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT try: for dataset in datasets: for run_id in range(start_run, rep_num): X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) for n_est in [1, 2, 4, 8, 12]: algos = algo_list[:n_est] task_format = dataset + '_hp_%d_%d' % (n_est, run_id) cls = Classifier( include_models=algos, optimizer='smbo', seed=seed).fit( dm, metric='accuracy', runcount=run_count, task_name=task_format) print(cls.predict(X)) except Exception as e: print(e) print('Exit!')
def test_claim(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT perfs_list = list() for dataset in datasets: for run_id in range(rep_num): X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) task_format = dataset + '_claim_%d' for optimizer in ['smbo']: cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_format % run_id) print(cls.predict(X)) file_id = 'data/%s/%s_claim_%d_%s.data' % (dataset, dataset, run_id, 'smac') with open(file_id, 'rb') as f: data = pickle.load(f) best_id = np.argmax(data['perfs']) best_value = data['perfs'][best_id] if data['perfs'].count(best_value) > 1: stats = dict() for conf, perf in zip(data['configs'], data['perfs']): if perf == best_value: est = conf['estimator'] if est not in stats: stats[est] = 0 stats[est] += 1 tmp_id = np.argmax(stats.values()) best_estimator = list(stats.keys())[tmp_id] print('=' * 20, best_value, stats) else: best_estimator = data['configs'][best_id]['estimator'] print('=' * 20, data['perfs'][best_id], data['configs'][best_id]) run_cnts = len([ item for item in data['configs'] if item['estimator'] == best_estimator ]) task_format = dataset + '_claim_single_%d' cls = Classifier(include_models=[best_estimator], optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_cnts, task_name=task_format % run_id) print(cls.predict(X)) file_id = 'data/%s/%s_claim_single_%d_%s.data' % ( dataset, dataset, run_id, 'smac') with open(file_id, 'rb') as f: data_s = pickle.load(f) print('=' * 20 + 'single', max(data_s['perfs'])) perfs_list.append((data['perfs'], data_s['perfs'])) for item in perfs_list: item1, item2 = item print(len(item1), max(item1), len(item2), max(item2)) print('=' * 50) print(perfs_list)
df["Sex"] = df["Sex"].replace(["male", "female"], [0, 1]) df.drop(columns="Ticket", axis=1, inplace=True) for i in range(df.shape[0]): if df["Cabin"][i] == "C23 C25 C27": df["Cabin"][i] = 0 else: df["Cabin"][i] = 1 df["Cabin"] = df["Cabin"].astype("float") df = pd.get_dummies(df) x = df.values x_train = x[:train_size] x_test = x[train_size:] dm = DataManager() dm.train_X = x_train dm.train_y = y_train clf = Classifier(optimizer="smbo") clf.fit(dm, metric="accuracy", runcount=200) submission = pd.read_csv(home_path + "/datasets/titanic/gender_submission.csv") submission["Survived"] = clf.predict(x_test) submission.to_csv(home_path + "/datasets/titanic/xgboost.csv", index=False)