def evaluate_evaluation_based_fe(dataset, time_limit, seed=1): # Prepare the configuration for random forest. from ConfigSpace.hyperparameters import UnParametrizedHyperparameter from autosklearn.pipeline.components.classification.random_forest import RandomForest cs = RandomForest.get_hyperparameter_search_space() clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest') cs.add_hyperparameter(clf_hp) evaluator = Evaluator(cs.get_default_configuration(), name='fe', seed=seed) raw_data = load_data(dataset, datanode_returned=True) pipeline = FEPipeline(fe_enabled=True, optimizer_type='eval_base', time_budget=time_limit, evaluator=evaluator, seed=seed, model_id='random_forest', time_limit_per_trans=300) train_data = pipeline.fit_transform(raw_data) score = evaluator(None, data_node=train_data) print('==> Base validation score', score) save_path = proj_dir + 'data/fe_%s_%d.pkl' % (dataset, time_limit) with open(save_path, 'wb') as f: pickle.dump([dataset, score], f) return score
def evaluate_fe_pipeline(): from automlToolkit.utils.data_manager import DataManager dm = DataManager() # file_path = "data/proprocess_data.csv" file_path = 'data/a9a/dataset_183_adult.csv' dm.load_train_csv(file_path) pipeline = FEPipeline(fe_enabled=True).fit(dm) train_data = pipeline.transform(dm) print(train_data) print(train_data.data)
def evaluate_fe_bugs(dataset, run_id, time_limit, seed): algorithms = [ 'lda', 'k_nearest_neighbors', 'libsvm_svc', 'sgd', 'adaboost', 'random_forest', 'extra_trees', 'decision_tree' ] algo_id = np.random.choice(algorithms, 1)[0] task_id = '%s-fe-%s-%d' % (dataset, algo_id, run_id) print(task_id) # Prepare the configuration for random forest. clf_class = _classifiers[algo_id] cs = clf_class.get_hyperparameter_search_space() clf_hp = UnParametrizedHyperparameter("estimator", algo_id) cs.add_hyperparameter(clf_hp) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='fe', seed=seed, resampling_strategy='holdout') pipeline = FEPipeline(fe_enabled=True, optimizer_type='eval_base', time_budget=time_limit, evaluator=evaluator, seed=seed, model_id=algo_id, time_limit_per_trans=per_run_time_limit, task_id=task_id) raw_data, test_raw_data = load_train_test_data(dataset) train_data = pipeline.fit_transform(raw_data.copy_()) test_data = pipeline.transform(test_raw_data.copy_()) train_data_new = pipeline.transform(raw_data.copy_()) assert (train_data.data[0] == train_data_new.data[0]).all() assert (train_data.data[1] == train_data_new.data[1]).all() assert (train_data_new == train_data) score = evaluator(None, data_node=test_data) print('==> Test score', score)
def load_data(dataset, data_dir='./', datanode_returned=False): dm = DataManager() data_path = data_dir + 'data/datasets/%s.csv' % dataset if dataset in ['credit_default']: data_path = data_dir + 'data/datasets/%s.xls' % dataset # Load train data. if dataset in ['higgs', 'amazon_employee', 'spectf', 'usps']: label_column = 0 else: label_column = -1 if dataset in ['spambase', 'messidor_features']: header = None else: header = 'infer' if dataset in ['winequality_white', 'winequality_red']: sep = ';' else: sep = ',' train_data_node = dm.load_train_csv(data_path, label_col=label_column, header=header, sep=sep) pipeline = FEPipeline(fe_enabled=False, metric='acc') train_data = pipeline.fit_transform(train_data_node) if datanode_returned: return train_data else: X, y = train_data.data feature_types = train_data.feature_types return X, y, feature_types
if ensemble_method == 'none': ensemble_method = None print('==> Start to evaluate with Budget %d' % time_limit) dm = DataManager() train_node = dm.load_train_csv("train_dataset.csv", label_col=-1, header='infer', na_values=['nan', '?']) test_node = dm.load_test_csv("test_dataset.csv", header='infer', has_label=True) from automlToolkit.components.utils.constants import REGRESSION pipeline = FEPipeline(fe_enabled=False, task_type=REGRESSION) train_data = pipeline.fit_transform(train_node) test_data = pipeline.transform(test_node) save_dir = './data/eval_exps/automl-toolkit' if not os.path.exists(save_dir): os.makedirs(save_dir) rgs = Regressor(metric='mse', ensemble_method=ensemble_method, evaluation=eval_type, time_limit=time_limit, output_dir=save_dir, random_state=1, n_jobs=n_jobs)
def evaluation_based_feature_engineering(time_limit, seed=1): if task_id == 3 and regressor_id == 'lightgbm': config = {'colsample_bytree': 0.556390018826356, 'estimator': 'lightgbm', 'learning_rate': 0.027650212980431577, 'min_child_weight': 4, 'n_estimators': 1000, # 2493, 'num_leaves': 818, 'reg_alpha': 0.00012695064964599962, 'reg_lambda': 0.0006320421481400761, 'subsample': 0.5611631795995178} elif task_id == 1 and regressor_id == 'lightgbm': config = {'colsample_bytree': 0.5836692544286752, 'estimator': 'lightgbm', 'learning_rate': 0.025011125056624308, 'min_child_weight': 3, 'n_estimators': 1000, # 2000, 'num_leaves': 958, 'reg_alpha': 0.00025307513851761005, 'reg_lambda': 0.01911305077512719, 'subsample': 0.7850946965061745 } elif task_id == 3 and regressor_id == 'catboost_gpu': config = {'loss_function': 'RMSE', 'task_type': 'GPU', 'bootstrap_type': 'Poisson', 'learning_rate': 0.07215105304885769, 'n_estimators': 10000, 'min_child_samples': 7, 'max_depth': 8, 'reg_lambda': 4.084654778260157e-06, 'subsample': 0.9998568450178255 } elif task_id == 1 and regressor_id == 'catboost_gpu': config = {'loss_function': 'RMSE', 'task_type': 'GPU', 'bootstrap_type': 'Poisson', 'learning_rate': 0.030167431274216235, 'n_estimators': 10000, 'min_child_samples': 2, 'max_depth': 11, 'reg_lambda': 0.00010924008880152775, 'subsample': 0.9996005646983249 } else: raise ValueError("Hyperparameters not available!") config.pop('estimator', None) if regressor_id == 'lightgbm': estimator = LightGBMRegressor(**config) elif 'catboost' in regressor_id: estimator = CatBoostRegressor(**config) scorer = make_scorer(smape, greater_is_better=False) evaluator = RegressionEvaluator(None, scorer, name='fe', seed=seed, estimator=estimator) train_data, test_data = fetch_data(task_id) X, y = train_data.data idxs = np.arange(X.shape[0]) np.random.shuffle(idxs) sample_size = int(X.shape[0] * train_size) subset_ids = idxs[:sample_size] X, y = X.iloc[subset_ids, :], y[subset_ids] train_data.data = [X, y] print(train_data) """ nystronem_sampler: 15 bad kitchen_sinks: 13 bad random_trees_embedding: 18 bad feature_agglomeration_decomposer: 11 timeout. """ # TODO: fast_ica, kernel_pca, and polynomial_features. # trans_used = [0, 3, 4, 5, 12, 16, 19, 30, 31, 32] # trans_used = [0, 3, 4, 5, 10, 11, 12, 16, 17, 19] # trans_used = [17, 30, 31] # trans_used = [30] pipeline = FEPipeline(task_type='regression', task_id='anti_plague', fe_enabled=True, optimizer_type='eval_base', time_budget=time_limit, evaluator=evaluator, seed=seed, model_id='lightgbm', time_limit_per_trans=900, trans_set=None ) transformed_train_data = pipeline.fit_transform(train_data) print(pipeline.optimizer.get_incumbent_path()) print('final train data shape & score', transformed_train_data.shape, transformed_train_data.score) transformed_test_datanode = pipeline.transform(test_data) print('final test data shape', transformed_test_datanode.shape) # Save results. np.save(data_dir + 'data/transformed_train_x-%d.csv' % task_id, transformed_train_data.data[0]) np.save(data_dir + 'data/transformed_train_y-%d.csv' % task_id, transformed_train_data.data[1]) np.save(data_dir + 'data/transformed_test-%d.csv' % task_id, transformed_test_datanode.data[0])