def load_data(dataset, data_dir='./', datanode_returned=False, preprocess=True, task_type=None): dm = DataManager() if task_type is None: data_path = data_dir + 'data/datasets/%s.csv' % dataset elif task_type in CLS_TASKS: data_path = data_dir + 'data/cls_datasets/%s.csv' % dataset elif task_type in REG_TASKS: data_path = data_dir + 'data/rgs_datasets/%s.csv' % dataset else: raise ValueError("Unknown task type %s" % str(task_type)) # if dataset in ['credit_default']: # data_path = data_dir + 'data/datasets/%s.xls' % dataset # Load train data. if dataset in [ 'higgs', 'amazon_employee', 'spectf', 'usps', 'vehicle_sensIT', 'codrna' ]: label_column = 0 elif dataset in ['rmftsa_sleepdata(1)']: label_column = 1 else: label_column = -1 if dataset in ['spambase', 'messidor_features']: header = None else: header = 'infer' if dataset in ['winequality_white', 'winequality_red']: sep = ';' else: sep = ',' train_data_node = dm.load_train_csv( data_path, label_col=label_column, header=header, sep=sep, na_values=["n/a", "na", "--", "-", "?"]) if preprocess: pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) train_data = pipeline.fit_transform(train_data_node) else: train_data = train_data_node if datanode_returned: return train_data else: X, y = train_data.data feature_types = train_data.feature_types return X, y, feature_types
def evaluate_fe_pipeline(): from solnml.utils.data_manager import DataManager dm = DataManager() # file_path = "data/proprocess_data.csv" file_path = 'data/a9a/dataset_183_adult.csv' dm.load_train_csv(file_path) pipeline = FEPipeline(fe_enabled=True).fit(dm) train_data = pipeline.transform(dm) print(train_data) print(train_data.data)
def load_tabular_data(self, data_path): self.data_manager = DataManager() train_data_node = self.data_manager.load_train_csv( data_path, label_col=self.label_column, header=self.header, sep=self.sep, na_values=list(self.nan_values)) task_type = REGRESSION if self.is_regression else CLASSIFICATION self._process_pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) return self._process_pipeline.fit_transform(train_data_node)
class TabularDataset(BaseDataset): def __init__(self, data_path: str, is_regression=False, label_column=-1, header='infer', sep=',', nan_values=("n/a", "na", "--", "-", "?"), train_val_split: bool = False, val_split_size: float = 0.2): super().__init__() self.is_regression = is_regression self.train_val_split = train_val_split self.val_split_size = val_split_size self.data_path = data_path self.label_column = label_column self.header = header self.sep = sep self.nan_values = nan_values self.data_manager = None self._process_pipeline = None def load_tabular_data(self, data_path): self.data_manager = DataManager() train_data_node = self.data_manager.load_train_csv( data_path, label_col=self.label_column, header=self.header, sep=self.sep, na_values=list(self.nan_values)) task_type = REGRESSION if self.is_regression else CLASSIFICATION self._process_pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) return self._process_pipeline.fit_transform(train_data_node) def load_data(self): self.train_dataset = self.load_tabular_data(self.data_path) def load_test_data(self): test_data_node = self.data_manager.load_test_csv(self.test_data_path, has_label=False, keep_default_na=True, header=self.header, sep=self.sep) self.test_dataset = self._process_pipeline.transform(test_data_node)
def evaluate_fe_bugs(dataset, run_id, time_limit, seed): algorithms = [ 'lda', 'k_nearest_neighbors', 'libsvm_svc', 'sgd', 'adaboost', 'random_forest', 'extra_trees', 'decision_tree' ] algo_id = np.random.choice(algorithms, 1)[0] task_id = '%s-fe-%s-%d' % (dataset, algo_id, run_id) print(task_id) # Prepare the configuration for random forest. clf_class = _classifiers[algo_id] cs = clf_class.get_hyperparameter_search_space() clf_hp = UnParametrizedHyperparameter("estimator", algo_id) cs.add_hyperparameter(clf_hp) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='fe', seed=seed, resampling_strategy='holdout') pipeline = FEPipeline(fe_enabled=True, optimizer_type='eval_base', time_budget=time_limit, evaluator=evaluator, seed=seed, model_id=algo_id, time_limit_per_trans=per_run_time_limit, task_id=task_id) raw_data, test_raw_data = load_train_test_data(dataset) train_data = pipeline.fit_transform(raw_data.copy_()) test_data = pipeline.transform(test_raw_data.copy_()) train_data_new = pipeline.transform(raw_data.copy_()) assert (train_data.data[0] == train_data_new.data[0]).all() assert (train_data.data[1] == train_data_new.data[1]).all() assert (train_data_new == train_data) score = evaluator(None, data_node=test_data) print('==> Test score', score)
def evaluation_based_feature_engineering(time_limit, seed=1): if task_id == 3 and regressor_id == 'lightgbm': config = {'colsample_bytree': 0.556390018826356, 'estimator': 'lightgbm', 'learning_rate': 0.027650212980431577, 'min_child_weight': 4, 'n_estimators': 1000, # 2493, 'num_leaves': 818, 'reg_alpha': 0.00012695064964599962, 'reg_lambda': 0.0006320421481400761, 'subsample': 0.5611631795995178} elif task_id == 1 and regressor_id == 'lightgbm': config = {'colsample_bytree': 0.5836692544286752, 'estimator': 'lightgbm', 'learning_rate': 0.025011125056624308, 'min_child_weight': 3, 'n_estimators': 1000, # 2000, 'num_leaves': 958, 'reg_alpha': 0.00025307513851761005, 'reg_lambda': 0.01911305077512719, 'subsample': 0.7850946965061745 } elif task_id == 3 and regressor_id == 'catboost_gpu': config = {'loss_function': 'RMSE', 'task_type': 'GPU', 'bootstrap_type': 'Poisson', 'learning_rate': 0.07215105304885769, 'n_estimators': 10000, 'min_child_samples': 7, 'max_depth': 8, 'reg_lambda': 4.084654778260157e-06, 'subsample': 0.9998568450178255 } elif task_id == 1 and regressor_id == 'catboost_gpu': config = {'loss_function': 'RMSE', 'task_type': 'GPU', 'bootstrap_type': 'Poisson', 'learning_rate': 0.030167431274216235, 'n_estimators': 10000, 'min_child_samples': 2, 'max_depth': 11, 'reg_lambda': 0.00010924008880152775, 'subsample': 0.9996005646983249 } else: raise ValueError("Hyperparameters not available!") config.pop('estimator', None) if regressor_id == 'lightgbm': estimator = LightGBMRegressor(**config) elif 'catboost' in regressor_id: estimator = CatBoostRegressor(**config) scorer = make_scorer(smape, greater_is_better=False) evaluator = RegressionEvaluator(None, scorer, name='fe', seed=seed, estimator=estimator) train_data, test_data = fetch_data(task_id) X, y = train_data.data idxs = np.arange(X.shape[0]) np.random.shuffle(idxs) sample_size = int(X.shape[0] * train_size) subset_ids = idxs[:sample_size] X, y = X.iloc[subset_ids, :], y[subset_ids] train_data.data = [X, y] print(train_data) """ nystronem_sampler: 15 bad kitchen_sinks: 13 bad random_trees_embedding: 18 bad feature_agglomeration_decomposer: 11 timeout. """ # TODO: fast_ica, kernel_pca, and polynomial_features. # trans_used = [0, 3, 4, 5, 12, 16, 19, 30, 31, 32] # trans_used = [0, 3, 4, 5, 10, 11, 12, 16, 17, 19] # trans_used = [17, 30, 31] # trans_used = [30] pipeline = FEPipeline(task_type='regression', task_id='anti_plague', fe_enabled=True, optimizer_type='eval_base', time_budget=time_limit, evaluator=evaluator, seed=seed, model_id='lightgbm', time_limit_per_trans=900, trans_set=None ) transformed_train_data = pipeline.fit_transform(train_data) print(pipeline.optimizer.get_incumbent_path()) print('final train data shape & score', transformed_train_data.shape, transformed_train_data.score) transformed_test_datanode = pipeline.transform(test_data) print('final test data shape', transformed_test_datanode.shape) # Save results. np.save(data_dir + 'data/transformed_train_x-%d.csv' % task_id, transformed_train_data.data[0]) np.save(data_dir + 'data/transformed_train_y-%d.csv' % task_id, transformed_train_data.data[1]) np.save(data_dir + 'data/transformed_test-%d.csv' % task_id, transformed_test_datanode.data[0])
if ensemble_method == 'none': ensemble_method = None print('==> Start to evaluate with Budget %d' % time_limit) dm = DataManager() train_node = dm.load_train_csv("train_dataset.csv", label_col=-1, header='infer', na_values=['nan', '?']) test_node = dm.load_test_csv("test_dataset.csv", header='infer', has_label=True) from solnml.components.utils.constants import REGRESSION pipeline = FEPipeline(fe_enabled=False, task_type=REGRESSION) train_data = pipeline.fit_transform(train_node) test_data = pipeline.transform(test_node) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) rgs = Regressor(metric='mse', ensemble_method=ensemble_method, evaluation=eval_type, time_limit=time_limit, output_dir=save_dir, random_state=1, n_jobs=n_jobs)