예제 #1
0
def load_data(dataset,
              data_dir='./',
              datanode_returned=False,
              preprocess=True,
              task_type=None):
    dm = DataManager()
    if task_type is None:
        data_path = data_dir + 'data/datasets/%s.csv' % dataset
    elif task_type in CLS_TASKS:
        data_path = data_dir + 'data/cls_datasets/%s.csv' % dataset
    elif task_type in REG_TASKS:
        data_path = data_dir + 'data/rgs_datasets/%s.csv' % dataset
    else:
        raise ValueError("Unknown task type %s" % str(task_type))

    # if dataset in ['credit_default']:
    #     data_path = data_dir + 'data/datasets/%s.xls' % dataset

    # Load train data.
    if dataset in [
            'higgs', 'amazon_employee', 'spectf', 'usps', 'vehicle_sensIT',
            'codrna'
    ]:
        label_column = 0
    elif dataset in ['rmftsa_sleepdata(1)']:
        label_column = 1
    else:
        label_column = -1

    if dataset in ['spambase', 'messidor_features']:
        header = None
    else:
        header = 'infer'

    if dataset in ['winequality_white', 'winequality_red']:
        sep = ';'
    else:
        sep = ','

    train_data_node = dm.load_train_csv(
        data_path,
        label_col=label_column,
        header=header,
        sep=sep,
        na_values=["n/a", "na", "--", "-", "?"])

    if preprocess:
        pipeline = FEPipeline(fe_enabled=False,
                              metric='acc',
                              task_type=task_type)
        train_data = pipeline.fit_transform(train_data_node)
    else:
        train_data = train_data_node

    if datanode_returned:
        return train_data
    else:
        X, y = train_data.data
        feature_types = train_data.feature_types
        return X, y, feature_types
def evaluate_fe_pipeline():
    from solnml.utils.data_manager import DataManager
    dm = DataManager()
    # file_path = "data/proprocess_data.csv"
    file_path = 'data/a9a/dataset_183_adult.csv'
    dm.load_train_csv(file_path)

    pipeline = FEPipeline(fe_enabled=True).fit(dm)
    train_data = pipeline.transform(dm)
    print(train_data)
    print(train_data.data)
예제 #3
0
    def load_tabular_data(self, data_path):
        self.data_manager = DataManager()
        train_data_node = self.data_manager.load_train_csv(
            data_path,
            label_col=self.label_column,
            header=self.header,
            sep=self.sep,
            na_values=list(self.nan_values))

        task_type = REGRESSION if self.is_regression else CLASSIFICATION
        self._process_pipeline = FEPipeline(fe_enabled=False,
                                            metric='acc',
                                            task_type=task_type)
        return self._process_pipeline.fit_transform(train_data_node)
예제 #4
0
class TabularDataset(BaseDataset):
    def __init__(self,
                 data_path: str,
                 is_regression=False,
                 label_column=-1,
                 header='infer',
                 sep=',',
                 nan_values=("n/a", "na", "--", "-", "?"),
                 train_val_split: bool = False,
                 val_split_size: float = 0.2):
        super().__init__()
        self.is_regression = is_regression
        self.train_val_split = train_val_split
        self.val_split_size = val_split_size
        self.data_path = data_path
        self.label_column = label_column
        self.header = header
        self.sep = sep
        self.nan_values = nan_values
        self.data_manager = None
        self._process_pipeline = None

    def load_tabular_data(self, data_path):
        self.data_manager = DataManager()
        train_data_node = self.data_manager.load_train_csv(
            data_path,
            label_col=self.label_column,
            header=self.header,
            sep=self.sep,
            na_values=list(self.nan_values))

        task_type = REGRESSION if self.is_regression else CLASSIFICATION
        self._process_pipeline = FEPipeline(fe_enabled=False,
                                            metric='acc',
                                            task_type=task_type)
        return self._process_pipeline.fit_transform(train_data_node)

    def load_data(self):
        self.train_dataset = self.load_tabular_data(self.data_path)

    def load_test_data(self):
        test_data_node = self.data_manager.load_test_csv(self.test_data_path,
                                                         has_label=False,
                                                         keep_default_na=True,
                                                         header=self.header,
                                                         sep=self.sep)
        self.test_dataset = self._process_pipeline.transform(test_data_node)
def evaluate_fe_bugs(dataset, run_id, time_limit, seed):
    algorithms = [
        'lda', 'k_nearest_neighbors', 'libsvm_svc', 'sgd', 'adaboost',
        'random_forest', 'extra_trees', 'decision_tree'
    ]
    algo_id = np.random.choice(algorithms, 1)[0]
    task_id = '%s-fe-%s-%d' % (dataset, algo_id, run_id)
    print(task_id)

    # Prepare the configuration for random forest.
    clf_class = _classifiers[algo_id]
    cs = clf_class.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", algo_id)
    cs.add_hyperparameter(clf_hp)
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')

    pipeline = FEPipeline(fe_enabled=True,
                          optimizer_type='eval_base',
                          time_budget=time_limit,
                          evaluator=evaluator,
                          seed=seed,
                          model_id=algo_id,
                          time_limit_per_trans=per_run_time_limit,
                          task_id=task_id)

    raw_data, test_raw_data = load_train_test_data(dataset)
    train_data = pipeline.fit_transform(raw_data.copy_())
    test_data = pipeline.transform(test_raw_data.copy_())
    train_data_new = pipeline.transform(raw_data.copy_())

    assert (train_data.data[0] == train_data_new.data[0]).all()
    assert (train_data.data[1] == train_data_new.data[1]).all()
    assert (train_data_new == train_data)

    score = evaluator(None, data_node=test_data)
    print('==> Test score', score)
def evaluation_based_feature_engineering(time_limit, seed=1):
    if task_id == 3 and regressor_id == 'lightgbm':
        config = {'colsample_bytree': 0.556390018826356,
                  'estimator': 'lightgbm',
                  'learning_rate': 0.027650212980431577,
                  'min_child_weight': 4,
                  'n_estimators': 1000,  # 2493,
                  'num_leaves': 818,
                  'reg_alpha': 0.00012695064964599962,
                  'reg_lambda': 0.0006320421481400761,
                  'subsample': 0.5611631795995178}
    elif task_id == 1 and regressor_id == 'lightgbm':
        config = {'colsample_bytree': 0.5836692544286752,
                  'estimator': 'lightgbm',
                  'learning_rate': 0.025011125056624308,
                  'min_child_weight': 3,
                  'n_estimators': 1000,  # 2000,
                  'num_leaves': 958,
                  'reg_alpha': 0.00025307513851761005,
                  'reg_lambda': 0.01911305077512719,
                  'subsample': 0.7850946965061745
                  }
    elif task_id == 3 and regressor_id == 'catboost_gpu':
        config = {'loss_function': 'RMSE',
                  'task_type': 'GPU',
                  'bootstrap_type': 'Poisson',
                  'learning_rate': 0.07215105304885769,
                  'n_estimators': 10000,
                  'min_child_samples': 7,
                  'max_depth': 8,
                  'reg_lambda': 4.084654778260157e-06,
                  'subsample': 0.9998568450178255
                  }
    elif task_id == 1 and regressor_id == 'catboost_gpu':
        config = {'loss_function': 'RMSE',
                  'task_type': 'GPU',
                  'bootstrap_type': 'Poisson',
                  'learning_rate': 0.030167431274216235,
                  'n_estimators': 10000,
                  'min_child_samples': 2,
                  'max_depth': 11,
                  'reg_lambda': 0.00010924008880152775,
                  'subsample': 0.9996005646983249
                  }
    else:
        raise ValueError("Hyperparameters not available!")

    config.pop('estimator', None)
    if regressor_id == 'lightgbm':
        estimator = LightGBMRegressor(**config)
    elif 'catboost' in regressor_id:
        estimator = CatBoostRegressor(**config)
    scorer = make_scorer(smape, greater_is_better=False)
    evaluator = RegressionEvaluator(None, scorer, name='fe', seed=seed, estimator=estimator)
    train_data, test_data = fetch_data(task_id)

    X, y = train_data.data
    idxs = np.arange(X.shape[0])
    np.random.shuffle(idxs)
    sample_size = int(X.shape[0] * train_size)
    subset_ids = idxs[:sample_size]
    X, y = X.iloc[subset_ids, :], y[subset_ids]
    train_data.data = [X, y]
    print(train_data)
    """
    nystronem_sampler: 15 bad
    kitchen_sinks: 13 bad
    random_trees_embedding: 18 bad
    feature_agglomeration_decomposer: 11 timeout.
    """
    # TODO: fast_ica, kernel_pca, and polynomial_features.
    # trans_used = [0, 3, 4, 5, 12, 16, 19, 30, 31, 32]
    # trans_used = [0, 3, 4, 5, 10, 11, 12, 16, 17, 19]
    # trans_used = [17, 30, 31]
    # trans_used = [30]
    pipeline = FEPipeline(task_type='regression', task_id='anti_plague',
                          fe_enabled=True, optimizer_type='eval_base',
                          time_budget=time_limit, evaluator=evaluator,
                          seed=seed, model_id='lightgbm',
                          time_limit_per_trans=900,
                          trans_set=None
                          )
    transformed_train_data = pipeline.fit_transform(train_data)
    print(pipeline.optimizer.get_incumbent_path())
    print('final train data shape & score', transformed_train_data.shape, transformed_train_data.score)
    transformed_test_datanode = pipeline.transform(test_data)
    print('final test data shape', transformed_test_datanode.shape)

    # Save results.
    np.save(data_dir + 'data/transformed_train_x-%d.csv' % task_id, transformed_train_data.data[0])
    np.save(data_dir + 'data/transformed_train_y-%d.csv' % task_id, transformed_train_data.data[1])
    np.save(data_dir + 'data/transformed_test-%d.csv' % task_id, transformed_test_datanode.data[0])
if ensemble_method == 'none':
    ensemble_method = None

print('==> Start to evaluate with Budget %d' % time_limit)

dm = DataManager()
train_node = dm.load_train_csv("train_dataset.csv",
                               label_col=-1,
                               header='infer',
                               na_values=['nan', '?'])
test_node = dm.load_test_csv("test_dataset.csv",
                             header='infer',
                             has_label=True)
from solnml.components.utils.constants import REGRESSION

pipeline = FEPipeline(fe_enabled=False, task_type=REGRESSION)
train_data = pipeline.fit_transform(train_node)
test_data = pipeline.transform(test_node)

save_dir = './data/eval_exps/soln-ml'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

rgs = Regressor(metric='mse',
                ensemble_method=ensemble_method,
                evaluation=eval_type,
                time_limit=time_limit,
                output_dir=save_dir,
                random_state=1,
                n_jobs=n_jobs)