def evaluate_fe_pipeline(): from mindware.utils.data_manager import DataManager dm = DataManager() # file_path = "data/proprocess_data.csv" file_path = 'data/a9a/dataset_183_adult.csv' dm.load_train_csv(file_path) pipeline = FEPipeline(fe_enabled=True).fit(dm) train_data = pipeline.transform(dm) print(train_data) print(train_data.data)
def load_data(dataset, data_dir='./', datanode_returned=False, preprocess=True, task_type=None): dm = DataManager() if task_type is None: data_path = data_dir + 'data/datasets/%s.csv' % dataset elif task_type in CLS_TASKS: data_path = data_dir + 'data/cls_datasets/%s.csv' % dataset elif task_type in RGS_TASKS: data_path = data_dir + 'data/rgs_datasets/%s.csv' % dataset else: raise ValueError("Unknown task type %s" % str(task_type)) # Load train data. if dataset in [ 'higgs', 'amazon_employee', 'spectf', 'usps', 'vehicle_sensIT', 'codrna' ]: label_column = 0 elif dataset in ['rmftsa_sleepdata(1)']: label_column = 1 else: label_column = -1 if dataset in ['spambase', 'messidor_features']: header = None else: header = 'infer' if dataset in ['winequality_white', 'winequality_red']: sep = ';' else: sep = ',' train_data_node = dm.load_train_csv( data_path, label_col=label_column, header=header, sep=sep, na_values=["n/a", "na", "--", "-", "?"]) if preprocess: pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) train_data = pipeline.fit_transform(train_data_node) else: train_data = train_data_node if datanode_returned: return train_data else: X, y = train_data.data feature_types = train_data.feature_types return X, y, feature_types
class TabularDataset(BaseDataset): def __init__(self, data_path: str, is_regression=False, label_column=-1, header='infer', sep=',', nan_values=("n/a", "na", "--", "-", "?"), train_val_split: bool = False, val_split_size: float = 0.2): super().__init__() self.is_regression = is_regression self.train_val_split = train_val_split self.val_split_size = val_split_size self.data_path = data_path self.label_column = label_column self.header = header self.sep = sep self.nan_values = nan_values self.data_manager = None self._process_pipeline = None def load_tabular_data(self, data_path): self.data_manager = DataManager() train_data_node = self.data_manager.load_train_csv( data_path, label_col=self.label_column, header=self.header, sep=self.sep, na_values=list(self.nan_values)) task_type = REGRESSION if self.is_regression else CLASSIFICATION self._process_pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) return self._process_pipeline.fit_transform(train_data_node) def load_data(self): self.train_dataset = self.load_tabular_data(self.data_path) def load_test_data(self): test_data_node = self.data_manager.load_test_csv(self.test_data_path, has_label=False, keep_default_na=True, header=self.header, sep=self.sep) self.test_dataset = self._process_pipeline.transform(test_data_node)
choices=['none', 'bagging', 'blending', 'stacking', 'ensemble_selection']) parser.add_argument('--n_jobs', type=int, default=1) args = parser.parse_args() time_limit = args.time_limit eval_type = args.eval_type n_jobs = args.n_jobs ensemble_method = args.ens_method if ensemble_method == 'none': ensemble_method = None print('==> Start to evaluate with Budget %d' % time_limit) dm = DataManager() train_node = dm.load_train_csv("train_dataset.csv", label_col=-1, header='infer', na_values=['nan', '?']) test_node = dm.load_test_csv("test_dataset.csv", header='infer', has_label=True) from mindware.components.utils.constants import REGRESSION pipeline = FEPipeline(fe_enabled=False, task_type=REGRESSION) train_data = pipeline.fit_transform(train_node) test_data = pipeline.transform(test_node) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) rgs = Regressor(metric='mse', ensemble_method=ensemble_method, evaluation=eval_type, time_limit=time_limit,