def test_rgs(): time_limit = 60 print('==> Start to evaluate with Budget %d' % time_limit) ensemble_method = 'stacking' eval_type = 'holdout' boston = load_boston() X, y = boston.data, boston.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) rgs = Regressor(metric='mse', ensemble_method=ensemble_method, enable_meta_algorithm_selection=False, ensemble_size=4, evaluation=eval_type, time_limit=time_limit, output_dir=save_dir) rgs.fit(train_data) print(rgs.summary()) pred = rgs.predict(test_data) print(mean_squared_error(test_data.data[1], pred)) shutil.rmtree(save_dir)
def main(): time_limit = 60 print('==> Start to evaluate with Budget %d' % time_limit) iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) add_classifier(UserDefinedDecisionTree) clf = Classifier(time_limit=time_limit, output_dir=save_dir, enable_meta_algorithm_selection=False, include_algorithms=['UserDefinedDecisionTree'], ensemble_method=None, metric='acc') _start_time = time.time() clf.fit(train_data) print(clf.summary()) pred = clf.predict(test_data) print(accuracy_score(test_data.data[1], pred)) shutil.rmtree(save_dir)
def test_cls(): save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) time_limit = 60 print('==> Start to evaluate with Budget %d' % time_limit) ensemble_method = 'stacking' eval_type = 'holdout' iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) clf = Classifier(time_limit=time_limit, output_dir=save_dir, ensemble_method=ensemble_method, enable_meta_algorithm_selection=False, ensemble_size=4, evaluation=eval_type, metric='acc') clf.fit(train_data) print(clf.summary()) pred = clf.predict(test_data) print(accuracy_score(test_data.data[1], pred)) shutil.rmtree(save_dir)
def evaluate(): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) try: dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) clf = Classifier(dataset_name='iris', time_limit=150, output_dir=save_dir, ensemble_method=ensemble_method, evaluation=eval_type, metric='acc') clf.fit(train_data) clf.refit() pred = clf.predict(test_data) print('final score', clf.score(test_data)) except Exception as e: return False return True
def load_data(dataset, data_dir='./', datanode_returned=False, preprocess=True, task_type=None): dm = DataManager() if task_type is None: data_path = data_dir + 'data/datasets/%s.csv' % dataset elif task_type in CLS_TASKS: data_path = data_dir + 'data/cls_datasets/%s.csv' % dataset elif task_type in RGS_TASKS: data_path = data_dir + 'data/rgs_datasets/%s.csv' % dataset else: raise ValueError("Unknown task type %s" % str(task_type)) # Load train data. if dataset in [ 'higgs', 'amazon_employee', 'spectf', 'usps', 'vehicle_sensIT', 'codrna' ]: label_column = 0 elif dataset in ['rmftsa_sleepdata(1)']: label_column = 1 else: label_column = -1 if dataset in ['spambase', 'messidor_features']: header = None else: header = 'infer' if dataset in ['winequality_white', 'winequality_red']: sep = ';' else: sep = ',' train_data_node = dm.load_train_csv( data_path, label_col=label_column, header=header, sep=sep, na_values=["n/a", "na", "--", "-", "?"]) if preprocess: pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) train_data = pipeline.fit_transform(train_data_node) else: train_data = train_data_node if datanode_returned: return train_data else: X, y = train_data.data feature_types = train_data.feature_types return X, y, feature_types
def evaluate_fe_pipeline(): from mindware.utils.data_manager import DataManager dm = DataManager() # file_path = "data/proprocess_data.csv" file_path = 'data/a9a/dataset_183_adult.csv' dm.load_train_csv(file_path) pipeline = FEPipeline(fe_enabled=True).fit(dm) train_data = pipeline.transform(dm) print(train_data) print(train_data.data)
def load_tabular_data(self, data_path): self.data_manager = DataManager() train_data_node = self.data_manager.load_train_csv( data_path, label_col=self.label_column, header=self.header, sep=self.sep, na_values=list(self.nan_values)) task_type = REGRESSION if self.is_regression else CLASSIFICATION self._process_pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) return self._process_pipeline.fit_transform(train_data_node)
class TabularDataset(BaseDataset): def __init__(self, data_path: str, is_regression=False, label_column=-1, header='infer', sep=',', nan_values=("n/a", "na", "--", "-", "?"), train_val_split: bool = False, val_split_size: float = 0.2): super().__init__() self.is_regression = is_regression self.train_val_split = train_val_split self.val_split_size = val_split_size self.data_path = data_path self.label_column = label_column self.header = header self.sep = sep self.nan_values = nan_values self.data_manager = None self._process_pipeline = None def load_tabular_data(self, data_path): self.data_manager = DataManager() train_data_node = self.data_manager.load_train_csv( data_path, label_col=self.label_column, header=self.header, sep=self.sep, na_values=list(self.nan_values)) task_type = REGRESSION if self.is_regression else CLASSIFICATION self._process_pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) return self._process_pipeline.fit_transform(train_data_node) def load_data(self): self.train_dataset = self.load_tabular_data(self.data_path) def load_test_data(self): test_data_node = self.data_manager.load_test_csv(self.test_data_path, has_label=False, keep_default_na=True, header=self.header, sep=self.sep) self.test_dataset = self._process_pipeline.transform(test_data_node)
def main(): tmp_dir = './data/eval_exps/soln-ml' if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) time_limit = 60 print('==> Start new AutoML task with budget - %d' % time_limit) ensemble_method = 'ensemble_selection' eval_type = 'holdout' iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) clf = Classifier(time_limit=time_limit, output_dir=tmp_dir, ensemble_method=ensemble_method, enable_meta_algorithm_selection=False, ensemble_size=10, optimizer='random_search', evaluation=eval_type, metric='acc', n_jobs=1) clf.fit(train_data, tree_id=2) print(clf.summary()) pred = clf.predict(test_data) print(accuracy_score(test_data.data[1], pred)) shutil.rmtree(tmp_dir)
def evaluate_data_manager(): # from data_manager import DataManager # dm = DataManager() # train_df = dm.load_train_csv("data/proprocess_data.csv") # print(train_df) # print(dm.feature_types) # print(dm.missing_flags) from mindware.utils.data_manager import DataManager import numpy as np X = np.array([[1, 2, 3, 4], [1, 'asfd', 2, 1.4]]) y = [1, 2] dm = DataManager(X, y) print(dm.feature_types) print(dm.missing_flags)
time_limit = args.time_limit eval_type = args.eval_type n_jobs = args.n_jobs ensemble_method = args.ens_method if ensemble_method == 'none': ensemble_method = None print('==> Start to evaluate with Budget %d' % time_limit) boston = load_boston() X, y = boston.data, boston.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) rgs = Regressor(metric='mse', dataset_name='boston', ensemble_method=ensemble_method, evaluation=eval_type, time_limit=time_limit, output_dir=save_dir, random_state=1, n_jobs=n_jobs)
parser.add_argument('--ens_method', default='ensemble_selection', choices=['none', 'bagging', 'blending', 'stacking', 'ensemble_selection']) parser.add_argument('--n_jobs', type=int, default=1) args = parser.parse_args() time_limit = args.time_limit eval_type = args.eval_type n_jobs = args.n_jobs ensemble_method = args.ens_method if ensemble_method == 'none': ensemble_method = None print('==> Start to evaluate with Budget %d' % time_limit) dm = DataManager() train_node = dm.load_train_csv("train_dataset.csv", label_col=-1, header='infer', na_values=['nan', '?']) test_node = dm.load_test_csv("test_dataset.csv", header='infer', has_label=True) from mindware.components.utils.constants import REGRESSION pipeline = FEPipeline(fe_enabled=False, task_type=REGRESSION) train_data = pipeline.fit_transform(train_node) test_data = pipeline.transform(test_node) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) rgs = Regressor(metric='mse', ensemble_method=ensemble_method, evaluation=eval_type,