def evaluate(): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) try: dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) clf = Classifier(dataset_name='iris', time_limit=150, output_dir=save_dir, ensemble_method=ensemble_method, evaluation=eval_type, metric='acc') clf.fit(train_data) clf.refit() pred = clf.predict(test_data) print('final score', clf.score(test_data)) except Exception as e: return False return True
def main(): time_limit = 60 print('==> Start to evaluate with Budget %d' % time_limit) iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) add_classifier(UserDefinedDecisionTree) clf = Classifier(time_limit=time_limit, output_dir=save_dir, enable_meta_algorithm_selection=False, include_algorithms=['UserDefinedDecisionTree'], ensemble_method=None, metric='acc') _start_time = time.time() clf.fit(train_data) print(clf.summary()) pred = clf.predict(test_data) print(accuracy_score(test_data.data[1], pred)) shutil.rmtree(save_dir)
def fit(self, X, y): if isinstance(X, pd.DataFrame): self.headers = X.columns X = np.array(X) y = np.array(y) self.origin_X = X self.origin_y = y if (np.sum(np.isnan(X)) == 0): self.impute = False self.impute_method = ['mean'] self.preprocess(X, y) for key in self.impute_operator: X_train = self.train_data[key] self.dm = DataManager(X_train, y) train_data = self.dm.get_data_node(X_train, y) self.mdl[key] = soln_Regressor( time_limit=self.time_limit / len(self.impute_method), output_dir=self.output_dir, ensemble_method=self.ensemble_method, evaluation=self.evaluation, metric=self.metric, n_jobs=self.n_jobs) self.mdl[key].fit(train_data) return 0
def test_rgs(): time_limit = 60 print('==> Start to evaluate with Budget %d' % time_limit) ensemble_method = 'bagging' eval_type = 'holdout' boston = load_boston() X, y = boston.data, boston.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) rgs = Regressor(metric='mse', ensemble_method=ensemble_method, enable_meta_algorithm_selection=False, evaluation=eval_type, time_limit=time_limit, output_dir=save_dir) rgs.fit(train_data) print(rgs.summary()) pred = rgs.predict(test_data) print(mean_squared_error(test_data.data[1], pred)) shutil.rmtree(save_dir)
def main(): ensemble_method = None time_limit = 120 print('==> Start to evaluate with Budget %d' % time_limit) eval_type = 'holdout' boston = load_boston() X, y = boston.data, boston.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) rgs = Regressor(metric='mse', ensemble_method=ensemble_method, evaluation=eval_type, time_limit=time_limit, output_dir=save_dir) rgs.fit(train_data) pred = rgs.predict(test_data) print(mean_squared_error(test_data.data[1], pred))
def load_data(dataset, data_dir='./', datanode_returned=False, preprocess=True, task_type=None): dm = DataManager() if task_type is None: data_path = data_dir + 'data/datasets/%s.csv' % dataset elif task_type in CLS_TASKS: data_path = data_dir + 'data/cls_datasets/%s.csv' % dataset elif task_type in REG_TASKS: data_path = data_dir + 'data/rgs_datasets/%s.csv' % dataset else: raise ValueError("Unknown task type %s" % str(task_type)) # if dataset in ['credit_default']: # data_path = data_dir + 'data/datasets/%s.xls' % dataset # Load train data. if dataset in [ 'higgs', 'amazon_employee', 'spectf', 'usps', 'vehicle_sensIT', 'codrna' ]: label_column = 0 elif dataset in ['rmftsa_sleepdata(1)']: label_column = 1 else: label_column = -1 if dataset in ['spambase', 'messidor_features']: header = None else: header = 'infer' if dataset in ['winequality_white', 'winequality_red']: sep = ';' else: sep = ',' train_data_node = dm.load_train_csv( data_path, label_col=label_column, header=header, sep=sep, na_values=["n/a", "na", "--", "-", "?"]) if preprocess: pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) train_data = pipeline.fit_transform(train_data_node) else: train_data = train_data_node if datanode_returned: return train_data else: X, y = train_data.data feature_types = train_data.feature_types return X, y, feature_types
def test_cls(): save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) time_limit = 60 print('==> Start to evaluate with Budget %d' % time_limit) ensemble_method = 'bagging' eval_type = 'holdout' iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) clf = Classifier(time_limit=time_limit, output_dir=save_dir, enable_meta_algorithm_selection=False, ensemble_method=ensemble_method, ensemble_size=10, evaluation=eval_type, metric='acc') clf.fit(train_data) print(clf.summary()) pred = clf.predict(test_data) print(accuracy_score(test_data.data[1], pred)) shutil.rmtree(save_dir)
def operate(self, data): dm = DataManager(data) data = dm.get_data_node(data, []) for fe in self.list: if fe is not None: data = fe.operate(data) return data.data[0]
def evaluate_fe_pipeline(): from solnml.utils.data_manager import DataManager dm = DataManager() # file_path = "data/proprocess_data.csv" file_path = 'data/a9a/dataset_183_adult.csv' dm.load_train_csv(file_path) pipeline = FEPipeline(fe_enabled=True).fit(dm) train_data = pipeline.transform(dm) print(train_data) print(train_data.data)
def model_fit(_id,obj,paramsj,X_trainj,y_trainj): info_path = './models_information/'+_id+'_information' info_file = open(info_path,'w') print('Model training begins!') try: # read data X_train = np.array(pd.DataFrame(json.loads(X_trainj))) y_train = np.array(pd.DataFrame(json.loads(y_trainj)))[:,0] params = json.loads(paramsj) #print(y_train) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) save_dir = '../data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) # train mode if(obj == 'clf'): mdl = Classifier(time_limit=params['time_limit'], output_dir=save_dir, ensemble_method=params['ensemble_method'], evaluation=params['evaluation'], metric=params['metric'], n_jobs=4) elif(obj == 'reg'): mdl = rgs = Regressor(metric=params['metric'], ensemble_method=params['ensemble_method'], evaluation=params['evaluation'], time_limit=params['time_limit'], output_dir=save_dir, random_state=1, n_jobs=n_jobs) mdl.fit(train_data) except: print('Model training failed!') info_file.write('Model training failed!') info_file.close() return -1 result = dict() result['best_algo_id'] = str(mdl.best_algo_id) result['best_hpo_config'] = str(mdl.best_hpo_config) result['nbest_algo_id'] = str(mdl.nbest_algo_id) result['best_perf'] = str(mdl.best_perf) result['best_fe_config'] = str(mdl.best_fe_config) result['get_ens_model_info'] = str(mdl.get_ens_model_info) #get_ens_model_info is not realized in this version yet info_file.write(json.dumps(result)) info_file.close() print('Model training finished!') return 0
def fetch_data(task_id): dm = DataManager() train_data_path = data_dir + 'data/p%d.csv' % task_id test_data_path = data_dir + 'data/test_data.csv' if not os.path.exists(train_data_path) or not os.path.exists(test_data_path): create_csv(task_id) train_data_node = dm.load_train_csv(train_data_path, label_col=-1, header='infer', sep=',') print('loading train data finished.') test_data_node = dm.load_test_csv(test_data_path, has_label=False, header='infer', sep=',') print('loading test data finished.') return train_data_node, test_data_node
def load_tabular_data(self, data_path): self.data_manager = DataManager() train_data_node = self.data_manager.load_train_csv( data_path, label_col=self.label_column, header=self.header, sep=self.sep, na_values=list(self.nan_values)) task_type = REGRESSION if self.is_regression else CLASSIFICATION self._process_pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) return self._process_pipeline.fit_transform(train_data_node)
class TabularDataset(BaseDataset): def __init__(self, data_path: str, is_regression=False, label_column=-1, header='infer', sep=',', nan_values=("n/a", "na", "--", "-", "?"), train_val_split: bool = False, val_split_size: float = 0.2): super().__init__() self.is_regression = is_regression self.train_val_split = train_val_split self.val_split_size = val_split_size self.data_path = data_path self.label_column = label_column self.header = header self.sep = sep self.nan_values = nan_values self.data_manager = None self._process_pipeline = None def load_tabular_data(self, data_path): self.data_manager = DataManager() train_data_node = self.data_manager.load_train_csv( data_path, label_col=self.label_column, header=self.header, sep=self.sep, na_values=list(self.nan_values)) task_type = REGRESSION if self.is_regression else CLASSIFICATION self._process_pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) return self._process_pipeline.fit_transform(train_data_node) def load_data(self): self.train_dataset = self.load_tabular_data(self.data_path) def load_test_data(self): test_data_node = self.data_manager.load_test_csv(self.test_data_path, has_label=False, keep_default_na=True, header=self.header, sep=self.sep) self.test_dataset = self._process_pipeline.transform(test_data_node)
def post(self): start_time = time.time() args = self.parser.parse_args() _id = request.form['model_name'] X_file = request.files['data_file_X'] y_file = request.files['data_file_y'] _id = request.form['model_name'] obj = request.form['objective'] # read data X_train = np.array(pd.read_csv(X_file)) y_train = np.array(pd.read_csv(y_file))[:, 0] print(y_train) if not (obj): obj = 'clf' dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) # train mode if (obj == 'clf'): mdl = Classifier(time_limit=100, output_dir=save_dir, ensemble_method='bagging', evaluation='holdout', metric='acc', n_jobs=4) elif (obj == 'reg'): mdl = rgs = Regressor(metric='mse', ensemble_method=ensemble_method, evaluation=eval_type, time_limit=time_limit, output_dir=save_dir, random_state=1, n_jobs=n_jobs) model_fit(_id, mdl, train_data) self.model_factory.add_pipeline(mdl, train_data, _id) print(self.model_factory) result = { 'trainTime': time.time() - start_time, 'trainShape': X_train.shape } self.model_factory[params['pipeline_id']]['stats'] = result return json.dumps(result)
def post(self): args = self.parser.parse_args() X_file = request.files['data_file_X'] X_test = np.array(pd.read_csv(X_file)) y_test = np.zeros(X_test.shape[0]) dm = DataManager(X_test, y_test) test_data = dm.get_data_node(X_test, y_test) _id = request.form['model_name'] proba = request.form['need_proba'] mdl = self.model_factory.pipelines[_id]['model'] if (proba): try: y_pred = mdl.predict_proba(test_data) except: y_pred = mdl.predict(test_data) else: y_pred = mdl.predict(test_data) print(y_pred) #params = read_params(args['params'].stream) return pd.DataFrame(y_pred).to_json()
def load_data(dataset, data_dir='./'): dm = DataManager(na_values=[]) data_path = data_dir + 'data/openml100/%s.csv' % dataset # Load train data. if dataset in ['higgs', 'cjs', 'Australian', 'monks-problems-1', 'monks-problems-2', 'monks-problems-3', 'profb', 'JapaneseVowels']: label_column = 0 else: label_column = -1 header = 'infer' sep = ',' train_data_node = dm.load_train_csv(data_path, label_col=label_column, header=header, sep=sep, na_values=["n/a", "na", "--", "?"], keep_default_na=False) train_data_node = dm.preprocess_fit(train_data_node) X, y = train_data_node.data feature_types = train_data_node.feature_types return X, y, feature_types
def evaluate_data_manager(): # from data_manager import DataManager # dm = DataManager() # train_df = dm.load_train_csv("data/proprocess_data.csv") # print(train_df) # print(dm.feature_types) # print(dm.missing_flags) from solnml.utils.data_manager import DataManager import numpy as np X = np.array([[1, 2, 3, 4], [1, 'asfd', 2, 1.4]]) y = [1, 2] dm = DataManager(X, y) print(dm.feature_types) print(dm.missing_flags)
class Classifier(): def __init__(self, dataset_name='default_dataset_name', time_limit=10800, amount_of_resource=None, metric='acc', include_algorithms=None, enable_meta_algorithm_selection=True, ensemble_method='ensemble_selection', ensemble_size=50, per_run_time_limit=150, random_state=1, n_jobs=1, evaluation='holdout', impute=False, impute_method=['MatrixFactorization', 'KNN', 'IterativeSVD'], pre_fs=True, fb_k=300, fb_r0=50, fb_max_iter=50, fb_population_size=10, fb_n0=500, fb_metric='accuracy', output_dir="/tmp/"): self.dataset_name = dataset_name self.metric = metric self.task_type = None self.time_limit = time_limit self.amount_of_resource = amount_of_resource self.include_algorithms = include_algorithms self.enable_meta_algorithm_selection = enable_meta_algorithm_selection self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.per_run_time_limit = per_run_time_limit self.random_state = random_state self.n_jobs = n_jobs self.evaluation = evaluation self.output_dir = output_dir self.dm = None #imputation arguments self.impute = impute self.impute_method = impute_method self.impute_operator = {} #fb arguments self.pre_fs = pre_fs self.fb_k = fb_k self.fb_r0 = fb_r0 self.fb_max_iter = fb_max_iter self.fb_population_size = fb_population_size self.fb_n0 = fb_n0 self.fb_metric = fb_metric #['accuracy','f1_score'] self.origin_X = None self.origin_y = None self.fb_operator = None self.headers = None self.selected_headers = None self.train_data = {} self.test_data = {} self.mdl = {} self.y_pred = {} # Create output directory. if not os.path.exists(output_dir): os.makedirs(output_dir) if not self.impute: self.impute_method = ['mean'] #simply impute anyway def run_impute(self, X, state='train'): if state == 'train': self.train_data['ave'] = np.zeros([X.shape[0], X.shape[1]]) for imp_method in self.impute_method: print('Impute ' + imp_method + ' starts!') if imp_method == 'mean': imp_ope = SimpleFill() if imp_method == 'KNN': imp_ope = KNN() if imp_method == 'IterativeSVD': imp_ope = IterativeSVD() if imp_method == 'MatrixFactorization': imp_ope = MatrixFactorization() X_filled = imp_ope.fit_transform(X) self.train_data[imp_method] = X_filled self.impute_operator[imp_method] = imp_ope self.train_data['ave'] += X_filled print('Impute ' + imp_method + ' ends!') self.train_data['ave'] /= len(self.impute_method) return 0 def feature_selection(self, X, y, state='train'): if state == 'train': print('Feature_selection starts!') fb = FeatureBand(r0=self.fb_r0, n0=self.fb_n0, clf=load_clf('logistic'), max_iter=self.fb_max_iter, k=self.fb_k, population_size=self.fb_population_size, local_search=True) times, iter_best, global_best = fb.fit(X, y, metrics=self.fb_metric) for key in self.train_data: self.train_data[key] = fb.transform(self.train_data[key]) self.fb_operator = fb if self.headers is not None: self.selected_headers = self.headers[fb.featrue_selected] return 0 def preprocess(self, X, y): self.run_impute(X) if X.shape[1] > self.fb_k and self.pre_fs == True: self.feature_selection(self.train_data['ave'], y) return 0 def fit(self, X, y): if isinstance(X, pd.DataFrame): self.headers = X.columns X = np.array(X) y = np.array(y) self.origin_X = X self.origin_y = y if (np.sum(np.isnan(X)) == 0): self.impute = False self.impute_method = ['mean'] self.preprocess(X, y) for key in self.impute_operator: X_train = self.train_data[key] self.dm = DataManager(X_train, y) train_data = self.dm.get_data_node(X_train, y) self.mdl[key] = soln_Classifier( time_limit=self.time_limit / len(self.impute_method), output_dir=self.output_dir, ensemble_method=self.ensemble_method, evaluation=self.evaluation, metric=self.metric, n_jobs=self.n_jobs) self.mdl[key].fit(train_data) return 0 def predict_proba(self, X_test): y_pred = None X_test = np.array(X_test) for key in self.impute_operator: if np.sum(np.isnan(X_test)) > 0: X_test_filled = self.impute_operator[key].fit_transform(X_test) else: X_test_filled = X_test if X_test.shape[1] > self.fb_k and self.pre_fs == True: X_test_filled = self.fb_operator.transform(X_test_filled) test_data = self.dm.get_data_node(X_test_filled, []) self.y_pred[key] = self.mdl[key].predict_proba(test_data) if y_pred is None: y_pred = self.y_pred[key] else: y_pred += self.y_pred[key] y_pred /= len(self.impute_operator) return y_pred def predict(self, X_test): return np.argmax(self.predict_proba(X_test), axis=1) @property def get_feature_selected(self): if self.fb_operator is not None: return self.fb_operator.featrue_selected def feature_analysis(self, topk=30): from lightgbm import LGBMClassifier relation_list = {} result = None importance_array = [] for key in self.impute_operator: mdl = self.mdl[key] data = self.dm.get_data_node(self.train_data[key], self.origin_y) data_tf = mdl.data_transform(data).data[0] sub_topk = min(int(topk / len(self.impute_operator)), data_tf.shape[1]) lgb = LGBMClassifier() lgb.fit(data_tf, self.origin_y) _importance = lgb.feature_importances_ index_needed = np.argsort(-_importance)[:sub_topk] temp_array = _importance[index_needed] temp_array = temp_array / np.max(temp_array) importance_array.append(temp_array) relation_list[key] = mdl.feature_corelation(data, index_needed) relation_list[key].index = key + relation_list[key].index if self.selected_headers is not None: relation_list[key].columns = list(self.selected_headers) elif self.fb_operator is not None: relation_list[key].columns = [ 'origin_fearure' + str(it) for it in self.fb_operator.featrue_selected ] if result is None: result = relation_list[key] else: result = result.append(relation_list[key]) importance_frame = pd.DataFrame(np.hstack(importance_array)) importance_frame.columns = ['feature_importance'] importance_frame.index = result.index return pd.concat([importance_frame, result], axis=1) return result def save(self, save_dir): saveloadmodel.save(self, save_dir, task_type='CLF')
args = parser.parse_args() time_limit = args.time_limit eval_type = args.eval_type n_jobs = args.n_jobs ensemble_method = args.ens_method if ensemble_method == 'none': ensemble_method = None print('==> Start to evaluate with Budget %d' % time_limit) boston = load_boston() X, y = boston.data, boston.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) dm = DataManager(X_train, y_train) train_data = dm.get_data_node(X_train, y_train) test_data = dm.get_data_node(X_test, y_test) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) rgs = Regressor(metric='mse', dataset_name='boston', ensemble_method=ensemble_method, evaluation=eval_type, time_limit=time_limit, output_dir=save_dir, random_state=1, n_jobs=n_jobs)
default='ensemble_selection', choices=['none', 'bagging', 'blending', 'stacking', 'ensemble_selection']) parser.add_argument('--n_jobs', type=int, default=1) args = parser.parse_args() time_limit = args.time_limit eval_type = args.eval_type n_jobs = args.n_jobs ensemble_method = args.ens_method if ensemble_method == 'none': ensemble_method = None print('==> Start to evaluate with Budget %d' % time_limit) dm = DataManager() train_node = dm.load_train_csv("train_dataset.csv", label_col=-1, header='infer', na_values=['nan', '?']) test_node = dm.load_test_csv("test_dataset.csv", header='infer', has_label=True) from solnml.components.utils.constants import REGRESSION pipeline = FEPipeline(fe_enabled=False, task_type=REGRESSION) train_data = pipeline.fit_transform(train_node) test_data = pipeline.transform(test_node) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir):