class AutoML(object): def __init__(self, time_limit=300, dataset_name='default_name', amount_of_resource=None, task_type=None, metric='bal_acc', include_algorithms=None, ensemble_method='ensemble_selection', enable_meta_algorithm_selection=True, per_run_time_limit=150, ensemble_size=50, evaluation='holdout', output_dir="logs", logging_config=None, random_state=1, n_jobs=1): self.metric_id = metric self.metric = get_metric(self.metric_id) self.dataset_name = dataset_name self.time_limit = time_limit self.seed = random_state self.per_run_time_limit = per_run_time_limit self.output_dir = output_dir self.logging_config = logging_config if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.logger = self._get_logger(self.dataset_name) self.evaluation_type = evaluation self.amount_of_resource = amount_of_resource self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.enable_meta_algorithm_selection = enable_meta_algorithm_selection self.task_type = task_type self.n_jobs = n_jobs self.solver = None if include_algorithms is not None: self.include_algorithms = include_algorithms else: if task_type in CLS_TASKS: self.include_algorithms = list(classification_algorithms) elif task_type in REG_TASKS: self.include_algorithms = list(regression_algorithms) else: raise ValueError("Unknown task type %s" % task_type) if ensemble_method is not None and ensemble_method not in ensemble_list: raise ValueError("%s is not supported for ensemble!" % ensemble_method) def _get_logger(self, name): logger_name = 'SolnML-%s(%d)' % (name, self.seed) setup_logger( os.path.join(self.output_dir, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name) def fit(self, train_data: DataNode, dataset_id=None): """ this function includes this following two procedures. 1. tune each algorithm's hyperparameters. 2. engineer each algorithm's features automatically. :param train_data: :return: """ if self.enable_meta_algorithm_selection: try: alad = AlgorithmAdvisor(task_type=self.task_type, n_algorithm=9, metric=self.metric_id) n_algo = 5 model_candidates = alad.fetch_algorithm_set( train_data, dataset_id=dataset_id) include_models = list() for algo in model_candidates: if algo in self.include_algorithms and len( include_models) < n_algo: include_models.append(algo) self.include_algorithms = include_models self.logger.info( 'Executing meta-learning based algorithm recommendation!') self.logger.info('Algorithms recommended: %s' % ','.join(self.include_algorithms)) except Exception as e: self.logger.error("Meta-learning failed!") # Check whether this dataset is balanced or not. if self.task_type in CLS_TASKS and is_unbalanced_dataset(train_data): # self.include_algorithms = imb_classication_algorithms self.logger.info('Input dataset is imbalanced!') train_data = DataBalancer().operate(train_data) if self.amount_of_resource is None: trial_num = len(self.include_algorithms) * 30 else: trial_num = self.amount_of_resource self.solver = FirstLayerBandit( self.task_type, trial_num, self.include_algorithms, train_data, per_run_time_limit=self.per_run_time_limit, dataset_name=self.dataset_name, ensemble_method=self.ensemble_method, ensemble_size=self.ensemble_size, inner_opt_algorithm='fixed', metric=self.metric, fe_algo='bo', seed=self.seed, time_limit=self.time_limit, eval_type=self.evaluation_type, output_dir=self.output_dir) self.solver.optimize() def refit(self): self.solver.refit() def predict_proba(self, test_data: DataNode): return self.solver.predict_proba(test_data) def predict(self, test_data: DataNode): return self.solver.predict(test_data) def score(self, test_data: DataNode, metric_func=None): if metric_func is None: metric_func = self.metric return metric_func(self, test_data, test_data.data[1]) def get_ens_model_info(self): if self.ensemble_method is not None: return self.solver.es.get_ens_model_info() else: return None
class AutoML(object): def __init__(self, time_limit=300, dataset_name='default_name', amount_of_resource=None, task_type=None, metric='bal_acc', include_algorithms=None, include_preprocessors=None, ensemble_method='ensemble_selection', enable_meta_algorithm_selection=True, enable_fe=True, per_run_time_limit=150, ensemble_size=50, evaluation='holdout', output_dir="logs", logging_config=None, random_state=1, n_jobs=1): self.metric_id = metric self.metric = get_metric(self.metric_id) self.dataset_name = dataset_name self.time_limit = time_limit self.seed = random_state self.per_run_time_limit = per_run_time_limit self.output_dir = output_dir self.logging_config = logging_config self.logger = self._get_logger(self.dataset_name) self.evaluation_type = evaluation self.include_preprocessors = include_preprocessors self.amount_of_resource = amount_of_resource self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.enable_meta_algorithm_selection = enable_meta_algorithm_selection self.enable_fe = enable_fe self.task_type = task_type self.n_jobs = n_jobs self.solver = None # Disable meta learning if self.include_preprocessors is not None: self.enable_meta_algorithm_selection = False if include_algorithms is not None: self.include_algorithms = include_algorithms else: if task_type in CLS_TASKS: if task_type in [IMG_CLS, TEXT_CLS]: raise ValueError( 'Please use AutoDL module, instead of AutoML.') else: self.include_algorithms = list(classification_algorithms) elif task_type in RGS_TASKS: self.include_algorithms = list(regression_algorithms) else: raise ValueError("Unknown task type %s" % task_type) if ensemble_method is not None and ensemble_method not in ensemble_list: raise ValueError("%s is not supported for ensemble!" % ensemble_method) def _get_logger(self, name): logger_name = 'SolnML-%s(%d)' % (name, self.seed) setup_logger( os.path.join(self.output_dir, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name) def fit(self, train_data: DataNode, **kwargs): """ This function includes this following two procedures. 1. tune each algorithm's hyperparameters. 2. engineer each algorithm's features automatically. :param train_data: :return: """ # Check whether this dataset is balanced or not. # if self.task_type in CLS_TASKS and is_imbalanced_dataset(train_data): # self.logger.info('Input dataset is imbalanced!') # train_data = DataBalancer().operate(train_data) dataset_id = kwargs.get('dataset_id', None) inner_opt_algorithm = kwargs.get('opt_strategy', 'alter_hpo') self.logger.info('Optimization algorithm in 2rd bandit: %s' % inner_opt_algorithm) if self.enable_meta_algorithm_selection: try: n_algo_recommended = 5 meta_datasets = kwargs.get('meta_datasets', None) self.logger.info( 'Executing Meta-Learning based Algorithm Recommendation.') alad = RankNetAdvisor(task_type=self.task_type, n_algorithm=n_algo_recommended, metric=self.metric_id) alad.fit() model_candidates = alad.fetch_algorithm_set( dataset_id, datanode=train_data) include_models = list() for algo in model_candidates: if algo in self.include_algorithms and len( include_models) < n_algo_recommended: include_models.append(algo) # if 'logistic_regression' in include_models: # include_models.remove('logistic_regression') # if 'adaboost' not in include_models: # include_models.append('adaboost') # include_models = ['extra_trees', 'adaboost', 'liblinear_svc', 'random_forest', # 'libsvm_svc', 'lightgbm'] self.include_algorithms = include_models self.logger.info('Final Algorithms Recommended: [%s]' % ','.join(self.include_algorithms)) except Exception as e: self.logger.error( "Meta-Learning based Algorithm Recommendation FAILED: %s." % str(e)) traceback.print_exc(file=sys.stdout) self.solver = FirstLayerBandit( self.task_type, self.amount_of_resource, self.include_algorithms, train_data, include_preprocessors=self.include_preprocessors, per_run_time_limit=self.per_run_time_limit, dataset_name=self.dataset_name, ensemble_method=self.ensemble_method, ensemble_size=self.ensemble_size, inner_opt_algorithm=inner_opt_algorithm, metric=self.metric, enable_fe=self.enable_fe, fe_algo='bo', seed=self.seed, time_limit=self.time_limit, eval_type=self.evaluation_type, output_dir=self.output_dir) self.solver.optimize() def refit(self): self.solver.refit() def predict_proba(self, test_data: DataNode): return self.solver.predict_proba(test_data) def predict(self, test_data: DataNode): return self.solver.predict(test_data) def score(self, test_data: DataNode, metric_func=None): if metric_func is None: metric_func = self.metric return metric_func(self, test_data, test_data.data[1]) def get_ens_model_info(self): if self.ensemble_method is not None: return self.solver.es.get_ens_model_info() else: return None def get_val_stats(self): return self.solver.get_stats()