def __init__(self, fixed_config=None, scorer=None, data_node=None, task_type=REGRESSION, resampling_strategy='cv', resampling_params=None, timestamp=None, output_dir=None, seed=1): self.resampling_strategy = resampling_strategy self.resampling_params = resampling_params self.fixed_config = fixed_config self.scorer = scorer if scorer is not None else balanced_accuracy_scorer self.task_type = task_type self.data_node = data_node self.output_dir = output_dir self.seed = seed self.onehot_encoder = None self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.continue_training = False self.train_node = data_node.copy_() self.val_node = data_node.copy_() self.timestamp = timestamp
def __init__(self, evaluator: _BaseEvaluator, config_space, name, timestamp, eval_type, output_dir=None, seed=None): self.evaluator = evaluator self.config_space = config_space assert name in ['hpo', 'fe'] self.name = name self.seed = np.random.random_integers( MAX_INT) if seed is None else seed self.start_time = time.time() self.timing_list = list() self.incumbent = None self.eval_type = eval_type self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.init_hpo_iter_num = None self.early_stopped_flag = False self.timestamp = timestamp self.output_dir = output_dir self.topk_saver = CombinedTopKModelSaver(k=50, model_dir=self.output_dir, identifier=self.timestamp)
def _get_logger(self, name): import os logger_name = 'mindware-%s-%d:%s' % (self.task_id, self._seed, name) setup_logger( os.path.join(self.tmp_directory, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name)
def __init__(self, estimator, master_ip, master_port, authkey, worker_port): self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.estimator = estimator self.evaluator = estimator.get_evaluator() self.master_ip = master_ip self.master_port = master_port self.worker_port = worker_port self.worker_messager = WorkerMessager(master_ip, master_port, authkey) self.receiver_messager = ReceiverMessager(ip='127.0.0.1', port=worker_port)
def __init__(self, eval_func, config_space, per_run_time_limit=600, seed=1, R=81, eta=3, n_jobs=1, output_dir='./'): self.eval_func = eval_func self.config_space = config_space self.n_workers = n_jobs self.per_run_time_limit = per_run_time_limit self.trial_cnt = 0 self.configs = list() self.perfs = list() self.incumbent_perf = float("-INF") self.incumbent_config = self.config_space.get_default_configuration() self.incumbent_configs = list() self.incumbent_perfs = list() self.evaluation_stats = dict() self.evaluation_stats['timestamps'] = list() self.evaluation_stats['val_scores'] = list() self.global_start_time = time.time() self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) # Parameters in Hyperband framework. self.restart_needed = True self.R = R self.eta = eta self.seed = seed self.logeta = lambda x: log(x) / log(self.eta) self.s_max = int(self.logeta(self.R)) self.B = (self.s_max + 1) * self.R self.s_values = list(reversed(range(self.s_max + 1))) self.inner_iter_id = 0 # Parameters in MFSE-HB. self.iterate_r = [] self.target_x = dict() self.target_y = dict() self.exp_output = dict() for index, item in enumerate( np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)): r = int(item) self.iterate_r.append(r) self.target_x[r] = list() self.target_y[r] = list() self.mf_advisor = MFBatchAdvisor(config_space, output_dir=output_dir) self.eval_dict = dict()
def __init__(self, rep=3, metric='acc', n_algorithm=3, task_type=None, total_resource=1200, exclude_datasets=None, meta_dir=None): self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) super().__init__(n_algorithm, task_type, metric, rep, total_resource, 'ranknet', exclude_datasets, meta_dir) self.model = None
def __init__(self, n_algorithm=3, task_type=None, metric='acc', exclude_datasets=None): self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) super().__init__(n_algorithm, task_type, metric=metric, meta_algorithm='lightgbm', exclude_datasets=exclude_datasets) self.model = None
def __init__(self, task_type, architectures, time_limit, sampling_strategy='uniform', R=27, eta=3, N=9, n_jobs=1): self.architectures = architectures self.time_limit = time_limit self.task_type = task_type self.n_jobs = n_jobs self.R = R self.eta = eta self.N = N self.logeta = lambda x: log(x) / log(self.eta) self.s_max = int(self.logeta(self.R)) self.sampling_strategy = sampling_strategy self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) from mindware.components.models.img_classification import _classifiers as _img_estimators, _addons as _img_addons from mindware.components.models.text_classification import _classifiers as _text_estimators, \ _addons as _text_addons from mindware.components.models.object_detection import _classifiers as _od_estimators, _addons as _od_addons self.time_limit = time_limit self.elimination_strategy = 'bandit' # Runtime stats. self.evaluation_stats = dict() self.update_cs = dict() if task_type == IMG_CLS: self._estimators = _img_estimators self._addons = _img_addons elif task_type == TEXT_CLS: self._estimators = _text_estimators self._addons = _text_addons elif task_type == OBJECT_DET: self._estimators = _od_estimators self._addons = _od_addons else: raise ValueError("Unknown task type %s" % task_type) self.eval_hist_configs = dict() self.eval_hist_perfs = dict() self.tpe_config_gen = dict() self.mfse_config_gen = dict()
def __init__(self, eval_func, config_space, seed=1, R=81, eta=3, n_jobs=1): self.eval_func = eval_func self.config_space = config_space self.n_workers = n_jobs self.trial_cnt = 0 self.configs = list() self.perfs = list() self.incumbent_perf = float("-INF") self.incumbent_config = self.config_space.get_default_configuration() self.incumbent_configs = [] self.incumbent_perfs = [] self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) # Parameters in Hyperband framework. self.restart_needed = True self.R = R self.eta = eta self.seed = seed self.logeta = lambda x: log(x) / log(self.eta) self.s_max = int(self.logeta(self.R)) self.B = (self.s_max + 1) * self.R self.s_values = list(reversed(range(self.s_max + 1))) self.inner_iter_id = 0 # Parameters in Hyperband. self.iterate_r = list() self.target_x = dict() self.target_y = dict() self.exp_output = dict() for index, item in enumerate(np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)): r = int(item) self.iterate_r.append(r) self.target_x[r] = list() self.target_y[r] = list() self.eval_dict = dict()
def __init__(self, eval_func, config_space, config_generator='tpe', seed=1, R=27, eta=3, n_jobs=1): self.eval_func = eval_func self.config_space = config_space self.config_generator = config_generator self.n_workers = n_jobs self.trial_cnt = 0 self.configs = list() self.perfs = list() self.incumbent_perf = float("-INF") self.incumbent_config = self.config_space.get_default_configuration() self.incumbent_configs = list() self.incumbent_perfs = list() self.global_start_time = time.time() self.time_ticks = list() self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) # Parameters in Hyperband framework. self.restart_needed = True self.R = R self.eta = eta self.seed = seed self.logeta = lambda x: log(x) / log(self.eta) self.s_max = int(self.logeta(self.R)) self.B = (self.s_max + 1) * self.R self.s_values = list(reversed(range(self.s_max + 1))) self.inner_iter_id = 0 # Parameters in BOHB. self.iterate_r = list() self.target_x = dict() self.target_y = dict() self.exp_output = dict() for index, item in enumerate( np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)): r = int(item) self.iterate_r.append(r) self.target_x[r] = list() self.target_y[r] = list() types, bounds = get_types(self.config_space) self.num_config = len(bounds) self.surrogate = RandomForestWithInstances(types, bounds) # self.executor = ParallelEvaluator(self.eval_func, n_worker=n_jobs) # self.executor = ParallelProcessEvaluator(self.eval_func, n_worker=n_jobs) self.acquisition_func = EI(model=self.surrogate) self.acq_optimizer = RandomSampling(self.acquisition_func, self.config_space, n_samples=2000, rng=np.random.RandomState(seed)) self.config_gen = TPE(config_space) self.eval_dict = dict()
def __init__(self, node_list, node_index, task_type, timestamp, fe_config_space: ConfigurationSpace, cash_config_space: ConfigurationSpace, data: DataNode, fixed_config=None, trial_num=0, time_limit=None, metric='acc', optimizer='smac', ensemble_method='ensemble_selection', ensemble_size=50, per_run_time_limit=300, output_dir="logs", dataset_name='default_dataset', eval_type='holdout', resampling_params=None, n_jobs=1, seed=1): # Tree setting self.node_list = node_list self.node_index = node_index # Set up backend. self.dataset_name = dataset_name self.trial_num = trial_num self.time_limit = time_limit self.per_run_time_limit = per_run_time_limit self.start_time = time.time() self.logger = get_logger('Soln-ml: %s' % dataset_name) # Basic settings. self.eval_type = eval_type self.resampling_params = resampling_params self.task_type = task_type self.timestamp = timestamp self.fe_config_space = fe_config_space self.cash_config_space = cash_config_space self.fixed_config = fixed_config self.original_data = data.copy_() self.metric = get_metric(metric) self.optimizer = optimizer self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.n_jobs = n_jobs self.seed = seed self.output_dir = output_dir self.early_stop_flag = False self.timeout_flag = False self.incumbent_perf = -float("INF") self.incumbent = None self.eval_dict = dict() if self.task_type in CLS_TASKS: self.if_imbal = is_imbalanced_dataset(self.original_data) else: self.if_imbal = False self.es = None
def __init__(self, stats, ensemble_method: str, ensemble_size: int, task_type: int, metric: _BaseScorer, data_node, output_dir=None): self.stats = stats self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.task_type = task_type self.metric = metric self.output_dir = output_dir self.node = data_node self.predictions = [] self.train_labels = None self.timestamp = str(time.time()) logger_name = 'EnsembleBuilder' self.logger = get_logger(logger_name) for algo_id in self.stats.keys(): model_to_eval = self.stats[algo_id] for idx, (_, _, path) in enumerate(model_to_eval): with open(path, 'rb') as f: op_list, model, _ = pkl.load(f) _node = self.node.copy_() _node = construct_node(_node, op_list) # TODO: Test size test_size = 0.33 X, y = _node.data if self.task_type in CLS_TASKS: ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1) else: ss = ShuffleSplit(n_splits=1, test_size=test_size, random_state=1) for train_index, val_index in ss.split(X, y): X_valid = X[val_index] y_valid = y[val_index] if self.train_labels is not None: assert (self.train_labels == y_valid).all() else: self.train_labels = y_valid if self.task_type in CLS_TASKS: y_valid_pred = model.predict_proba(X_valid) else: y_valid_pred = model.predict(X_valid) self.predictions.append(y_valid_pred) if len(self.predictions) < self.ensemble_size: self.ensemble_size = len(self.predictions) if ensemble_method == 'ensemble_selection': return if task_type in CLS_TASKS: self.base_model_mask = choose_base_models_classification( np.array(self.predictions), self.ensemble_size) else: self.base_model_mask = choose_base_models_regression( np.array(self.predictions), np.array(y_valid), self.ensemble_size) self.ensemble_size = sum(self.base_model_mask)
def __init__(self): self.logger = get_logger(__name__)
def _get_logger(self, name): logger_name = 'MindWare-%s(%d)' % (name, self.seed) setup_logger(os.path.join(self.output_dir, '%s.log' % str(logger_name)), self.logging_config) return get_logger(logger_name)
def __init__(self, configspace, min_points_in_model=None, top_n_percent=15, num_samples=64, random_fraction=1 / 3, bandwidth_factor=3, min_bandwidth=1e-3, **kwargs): """ Fits for each given budget a kernel density estimator on the best N percent of the evaluated configurations on this budget. Parameters: ----------- configspace: ConfigSpace Configuration space object top_n_percent: int Determines the percentile of configurations that will be used as training data for the kernel density estimator, e.g if set to 10 the 10% best configurations will be considered for training. min_points_in_model: int minimum number of datapoints needed to fit a model num_samples: int number of samples drawn to optimize EI via sampling random_fraction: float fraction of random configurations returned bandwidth_factor: float widens the bandwidth for contiuous parameters for proposed points to optimize EI min_bandwidth: float to keep diversity, even when all (good) samples have the same value for one of the parameters, a minimum bandwidth (Default: 1e-3) is used instead of zero. """ super().__init__(**kwargs) self.top_n_percent = top_n_percent self.configspace = configspace self.bw_factor = bandwidth_factor self.min_bandwidth = min_bandwidth self.min_points_in_model = min_points_in_model if min_points_in_model is None: self.min_points_in_model = len( self.configspace.get_hyperparameters()) + 1 if self.min_points_in_model < len( self.configspace.get_hyperparameters()) + 1: self.min_points_in_model = len( self.configspace.get_hyperparameters()) + 1 self.num_samples = num_samples self.random_fraction = random_fraction hps = self.configspace.get_hyperparameters() self.kde_vartypes = "" self.vartypes = [] for h in hps: if hasattr(h, 'sequence'): raise RuntimeError( 'This version on BOHB does not support ordinal hyperparameters. Please encode %s as an integer parameter!' % (h.name)) if hasattr(h, 'choices'): self.kde_vartypes += 'u' self.vartypes += [len(h.choices)] else: self.kde_vartypes += 'c' self.vartypes += [0] self.vartypes = np.array(self.vartypes, dtype=int) # store precomputed probs for the categorical parameters self.cat_probs = [] self.configs = dict() self.losses = dict() self.good_config_rankings = dict() self.kde_models = dict() self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)
def calculate_all_metafeatures(X, y, categorical, dataset_name, task_type, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None func_cls = ['NumberOfClasses', 'LogNumberOfFeatures', 'ClassProbabilityMin', 'ClassProbabilityMax', 'ClassProbabilityMean', "ClassProbabilitySTD", 'ClassEntropy', 'LandmarkLDA', 'LandmarkNaiveBayes', 'LandmarkDecisionTree', 'LandmarkDecisionNodeLearner', 'LandmarkRandomNodeLearner', 'LandmarkWorstNodeLearner', 'Landmark1NN'] # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in func_cls and task_type not in CLS_TASKS: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) imputer = SimpleImputer(strategy='most_frequent', copy=False) X_transformed = imputer.fit_transform(X.copy()) if any(categorical): categorical_idx = [idx for idx, i in enumerate(categorical) if i] ohe = ColumnTransformer([('one-hot', OneHotEncoder(), categorical_idx)], remainder="passthrough") X_transformed = ohe.fit_transform(X_transformed) center = not scipy.sparse.isspmatrix(X_transformed) standard_scaler = StandardScaler(copy=False, with_mean=center) X_transformed = standard_scaler.fit_transform(X_transformed) categorical_transformed = [False] * X_transformed.shape[1] # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.debug("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.debug("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_, task_type=task_type) return mf_
def __init__(self, n_algorithm=3, task_type=None, metric='bal_acc', rep=3, total_resource=1200, meta_algorithm='lightgbm', exclude_datasets=None, meta_dir=None): self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.n_algorithm = n_algorithm self.n_algo_candidates = len(_cls_builtin_algorithms) self.task_type = task_type self.meta_algo = meta_algorithm self.rep = rep self.metric = metric if task_type in CLS_TASKS: self.algorithms = _cls_builtin_algorithms self.n_algo_candidates = len(_cls_builtin_algorithms) if metric not in ['acc', 'bal_acc']: self.logger.info( 'Meta information about metric-%s does not exist, use accuracy instead.' % str(metric)) metric = 'acc' elif task_type in RGS_TASKS: self.algorithms = _rgs_builtin_algorithms self.n_algo_candidates = len(_rgs_builtin_algorithms) if metric not in ['mse']: self.logger.info( 'Meta information about metric-%s does not exist, use accuracy instead.' % str(metric)) metric = 'mse' else: raise ValueError('Invalid metric: %s.' % metric) self.total_resource = total_resource self.exclude_datasets = exclude_datasets builtin_loc = os.path.dirname(__file__) builtin_loc = os.path.join(builtin_loc, '..') builtin_loc = os.path.join(builtin_loc, 'meta_resource') self.meta_dir = meta_dir if meta_dir is not None else builtin_loc if self.exclude_datasets is None: self.hash_id = 'none' else: self.exclude_datasets = list(set(exclude_datasets)) exclude_str = ','.join(sorted(self.exclude_datasets)) md5 = hashlib.md5() md5.update(exclude_str.encode('utf-8')) self.hash_id = md5.hexdigest() meta_datasets = set() _folder = os.path.join(self.meta_dir, 'meta_dataset_vec') if task_type in CLS_TASKS: task_prefix = 'cls' else: task_prefix = 'rgs' embedding_path = os.path.join( _folder, '%s_meta_dataset_embedding.pkl' % task_prefix) with open(embedding_path, 'rb') as f: d = pkl.load(f) meta_datasets = d['task_ids'] self._builtin_datasets = sorted(list(meta_datasets)) self.metadata_manager = MetaDataManager(self.meta_dir, self.algorithms, self._builtin_datasets, metric, total_resource, task_type=task_type, rep=rep) self.meta_learner = None