class mqSMBO(BOBase): def __init__( self, objective_function, config_space, eval_type='holdout', parallel_strategy='async', batch_size=4, batch_strategy='median_imputation', num_constraints=0, num_objs=1, sample_strategy: str = 'bo', runtime_limit=600, time_limit_per_trial=180, surrogate_type=None, acq_type=None, acq_optimizer_type='local_random', initial_runs=3, init_strategy='random_explore_first', initial_configurations=None, ref_point=None, history_bo_data: List[OrderedDict] = None, logging_dir='logs', task_id='default', random_state=1, ip="", port=13579, authkey=b'abc', ): self.task_info = { 'num_constraints': num_constraints, 'num_objs': num_objs } self.FAILED_PERF = [MAXINT] * num_objs super().__init__(objective_function, config_space, task_id=task_id, output_dir=logging_dir, random_state=random_state, initial_runs=initial_runs, max_runs=int(1e10), runtime_limit=runtime_limit, sample_strategy=sample_strategy, time_limit_per_trial=time_limit_per_trial, history_bo_data=history_bo_data) if parallel_strategy == 'sync': self.config_advisor = SyncBatchAdvisor( config_space, self.task_info, batch_size=batch_size, batch_strategy=batch_strategy, initial_trials=initial_runs, initial_configurations=initial_configurations, init_strategy=init_strategy, history_bo_data=history_bo_data, optimization_strategy=sample_strategy, surrogate_type=surrogate_type, acq_type=acq_type, acq_optimizer_type=acq_optimizer_type, ref_point=ref_point, task_id=task_id, output_dir=logging_dir, random_state=random_state) elif parallel_strategy == 'async': self.config_advisor = AsyncBatchAdvisor( config_space, self.task_info, batch_size=batch_size, batch_strategy=batch_strategy, initial_trials=initial_runs, initial_configurations=initial_configurations, init_strategy=init_strategy, history_bo_data=history_bo_data, optimization_strategy=sample_strategy, surrogate_type=surrogate_type, acq_type=acq_type, acq_optimizer_type=acq_optimizer_type, ref_point=ref_point, task_id=task_id, output_dir=logging_dir, random_state=random_state) else: raise ValueError('Invalid parallel strategy - %s.' % parallel_strategy) self.eval_type = eval_type self.parallel_strategy = parallel_strategy self.batch_size = batch_size max_queue_len = max(100, 3 * batch_size) self.master_messager = MasterMessager(ip, port, authkey, max_queue_len, max_queue_len) self.start_time = time.time() self.configs = list() self.perfs = list() self.incumbent_perf = float("-INF") self.incumbent_config = self.config_space.get_default_configuration() self.eval_dict = dict() self.workers = dict() def async_run(self): config_num = 0 cur_num = 0 while time.time() - self.start_time < self.runtime_limit: # Add jobs to masterQueue. while len(self.config_advisor.running_configs ) < self.batch_size and config_num < self.max_iterations: config_num += 1 config = self.config_advisor.get_suggestion() msg = [config, self.time_limit_per_trial] self.logger.info("Master: Add config %d." % config_num) self.master_messager.send_message(msg) # Get results from workerQueue. while True: observation = self.master_messager.receive_message() if observation is None: # Wait for workers. # self.logger.info("Master: wait for worker results. sleep 1s.") time.sleep(1) break # Report result. cur_num += 1 config, trial_state, constraints, objs, elapsed_time, worker_info, extra_info = observation stored_info = list(self.workers.values()) if worker_info not in stored_info: self.workers[len(self.workers)] = worker_info _perf = float("INF") if objs is None else objs[0] self.configs.append(config) self.perfs.append(_perf) self.eval_dict[config] = [-_perf, time.time(), trial_state] if -_perf > self.incumbent_perf: self.incumbent_perf = -_perf self.incumbent_config = config if objs is None: observation = Observation(config, trial_state, constraints, self.FAILED_PERF, elapsed_time, worker_info=worker_info, extra=extra_info) self.config_advisor.update_observation(observation) self.logger.info('Master: Get %d observation: %s' % (cur_num, str(observation))) def sync_run(self): batch_id = 0 while time.time() - self.start_time < self.runtime_limit: configs = self.config_advisor.get_suggestions() # Add batch configs to masterQueue. for config in configs: msg = [config, self.time_limit_per_trial] self.master_messager.send_message(msg) self.logger.info('Master: %d-th batch. %d configs sent.' % (batch_id, len(configs))) # Get batch results from workerQueue. result_num = 0 result_needed = len(configs) while True: observation = self.master_messager.receive_message() if observation is None: # Wait for workers. # self.logger.info("Master: wait for worker results. sleep 1s.") time.sleep(1) continue # Report result. result_num += 1 config, trial_state, constraints, objs, elapsed_time, worker_info, extra_info = observation if objs is None: observation = Observation(config, trial_state, constraints, self.FAILED_PERF, elapsed_time, worker_info, extra_info) self.config_advisor.update_observation(observation) self.logger.info( 'Master: In the %d-th batch [%d], observation is: %s' % (batch_id, result_num, str(observation))) if result_num == result_needed: break batch_id += 1 def run(self): if self.parallel_strategy == 'async': self.async_run() else: self.sync_run() return self.get_history()
class async_mqBaseFacade(object): def __init__(self, objective_func, restart_needed=False, need_lc=False, method_name='default_method_name', log_directory='logs', data_directory='data', time_limit_per_trial=600, runtime_limit=None, max_queue_len=300, ip='', port=13579, authkey=b'abc', sleep_time=0.1,): self.log_directory = log_directory if not os.path.exists(self.log_directory): os.makedirs(self.log_directory) self.data_directory = data_directory if not os.path.exists(self.data_directory): os.makedirs(self.data_directory) self.logger = self._get_logger(method_name) self.objective_func = objective_func self.trial_statistics = list() self.recorder = list() self.global_start_time = time.time() self.runtime_limit = None self._history = {"time_elapsed": list(), "performance": list(), "best_trial_id": list(), "configuration": list()} self.global_incumbent = 1e10 self.global_incumbent_configuration = None self.global_trial_counter = 0 self.restart_needed = restart_needed self.record_lc = need_lc self.method_name = method_name # evaluation metrics self.stage_id = 1 self.stage_history = {'stage_id': list(), 'performance': list()} self.grid_search_perf = list() self.save_intermediate_record = False self.save_intermediate_record_id = 0 self.save_intermediate_record_path = None if self.method_name is None: raise ValueError('Method name must be specified! NOT NONE.') self.time_limit_per_trial = time_limit_per_trial self.runtime_limit = runtime_limit assert self.runtime_limit is not None max_queue_len = max(300, max_queue_len) self.master_messager = MasterMessager(ip, port, authkey, max_queue_len, max_queue_len) self.sleep_time = sleep_time def set_restart(self): self.restart_needed = True def set_method_name(self, name): self.method_name = name def add_stage_history(self, stage_id, performance): self.stage_history['stage_id'].append(stage_id) self.stage_history['performance'].append(performance) def add_history(self, time_elapsed, performance, trial_id, config): self._history['time_elapsed'].append(time_elapsed) self._history['performance'].append(performance) self._history['best_trial_id'].append(trial_id) self._history['configuration'].append(config) def run(self): try: worker_num = 0 while True: if self.runtime_limit is not None and time.time() - self.global_start_time > self.runtime_limit: self.logger.info('RUNTIME BUDGET is RUNNING OUT.') return # Get observation from worker observation = self.master_messager.receive_message() # return_info, time_taken, trial_id, config if observation is None: # Wait for workers. time.sleep(self.sleep_time) continue return_info, time_taken, trial_id, config = observation # worker init if config is None: worker_num += 1 self.logger.info("Worker %d init." % (worker_num, )) # update observation else: global_time = time.time() - self.global_start_time self.logger.info('Master get observation: %s. Global time=%.2fs.' % (str(observation), global_time)) n_iteration = return_info['n_iteration'] perf = return_info['loss'] t = time.time() self.update_observation(config, perf, n_iteration) self.logger.info('update_observation() cost %.2fs.' % (time.time() - t,)) self.recorder.append({'trial_id': trial_id, 'time_consumed': time_taken, 'configuration': config, 'n_iteration': n_iteration, 'return_info': return_info, 'global_time': global_time}) if (not hasattr(self, 'R')) or n_iteration == self.R: self.save_intermediate_statistics() # Send new job t = time.time() config, n_iteration, extra_conf = self.get_job() self.logger.info('get_job() cost %.2fs.' % (time.time()-t, )) msg = [config, extra_conf, self.time_limit_per_trial, n_iteration, self.global_trial_counter] self.master_messager.send_message(msg) self.global_trial_counter += 1 self.logger.info('Master send job: %s.' % (msg,)) except Exception as e: print(e) print(traceback.format_exc()) self.logger.error(traceback.format_exc()) def get_job(self): raise NotImplementedError def update_observation(self, config, perf, n_iteration): raise NotImplementedError def set_save_intermediate_record(self, dir_path, file_name): if not os.path.exists(dir_path): os.makedirs(dir_path) self.save_intermediate_record = True if file_name.endswith('.pkl'): file_name = file_name[:-4] self.save_intermediate_record_path = os.path.join(dir_path, file_name) self.logger.info('set save_intermediate_record to True. path: %s.' % (self.save_intermediate_record_path,)) def save_intermediate_statistics(self): if self.save_intermediate_record: self.save_intermediate_record_id += 1 path = '%s_%d.pkl' % (self.save_intermediate_record_path, self.save_intermediate_record_id) with open(path, 'wb') as f: pkl.dump(self.recorder, f) global_time = time.time() - self.global_start_time self.logger.info('Intermediate record %s saved! global_time=%.2fs.' % (path, global_time)) def _get_logger(self, name): logger_name = name setup_logger(os.path.join(self.log_directory, '%s.log' % str(logger_name)), None) return get_logger(self.__class__.__name__)
class mqBaseFacade(object): def __init__(self, objective_func, restart_needed=False, need_lc=False, method_name='default_method_name', log_directory='logs', data_directory='data', time_limit_per_trial=600, runtime_limit=None, max_queue_len=300, ip='', port=13579, authkey=b'abc',): self.log_directory = log_directory if not os.path.exists(self.log_directory): os.makedirs(self.log_directory) self.data_directory = data_directory if not os.path.exists(self.data_directory): os.makedirs(self.data_directory) self.logger = self._get_logger(method_name) self.objective_func = objective_func self.trial_statistics = [] self.recorder = [] self.global_start_time = time.time() self.runtime_limit = None self._history = {"time_elapsed": [], "performance": [], "best_trial_id": [], "configuration": []} self.global_incumbent = 1e10 self.global_incumbent_configuration = None self.global_trial_counter = 0 self.restart_needed = restart_needed self.record_lc = need_lc self.method_name = method_name # evaluation metrics self.stage_id = 1 self.stage_history = {'stage_id': [], 'performance': []} self.grid_search_perf = [] if self.method_name is None: raise ValueError('Method name must be specified! NOT NONE.') self.time_limit_per_trial = time_limit_per_trial self.runtime_limit = runtime_limit max_queue_len = max(300, max_queue_len) self.master_messager = MasterMessager(ip, port, authkey, max_queue_len, max_queue_len) def set_restart(self): self.restart_needed = True def set_method_name(self, name): self.method_name = name def add_stage_history(self, stage_id, performance): self.stage_history['stage_id'].append(stage_id) self.stage_history['performance'].append(performance) def add_history(self, time_elapsed, performance, trial_id, config): self._history['time_elapsed'].append(time_elapsed) self._history['performance'].append(performance) self._history['best_trial_id'].append(trial_id) self._history['configuration'].append(config) def run_in_parallel(self, configurations, n_iteration, extra_info=None): n_configuration = len(configurations) performance_result = [] early_stops = [] # TODO: need systematic tests. # check configurations, whether it exists the same configs count_dict = dict() for i, config in enumerate(configurations): if config not in count_dict: count_dict[config] = 0 count_dict[config] += 1 # incorporate ref info. conf_list = [] for index, config in enumerate(configurations): extra_conf_dict = dict() if count_dict[config] > 1: extra_conf_dict['uid'] = count_dict[config] count_dict[config] -= 1 if extra_info is not None: extra_conf_dict['reference'] = extra_info[index] extra_conf_dict['need_lc'] = self.record_lc extra_conf_dict['method_name'] = self.method_name conf_list.append((config, extra_conf_dict)) # Add batch configs to masterQueue. for config, extra_conf in conf_list: msg = [config, extra_conf, self.time_limit_per_trial, n_iteration, self.global_trial_counter] self.master_messager.send_message(msg) self.global_trial_counter += 1 self.logger.info('Master: %d configs sent.' % (len(conf_list))) # Get batch results from workerQueue. result_num = 0 result_needed = len(conf_list) while True: if self.runtime_limit is not None and time.time() - self.global_start_time > self.runtime_limit: break observation = self.master_messager.receive_message() # return_info, time_taken, trial_id, config if observation is None: # Wait for workers. # self.logger.info("Master: wait for worker results. sleep 1s.") time.sleep(1) continue # Report result. result_num += 1 global_time = time.time() - self.global_start_time self.trial_statistics.append((observation, global_time)) self.logger.info('Master: Get the [%d] result, observation is %s.' % (result_num, str(observation))) if result_num == result_needed: break # sort by trial_id. FIX BUG self.trial_statistics.sort(key=lambda x: x[0][2]) # get the evaluation statistics for observation, global_time in self.trial_statistics: return_info, time_taken, trial_id, config = observation performance = return_info['loss'] if performance < self.global_incumbent: self.global_incumbent = performance self.global_incumbent_configuration = config self.add_history(global_time, self.global_incumbent, trial_id, self.global_incumbent_configuration) # TODO: old version => performance_result.append(performance) performance_result.append(return_info) early_stops.append(return_info.get('early_stop', False)) self.recorder.append({'trial_id': trial_id, 'time_consumed': time_taken, 'configuration': config, 'n_iteration': n_iteration, 'return_info': return_info, 'global_time': global_time}) self.trial_statistics.clear() self.save_intemediate_statistics() if self.runtime_limit is not None and time.time() - self.global_start_time > self.runtime_limit: raise ValueError('Runtime budget meets!') return performance_result, early_stops def save_intemediate_statistics(self, save_stage=False): # file_name = '%s.npy' % self.method_name # x = np.array(self._history['time_elapsed']) # y = np.array(self._history['performance']) # np.save(os.path.join(self.data_directory, file_name), np.array([x, y])) # # config_file_name = 'config_%s.pkl' % self.method_name # with open(os.path.join(self.data_directory, config_file_name), 'wb') as f: # pkl.dump(self.global_incumbent_configuration, f) # # record_file_name = 'record_%s.pkl' % self.method_name # with open(os.path.join(self.data_directory, record_file_name), 'wb') as f: # pkl.dump(self.recorder, f) # # if save_stage: # stage_file_name = 'stage_%s.npy' % self.method_name # stage_x = np.array(self.stage_history['stage_id']) # stage_y = np.array(self.stage_history['performance']) # np.save(os.path.join(self.data_directory, stage_file_name), np.array([stage_x, stage_y])) # # if PLOT: # plt.plot(x, y) # plt.xlabel('Time elapsed (sec)') # plt.ylabel('Validation error') # plt.savefig("data/%s.png" % self.method_name) return def _get_logger(self, name): logger_name = name setup_logger(os.path.join(self.log_directory, '%s.log' % str(logger_name)), None) return get_logger(self.__class__.__name__)
class mqSMBO(BOBase): def __init__( self, objective_function, config_space, parallel_strategy='async', batch_size=4, batch_strategy='default', num_constraints=0, num_objs=1, sample_strategy: str = 'bo', max_runs=200, time_limit_per_trial=180, surrogate_type='auto', acq_type='auto', acq_optimizer_type='auto', initial_runs=3, init_strategy='random_explore_first', initial_configurations=None, ref_point=None, history_bo_data: List[OrderedDict] = None, logging_dir='logs', task_id='default_task_id', random_state=None, advisor_kwargs: dict = None, ip="", port=13579, authkey=b'abc', ): if task_id is None: raise ValueError( 'Task id is not SPECIFIED. Please input task id first.') self.num_objs = num_objs self.num_constraints = num_constraints self.FAILED_PERF = [MAXINT] * num_objs super().__init__(objective_function, config_space, task_id=task_id, output_dir=logging_dir, random_state=random_state, initial_runs=initial_runs, max_runs=max_runs, sample_strategy=sample_strategy, time_limit_per_trial=time_limit_per_trial, history_bo_data=history_bo_data) self.parallel_strategy = parallel_strategy self.batch_size = batch_size max_queue_len = max(100, 3 * batch_size) self.master_messager = MasterMessager(ip, port, authkey, max_queue_len, max_queue_len) advisor_kwargs = advisor_kwargs or {} if parallel_strategy == 'sync': self.config_advisor = SyncBatchAdvisor( config_space, num_objs=num_objs, num_constraints=num_constraints, batch_size=batch_size, batch_strategy=batch_strategy, initial_trials=initial_runs, initial_configurations=initial_configurations, init_strategy=init_strategy, history_bo_data=history_bo_data, optimization_strategy=sample_strategy, surrogate_type=surrogate_type, acq_type=acq_type, acq_optimizer_type=acq_optimizer_type, ref_point=ref_point, task_id=task_id, output_dir=logging_dir, random_state=random_state, **advisor_kwargs) elif parallel_strategy == 'async': self.config_advisor = AsyncBatchAdvisor( config_space, num_objs=num_objs, num_constraints=num_constraints, batch_size=batch_size, batch_strategy=batch_strategy, initial_trials=initial_runs, initial_configurations=initial_configurations, init_strategy=init_strategy, history_bo_data=history_bo_data, optimization_strategy=sample_strategy, surrogate_type=surrogate_type, acq_type=acq_type, acq_optimizer_type=acq_optimizer_type, ref_point=ref_point, task_id=task_id, output_dir=logging_dir, random_state=random_state, **advisor_kwargs) else: raise ValueError('Invalid parallel strategy - %s.' % parallel_strategy) def async_run(self): config_num = 0 result_num = 0 while result_num < self.max_iterations: # Add jobs to masterQueue. while len(self.config_advisor.running_configs ) < self.batch_size and config_num < self.max_iterations: config_num += 1 config = self.config_advisor.get_suggestion() msg = [config, self.time_limit_per_trial] self.logger.info("Master: Add config %d." % config_num) self.master_messager.send_message(msg) # Get results from workerQueue. while True: observation = self.master_messager.receive_message() if observation is None: # Wait for workers. # self.logger.info("Master: wait for worker results. sleep 1s.") time.sleep(1) break # Report result. result_num += 1 if observation.objs is None: observation = Observation( config=observation.config, objs=self.FAILED_PERF, constraints=observation.constraints, trial_state=observation.trial_state, elapsed_time=observation.elapsed_time, ) self.config_advisor.update_observation(observation) self.logger.info('Master: Get %d observation: %s' % (result_num, str(observation))) def sync_run(self): batch_num = (self.max_iterations + self.batch_size - 1) // self.batch_size if self.batch_size > self.config_advisor.init_num: batch_num += 1 # fix bug batch_id = 0 while batch_id < batch_num: configs = self.config_advisor.get_suggestions() # Add batch configs to masterQueue. for config in configs: msg = [config, self.time_limit_per_trial] self.master_messager.send_message(msg) self.logger.info('Master: %d-th batch. %d configs sent.' % (batch_id, len(configs))) # Get batch results from workerQueue. result_num = 0 result_needed = len(configs) while True: observation = self.master_messager.receive_message() if observation is None: # Wait for workers. # self.logger.info("Master: wait for worker results. sleep 1s.") time.sleep(1) continue # Report result. result_num += 1 if observation.objs is None: observation = Observation( config=observation.config, objs=self.FAILED_PERF, constraints=observation.constraints, trial_state=observation.trial_state, elapsed_time=observation.elapsed_time, ) self.config_advisor.update_observation(observation) self.logger.info( 'Master: In the %d-th batch [%d], observation is: %s' % (batch_id, result_num, str(observation))) if result_num == result_needed: break batch_id += 1 def run(self): if self.parallel_strategy == 'async': self.async_run() else: self.sync_run() return self.get_history()