def _init_next_rung(self): """Init next rung to search.""" next_rung_id = self.rung_id + 1 if next_rung_id >= self.total_rungs: self.rung_id = self.rung_id + 1 return for i in range(self.config_count): self.all_config_dict[i][next_rung_id] = self.all_config_dict[i][self.rung_id] current_score = [] for i in range(self.config_count): current_score.append((i, self.best_score_dict[self.rung_id][i])) current_score.sort(key=lambda current_score: current_score[1]) for i in range(4): better_id = current_score[self.config_count - 1 - i][0] worse_id = current_score[i][0] better_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', str(better_id), 'checkpoint') FileOps.make_dir(better_worker_result_path) worse_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', str(worse_id), 'checkpoint') FileOps.make_dir(worse_worker_result_path) shutil.rmtree(worse_worker_result_path) shutil.copytree(better_worker_result_path, worse_worker_result_path) self.all_config_dict[worse_id] = self.all_config_dict[better_id] policy_unchange = self.all_config_dict[worse_id][next_rung_id] policy_changed = self.explore(policy_unchange) self.all_config_dict[worse_id][next_rung_id] = policy_changed for id in range(self.config_count): self.best_score_dict[next_rung_id][id] = -1 * float('inf') tmp_row_data = {'config_id': id, 'rung_id': next_rung_id, 'status': StatusType.WAITTING} self._add_to_board(tmp_row_data) self.rung_id = self.rung_id + 1
def search(self): """Search one mutated model. :return: current number of samples, and the model """ search_desc = self.search_space.search_space.custom pareto_front_folder = FileOps.join_path(self.local_base_path, "result") if 'pareto_folder' in self.search_space.cfg and self.search_space.cfg.pareto_folder is not None: pareto_front_folder = self.search_space.cfg.pareto_folder.replace( "{local_base_path}", self.local_base_path) pareto_front_df = pd.read_csv( FileOps.join_path(pareto_front_folder, "pareto_front.csv")) code_to_mutate = random.choice(pareto_front_df['Code']) current_mutate, code_mutated = 0, code_to_mutate num_candidates = len(search_desc["candidates"]) while current_mutate < self.num_mutate: code_new = self.mutate_once(code_mutated, num_candidates) if code_new != code_mutated: current_mutate += 1 code_mutated = code_new logging.info("Mutate from {} to {}".format(code_to_mutate, code_mutated)) search_desc['code'] = code_mutated search_desc['method'] = "mutate" search_desc = self.codec.decode(search_desc) self.sample_count += 1 return self.sample_count, NetworkDesc(self.search_space.search_space)
def _save_checkpoint(self, epoch): """Save checkpoint.""" logging.debug("Start Save Checkpoint, file_name=%s", self.trainer.checkpoint_file_name) checkpoint_file = FileOps.join_path( self.trainer.get_local_worker_path(), self.trainer.checkpoint_file_name) logging.debug("Start Save Model, model_file=%s", self.trainer.model_pickle_file_name) model_pickle_file = FileOps.join_path( self.trainer.get_local_worker_path(), self.trainer.model_pickle_file_name) # pickle model with open(model_pickle_file, 'wb') as handle: pickle.dump(self.trainer.model, handle, protocol=pickle.HIGHEST_PROTOCOL) # save checkpoint ckpt = { 'epoch': epoch, 'weight': self.trainer.model.state_dict(), 'optimizer': self.trainer.optimizer.state_dict(), 'lr_scheduler': self.trainer.lr_scheduler.state_dict(), } torch.save(ckpt, checkpoint_file) self.trainer.checkpoint_file = checkpoint_file self.trainer.model_path = model_pickle_file
def _save_best_model(self): save_path = FileOps.join_path(self.trainer.get_local_worker_path(), self.trainer.step_name, "best_model.pth") FileOps.make_base_dir(save_path) torch.save(self.model.state_dict(), save_path) if self.trainer.backup_base_path is not None: _dst = FileOps.join_path(self.trainer.backup_base_path, "workers", str(self.trainer.worker_id)) FileOps.copy_folder(self.trainer.get_local_worker_path(), _dst)
def __init__(self): """Init HpoBase.""" super(HpoGenerator, self).__init__(self.cfg) self.hpo = None self.policy = self.cfg.get('policy') self._hps_cache = {} step_path = FileOps.join_path(self.local_output_path, self.cfg.step_name) self._best_hps_file = FileOps.join_path(step_path, 'best_hps.json') self._cache_file = FileOps.join_path(step_path, 'cache.csv') self._board_file = FileOps.join_path(step_path, 'score_board.csv')
def _copy_needed_file(self): if "pareto_front_file" not in self.cfg or self.cfg.pareto_front_file is None: raise FileNotFoundError("Config item paretor_front_file not found in config file.") init_pareto_front_file = self.cfg.pareto_front_file.replace("{local_base_path}", self.local_base_path) self.pareto_front_file = FileOps.join_path(self.local_output_path, self.cfg.step_name, "pareto_front.csv") FileOps.make_base_dir(self.pareto_front_file) FileOps.copy_file(init_pareto_front_file, self.pareto_front_file) if "random_file" not in self.cfg or self.cfg.random_file is None: raise FileNotFoundError("Config item random_file not found in config file.") init_random_file = self.cfg.random_file.replace("{local_base_path}", self.local_base_path) self.random_file = FileOps.join_path(self.local_output_path, self.cfg.step_name, "random.csv") FileOps.copy_file(init_random_file, self.random_file)
def save_metrics_value(self): """Save the metric value of the trained model. :return: save_path (local) and s3_path (remote). If s3_path not specified, then s3_path is None :rtype: a tuple of two str """ pd_path = FileOps.join_path(self.trainer.local_output_path, self.trainer.step_name, "performace.csv") FileOps.make_base_dir(pd_path) encoding = self.model.nbit_w_list + self.model.nbit_a_list df = pd.DataFrame( [[encoding, self.flops_count, self.params_count, self.metric]], columns=[ "encoding", "flops", "parameters", self.cfg.get("valid_metric", "acc") ]) if not os.path.exists(pd_path): with open(pd_path, "w") as file: df.to_csv(file, index=False) else: with open(pd_path, "a") as file: df.to_csv(file, index=False, header=False) if self.trainer.backup_base_path is not None: FileOps.copy_folder(self.trainer.local_output_path, self.trainer.backup_base_path)
def _get_performance(self, step_name, worker_id): """Read Performance values from perform.txt. :param step_name: step name in the pipeline. :type step_name: str. :param worker_id: the worker's worker id. :type worker_id: str. :return: performance value :rtype: int/float/list """ _file = FileOps.join_path(self.get_local_worker_path(step_name, worker_id), "performance.txt") if not os.path.isfile(_file): logger.info("Performance file is not exited, file={}".format(_file)) return [] with open(_file, 'r') as f: performance = [] for line in f.readlines(): line = line.strip() if line == "": continue data = json.loads(line) if isinstance(data, list): data = data[0] performance.append(data) logger.info("performance={}".format(performance)) return performance
def __init__(self, **kwargs): """Construct the Imagenet class.""" Dataset.__init__(self, **kwargs) split = 'train' if self.train else 'val' local_data_path = FileOps.join_path(self.args.data_path, split) ImageFolder.__init__(self, root=local_data_path, transform=Compose(self.transforms.__transform__))
def __init__(self, search_space): super(PruneEA, self).__init__(search_space) self.length = self.policy.length self.num_individual = self.policy.num_individual self.num_generation = self.policy.num_generation self.x_axis = 'flops' self.y_axis = 'acc' self.random_models = self.policy.random_models self.codec = Codec(self.cfg.codec, search_space) self.random_count = 0 self.ea_count = 0 self.ea_epoch = 0 self.step_path = FileOps.join_path(self.local_output_path, self.cfg.step_name) self.pd_file_name = FileOps.join_path(self.step_path, "performance.csv") self.pareto_front_file = FileOps.join_path(self.step_path, "pareto_front.csv") self.pd_path = FileOps.join_path(self.step_path, "pareto_front") FileOps.make_dir(self.pd_path)
def get_pareto_list_size(self): """Get the number of pareto list.""" pareto_list_size = 0 pareto_file_locate = FileOps.join_path(self.local_base_path, "result", "pareto_front.csv") if os.path.exists(pareto_file_locate): pareto_front_df = pd.read_csv(pareto_file_locate) pareto_list_size = pareto_front_df.size return pareto_list_size
def save_backup(self, performance): """Save checkpoints and performance file to backup path. :param performance: validated performance :type param: float, list or dict """ if self.backup_base_path is None: return pfm_file = os.path.join(self.get_local_worker_path(), 'performance.txt') with open(pfm_file, 'w') as f: f.write("{}".format(performance)) backup_worker_path = FileOps.join_path(self.backup_base_path, self.get_worker_subpath()) FileOps.copy_folder(self.get_local_worker_path(), backup_worker_path)
def update(self, record): """Update current performance into hpo score board. :param hps: hyper parameters need to update :param performance: trainer performance """ super().update(record) config_id = str(record.get('worker_id')) step_name = record.get('step_name') worker_result_path = self.get_local_worker_path(step_name, config_id) new_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', config_id, 'checkpoint') FileOps.make_dir(worker_result_path) FileOps.make_dir(new_worker_result_path) if os.path.exists(new_worker_result_path): shutil.rmtree(new_worker_result_path) shutil.copytree(worker_result_path, new_worker_result_path)
def _save_model_desc(self): """Save final model desc of NAS.""" pf_file = FileOps.join_path(self.trainer.local_output_path, self.trainer.step_name, "pareto_front.csv") if not FileOps.exists(pf_file): return with open(pf_file, "r") as file: pf = pd.read_csv(file) pareto_fronts = pf["encoding"].tolist() search_space = SearchSpace() codec = QuantCodec('QuantCodec', search_space) for i, pareto_front in enumerate(pareto_fronts): pareto_front = [int(x) for x in pareto_front[1:-1].split(',')] model_desc = Config() model_desc.modules = search_space.search_space.modules model_desc.backbone = codec.decode(pareto_front)._desc.backbone self.trainer.output_model_desc(i, model_desc)
def get_pareto_front(self): """Get pareto front from remote result file.""" with open(self.pd_file_name, "r") as file: df = pd.read_csv(file) fitness = df[[self.x_axis, self.y_axis]].values.transpose() # acc2error fitness[1, :] = 1 - fitness[1, :] _, _, selected = SortAndSelectPopulation(fitness, self.num_individual) result = df.loc[selected, :] if self.ea_count % self.num_individual == 0: file_name = "{}_epoch.csv".format( str(self.ea_epoch)) pd_result_file = FileOps.join_path(self.pd_path, file_name) with open(pd_result_file, "w") as file: result.to_csv(file, index=False) with open(self.pareto_front_file, "w") as file: result.to_csv(file, index=False) self.ea_epoch += 1 return result
def _get_performance(self, step_name, worker_id): saved_folder = self.get_local_worker_path(step_name, worker_id) performance_file = FileOps.join_path(saved_folder, "performance.txt") if not os.path.isfile(performance_file): logging.info("Performance file is not exited, file={}".format( performance_file)) return [] with open(performance_file, 'r') as f: performance = [] for line in f.readlines(): line = line.strip() if line == "": continue data = json.loads(line) if isinstance(data, list): data = data[0] performance.append(data) logging.info("performance={}".format(performance)) return performance
def search(self): """Search an id and hps from hpo.""" sample = self.hpo.propose() if sample is None: return None re_hps = {} sample = copy.deepcopy(sample) sample_id = sample.get('config_id') trans_para = sample.get('configs') rung_id = sample.get('rung_id') re_hps['dataset.transforms'] = [{'type': 'PBATransformer', 'para_array': trans_para, 'operation_names': self.operation_names}] checkpoint_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', str(sample_id), 'checkpoint') FileOps.make_dir(checkpoint_path) if os.path.exists(checkpoint_path): re_hps['trainer.checkpoint_path'] = checkpoint_path if 'epoch' in sample: re_hps['trainer.epochs'] = sample.get('epoch') return dict(worker_id=sample_id, desc=re_hps, info=rung_id)
def update_performance(self, hps, performance): """Update current performance into hpo score board. :param hps: hyper parameters need to update :param performance: trainer performance """ if isinstance(performance, list) and len(performance) > 0: self.hpo.add_score(int(hps.get('config_id')), int(hps.get('rung_id')), performance[0]) else: self.hpo.add_score(int(hps.get('config_id')), int(hps.get('rung_id')), -1) logging.error("hpo get empty performance!") worker_result_path = self.get_local_worker_path(self.cfg.step_name, str(hps.get('config_id'))) new_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', str(hps.get('config_id')), 'checkpoint') FileOps.make_dir(worker_result_path) FileOps.make_dir(new_worker_result_path) if os.path.exists(new_worker_result_path): shutil.rmtree(new_worker_result_path) shutil.copytree(worker_result_path, new_worker_result_path)
def _new_model_init(self, model_prune): """Init new model. :param model_prune: searched pruned model :type model_prune: torch.nn.Module :return: initial model after loading pretrained model :rtype: torch.nn.Module """ init_model_file = self.config.init_model_file if ":" in init_model_file: local_path = FileOps.join_path( self.trainer.get_local_worker_path(), os.path.basename(init_model_file)) FileOps.copy_file(init_model_file, local_path) self.config.init_model_file = local_path network_desc = copy.deepcopy(self.base_net_desc) network_desc.backbone.chn = network_desc.backbone.base_chn network_desc.backbone.chn_node = network_desc.backbone.base_chn_node network_desc.backbone.encoding = model_prune.encoding model_init = NetworkDesc(network_desc).to_model() return model_init
def __init__(self, search_space=None): super(SpNas, self).__init__(search_space) self.search_space = search_space self.codec = Codec(self.cfg.codec, search_space) self.sample_level = self.cfg.sample_level self.max_sample = self.cfg.max_sample self.max_optimal = self.cfg.max_optimal self._total_list_name = self.cfg.total_list self.serial_settings = self.cfg.serial_settings self._total_list = ListDict() self.sample_count = 0 self.init_code = None remote_output_path = FileOps.join_path(self.local_output_path, self.cfg.step_name) if 'last_search_result' in self.cfg: last_search_file = self.cfg.last_search_result assert FileOps.exists(os.path.join(remote_output_path, last_search_file) ), "Not found serial results!" # self.download_task_folder() last_search_results = os.path.join(self.local_output_path, last_search_file) last_search_results = ListDict.load_csv(last_search_results) pre_worker_id, pre_arch = self.select_from_remote(self.max_optimal, last_search_results) # re-write config template if self.cfg.regnition: self.codec.config_template['model']['backbone']['reignition'] = True assert FileOps.exists(os.path.join(remote_output_path, pre_arch + '_imagenet.pth') ), "Not found {} pretrained .pth file!".format(pre_arch) pretrained_pth = os.path.join(self.local_output_path, pre_arch + '_imagenet.pth') self.codec.config_template['model']['pretrained'] = pretrained_pth pre_worker_id = -1 # update config template self.init_code = dict(arch=pre_arch, pre_arch=pre_arch.split('_')[1], pre_worker_id=pre_worker_id) logging.info("inited SpNas {}-level search...".format(self.sample_level))
def update(self, step_name, worker_id): """Update hpo score into score board. :param step_name: step name in pipeline :param worker_id: worker id of worker """ worker_id = str(worker_id) performance = self._get_performance(step_name, worker_id) if worker_id in self._hps_cache: hps = self._hps_cache[worker_id][0] self._hps_cache[worker_id][1] = copy.deepcopy(performance) logging.info("get hps need to update, worker_id=%s, hps=%s", worker_id, str(hps)) self.update_performance(hps, performance) logging.info("hpo_id=%s, hps=%s, performance=%s", worker_id, str(hps), str(performance)) self._save_hpo_cache() self._save_score_board() self._save_best() if self.need_backup and self.backup_base_path is not None: FileOps.copy_folder(self.local_output_path, FileOps.join_path(self.backup_base_path, self.output_subpath)) logger.info("Hpo update finished.") else: logger.error("worker_id not in hps_cache.")
def sample(self): """Sample an id and hps from hpo. :return: id, hps :rtype: int, dict """ re_hps = {} sample = self.hpo.propose() if sample is not None: sample = copy.deepcopy(sample) sample_id = sample.get('config_id') self._hps_cache[str(sample_id)] = [copy.deepcopy(sample), []] trans_para = sample.get('configs') re_hps['dataset.transforms'] = [{'type': 'PBATransformer', 'para_array': trans_para, 'operation_names': self.operation_names}] checkpoint_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', str(sample_id), 'checkpoint') FileOps.make_dir(checkpoint_path) if os.path.exists(checkpoint_path): re_hps['trainer.checkpoint_path'] = checkpoint_path if 'epoch' in sample: re_hps['trainer.epochs'] = sample.get('epoch') return sample_id, re_hps else: return None, None
def _load_tf_model(self, model_prune, model_init, chn_node_mask): """Load tensorflow pretrained model.""" with tf.Session(config=self.trainer._init_session_config()) as sess: saver = tf.train.import_meta_graph("{}.meta".format( self.config.init_model_file)) saver.restore(sess, self.config.init_model_file) chn_node_id = 0 chn_id = 0 chn_mask = model_prune.chn_mask start_mask = [] end_mask = [] all_weight = tf.get_collection(tf.GraphKeys.VARIABLES) all_weight = [ t for t in all_weight if not t.name.endswith('Momentum:0') ] for op in all_weight: name = op.name if name.startswith('conv_1'): end_mask = chn_node_mask[0] end_mask = np.asarray(end_mask) idx1 = np.squeeze( np.argwhere( np.asarray(np.ones(end_mask.shape) - end_mask))) mask = np.ones(op.get_shape()) mask[:, :, :, idx1.tolist()] = 0 sess.run( tf.assign(op, op * tf.constant(mask, dtype=op.dtype))) elif name.startswith('bn_1'): idx1 = np.squeeze( np.argwhere( np.asarray(np.ones(end_mask.shape) - end_mask))) mask = np.ones(op.get_shape()) mask[idx1.tolist()] = 0 sess.run( tf.assign(op, op * tf.constant(mask, dtype=op.dtype))) elif name.startswith('dense/kernel'): idx1 = np.squeeze( np.argwhere( np.asarray(np.ones(end_mask.shape) - end_mask))) mask = np.ones(op.get_shape()) mask[idx1.tolist(), :] = 0 sess.run( tf.assign(op, op * tf.constant(mask, dtype=op.dtype))) elif name.startswith('layer'): parsed_name = list(name.split('/')) layer_idx = parsed_name[0][-1] block_idx = parsed_name[1][-1] operation = parsed_name[2] if operation.startswith('conv'): if operation == 'conv_1': start_mask = chn_node_mask[int(layer_idx) - 1] end_mask = chn_mask[int(block_idx)] if operation == 'conv_2': start_mask = end_mask end_mask = chn_node_mask[int(layer_idx)] # shortcut if operation == 'conv_3': start_mask = chn_node_mask[int(layer_idx) - 1] end_mask = chn_node_mask[int(layer_idx)] start_mask = np.asarray(start_mask) end_mask = np.asarray(end_mask) idx0 = np.squeeze( np.argwhere( np.asarray( np.ones(start_mask.shape) - start_mask))) idx1 = np.squeeze( np.argwhere( np.asarray(np.ones(end_mask.shape) - end_mask))) mask = np.ones(op.get_shape()) mask[:, :, idx0.tolist(), :] = 0 mask[:, :, :, idx1.tolist()] = 0 sess.run( tf.assign(op, op * tf.constant(mask, dtype=op.dtype))) elif operation.startswith('bn'): idx1 = np.squeeze( np.argwhere( np.asarray(np.ones(end_mask.shape) - end_mask))) mask = np.ones(op.get_shape()) mask[idx1.tolist()] = 0 sess.run( tf.assign(op, op * tf.constant(mask, dtype=op.dtype))) save_file = FileOps.join_path(self.trainer.get_local_worker_path(), 'prune_model') saver.save(sess, save_file) return model_init