def save_results(self): """Save the results of evolution contains the information of pupulation and elitism.""" step_name = Config(deepcopy(UserConfig().data)).general.step_name _path = FileOps.join_path(self.local_output_path, step_name) FileOps.make_dir(_path) arch_file = FileOps.join_path(_path, 'arch.txt') arch_child = FileOps.join_path(_path, 'arch_child.txt') sel_arch_file = FileOps.join_path(_path, 'selected_arch.npy') sel_arch = [] with open(arch_file, 'a') as fw_a, open(arch_child, 'a') as fw_ac: writer_a = csv.writer(fw_a, lineterminator='\n') writer_ac = csv.writer(fw_ac, lineterminator='\n') writer_ac.writerow(['Population Iteration: ' + str(self.evolution_count + 1)]) for c in range(self.individual_num): writer_ac.writerow( self._log_data(net_info_type='active_only', pop=self.pop[c], value=self.pop[c].fitness)) writer_a.writerow(['Population Iteration: ' + str(self.evolution_count + 1)]) for c in range(self.elitism_num): writer_a.writerow(self._log_data(net_info_type='active_only', pop=self.elitism[c], value=self.elit_fitness[c])) sel_arch.append(self.elitism[c].gene) sel_arch = np.stack(sel_arch) np.save(sel_arch_file, sel_arch) if self.backup_base_path is not None: FileOps.copy_folder(self.local_output_path, self.backup_base_path)
def _save_performance(self, performance, model_desc=None): """Save result of the model, and calculate pareto front. :param performance: The dict that contains all the result needed :param model_desc: config of the model """ performance_str = json.dumps(performance, indent=4, sort_keys=True) self.trainer._save_performance(performance_str) method = model_desc.method code = model_desc.code metric_method = self.cfg.metric.method FileOps.make_dir(FileOps.join_path(self.result_path)) result_file_name = FileOps.join_path(self.result_path, "{}.csv".format(method)) header = "Code,GFlops,KParams,{0},Best {0},Worker_id\n".format( metric_method) if not os.path.exists(result_file_name): with open(result_file_name, 'w') as file: file.write(header) with open(result_file_name, 'a') as file: file.write('{},{},{},{},{},{}\n'.format( code, performance['gflops'], performance['kparams'], performance["cur_valid_perf"], performance["best_valid_perf"], self.trainer.worker_id)) logging.info("Model result saved to {}".format(result_file_name)) self._save_pareto_front("GFlops", "Best {}".format(metric_method))
def dump_model_visual_info(trainer, epoch, model, inputs): """Dump model to tensorboard event files. :param trainer: trainer. :type worker: object that the class was inherited from DistributedWorker. :param model: model. :type model: model. :param inputs: input data. :type inputs: data. """ (_, visual, interval, title, worker_id, output_path) = _get_trainer_info(trainer) if visual is not True: return if epoch % interval != 0: return title = str(worker_id) _path = FileOps.join_path(output_path, title) FileOps.make_dir(_path) try: with SummaryWriter(_path) as writer: writer.add_graph(model, (inputs, )) except Exception as e: logging.error( "Failed to dump model visual info, worker id: {}, epoch: {}, error: {}" .format(worker_id, epoch, str(e)))
def _save_checkpoint(self, epoch, best=False): """Save model weights. :param epoch: current epoch :type epoch: int """ save_dir = os.path.join(self.worker_path, str(epoch)) FileOps.make_dir(save_dir) for name in self.model.model_names: if isinstance(name, str): save_filename = '%s_net_%s.pth' % (epoch, name) save_path = FileOps.join_path(save_dir, save_filename) net = getattr(self.model, 'net' + name) best_file = FileOps.join_path(self.worker_path, "model_{}.pth".format(name)) if self.cfg.cuda and torch.cuda.is_available(): # torch.save(net.module.cpu().state_dict(), save_path) torch.save(net.module.state_dict(), save_path) # net.cuda() if best: torch.save(net.module.state_dict(), best_file) else: torch.save(net.cpu().state_dict(), save_path) if best: torch.save(net.cpu().state_dict(), best_file)
def before_train(self, logs=None): """Be called before the whole train process.""" self.trainer.config.call_metrics_on_train = False self.cfg = self.trainer.config self.worker_id = self.trainer.worker_id self.local_base_path = self.trainer.local_base_path self.local_output_path = self.trainer.local_output_path self.result_path = FileOps.join_path(self.trainer.local_base_path, "result") FileOps.make_dir(self.result_path) self.logger_patch()
def copy_pareto_output(self, step_name=None, worker_ids=[]): """Copy files related to pareto from worker to output.""" taskops = TaskOps() local_output_path = os.path.join(taskops.local_output_path, step_name) if not (step_name and os.path.exists(local_output_path)): return for worker_id in worker_ids: desDir = os.path.join(local_output_path, str(worker_id)) FileOps.make_dir(desDir) local_worker_path = taskops.get_worker_subpath( step_name, str(worker_id)) srcDir = FileOps.join_path(taskops.local_base_path, local_worker_path) copy_search_file(srcDir, desDir)
def _save_performance(self, results): """Save performance into performance.pkl and save checkpoint to output_dir. :param results: performance results :type sr: dict """ logging.info("performance=%s", str(results)) performance_dir = os.path.join(self.worker_path, 'performance') FileOps.make_dir(performance_dir) FileOps.dump_pickle(results, os.path.join(performance_dir, 'performance.pkl')) logging.info("performance save to %s", performance_dir) # copy pth to output dir output_dir = os.path.join(self.output_path, str(self._worker_id)) FileOps.make_dir(output_dir) shutil.copy( os.path.join(self.worker_path, 'latest.pth'), os.path.join(output_dir, results['arch'].split('_')[1] + '.pth')) logging.info("Latest checkpoint save to %s", output_dir)
def before_train(self, logs=None): """Be called before the training process.""" self.cfg = self.trainer.cfg self.trainer.auto_save_ckpt = False self.trainer.auto_save_perf = False self.worker_id = self.trainer.worker_id self.local_base_path = self.trainer.local_base_path self.local_output_path = self.trainer.local_output_path self.result_path = FileOps.join_path(self.local_output_path, self.cfg.step_name) FileOps.make_dir(self.result_path) count_input = torch.FloatTensor(1, 3, 192, 192).cuda() flops_count, params_count = calc_model_flops_params( self.trainer.model, count_input) GFlops, KParams = flops_count * 1e-9, params_count * 1e-3 logger.info("Flops: {:.2f} G, Params: {:.1f} K".format( GFlops, KParams)) if GFlops > 0.6: logger.info("Flop too large!") self.trainer.skip_train = True self._copy_needed_file()
def _output_records(self, step_name, records, desc=True, weights_file=False, performance=False): """Dump records.""" columns = ["worker_id", "performance", "desc"] outputs = [] for record in records: record = record.serialize() _record = {} for key in columns: _record[key] = record[key] outputs.append(deepcopy(_record)) data = pd.DataFrame(outputs) step_path = FileOps.join_path(TaskOps().local_output_path, step_name) FileOps.make_dir(step_path) _file = FileOps.join_path(step_path, "output.csv") try: data.to_csv(_file, index=False) except Exception: logging.error("Failed to save output file, file={}".format(_file)) for record in outputs: worker_id = record["worker_id"] worker_path = TaskOps().get_local_worker_path(step_name, worker_id) outputs_globs = [] if desc: outputs_globs += glob.glob( FileOps.join_path(worker_path, "desc_*.json")) if weights_file: outputs_globs += glob.glob( FileOps.join_path(worker_path, "model_*.pth")) if performance: outputs_globs += glob.glob( FileOps.join_path(worker_path, "performance_*.json")) for _file in outputs_globs: FileOps.copy_file(_file, step_path)
def build(self, model=None, optimizer=None, loss=None, lr_scheduler=None, metrics=None, hps=None, callbacks=None, train_loader=None, valid_loader=None, make_batch=None, train_step=None, valid_step=None, load_ckpt_flag=False, checkpoint_file_name="weights.pth", model_pickle_file_name="model.pkl", performance_file_name="performance.txt"): """Build the trainer by assembling the necessary components.""" # Intitialize hyperparameters by parameters or configurations self.checkpoint_file_name = checkpoint_file_name self.model_pickle_file_name = model_pickle_file_name self.performance_file_name = performance_file_name self._init_cuda_setting() self._init_hps(hps) self.do_validation = self.cfg.with_valid self.model = self._init_model(model) self.load_ckpt_flag = load_ckpt_flag if self.load_ckpt_flag: self.load_checkpoint() else: self._load_pretrained_model() if self.model is not None and self.use_cuda: self.model = self.model.cuda() self.use_syncbn = self.cfg.get('syncbn', False) if self.use_syncbn: self.model = apex.parallel.convert_syncbn_model(self.model) self.optimizer = self._init_optimizer(optimizer) self.loss = self._init_loss(loss) self.lr_scheduler = self._init_lr_scheduler(lr_scheduler) # Some trainer has different train batch size from valid batch self.train_metrics = self._init_metrics(metrics) self.valid_metrics = self._init_metrics(metrics) self.train_loader = self._init_dataloader(mode='train', loader=train_loader) self.valid_loader = self._init_dataloader(mode='test', loader=valid_loader) self._init_horovod_setting() self.use_amp = self.cfg.get('amp', False) if self.use_amp: self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O1') if self.callbacks is None: self.callbacks = callbacks self._init_step_functions(make_batch, train_step, valid_step) # self.output_model_desc() cur_working_dir = FileOps.join_path(self.local_output_path, self.step_name) FileOps.make_dir(cur_working_dir) # Make sure Trainer has been built for training self.has_built = True
def performance_path(self, worker_result_path): """Get performance path.""" performance_dir = os.path.join(worker_result_path, 'performance') if not os.path.exists(performance_dir): FileOps.make_dir(performance_dir) return os.path.join(performance_dir, 'performance.pkl')
def build(self, model=None, optimizer=None, loss=None, lr_scheduler=None, metrics=None, hps=None, callbacks=None, train_loader=None, valid_loader=None, make_batch=None, train_step=None, valid_step=None, model_fn=None, train_input_fn=None, valid_input_fn=None, load_ckpt_flag=False, checkpoint_file_name="checkpoint.pth", model_pickle_file_name="model.pkl"): """Build the trainer by assembling the necessary components.""" # Intitialize hyperparameters by parameters or configurations self._init_hps(hps) logging.debug("Trainer Config: {}".format(obj2config(self.config))) self.checkpoint_file_name = checkpoint_file_name self.model_pickle_file_name = model_pickle_file_name if vega.is_torch_backend(): self._init_step_functions(make_batch, train_step, valid_step) elif vega.is_tf_backend(): self._init_estimator_fn(model_fn, train_input_fn, valid_input_fn) self._init_tf_session() self._init_distributed_setting() self._init_cuda_setting() self._init_tf_estimator() self.do_validation = self.config.with_valid self.model = self._init_model(model) self.load_ckpt_flag = load_ckpt_flag if self.load_ckpt_flag: self.load_checkpoint() else: self._load_pretrained_model() self.use_syncbn = self.config.syncbn if self.use_syncbn and vega.is_torch_backend(): self.model = apex.parallel.convert_syncbn_model(self.model) self.train_loader = self._init_dataloader(mode='train', loader=train_loader) self.valid_loader = self._init_dataloader(mode='val', loader=valid_loader) if vega.is_torch_backend(): self.optimizer = Optimizer()(model=self.model, distributed=self.distributed) \ if optimizer is None else optimizer self.loss = Loss()() if loss is None else loss self.lr_scheduler = LrScheduler()( self.optimizer) if lr_scheduler is None else lr_scheduler # Some trainer has different train batch size from valid batch self.train_metrics = self._init_metrics( metrics) if vega.is_torch_backend() else None self.valid_metrics = self._init_metrics(metrics) self._init_horovod_setting() if self.use_amp and vega.is_torch_backend(): self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O1') if self.callbacks is None: self.callbacks = callbacks # self.output_model_desc() cur_working_dir = FileOps.join_path(self.local_output_path, self.step_name) FileOps.make_dir(cur_working_dir) # Make sure Trainer has been built for training self.has_built = True