def test(self, data_loader): self.mode = 'test' self.data_loader = data_loader self.call_hook('before_test_epoch') time.sleep(0.2) sample_nums_all_threads = 0 save_path = osp.join(self.work_dir, "test_results") mkdir_or_exist(save_path) for i, data_batch in enumerate(data_loader): batchdata = data_batch sample_nums_for_one_thread = batchdata[0].shape[0] self._inner_iter = i self.call_hook('before_test_iter') self.outputs = self.model.test_step( batchdata, save_image=True, save_path=save_path, sample_id=sample_nums_all_threads + sample_nums_for_one_thread * self.local_rank, epoch=self._epoch ) # for ensemble 如果开启了ensemble,这个epoch就会被使用,添加在生成图像的后面,方面后面进行求平均,并move sample_nums_all_threads += self.outputs[0].shape[0] self.call_hook('after_test_iter') self._iter += 1 self.call_hook('after_test_epoch') self._epoch += 1
def save_checkpoint(self, out_dir, create_symlink=True): """Save the checkpoint. Args: out_dir (str): The directory that checkpoints are saved. save_optimizer (bool, optional): Whether to save the optimizer to the checkpoint. Defaults to True. create_symlink (bool, optional): Whether to create a symlink "latest.pth" to point to the latest checkpoint. Defaults to True. """ filename_tmpl = "epoch_{}" filename = filename_tmpl.format(self.epoch + 1) filepath = osp.join(out_dir, filename) self.logger.info('save checkpoint to {}'.format(filepath)) mkdir_or_exist(filepath) if isinstance(self.model.optimizers, dict): for key in self.model.optimizers.keys(): submodule = getattr(self.model, key, None) assert submodule is not None, "model should have submodule {}".format( key) assert isinstance( submodule, Module ), "submodule should be instance of megengine.module.Module" mge.save(submodule.state_dict(), osp.join(filepath, key + module_ckpt_suffix)) mge.save(self.model.optimizers[key].state_dict(), osp.join(filepath, key + optim_ckpt_suffix)) else: raise TypeError( " the type of optimizers should be dict for save_checkpoint") if create_symlink: pass
def after_train_iter(self, runner): if not self.every_n_iters(runner, self.interval): return self.logger.info("start to eval for iter: {}".format(runner.iter + 1)) save_path = os.path.join(self.save_path, "iter_{}".format(runner.iter + 1)) mkdir_or_exist(save_path) results = [] # list of dict if self.multi_process: assert is_distributed( ), "when set multiprocess eval, you should use multi process training" raise NotImplementedError("not support multi process for eval now") elif self.local_rank == 0: # 全部交给rank0来处理 for data in self.dataloader: outputs = runner.model.test_step(data, save_image=self.save_image, save_path=save_path, ensemble=self.ensemble) result = runner.model.cal_for_eval(outputs, data) assert isinstance(result, list) results += result self.evaluate(results, runner.iter + 1) else: pass if is_distributed(): dist.group_barrier()
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.gpus = args.gpus cfg.dynamic = args.dynamic if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) # init the logger log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('training gpus num: {}'.format(args.gpus)) logger.info('Config:\n{}'.format(cfg.text)) # get world_size world_size = args.gpus assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') else: gpuid = args.gpuid mge.set_default_device(device='gpu' + gpuid) if world_size > 1: # scale learning rate by number of gpus is_dict_of_dict = True for _, cfg_ in cfg.optimizers.items(): if not isinstance(cfg_, dict): is_dict_of_dict = False if is_dict_of_dict: for _, cfg_ in cfg.optimizers.items(): cfg_['lr'] = cfg_['lr'] * world_size else: raise RuntimeError( "please use 'dict of dict' style for optimizers config") # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, cfg)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, cfg)
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.dynamic = args.dynamic cfg.ensemble = args.ensemble if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) logger.info('Config:\n{}'.format(cfg.text)) gpu_list = [item.strip() for item in args.gpuids.split(",")] if gpu_list[0] == "-1": world_size = 0 # use cpu logger.info('test use only cpu') else: world_size = len(gpu_list) logger.info('test gpus num: {}'.format(world_size)) # assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') elif world_size == 1: mge.set_default_device(device='gpu' + gpu_list[0]) else: pass if world_size > 1: port = dist.util.get_free_ports(1)[0] server = dist.Server(port) processes = [] for rank in range(world_size): logger.info("init distributed process group {} / {}".format( rank, world_size)) p = mp.Process(target=worker, args=(rank, world_size, cfg, gpu_list[rank], port)) p.start() processes.append(p) for rank in range(world_size): processes[rank].join() code = processes[rank].exitcode assert code == 0, "subprocess {} exit with code {}".format( rank, code) else: worker(0, 1, cfg)
def after_train_iter(self, runner): """The behavior after each train iteration. Args: runner (``edit.core.runner.BaseRunner``): The runner. """ if not self.every_n_iters(runner, self.interval): return # for key, para in runner.model.generator.named_parameters(): # para.requires_grad = False self.logger.info("start to eval for iter: {}".format(runner.iter + 1)) save_path = os.path.join(self.save_path, "iter_{}".format(runner.iter + 1)) mkdir_or_exist(save_path) results = [] # list of dict sample_nums_all_threads = 0 for _, data in enumerate(self.dataloader): batchdata = data sample_nums_for_one_thread = batchdata[0].shape[0] outputs = runner.model.test_step( batchdata, save_image=self.save_image, save_path=save_path, sample_id=sample_nums_all_threads + sample_nums_for_one_thread * self.local_rank) if self.nranks > 1: # TODO: # 一定是使用GPU,将所有线程的outputs和data收集过来 # gathered_outputs = xxx # gathered_batchdata = xxx pass else: gathered_outputs = outputs # list of tensor gathered_batchdata = batchdata # list of numpy assert gathered_batchdata[0].shape[0] == gathered_outputs[0].shape[ 0] # batch维度要匹配 assert gathered_batchdata[0].shape[ 0] == sample_nums_for_one_thread * self.nranks # 确保是gather后的 sample_nums_all_threads += gathered_outputs[0].shape[0] # 目前是所有进程前向并保存结果,0号进程去计算metric;之后增加CPU进程通信,把计算metric也分到不同进程上 if self.local_rank == 0: result = runner.model.cal_for_eval(gathered_outputs, gathered_batchdata) assert is_list_of(result, dict) # self.logger.info(result) results += result else: pass if self.local_rank == 0: self.evaluate(results, runner.iter + 1)
def test_aggre(self, save_path, padding_len=4, start_index=1): clip_names = sorted(self.frame_num.keys()) # e.g. [`city`, `walk`] frame_nums = [self.frame_num[clip] for clip in clip_names] do_frames = 0 now_clip_idx = 0 total_deal = 0 for _ in range(len(self)): do_frames += 1 if do_frames == frame_nums[now_clip_idx]: clip_name = clip_names[now_clip_idx] # move images to dir use shutil save_dir_path = osp.join(save_path, clip_name) mkdir_or_exist(save_dir_path) # index from [total_deal, total_deal + do_frames) for idx in range(total_deal, total_deal + do_frames): ensemble_path_1 = osp.join( save_path, "idx_{}_epoch_1.png".format(idx)) desti_path = osp.join( save_dir_path, str(idx - total_deal + start_index).zfill(padding_len) + ".png") if osp.exists(ensemble_path_1): # get the content path = osp.join(save_path, "idx_{}.png".format(idx)) sum_result = imread(path, flag='unchanged').astype( np.float32) os.remove(path) for e in range(1, 8): path = osp.join( save_path, "idx_{}_epoch_{}.png".format(idx, e)) sum_result = sum_result + imread( path, flag='unchanged').astype(np.float32) os.remove(path) sum_result = sum_result / 8 # 四舍五入 sum_result = sum_result.round().astype(np.uint8) # save imwrite(sum_result, desti_path) else: # move shutil.move( osp.join(save_path, "idx_" + str(idx) + ".png"), desti_path) total_deal += do_frames do_frames = 0 now_clip_idx += 1
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.gpus = args.gpus cfg.dynamic = args.dynamic cfg.ensemble = args.ensemble if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) # init the logger log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('test gpus num: {}'.format(args.gpus)) logger.info('Config:\n{}'.format(cfg.text)) # get world_size world_size = args.gpus assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') else: gpuid = args.gpuid mge.set_default_device(device='gpu' + gpuid) if world_size > 1: # start distributed test, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, cfg)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, cfg)
def imwrite(img, file_path, params=None, auto_mkdir=True): """Write image to file. Args: img (ndarray): Image array to be written. file_path (str): Image file path. params (None or list): Same as opencv's :func:`imwrite` interface. auto_mkdir (bool): If the parent folder of `file_path` does not exist, whether to create it automatically. Returns: bool: Successful or not. """ if auto_mkdir: dir_name = osp.abspath(osp.dirname(file_path)) mkdir_or_exist(dir_name) return cv2.imwrite(file_path, img, params)
def test(self, data_loader): self.mode = 'test' self.data_loader = data_loader self.call_hook('before_test_epoch') time.sleep(0.05) save_path = osp.join(self.work_dir, "test_results") mkdir_or_exist(save_path) for i, data_batch in enumerate(data_loader): batchdata = data_batch self._inner_iter = i self.call_hook('before_test_iter') self.outputs = self.model.test_step(batchdata, save_image = True, save_path = save_path) self.call_hook('after_test_iter') self._iter += 1 self.call_hook('after_test_epoch') self._epoch += 1
def __init__(self, dataloader, **eval_kwargs): if not isinstance(dataloader, DataLoader): raise TypeError('dataloader must be a mge DataLoader, ' f'but got { type(dataloader)}') self.dataloader = dataloader self.eval_kwargs = eval_kwargs self.interval = self.eval_kwargs.pop('interval', 10000) self.save_image = self.eval_kwargs.pop('save_image', False) self.save_path = self.eval_kwargs.pop('save_path', None) self.log_dir_path = self.eval_kwargs.pop('log_path', None) self.log_path = os.path.join(self.log_dir_path, "eval.log") mkdir_or_exist(self.log_dir_path) self.logger = get_logger(name="EvalIterHook", log_file=self.log_path) # dist if is_distributed(): self.local_rank = get_rank() self.nranks = get_world_size() else: self.local_rank = 0 self.nranks = 1
def __init__(self, dataloader, **eval_kwargs): if not isinstance(dataloader, DataLoader): raise TypeError( 'dataloader must be a mge DataLoader, but got {}'.format( type(dataloader))) self.dataloader = dataloader self.eval_kwargs = eval_kwargs self.interval = self.eval_kwargs.pop('interval', 10000) self.save_image = self.eval_kwargs.pop('save_image', False) self.save_path = self.eval_kwargs.pop('save_path', None) self.log_path = self.eval_kwargs.pop('log_path', None) self.multi_process = self.eval_kwargs.pop('multi_process', False) self.ensemble = self.eval_kwargs.pop('ensemble', False) mkdir_or_exist(self.save_path) self.logger = get_logger(name="EvalIterHook", log_file=self.log_path) # only for rank0 if is_distributed(): self.local_rank = dist.get_rank() self.nranks = dist.get_world_size() else: self.local_rank = 0 self.nranks = 1
def evaluate(self, results, save_path): """ Evaluate with different metrics. Args: results (list of dict): for every dict, record metric -> value for one frame Return: dict: Evaluation results dict. """ save_SVG_path = osp.join(save_path, "SVG") mkdir_or_exist(save_SVG_path) assert is_list_of( results, dict), f'results must be a list of dict, but got {type(results)}' assert len(results) >= len( self ), "results length should >= dataset length, due to multicard eval" self.logger.info( "eval samples length: {}, dataset length: {}, only select front {} results" .format(len(results), len(self), len(self))) results = results[:len(self)] clip_names = sorted(self.frame_num.keys()) # e.g. [`city`, `walk`] frame_nums = [self.frame_num[clip] for clip in clip_names] eval_results = defaultdict(list) # a dict of list do_frames = 0 now_clip_idx = 0 eval_results_one_clip = defaultdict(list) for res in results: for metric, val in res.items(): eval_results_one_clip[metric].append(val) do_frames += 1 if do_frames == frame_nums[now_clip_idx]: # 处理一个clip clip_name = clip_names[now_clip_idx] self.logger.info("{}: {} is ok".format(now_clip_idx, clip_name)) for metric, values in eval_results_one_clip.items(): # metric clip_name values to save an svg average = sum(values) / len(values) save_filename = clip_name + "_" + metric title = "{} for {}, length: {}, average: {:.4f}".format( metric, clip_name, len(values), average) plt.figure(figsize=(len(values) // 4 + 1, 8)) plt.plot(list(range(len(values))), values, label=metric) # promise that <= 10000 plt.title(title) plt.xlabel('frame idx') plt.ylabel('{} value'.format(metric)) plt.legend() fig = plt.gcf() fig.savefig(osp.join(save_SVG_path, save_filename + '.svg'), dpi=600, bbox_inches='tight') # plt.show() plt.clf() plt.close() eval_results[metric].append(average) do_frames = 0 now_clip_idx += 1 eval_results_one_clip = defaultdict(list) for metric, val_list in eval_results.items(): assert len(val_list) == len(clip_names), ( f'Length of evaluation result of {metric} is {len(val_list)}, ' f'should be {len(clip_names)}') # average the results eval_results = { metric: sum(values) / len(values) for metric, values in eval_results.items() } return eval_results