Exemplo n.º 1
0
    def test(self, data_loader):
        self.mode = 'test'
        self.data_loader = data_loader
        self.call_hook('before_test_epoch')
        time.sleep(0.2)
        sample_nums_all_threads = 0
        save_path = osp.join(self.work_dir, "test_results")
        mkdir_or_exist(save_path)
        for i, data_batch in enumerate(data_loader):
            batchdata = data_batch
            sample_nums_for_one_thread = batchdata[0].shape[0]
            self._inner_iter = i
            self.call_hook('before_test_iter')
            self.outputs = self.model.test_step(
                batchdata,
                save_image=True,
                save_path=save_path,
                sample_id=sample_nums_all_threads +
                sample_nums_for_one_thread * self.local_rank,
                epoch=self._epoch
            )  # for ensemble  如果开启了ensemble,这个epoch就会被使用,添加在生成图像的后面,方面后面进行求平均,并move
            sample_nums_all_threads += self.outputs[0].shape[0]
            self.call_hook('after_test_iter')
            self._iter += 1

        self.call_hook('after_test_epoch')
        self._epoch += 1
Exemplo n.º 2
0
    def save_checkpoint(self, out_dir, create_symlink=True):
        """Save the checkpoint.

        Args:
            out_dir (str): The directory that checkpoints are saved.
            save_optimizer (bool, optional): Whether to save the optimizer to
                the checkpoint. Defaults to True.
            create_symlink (bool, optional): Whether to create a symlink
                "latest.pth" to point to the latest checkpoint.
                Defaults to True.
        """
        filename_tmpl = "epoch_{}"
        filename = filename_tmpl.format(self.epoch + 1)
        filepath = osp.join(out_dir, filename)
        self.logger.info('save checkpoint to {}'.format(filepath))
        mkdir_or_exist(filepath)
        if isinstance(self.model.optimizers, dict):
            for key in self.model.optimizers.keys():
                submodule = getattr(self.model, key, None)
                assert submodule is not None, "model should have submodule {}".format(
                    key)
                assert isinstance(
                    submodule, Module
                ), "submodule should be instance of megengine.module.Module"
                mge.save(submodule.state_dict(),
                         osp.join(filepath, key + module_ckpt_suffix))
                mge.save(self.model.optimizers[key].state_dict(),
                         osp.join(filepath, key + optim_ckpt_suffix))
        else:
            raise TypeError(
                " the type of optimizers should be dict for save_checkpoint")

        if create_symlink:
            pass
Exemplo n.º 3
0
    def after_train_iter(self, runner):
        if not self.every_n_iters(runner, self.interval):
            return

        self.logger.info("start to eval for iter: {}".format(runner.iter + 1))
        save_path = os.path.join(self.save_path,
                                 "iter_{}".format(runner.iter + 1))
        mkdir_or_exist(save_path)
        results = []  # list of dict
        if self.multi_process:
            assert is_distributed(
            ), "when set multiprocess eval, you should use multi process training"
            raise NotImplementedError("not support multi process for eval now")
        elif self.local_rank == 0:  # 全部交给rank0来处理
            for data in self.dataloader:
                outputs = runner.model.test_step(data,
                                                 save_image=self.save_image,
                                                 save_path=save_path,
                                                 ensemble=self.ensemble)
                result = runner.model.cal_for_eval(outputs, data)
                assert isinstance(result, list)
                results += result
            self.evaluate(results, runner.iter + 1)
        else:
            pass

        if is_distributed():
            dist.group_barrier()
Exemplo n.º 4
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.gpus = args.gpus
    cfg.dynamic = args.dynamic
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    # init the logger
    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # log some basic info
    logger.info('training gpus num: {}'.format(args.gpus))
    logger.info('Config:\n{}'.format(cfg.text))

    # get world_size
    world_size = args.gpus
    assert world_size <= mge.get_device_count("gpu")
    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    else:
        gpuid = args.gpuid
        mge.set_default_device(device='gpu' + gpuid)

    if world_size > 1:
        # scale learning rate by number of gpus
        is_dict_of_dict = True
        for _, cfg_ in cfg.optimizers.items():
            if not isinstance(cfg_, dict):
                is_dict_of_dict = False
        if is_dict_of_dict:
            for _, cfg_ in cfg.optimizers.items():
                cfg_['lr'] = cfg_['lr'] * world_size
        else:
            raise RuntimeError(
                "please use 'dict of dict' style for optimizers config")

        # start distributed training, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, cfg))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, cfg)
Exemplo n.º 5
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.dynamic = args.dynamic
    cfg.ensemble = args.ensemble
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
    logger.info('Config:\n{}'.format(cfg.text))

    gpu_list = [item.strip() for item in args.gpuids.split(",")]
    if gpu_list[0] == "-1":
        world_size = 0  # use cpu
        logger.info('test use only cpu')
    else:
        world_size = len(gpu_list)
        logger.info('test gpus num: {}'.format(world_size))

    # assert world_size <= mge.get_device_count("gpu")

    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    elif world_size == 1:
        mge.set_default_device(device='gpu' + gpu_list[0])
    else:
        pass

    if world_size > 1:
        port = dist.util.get_free_ports(1)[0]
        server = dist.Server(port)
        processes = []
        for rank in range(world_size):
            logger.info("init distributed process group {} / {}".format(
                rank, world_size))
            p = mp.Process(target=worker,
                           args=(rank, world_size, cfg, gpu_list[rank], port))
            p.start()
            processes.append(p)

        for rank in range(world_size):
            processes[rank].join()
            code = processes[rank].exitcode
            assert code == 0, "subprocess {} exit with code {}".format(
                rank, code)
    else:
        worker(0, 1, cfg)
Exemplo n.º 6
0
    def after_train_iter(self, runner):
        """The behavior after each train iteration.

        Args:
            runner (``edit.core.runner.BaseRunner``): The runner.
        """
        if not self.every_n_iters(runner, self.interval):
            return

        # for key, para in runner.model.generator.named_parameters():
        #     para.requires_grad = False

        self.logger.info("start to eval for iter: {}".format(runner.iter + 1))
        save_path = os.path.join(self.save_path,
                                 "iter_{}".format(runner.iter + 1))
        mkdir_or_exist(save_path)
        results = []  # list of dict
        sample_nums_all_threads = 0
        for _, data in enumerate(self.dataloader):
            batchdata = data
            sample_nums_for_one_thread = batchdata[0].shape[0]
            outputs = runner.model.test_step(
                batchdata,
                save_image=self.save_image,
                save_path=save_path,
                sample_id=sample_nums_all_threads +
                sample_nums_for_one_thread * self.local_rank)
            if self.nranks > 1:
                # TODO:
                # 一定是使用GPU,将所有线程的outputs和data收集过来
                # gathered_outputs = xxx
                # gathered_batchdata = xxx
                pass
            else:
                gathered_outputs = outputs  # list of tensor
                gathered_batchdata = batchdata  # list of numpy
            assert gathered_batchdata[0].shape[0] == gathered_outputs[0].shape[
                0]  # batch维度要匹配
            assert gathered_batchdata[0].shape[
                0] == sample_nums_for_one_thread * self.nranks  # 确保是gather后的
            sample_nums_all_threads += gathered_outputs[0].shape[0]
            # 目前是所有进程前向并保存结果,0号进程去计算metric;之后增加CPU进程通信,把计算metric也分到不同进程上
            if self.local_rank == 0:
                result = runner.model.cal_for_eval(gathered_outputs,
                                                   gathered_batchdata)
                assert is_list_of(result, dict)
                # self.logger.info(result)
                results += result
            else:
                pass
        if self.local_rank == 0:
            self.evaluate(results, runner.iter + 1)
    def test_aggre(self, save_path, padding_len=4, start_index=1):
        clip_names = sorted(self.frame_num.keys())  # e.g. [`city`, `walk`]
        frame_nums = [self.frame_num[clip] for clip in clip_names]

        do_frames = 0
        now_clip_idx = 0
        total_deal = 0
        for _ in range(len(self)):
            do_frames += 1
            if do_frames == frame_nums[now_clip_idx]:
                clip_name = clip_names[now_clip_idx]
                # move images to dir use shutil
                save_dir_path = osp.join(save_path, clip_name)
                mkdir_or_exist(save_dir_path)
                # index from [total_deal, total_deal + do_frames)
                for idx in range(total_deal, total_deal + do_frames):
                    ensemble_path_1 = osp.join(
                        save_path, "idx_{}_epoch_1.png".format(idx))
                    desti_path = osp.join(
                        save_dir_path,
                        str(idx - total_deal + start_index).zfill(padding_len)
                        + ".png")
                    if osp.exists(ensemble_path_1):
                        # get the content
                        path = osp.join(save_path, "idx_{}.png".format(idx))
                        sum_result = imread(path, flag='unchanged').astype(
                            np.float32)
                        os.remove(path)
                        for e in range(1, 8):
                            path = osp.join(
                                save_path,
                                "idx_{}_epoch_{}.png".format(idx, e))
                            sum_result = sum_result + imread(
                                path, flag='unchanged').astype(np.float32)
                            os.remove(path)
                        sum_result = sum_result / 8
                        # 四舍五入
                        sum_result = sum_result.round().astype(np.uint8)
                        # save
                        imwrite(sum_result, desti_path)
                    else:
                        # move
                        shutil.move(
                            osp.join(save_path, "idx_" + str(idx) + ".png"),
                            desti_path)

                total_deal += do_frames
                do_frames = 0
                now_clip_idx += 1
Exemplo n.º 8
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.gpus = args.gpus
    cfg.dynamic = args.dynamic
    cfg.ensemble = args.ensemble
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    # init the logger
    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # log some basic info
    logger.info('test gpus num: {}'.format(args.gpus))
    logger.info('Config:\n{}'.format(cfg.text))

    # get world_size
    world_size = args.gpus
    assert world_size <= mge.get_device_count("gpu")
    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    else:
        gpuid = args.gpuid
        mge.set_default_device(device='gpu' + gpuid)

    if world_size > 1:
        # start distributed test, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, cfg))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, cfg)
Exemplo n.º 9
0
def imwrite(img, file_path, params=None, auto_mkdir=True):
    """Write image to file.

    Args:
        img (ndarray): Image array to be written.
        file_path (str): Image file path.
        params (None or list): Same as opencv's :func:`imwrite` interface.
        auto_mkdir (bool): If the parent folder of `file_path` does not exist,
            whether to create it automatically.

    Returns:
        bool: Successful or not.
    """
    if auto_mkdir:
        dir_name = osp.abspath(osp.dirname(file_path))
        mkdir_or_exist(dir_name)
    return cv2.imwrite(file_path, img, params)
Exemplo n.º 10
0
    def test(self, data_loader):
        self.mode = 'test'
        self.data_loader = data_loader
        self.call_hook('before_test_epoch')
        time.sleep(0.05)
        save_path = osp.join(self.work_dir, "test_results")
        mkdir_or_exist(save_path)
        for i, data_batch in enumerate(data_loader):
            batchdata = data_batch
            self._inner_iter = i
            self.call_hook('before_test_iter')
            self.outputs = self.model.test_step(batchdata, 
                                                save_image = True, 
                                                save_path = save_path)
            self.call_hook('after_test_iter')
            self._iter += 1

        self.call_hook('after_test_epoch')
        self._epoch += 1
Exemplo n.º 11
0
    def __init__(self, dataloader, **eval_kwargs):
        if not isinstance(dataloader, DataLoader):
            raise TypeError('dataloader must be a mge DataLoader, '
                            f'but got { type(dataloader)}')
        self.dataloader = dataloader
        self.eval_kwargs = eval_kwargs
        self.interval = self.eval_kwargs.pop('interval', 10000)
        self.save_image = self.eval_kwargs.pop('save_image', False)
        self.save_path = self.eval_kwargs.pop('save_path', None)
        self.log_dir_path = self.eval_kwargs.pop('log_path', None)
        self.log_path = os.path.join(self.log_dir_path, "eval.log")
        mkdir_or_exist(self.log_dir_path)
        self.logger = get_logger(name="EvalIterHook", log_file=self.log_path)

        # dist
        if is_distributed():
            self.local_rank = get_rank()
            self.nranks = get_world_size()
        else:
            self.local_rank = 0
            self.nranks = 1
Exemplo n.º 12
0
    def __init__(self, dataloader, **eval_kwargs):
        if not isinstance(dataloader, DataLoader):
            raise TypeError(
                'dataloader must be a mge DataLoader, but got {}'.format(
                    type(dataloader)))
        self.dataloader = dataloader
        self.eval_kwargs = eval_kwargs
        self.interval = self.eval_kwargs.pop('interval', 10000)
        self.save_image = self.eval_kwargs.pop('save_image', False)
        self.save_path = self.eval_kwargs.pop('save_path', None)
        self.log_path = self.eval_kwargs.pop('log_path', None)
        self.multi_process = self.eval_kwargs.pop('multi_process', False)
        self.ensemble = self.eval_kwargs.pop('ensemble', False)
        mkdir_or_exist(self.save_path)
        self.logger = get_logger(name="EvalIterHook",
                                 log_file=self.log_path)  # only for rank0

        if is_distributed():
            self.local_rank = dist.get_rank()
            self.nranks = dist.get_world_size()
        else:
            self.local_rank = 0
            self.nranks = 1
Exemplo n.º 13
0
    def evaluate(self, results, save_path):
        """ Evaluate with different metrics.
            Args:
                results (list of dict): for every dict, record metric -> value for one frame

            Return:
                dict: Evaluation results dict.
        """
        save_SVG_path = osp.join(save_path, "SVG")
        mkdir_or_exist(save_SVG_path)
        assert is_list_of(
            results,
            dict), f'results must be a list of dict, but got {type(results)}'
        assert len(results) >= len(
            self
        ), "results length should >= dataset length, due to multicard eval"
        self.logger.info(
            "eval samples length: {}, dataset length: {}, only select front {} results"
            .format(len(results), len(self), len(self)))
        results = results[:len(self)]

        clip_names = sorted(self.frame_num.keys())  # e.g. [`city`, `walk`]
        frame_nums = [self.frame_num[clip] for clip in clip_names]

        eval_results = defaultdict(list)  # a dict of list

        do_frames = 0
        now_clip_idx = 0
        eval_results_one_clip = defaultdict(list)
        for res in results:
            for metric, val in res.items():
                eval_results_one_clip[metric].append(val)

            do_frames += 1
            if do_frames == frame_nums[now_clip_idx]:  # 处理一个clip
                clip_name = clip_names[now_clip_idx]
                self.logger.info("{}: {} is ok".format(now_clip_idx,
                                                       clip_name))
                for metric, values in eval_results_one_clip.items():
                    # metric clip_name values   to save an svg
                    average = sum(values) / len(values)
                    save_filename = clip_name + "_" + metric
                    title = "{} for {}, length: {}, average: {:.4f}".format(
                        metric, clip_name, len(values), average)
                    plt.figure(figsize=(len(values) // 4 + 1, 8))
                    plt.plot(list(range(len(values))), values,
                             label=metric)  # promise that <= 10000
                    plt.title(title)
                    plt.xlabel('frame idx')
                    plt.ylabel('{} value'.format(metric))
                    plt.legend()
                    fig = plt.gcf()
                    fig.savefig(osp.join(save_SVG_path,
                                         save_filename + '.svg'),
                                dpi=600,
                                bbox_inches='tight')
                    # plt.show()
                    plt.clf()
                    plt.close()

                    eval_results[metric].append(average)

                do_frames = 0
                now_clip_idx += 1
                eval_results_one_clip = defaultdict(list)

        for metric, val_list in eval_results.items():
            assert len(val_list) == len(clip_names), (
                f'Length of evaluation result of {metric} is {len(val_list)}, '
                f'should be {len(clip_names)}')

        # average the results
        eval_results = {
            metric: sum(values) / len(values)
            for metric, values in eval_results.items()
        }

        return eval_results