예제 #1
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.gpus = args.gpus
    cfg.dynamic = args.dynamic
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    # init the logger
    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # log some basic info
    logger.info('training gpus num: {}'.format(args.gpus))
    logger.info('Config:\n{}'.format(cfg.text))

    # get world_size
    world_size = args.gpus
    assert world_size <= mge.get_device_count("gpu")
    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    else:
        gpuid = args.gpuid
        mge.set_default_device(device='gpu' + gpuid)

    if world_size > 1:
        # scale learning rate by number of gpus
        is_dict_of_dict = True
        for _, cfg_ in cfg.optimizers.items():
            if not isinstance(cfg_, dict):
                is_dict_of_dict = False
        if is_dict_of_dict:
            for _, cfg_ in cfg.optimizers.items():
                cfg_['lr'] = cfg_['lr'] * world_size
        else:
            raise RuntimeError(
                "please use 'dict of dict' style for optimizers config")

        # start distributed training, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, cfg))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, cfg)
예제 #2
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.dynamic = args.dynamic
    cfg.ensemble = args.ensemble
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
    logger.info('Config:\n{}'.format(cfg.text))

    gpu_list = [item.strip() for item in args.gpuids.split(",")]
    if gpu_list[0] == "-1":
        world_size = 0  # use cpu
        logger.info('test use only cpu')
    else:
        world_size = len(gpu_list)
        logger.info('test gpus num: {}'.format(world_size))

    # assert world_size <= mge.get_device_count("gpu")

    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    elif world_size == 1:
        mge.set_default_device(device='gpu' + gpu_list[0])
    else:
        pass

    if world_size > 1:
        port = dist.util.get_free_ports(1)[0]
        server = dist.Server(port)
        processes = []
        for rank in range(world_size):
            logger.info("init distributed process group {} / {}".format(
                rank, world_size))
            p = mp.Process(target=worker,
                           args=(rank, world_size, cfg, gpu_list[rank], port))
            p.start()
            processes.append(p)

        for rank in range(world_size):
            processes[rank].join()
            code = processes[rank].exitcode
            assert code == 0, "subprocess {} exit with code {}".format(
                rank, code)
    else:
        worker(0, 1, cfg)
 def __init__(self, optimizer_cfg, paramwise_cfg=None):
     if not isinstance(optimizer_cfg, dict):
         raise TypeError('optimizer_cfg should be a dict',
                         f'but got {type(optimizer_cfg)}')
     self.optimizer_cfg = optimizer_cfg
     self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg
     self.base_lr = optimizer_cfg.get('lr', None)
     self.base_wd = optimizer_cfg.get('weight_decay', None)
     self.logger = get_root_logger()
     self._validate_cfg()
예제 #4
0
파일: test.py 프로젝트: xxoox168/MgeEditing
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.gpus = args.gpus
    cfg.dynamic = args.dynamic
    cfg.ensemble = args.ensemble
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    # init the logger
    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # log some basic info
    logger.info('test gpus num: {}'.format(args.gpus))
    logger.info('Config:\n{}'.format(cfg.text))

    # get world_size
    world_size = args.gpus
    assert world_size <= mge.get_device_count("gpu")
    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    else:
        gpuid = args.gpuid
        mge.set_default_device(device='gpu' + gpuid)

    if world_size > 1:
        # start distributed test, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, cfg))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, cfg)
예제 #5
0
def worker(rank, world_size, cfg):
    logger = get_root_logger()  # 每个进程再创建一个logger
    if world_size > 1:
        # Initialize distributed process group
        logger.info("init distributed process group {} / {}".format(
            rank, world_size))
        dist.init_process_group(
            master_ip="localhost",
            master_port=23333,
            world_size=world_size,
            rank=rank,
            dev=rank % 8,
        )
    model = build_model(cfg.model,
                        train_cfg=cfg.train_cfg,
                        eval_cfg=cfg.eval_cfg)
    datasets = [build_dataset(cfg.data.train)]
    train(model, datasets, cfg, rank)
예제 #6
0
def worker(rank, world_size, cfg, gpu_id="0", port=23333):
    if cfg.dynamic:
        trace.enabled = False

    if world_size > 1:
        dist.init_process_group(
            master_ip="localhost",
            port=port,
            world_size=world_size,
            rank=rank,
            device=int(gpu_id) % 10,
        )
        log_file = os.path.join(cfg.work_dir, 'rank{}_root.log'.format(rank))
        logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
    model = build_model(cfg.model,
                        train_cfg=cfg.train_cfg,
                        eval_cfg=cfg.eval_cfg)  # 此时参数已经随机化完成
    datasets = [build_dataset(cfg.data.train)]
    train(model, datasets, cfg, rank)
예제 #7
0
    def __init__(self, model, optimizers_cfg=None, work_dir=None):
        assert hasattr(model, 'train_step')
        assert hasattr(model, 'test_step')
        assert hasattr(model, 'create_optimizers')

        self.model = model
        self.optimizers_cfg = optimizers_cfg
        self.logger = get_root_logger()
        self.work_dir = work_dir
        assert self.work_dir is not None

        # get model name from the model class
        self._model_name = self.model.__class__.__name__
        self.mode = None
        self._hooks = []
        self._epoch = 0
        self._iter = 0
        self._inner_iter = 0
        self._max_epochs = 0
        self._max_iters = 0
예제 #8
0
def worker(rank, world_size, cfg, gpu_id="0", port=23333):
    if cfg.dynamic:
        trace.enabled = False

    if world_size > 1:
        dist.init_process_group(
            master_ip="localhost",
            port=port,
            world_size=world_size,
            rank=rank,
            device=int(gpu_id) % 10,
        )
        log_file = os.path.join(cfg.work_dir, 'rank{}_root.log'.format(rank))
        logger = get_root_logger(
            log_file=log_file, log_level=cfg.log_level
        )  # 给每个进程创立自己的root logger,但只有rank0的创建文件,其余的不创建且为error级别
    model = build_model(
        cfg.model, eval_cfg=cfg.eval_cfg
    )  # eval cfg can provide some useful info, e.g. the padding multi
    datasets = [build_dataset(cfg.data.test)]
    test(model, datasets, cfg, rank)
예제 #9
0
    def __call__(self, model):
        optimizer_cfg = self.optimizer_cfg.copy()
        # if no paramwise option is specified, just use the global setting
        logger = get_root_logger()
        param_nums = 0
        for item in model.parameters():
            param_nums += np.prod(np.array(item.shape))
        logger.info("model: {} 's total parameter nums: {}".format(model.__class__.__name__, param_nums))
        
        if not self.paramwise_cfg:
            optimizer_cfg['params'] = model.parameters()
            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
        else:
            raise NotImplementedError("paramwise_cfg not implemented now")

            # set param-wise lr and weight decay recursively
            params = []
            self.add_params(params, model)
            optimizer_cfg['params'] = params

            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
예제 #10
0
파일: test.py 프로젝트: xxoox168/MgeEditing
def worker(rank, world_size, cfg):
    logger = get_root_logger()  # 每个进程再创建一个logger

    # set dynamic graph for debug
    if cfg.dynamic:
        trace.enabled = False

    if world_size > 1:
        # Initialize distributed process group
        logger.info("init distributed process group {} / {}".format(
            rank, world_size))
        dist.init_process_group(
            master_ip="localhost",
            master_port=23333,
            world_size=world_size,
            rank=rank,
            dev=rank % 8,
        )
    model = build_model(
        cfg.model, eval_cfg=cfg.eval_cfg
    )  # eval cfg can provide some useful info, e.g. the padding multi
    datasets = [build_dataset(cfg.data.test)]
    test(model, datasets, cfg, rank)
예제 #11
0
 def __init__(self, pipeline, mode='train'):
     super(BaseDataset, self).__init__()
     assert mode in ("train", "test", "eval")
     self.mode = mode
     self.pipeline = Compose(pipeline)
     self.logger = get_root_logger()
예제 #12
0
 def __init__(self, pipeline, test_mode=False):
     super(BaseDataset, self).__init__()
     self.test_mode = test_mode
     self.pipeline = Compose(pipeline)
     self.logger = get_root_logger()