def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.gpus = args.gpus cfg.dynamic = args.dynamic if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) # init the logger log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('training gpus num: {}'.format(args.gpus)) logger.info('Config:\n{}'.format(cfg.text)) # get world_size world_size = args.gpus assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') else: gpuid = args.gpuid mge.set_default_device(device='gpu' + gpuid) if world_size > 1: # scale learning rate by number of gpus is_dict_of_dict = True for _, cfg_ in cfg.optimizers.items(): if not isinstance(cfg_, dict): is_dict_of_dict = False if is_dict_of_dict: for _, cfg_ in cfg.optimizers.items(): cfg_['lr'] = cfg_['lr'] * world_size else: raise RuntimeError( "please use 'dict of dict' style for optimizers config") # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, cfg)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, cfg)
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.dynamic = args.dynamic cfg.ensemble = args.ensemble if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) logger.info('Config:\n{}'.format(cfg.text)) gpu_list = [item.strip() for item in args.gpuids.split(",")] if gpu_list[0] == "-1": world_size = 0 # use cpu logger.info('test use only cpu') else: world_size = len(gpu_list) logger.info('test gpus num: {}'.format(world_size)) # assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') elif world_size == 1: mge.set_default_device(device='gpu' + gpu_list[0]) else: pass if world_size > 1: port = dist.util.get_free_ports(1)[0] server = dist.Server(port) processes = [] for rank in range(world_size): logger.info("init distributed process group {} / {}".format( rank, world_size)) p = mp.Process(target=worker, args=(rank, world_size, cfg, gpu_list[rank], port)) p.start() processes.append(p) for rank in range(world_size): processes[rank].join() code = processes[rank].exitcode assert code == 0, "subprocess {} exit with code {}".format( rank, code) else: worker(0, 1, cfg)
def __init__(self, optimizer_cfg, paramwise_cfg=None): if not isinstance(optimizer_cfg, dict): raise TypeError('optimizer_cfg should be a dict', f'but got {type(optimizer_cfg)}') self.optimizer_cfg = optimizer_cfg self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg self.base_lr = optimizer_cfg.get('lr', None) self.base_wd = optimizer_cfg.get('weight_decay', None) self.logger = get_root_logger() self._validate_cfg()
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.gpus = args.gpus cfg.dynamic = args.dynamic cfg.ensemble = args.ensemble if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) # init the logger log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('test gpus num: {}'.format(args.gpus)) logger.info('Config:\n{}'.format(cfg.text)) # get world_size world_size = args.gpus assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') else: gpuid = args.gpuid mge.set_default_device(device='gpu' + gpuid) if world_size > 1: # start distributed test, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, cfg)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, cfg)
def worker(rank, world_size, cfg): logger = get_root_logger() # 每个进程再创建一个logger if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23333, world_size=world_size, rank=rank, dev=rank % 8, ) model = build_model(cfg.model, train_cfg=cfg.train_cfg, eval_cfg=cfg.eval_cfg) datasets = [build_dataset(cfg.data.train)] train(model, datasets, cfg, rank)
def worker(rank, world_size, cfg, gpu_id="0", port=23333): if cfg.dynamic: trace.enabled = False if world_size > 1: dist.init_process_group( master_ip="localhost", port=port, world_size=world_size, rank=rank, device=int(gpu_id) % 10, ) log_file = os.path.join(cfg.work_dir, 'rank{}_root.log'.format(rank)) logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) model = build_model(cfg.model, train_cfg=cfg.train_cfg, eval_cfg=cfg.eval_cfg) # 此时参数已经随机化完成 datasets = [build_dataset(cfg.data.train)] train(model, datasets, cfg, rank)
def __init__(self, model, optimizers_cfg=None, work_dir=None): assert hasattr(model, 'train_step') assert hasattr(model, 'test_step') assert hasattr(model, 'create_optimizers') self.model = model self.optimizers_cfg = optimizers_cfg self.logger = get_root_logger() self.work_dir = work_dir assert self.work_dir is not None # get model name from the model class self._model_name = self.model.__class__.__name__ self.mode = None self._hooks = [] self._epoch = 0 self._iter = 0 self._inner_iter = 0 self._max_epochs = 0 self._max_iters = 0
def worker(rank, world_size, cfg, gpu_id="0", port=23333): if cfg.dynamic: trace.enabled = False if world_size > 1: dist.init_process_group( master_ip="localhost", port=port, world_size=world_size, rank=rank, device=int(gpu_id) % 10, ) log_file = os.path.join(cfg.work_dir, 'rank{}_root.log'.format(rank)) logger = get_root_logger( log_file=log_file, log_level=cfg.log_level ) # 给每个进程创立自己的root logger,但只有rank0的创建文件,其余的不创建且为error级别 model = build_model( cfg.model, eval_cfg=cfg.eval_cfg ) # eval cfg can provide some useful info, e.g. the padding multi datasets = [build_dataset(cfg.data.test)] test(model, datasets, cfg, rank)
def __call__(self, model): optimizer_cfg = self.optimizer_cfg.copy() # if no paramwise option is specified, just use the global setting logger = get_root_logger() param_nums = 0 for item in model.parameters(): param_nums += np.prod(np.array(item.shape)) logger.info("model: {} 's total parameter nums: {}".format(model.__class__.__name__, param_nums)) if not self.paramwise_cfg: optimizer_cfg['params'] = model.parameters() return build_from_cfg(optimizer_cfg, OPTIMIZERS) else: raise NotImplementedError("paramwise_cfg not implemented now") # set param-wise lr and weight decay recursively params = [] self.add_params(params, model) optimizer_cfg['params'] = params return build_from_cfg(optimizer_cfg, OPTIMIZERS)
def worker(rank, world_size, cfg): logger = get_root_logger() # 每个进程再创建一个logger # set dynamic graph for debug if cfg.dynamic: trace.enabled = False if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23333, world_size=world_size, rank=rank, dev=rank % 8, ) model = build_model( cfg.model, eval_cfg=cfg.eval_cfg ) # eval cfg can provide some useful info, e.g. the padding multi datasets = [build_dataset(cfg.data.test)] test(model, datasets, cfg, rank)
def __init__(self, pipeline, mode='train'): super(BaseDataset, self).__init__() assert mode in ("train", "test", "eval") self.mode = mode self.pipeline = Compose(pipeline) self.logger = get_root_logger()
def __init__(self, pipeline, test_mode=False): super(BaseDataset, self).__init__() self.test_mode = test_mode self.pipeline = Compose(pipeline) self.logger = get_root_logger()