def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.gpus = args.gpus cfg.dynamic = args.dynamic if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) # init the logger log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('training gpus num: {}'.format(args.gpus)) logger.info('Config:\n{}'.format(cfg.text)) # get world_size world_size = args.gpus assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') else: gpuid = args.gpuid mge.set_default_device(device='gpu' + gpuid) if world_size > 1: # scale learning rate by number of gpus is_dict_of_dict = True for _, cfg_ in cfg.optimizers.items(): if not isinstance(cfg_, dict): is_dict_of_dict = False if is_dict_of_dict: for _, cfg_ in cfg.optimizers.items(): cfg_['lr'] = cfg_['lr'] * world_size else: raise RuntimeError( "please use 'dict of dict' style for optimizers config") # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, cfg)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, cfg)
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.dynamic = args.dynamic cfg.ensemble = args.ensemble if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) logger.info('Config:\n{}'.format(cfg.text)) gpu_list = [item.strip() for item in args.gpuids.split(",")] if gpu_list[0] == "-1": world_size = 0 # use cpu logger.info('test use only cpu') else: world_size = len(gpu_list) logger.info('test gpus num: {}'.format(world_size)) # assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') elif world_size == 1: mge.set_default_device(device='gpu' + gpu_list[0]) else: pass if world_size > 1: port = dist.util.get_free_ports(1)[0] server = dist.Server(port) processes = [] for rank in range(world_size): logger.info("init distributed process group {} / {}".format( rank, world_size)) p = mp.Process(target=worker, args=(rank, world_size, cfg, gpu_list[rank], port)) p.start() processes.append(p) for rank in range(world_size): processes[rank].join() code = processes[rank].exitcode assert code == 0, "subprocess {} exit with code {}".format( rank, code) else: worker(0, 1, cfg)
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.gpus = args.gpus cfg.dynamic = args.dynamic cfg.ensemble = args.ensemble if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) # init the logger log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('test gpus num: {}'.format(args.gpus)) logger.info('Config:\n{}'.format(cfg.text)) # get world_size world_size = args.gpus assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') else: gpuid = args.gpuid mge.set_default_device(device='gpu' + gpuid) if world_size > 1: # start distributed test, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, cfg)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, cfg)