Пример #1
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.gpus = args.gpus
    cfg.dynamic = args.dynamic
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    # init the logger
    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # log some basic info
    logger.info('training gpus num: {}'.format(args.gpus))
    logger.info('Config:\n{}'.format(cfg.text))

    # get world_size
    world_size = args.gpus
    assert world_size <= mge.get_device_count("gpu")
    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    else:
        gpuid = args.gpuid
        mge.set_default_device(device='gpu' + gpuid)

    if world_size > 1:
        # scale learning rate by number of gpus
        is_dict_of_dict = True
        for _, cfg_ in cfg.optimizers.items():
            if not isinstance(cfg_, dict):
                is_dict_of_dict = False
        if is_dict_of_dict:
            for _, cfg_ in cfg.optimizers.items():
                cfg_['lr'] = cfg_['lr'] * world_size
        else:
            raise RuntimeError(
                "please use 'dict of dict' style for optimizers config")

        # start distributed training, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, cfg))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, cfg)
Пример #2
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.dynamic = args.dynamic
    cfg.ensemble = args.ensemble
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
    logger.info('Config:\n{}'.format(cfg.text))

    gpu_list = [item.strip() for item in args.gpuids.split(",")]
    if gpu_list[0] == "-1":
        world_size = 0  # use cpu
        logger.info('test use only cpu')
    else:
        world_size = len(gpu_list)
        logger.info('test gpus num: {}'.format(world_size))

    # assert world_size <= mge.get_device_count("gpu")

    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    elif world_size == 1:
        mge.set_default_device(device='gpu' + gpu_list[0])
    else:
        pass

    if world_size > 1:
        port = dist.util.get_free_ports(1)[0]
        server = dist.Server(port)
        processes = []
        for rank in range(world_size):
            logger.info("init distributed process group {} / {}".format(
                rank, world_size))
            p = mp.Process(target=worker,
                           args=(rank, world_size, cfg, gpu_list[rank], port))
            p.start()
            processes.append(p)

        for rank in range(world_size):
            processes[rank].join()
            code = processes[rank].exitcode
            assert code == 0, "subprocess {} exit with code {}".format(
                rank, code)
    else:
        worker(0, 1, cfg)
Пример #3
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.gpus = args.gpus
    cfg.dynamic = args.dynamic
    cfg.ensemble = args.ensemble
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    # init the logger
    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # log some basic info
    logger.info('test gpus num: {}'.format(args.gpus))
    logger.info('Config:\n{}'.format(cfg.text))

    # get world_size
    world_size = args.gpus
    assert world_size <= mge.get_device_count("gpu")
    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    else:
        gpuid = args.gpuid
        mge.set_default_device(device='gpu' + gpuid)

    if world_size > 1:
        # start distributed test, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, cfg))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, cfg)