예제 #1
0
def run(local_rank,
        config_path,
        model_dir,
        cpu_mode=False,
        after_construct_launcher_callbacks=None,
        opts=None):
    # 0. config
    cfg = config.import_config(config_path)
    cfg = AttrDict.from_dict(cfg)
    if opts is not None:
        cfg.update_from_list(opts)
    # 1. model
    model = make_model(cfg['model'])
    if cfg['train'].get('sync_bn', False):
        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)

    if not cpu_mode:
        if torch.cuda.is_available():
            torch.cuda.set_device(local_rank)
            dist.init_process_group(backend="nccl", init_method="env://")
        model.to(torch.device('cuda'))
        if dist.is_available():
            model = nn.parallel.DistributedDataParallel(
                model,
                device_ids=[local_rank],
                output_device=local_rank,
            )

    # 2. data
    traindata_loader = make_dataloader(cfg['data']['train'])
    testdata_loader = make_dataloader(
        cfg['data']['test']) if 'test' in cfg['data'] else None

    # 3. optimizer
    lr_schedule = make_learningrate(cfg['learning_rate'])
    cfg['optimizer']['params']['lr'] = lr_schedule.base_lr
    optimizer = make_optimizer(cfg['optimizer'], params=model.parameters())
    tl = trainer.Launcher(model_dir=model_dir,
                          model=model,
                          optimizer=optimizer,
                          lr_schedule=lr_schedule)

    if after_construct_launcher_callbacks is not None:
        for f in after_construct_launcher_callbacks:
            f(tl)

    tl.logger.info('sync bn: {}'.format(
        'True' if cfg['train'].get('sync_bn', False) else 'False'))
    tl.logger.info('external parameter: {}'.format(opts))
    tl.train_by_config(traindata_loader,
                       config=merge_dict(cfg['train'], cfg['test']),
                       test_data_loader=testdata_loader)
    return dict(config=cfg, launcher=tl)
예제 #2
0
def run(config_path,
        model_dir,
        cpu_mode=False,
        after_construct_launcher_callbacks=None,
        opts=None):
    # 0. config
    cfg = config.import_config(config_path)
    cfg = AttrDict.from_dict(cfg)
    if opts is not None:
        cfg.update_from_list(opts)
    # 1. model
    model = make_model(cfg['model'])

    if not cpu_mode:
        if torch.cuda.is_available():
            model.to(torch.device('cuda'))
            model = nn.DataParallel(model,
                                    device_ids=list(
                                        range(torch.cuda.device_count())))

    # 2. data
    traindata_loader = make_dataloader(cfg['data']['train'])
    testdata_loader = make_dataloader(
        cfg['data']['test']) if 'test' in cfg['data'] else None

    # 3. optimizer
    lr_schedule = make_learningrate(cfg['learning_rate'])
    cfg['optimizer']['params']['lr'] = lr_schedule.base_lr
    optimizer = make_optimizer(cfg['optimizer'], params=model.parameters())

    tl = trainer.Launcher(model_dir=model_dir,
                          model=model,
                          optimizer=optimizer,
                          lr_schedule=lr_schedule)

    if after_construct_launcher_callbacks is not None:
        for f in after_construct_launcher_callbacks:
            f(tl)

    tl.logger.info('external parameter: {}'.format(opts))
    tl.train_by_config(traindata_loader,
                       config=merge_dict(cfg['train'], cfg['test']),
                       test_data_loader=testdata_loader)
    return dict(config=cfg, launcher=tl)
예제 #3
0
def run(local_rank,
        config_path,
        model_dir,
        opt_level='O0',
        cpu_mode=False,
        after_construct_launcher_callbacks=None,
        opts=None):
    # 0. config
    cfg = config.import_config(config_path)
    cfg = AttrDict.from_dict(cfg)
    if opts is not None:
        cfg.update_from_list(opts)
    # 1. model
    model = make_model(cfg['model'])
    if cfg['train'].get('apex_sync_bn', False):
        model = apex.parallel.convert_syncbn_model(model)
    # 2. optimizer
    lr_schedule = make_learningrate(cfg['learning_rate'])
    cfg['optimizer']['params']['lr'] = lr_schedule.base_lr
    optimizer = make_optimizer(cfg['optimizer'], params=model.parameters())

    if not cpu_mode:
        if torch.cuda.is_available():
            torch.cuda.set_device(local_rank)
            dist.init_process_group(backend="nccl", init_method="env://")
        model.to(torch.device('cuda'))
        if dist.is_available():
            # if OPT_LEVELS.index(opt_level) < 2:
            #     keep_batchnorm_fp32 = None
            model, optimizer = amp.initialize(
                model,
                optimizer,
                opt_level=opt_level,
            )
            model = DDP(
                model,
                delay_allreduce=True,
            )
    # 3. data
    traindata_loader = make_dataloader(cfg['data']['train'])
    testdata_loader = make_dataloader(
        cfg['data']['test']) if 'test' in cfg['data'] else None
    tl = trainer.Launcher(model_dir=model_dir,
                          model=model,
                          optimizer=optimizer,
                          lr_schedule=lr_schedule)
    # log dist train info
    tl.logger.info(
        '[NVIDIA/apex] amp optimizer. opt_level = {}'.format(opt_level))
    tl.logger.info('apex sync bn: {}'.format(
        'on' if cfg['train'].get('apex_sync_bn', False) else 'off'))
    tl.logger.info('external parameter: {}'.format(opts))
    tl.override_backward(default_backward.amp_backward)

    if after_construct_launcher_callbacks is not None:
        for f in after_construct_launcher_callbacks:
            f(tl)

    tl.train_by_config(traindata_loader,
                       config=merge_dict(cfg['train'], cfg['test']),
                       test_data_loader=testdata_loader)