def run(local_rank, config_path, model_dir, cpu_mode=False, after_construct_launcher_callbacks=None, opts=None): # 0. config cfg = config.import_config(config_path) cfg = AttrDict.from_dict(cfg) if opts is not None: cfg.update_from_list(opts) # 1. model model = make_model(cfg['model']) if cfg['train'].get('sync_bn', False): model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if not cpu_mode: if torch.cuda.is_available(): torch.cuda.set_device(local_rank) dist.init_process_group(backend="nccl", init_method="env://") model.to(torch.device('cuda')) if dist.is_available(): model = nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, ) # 2. data traindata_loader = make_dataloader(cfg['data']['train']) testdata_loader = make_dataloader( cfg['data']['test']) if 'test' in cfg['data'] else None # 3. optimizer lr_schedule = make_learningrate(cfg['learning_rate']) cfg['optimizer']['params']['lr'] = lr_schedule.base_lr optimizer = make_optimizer(cfg['optimizer'], params=model.parameters()) tl = trainer.Launcher(model_dir=model_dir, model=model, optimizer=optimizer, lr_schedule=lr_schedule) if after_construct_launcher_callbacks is not None: for f in after_construct_launcher_callbacks: f(tl) tl.logger.info('sync bn: {}'.format( 'True' if cfg['train'].get('sync_bn', False) else 'False')) tl.logger.info('external parameter: {}'.format(opts)) tl.train_by_config(traindata_loader, config=merge_dict(cfg['train'], cfg['test']), test_data_loader=testdata_loader) return dict(config=cfg, launcher=tl)
def run(config_path, model_dir, cpu_mode=False, after_construct_launcher_callbacks=None, opts=None): # 0. config cfg = config.import_config(config_path) cfg = AttrDict.from_dict(cfg) if opts is not None: cfg.update_from_list(opts) # 1. model model = make_model(cfg['model']) if not cpu_mode: if torch.cuda.is_available(): model.to(torch.device('cuda')) model = nn.DataParallel(model, device_ids=list( range(torch.cuda.device_count()))) # 2. data traindata_loader = make_dataloader(cfg['data']['train']) testdata_loader = make_dataloader( cfg['data']['test']) if 'test' in cfg['data'] else None # 3. optimizer lr_schedule = make_learningrate(cfg['learning_rate']) cfg['optimizer']['params']['lr'] = lr_schedule.base_lr optimizer = make_optimizer(cfg['optimizer'], params=model.parameters()) tl = trainer.Launcher(model_dir=model_dir, model=model, optimizer=optimizer, lr_schedule=lr_schedule) if after_construct_launcher_callbacks is not None: for f in after_construct_launcher_callbacks: f(tl) tl.logger.info('external parameter: {}'.format(opts)) tl.train_by_config(traindata_loader, config=merge_dict(cfg['train'], cfg['test']), test_data_loader=testdata_loader) return dict(config=cfg, launcher=tl)
def run(local_rank, config_path, model_dir, opt_level='O0', cpu_mode=False, after_construct_launcher_callbacks=None, opts=None): # 0. config cfg = config.import_config(config_path) cfg = AttrDict.from_dict(cfg) if opts is not None: cfg.update_from_list(opts) # 1. model model = make_model(cfg['model']) if cfg['train'].get('apex_sync_bn', False): model = apex.parallel.convert_syncbn_model(model) # 2. optimizer lr_schedule = make_learningrate(cfg['learning_rate']) cfg['optimizer']['params']['lr'] = lr_schedule.base_lr optimizer = make_optimizer(cfg['optimizer'], params=model.parameters()) if not cpu_mode: if torch.cuda.is_available(): torch.cuda.set_device(local_rank) dist.init_process_group(backend="nccl", init_method="env://") model.to(torch.device('cuda')) if dist.is_available(): # if OPT_LEVELS.index(opt_level) < 2: # keep_batchnorm_fp32 = None model, optimizer = amp.initialize( model, optimizer, opt_level=opt_level, ) model = DDP( model, delay_allreduce=True, ) # 3. data traindata_loader = make_dataloader(cfg['data']['train']) testdata_loader = make_dataloader( cfg['data']['test']) if 'test' in cfg['data'] else None tl = trainer.Launcher(model_dir=model_dir, model=model, optimizer=optimizer, lr_schedule=lr_schedule) # log dist train info tl.logger.info( '[NVIDIA/apex] amp optimizer. opt_level = {}'.format(opt_level)) tl.logger.info('apex sync bn: {}'.format( 'on' if cfg['train'].get('apex_sync_bn', False) else 'off')) tl.logger.info('external parameter: {}'.format(opts)) tl.override_backward(default_backward.amp_backward) if after_construct_launcher_callbacks is not None: for f in after_construct_launcher_callbacks: f(tl) tl.train_by_config(traindata_loader, config=merge_dict(cfg['train'], cfg['test']), test_data_loader=testdata_loader)