def _dist_train(model, dataset, cfg, logger, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) # Start: vj changes find_unused_parameters = True # End: vj chagnes model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, batch_processor, optimizer, cfg.work_dir, logger) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if cfg.data.val.type in ['RawFramesDataset', 'VideoDataset']: runner.register_hook( DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5))) if cfg.data.val.type == 'AVADataset': runner.register_hook(AVADistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda(), device_ids=[torch.cuda.current_device()]) # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if cfg.data.val.type in ['RawFramesDataset', 'VideoDataset']: runner.register_hook( DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5))) if cfg.data.val.type == 'AVADataset': runner.register_hook(AVADistEvalmAPHook(cfg.data.val)) # if validate: # if isinstance(model.module, RPN): # # TODO: implement recall hooks for other datasets # runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) # else: # if cfg.data.val.type == 'CocoDataset': # runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) # else: # runner.register_hook(DistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, logger=None, ignores=None): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, dist=True) ] num_steps_per_epoch = len(data_loaders[0]) if hasattr(model, 'update_state'): model.update_state(num_steps_per_epoch) if cfg.load_from: load_checkpoint(model, cfg.load_from, strict=False, logger=logger, show_converted=True, ignores=ignores) if hasattr(cfg, 'model_partial_init') and cfg.model_partial_init: model.reset_weights() # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) # fix warm-up bug if hasattr(cfg.lr_config, 'warmup_iters'): if not hasattr(cfg.lr_config, 'by_epoch') or cfg.lr_config.by_epoch: cfg.lr_config.warmup_iters *= num_steps_per_epoch # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: eval_epoch = cfg.eval_epoch if hasattr(cfg, 'eval_epoch') else 1 if cfg.data.val.type in [ 'RawFramesDataset', 'StreamDataset', 'VideoDataset' ]: runner.register_hook( DistEvalTopKAccuracyHook( cfg.data.val, eval_epoch, k=(1, 5), num_valid_classes=cfg.data.num_test_classes)) elif cfg.data.val.type == 'AVADataset': runner.register_hook(AVADistEvalmAPHook(cfg.data.val, eval_epoch)) if cfg.resume_from: runner.resume(cfg.resume_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)