def main(gpu, ngpus, args, cfg): args.gpu = gpu args.world_size = ngpus # Prepare for training model_cfg, eval_dir, logger = eval_utils.prepare_environment(args, cfg) model, ckp_manager = eval_utils.build_model(model_cfg, cfg['model'], eval_dir, args, logger) optimizer, scheduler = main_utils.build_optimizer(model.parameters(), cfg['optimizer'], logger) train_loader, test_loader, dense_loader = eval_utils.build_dataloaders( cfg['dataset'], cfg['num_workers'], False, logger) # Optionally resume from a checkpoint start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs'] if cfg['resume'] or cfg['test_only']: start_epoch = ckp_manager.restore(model, optimizer, scheduler, restore_last=True, logger=logger) ######################### TRAINING ######################### if not cfg['test_only']: logger.add_line("=" * 30 + " Training " + "=" * 30) for epoch in range(start_epoch, end_epoch): train_loader.dataset.shuffle_dataset() test_loader.dataset.shuffle_dataset() logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30) logger.add_line('LR: {}'.format(scheduler.get_lr())) run_phase('train', train_loader, model, optimizer, epoch, args, cfg, logger) top1, _ = run_phase('test', test_loader, model, None, epoch, args, cfg, logger) ckp_manager.save(model, optimizer, scheduler, epoch, eval_metric=top1) scheduler.step() ######################### TESTING ######################### logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30) cfg['dataset']['test']['clips_per_video'] = 25 train_loader, test_loader, dense_loader = eval_utils.build_dataloaders( cfg['dataset'], cfg['num_workers'], False, logger) top1, top5 = run_phase('test', test_loader, model, None, end_epoch, args, cfg, logger) top1_dense, top5_dense = run_phase('test_dense', dense_loader, model, None, end_epoch, args, cfg, logger) ######################### LOG RESULTS ######################### logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30) logger.add_line('Clip@1: {:6.2f}'.format(top1)) logger.add_line('Clip@5: {:6.2f}'.format(top5)) logger.add_line('Video@1: {:6.2f}'.format(top1_dense)) logger.add_line('Video@5: {:6.2f}'.format(top5_dense))
def main_worker(gpu, ngpus_per_node, args, cfg): args.gpu = gpu # Setup environment args = main_utils.initialize_distributed_backend(args, ngpus_per_node) logger, tb_writter, model_dir = main_utils.prep_environment(args, cfg) # Define model model = main_utils.build_model(cfg['model'], logger) model, args, cfg['dataset']['batch_size'], cfg['num_workers'] = main_utils.distribute_model_to_cuda(model, args, cfg['dataset']['batch_size'], cfg['num_workers'], ngpus_per_node) # Define dataloaders train_loader, test_loader = main_utils.build_dataloaders(cfg['dataset'], cfg['num_workers'], args.distributed, logger) # Define criterion train_criterion = main_utils.build_criterion(cfg['loss'], logger=logger).cuda(gpu) # Define optimizer optimizer, scheduler = main_utils.build_optimizer( params=list(model.parameters()) + list(train_criterion.parameters()), cfg=cfg['optimizer'], logger=logger) ckp_manager = main_utils.CheckpointManager(model_dir, rank=args.rank) # Optionally resume from a checkpoint start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs'] if cfg['resume']: if ckp_manager.checkpoint_exists(last=True): start_epoch = ckp_manager.restore(restore_last=True, model=model, optimizer=optimizer, train_criterion=train_criterion) scheduler.step(start_epoch) logger.add_line("Checkpoint loaded: '{}' (epoch {})".format(ckp_manager.last_checkpoint_fn(), start_epoch)) else: logger.add_line("No checkpoint found at '{}'".format(ckp_manager.last_checkpoint_fn())) cudnn.benchmark = True ############################ TRAIN ######################################### for epoch in range(start_epoch, end_epoch): if epoch in cfg['optimizer']['lr']['milestones']: ckp_manager.save(epoch, model=model, train_criterion=train_criterion, optimizer=optimizer, filename='checkpoint-ep{}.pth.tar'.format(epoch)) if args.distributed: train_loader.sampler.set_epoch(epoch) train_loader.dataset.shuffle_dataset() # Train for one epoch logger.add_line('='*30 + ' Epoch {} '.format(epoch) + '='*30) logger.add_line('LR: {}'.format(scheduler.get_lr()[0])) run_phase('train', train_loader, model, optimizer, train_criterion, epoch, args, cfg, logger, tb_writter) run_phase('test', test_loader, model, optimizer, train_criterion, epoch, args, cfg, logger, tb_writter) ckp_manager.save(epoch+1, model=model, optimizer=optimizer, train_criterion=train_criterion) scheduler.step()
def main_worker(gpu, ngpus, fold, args, cfg): args.gpu = gpu args.world_size = ngpus # Prepare folder and logger eval_dir, model_cfg, logger = eval_utils.prepare_environment( args, cfg, fold) # Model model, ckp_manager = eval_utils.build_model(model_cfg, cfg, eval_dir, args, logger) # Optimizer optimizer, scheduler = main_utils.build_optimizer(model.parameters(), cfg['optimizer'], logger) # Datasets train_loader, test_loader, dense_loader = eval_utils.build_dataloaders( cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger) ################################ Train ################################ start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs'] if (cfg['resume'] or args.test_only) and ckp_manager.checkpoint_exists(last=True): start_epoch = ckp_manager.restore(model, optimizer, scheduler, restore_last=True) logger.add_line("Loaded checkpoint '{}' (epoch {})".format( ckp_manager.last_checkpoint_fn(), start_epoch)) if not cfg['test_only']: logger.add_line("=" * 30 + " Training " + "=" * 30) for epoch in range(start_epoch, end_epoch): scheduler.step(epoch=epoch) if args.distributed: train_loader.sampler.set_epoch(epoch) test_loader.sampler.set_epoch(epoch) logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30) logger.add_line('LR: {}'.format(scheduler.get_lr())) run_phase('train', train_loader, model, optimizer, epoch, args, cfg, logger) run_phase('test', test_loader, model, None, epoch, args, cfg, logger) ckp_manager.save(model, optimizer, scheduler, epoch) ################################ Eval ################################ logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30) cfg['dataset']['test']['clips_per_video'] = 25 train_loader, test_loader, dense_loader = eval_utils.build_dataloaders( cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger) top1_dense, top5_dense = run_phase('test_dense', dense_loader, model, None, end_epoch, args, cfg, logger) top1, top5 = run_phase('test', test_loader, model, None, end_epoch, args, cfg, logger) logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30) for ft in top1: logger.add_line('') logger.add_line('[{}] Clip@1: {:6.2f}'.format(ft, top1[ft])) logger.add_line('[{}] Clip@5: {:6.2f}'.format(ft, top5[ft])) logger.add_line('[{}] Video@1: {:6.2f}'.format(ft, top1_dense[ft])) logger.add_line('[{}] Video@5: {:6.2f}'.format(ft, top5_dense[ft]))
def main_worker(gpu, ngpus, args, cfg): args.gpu = gpu ngpus_per_node = ngpus # Setup environment args = main_utils.initialize_distributed_backend( args, ngpus_per_node) ### Use other method instead logger, tb_writter, model_dir = main_utils.prep_environment(args, cfg) # Define model model = main_utils.build_model(cfg['model'], logger) model, args = main_utils.distribute_model_to_cuda(model, args) # Define dataloaders train_loader = main_utils.build_dataloaders( cfg['dataset'], cfg['num_workers'], args.multiprocessing_distributed, logger) # Define criterion train_criterion = main_utils.build_criterion(cfg['loss'], logger=logger) train_criterion = train_criterion.cuda() # Define optimizer optimizer, scheduler = main_utils.build_optimizer( params=list(model.parameters()) + list(train_criterion.parameters()), cfg=cfg['optimizer'], logger=logger) ckp_manager = main_utils.CheckpointManager( model_dir, rank=args.rank, dist=args.multiprocessing_distributed) # Optionally resume from a checkpoint start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs'] if cfg['resume']: if ckp_manager.checkpoint_exists(last=True): start_epoch = ckp_manager.restore(restore_last=True, model=model, optimizer=optimizer, train_criterion=train_criterion) scheduler.step(start_epoch) logger.add_line("Checkpoint loaded: '{}' (epoch {})".format( ckp_manager.last_checkpoint_fn(), start_epoch)) else: logger.add_line("No checkpoint found at '{}'".format( ckp_manager.last_checkpoint_fn())) cudnn.benchmark = True ############################ TRAIN ######################################### test_freq = cfg['test_freq'] if 'test_freq' in cfg else 1 for epoch in range(start_epoch, end_epoch): if (epoch % 10) == 0: ckp_manager.save(epoch, model=model, train_criterion=train_criterion, optimizer=optimizer, filename='checkpoint-ep{}.pth.tar'.format(epoch)) if args.multiprocessing_distributed: train_loader.sampler.set_epoch(epoch) # Train for one epoch logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30) logger.add_line('LR: {}'.format(scheduler.get_lr())) run_phase('train', train_loader, model, optimizer, train_criterion, epoch, args, cfg, logger, tb_writter) scheduler.step(epoch) if ((epoch % test_freq) == 0) or (epoch == end_epoch - 1): ckp_manager.save(epoch + 1, model=model, optimizer=optimizer, train_criterion=train_criterion)
def main(gpu, ngpus, args, cfg): args.gpu = gpu args.world_size = ngpus # Prepare for training model_cfg, eval_dir, logger = eval_utils.prepare_environment(args, cfg) if 'scratch' not in cfg: cfg['scratch'] = False if 'ft_all' not in cfg: cfg['ft_all'] = False model, ckp_manager, ckp = eval_utils.build_model(model_cfg, cfg['model'], eval_dir, args, logger, return_ckp=True, scratch=cfg['scratch']) params = list(model.parameters()) if cfg['ft_all'] else model.head_params() if cfg['use_transf'] != 'none': loss_cfg = yaml.safe_load(open(args.model_cfg))['loss'] align_criterion = main_utils.build_criterion(loss_cfg, logger=logger).cuda(gpu) align_criterion.load_state_dict(ckp['train_criterion']) if type(align_criterion).__name__ == 'MultiTask': align_criterion = align_criterion.losses[0] # MultiTask if cfg['ft_all']: params += list(align_criterion.parameters()) else: align_criterion = None optimizer, scheduler = main_utils.build_optimizer(params, cfg['optimizer'], logger) train_loader, test_loader = build_dataloaders(cfg['dataset'], cfg['num_workers'], args.distributed, logger) # Optionally resume from a checkpoint start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs'] if 'resume' in cfg: args.resume = cfg['resume'] if 'test_only' in cfg: args.test_only = cfg['test_only'] if args.resume or args.test_only: start_epoch = ckp_manager.restore(model, optimizer, scheduler, restore_last=True, logger=logger) ######################### TRAINING ######################### if not args.test_only: logger.add_line("=" * 30 + " Training " + "=" * 30) for epoch in range(start_epoch, end_epoch): if args.distributed: train_loader.sampler.set_epoch(epoch) test_loader.sampler.set_epoch(epoch) train_loader.dataset.shuffle_dataset() test_loader.dataset.shuffle_dataset() logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30) logger.add_line('LR: {}'.format(scheduler.get_lr())) run_phase('train', train_loader, model, optimizer, epoch, args, cfg, logger, align_criterion) top1 = run_phase('test', test_loader, model, None, epoch, args, cfg, logger, align_criterion) ckp_manager.save(model, optimizer, scheduler, epoch, criterion=align_criterion, eval_metric=top1) scheduler.step() ######################### TESTING ######################### logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30) top1 = run_phase('test', test_loader, model, None, end_epoch, args, cfg, logger, align_criterion) ######################### LOG RESULTS ######################### logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30) logger.add_line('Clip@1: {:6.2f}'.format(top1))
def main_worker(gpu, ngpus, fold, args, cfg): args.gpu = gpu args.world_size = ngpus # Prepare folder and logger eval_dir, model_cfg, logger = eval_utils.prepare_environment( args, cfg, fold) # Model model, ckp_manager = eval_utils.build_model(model_cfg, cfg, eval_dir, args, logger) # Optimizer optimizer, scheduler = main_utils.build_optimizer(model.parameters(), cfg['optimizer'], logger) # Datasets train_loader, test_loader, dense_loader = eval_utils.build_dataloaders( cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger) ################################ Train ################################ start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs'] if cfg['resume'] and ckp_manager.checkpoint_exists(last=True): start_epoch = ckp_manager.restore(model, optimizer, scheduler, restore_last=True) logger.add_line("Loaded checkpoint '{}' (epoch {})".format( ckp_manager.last_checkpoint_fn(), start_epoch)) if not cfg['test_only']: logger.add_line("=" * 30 + " Training " + "=" * 30) # Warmup. Train classifier for a few epochs. if start_epoch == 0 and 'warmup_classifier' in cfg[ 'optimizer'] and cfg['optimizer']['warmup_classifier']: n_wu_epochs = cfg['optimizer'][ 'warmup_epochs'] if 'warmup_epochs' in cfg['optimizer'] else 5 cls_opt, _ = main_utils.build_optimizer( params=[ p for n, p in model.named_parameters() if 'feature_extractor' not in n ], cfg={ 'lr': { 'base_lr': cfg['optimizer']['lr']['base_lr'], 'milestones': [ n_wu_epochs, ], 'gamma': 1. }, 'weight_decay': cfg['optimizer']['weight_decay'], 'name': cfg['optimizer']['name'] }) for epoch in range(n_wu_epochs): run_phase('train', train_loader, model, cls_opt, epoch, args, cfg, logger) top1, _ = run_phase('test', test_loader, model, None, epoch, args, cfg, logger) # Main training loop for epoch in range(start_epoch, end_epoch): scheduler.step(epoch=epoch) if args.distributed: train_loader.sampler.set_epoch(epoch) test_loader.sampler.set_epoch(epoch) logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30) logger.add_line('LR: {}'.format(scheduler.get_lr())) run_phase('train', train_loader, model, optimizer, epoch, args, cfg, logger) top1, _ = run_phase('test', test_loader, model, None, epoch, args, cfg, logger) ckp_manager.save(model, optimizer, scheduler, epoch, eval_metric=top1) ################################ Eval ################################ logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30) cfg['dataset']['test'][ 'clips_per_video'] = 25 # Evaluate clip-level predictions with 25 clips per video for metric stability train_loader, test_loader, dense_loader = eval_utils.build_dataloaders( cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger) top1, top5 = run_phase('test', test_loader, model, None, end_epoch, args, cfg, logger) top1_dense, top5_dense = run_phase('test_dense', dense_loader, model, None, end_epoch, args, cfg, logger) logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30) logger.add_line('Clip@1: {:6.2f}'.format(top1)) logger.add_line('Clip@5: {:6.2f}'.format(top5)) logger.add_line('Video@1: {:6.2f}'.format(top1_dense)) logger.add_line('Video@5: {:6.2f}'.format(top5_dense))