def print_dict(d, ident=''): for k in d: if isinstance(d[k], dict): logger.add_line("{}{}".format(ident, k)) print_dict(d[k], ident=' ' + ident) else: logger.add_line("{}{}: {}".format(ident, k, str(d[k])))
def prepare_environment(args, cfg, fold): if args.distributed: while True: try: dist.init_process_group( backend='nccl', init_method='tcp://localhost:{}'.format(args.port), world_size=args.world_size, rank=args.gpu) break except RuntimeError: args.port = str(int(args.port) + 1) model_cfg = yaml.safe_load(open(args.model_cfg))['model'] eval_dir = '{}/{}/eval-{}/fold-{:02d}'.format(model_cfg['model_dir'], model_cfg['name'], cfg['benchmark']['name'], fold) os.makedirs(eval_dir, exist_ok=True) yaml.safe_dump(cfg, open('{}/config.yaml'.format(eval_dir), 'w')) logger = utils.logger.Logger(quiet=args.quiet, log_fn='{}/eval.log'.format(eval_dir), rank=args.gpu) if any(['SLURM' in env for env in list(os.environ.keys())]): logger.add_line("=" * 30 + " SLURM " + "=" * 30) for env in os.environ.keys(): if 'SLURM' in env: logger.add_line('{:30}: {}'.format(env, os.environ[env])) logger.add_line("=" * 30 + " Config " + "=" * 30) def print_dict(d, ident=''): for k in d: if isinstance(d[k], dict): logger.add_line("{}{}".format(ident, k)) print_dict(d[k], ident=' ' + ident) else: logger.add_line("{}{}: {}".format(ident, k, str(d[k]))) print_dict(cfg) logger.add_line("=" * 30 + " Model Config " + "=" * 30) print_dict(model_cfg) return eval_dir, model_cfg, logger
def main_worker(gpu, ngpus, fold, args, cfg): args.gpu = gpu args.world_size = ngpus # Prepare folder and logger eval_dir, model_cfg, logger = eval_utils.prepare_environment( args, cfg, fold) # Model model, ckp_manager = eval_utils.build_model(model_cfg, cfg, eval_dir, args, logger) # Optimizer optimizer, scheduler = main_utils.build_optimizer(model.parameters(), cfg['optimizer'], logger) # Datasets train_loader, test_loader, dense_loader = eval_utils.build_dataloaders( cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger) ################################ Train ################################ start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs'] if (cfg['resume'] or args.test_only) and ckp_manager.checkpoint_exists(last=True): start_epoch = ckp_manager.restore(model, optimizer, scheduler, restore_last=True) logger.add_line("Loaded checkpoint '{}' (epoch {})".format( ckp_manager.last_checkpoint_fn(), start_epoch)) if not cfg['test_only']: logger.add_line("=" * 30 + " Training " + "=" * 30) for epoch in range(start_epoch, end_epoch): scheduler.step(epoch=epoch) if args.distributed: train_loader.sampler.set_epoch(epoch) test_loader.sampler.set_epoch(epoch) logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30) logger.add_line('LR: {}'.format(scheduler.get_lr())) run_phase('train', train_loader, model, optimizer, epoch, args, cfg, logger) run_phase('test', test_loader, model, None, epoch, args, cfg, logger) ckp_manager.save(model, optimizer, scheduler, epoch) ################################ Eval ################################ logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30) cfg['dataset']['test']['clips_per_video'] = 25 train_loader, test_loader, dense_loader = eval_utils.build_dataloaders( cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger) top1_dense, top5_dense = run_phase('test_dense', dense_loader, model, None, end_epoch, args, cfg, logger) top1, top5 = run_phase('test', test_loader, model, None, end_epoch, args, cfg, logger) logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30) for ft in top1: logger.add_line('') logger.add_line('[{}] Clip@1: {:6.2f}'.format(ft, top1[ft])) logger.add_line('[{}] Clip@5: {:6.2f}'.format(ft, top5[ft])) logger.add_line('[{}] Video@1: {:6.2f}'.format(ft, top1_dense[ft])) logger.add_line('[{}] Video@5: {:6.2f}'.format(ft, top5_dense[ft]))
def run_phase(phase, loader, model, optimizer, epoch, args, cfg, logger): from utils import metrics_utils logger.add_line('\n{}: Epoch {}'.format(phase, epoch)) feature_names = cfg['model']['args']['feat_names'] batch_time = metrics_utils.AverageMeter('Time', ':6.3f', 100) data_time = metrics_utils.AverageMeter('Data', ':6.3f', 100) loss_meters = { ft: metrics_utils.AverageMeter('Loss', ':.4e', 0) for ft in feature_names } top1_meters = { ft: metrics_utils.AverageMeter('Acc@1', ':6.2f', 0) for ft in feature_names } top5_meters = { ft: metrics_utils.AverageMeter('Acc@5', ':6.2f', 0) for ft in feature_names } progress = { 'timers': utils.logger.ProgressMeter(len(loader), meters=[batch_time, data_time], phase=phase, epoch=epoch, logger=logger) } progress.update({ ft: utils.logger.ProgressMeter( len(loader), meters=[loss_meters[ft], top1_meters[ft], top5_meters[ft]], phase=phase, epoch=epoch, logger=logger) for ft in feature_names }) # switch to train/test mode model.train(phase == 'train') if phase in {'test_dense', 'test'}: model = BatchWrapper(model, cfg['dataset']['batch_size']) end = time.time() criterion = torch.nn.CrossEntropyLoss() softmax = torch.nn.Softmax(dim=1) for it, sample in enumerate(loader): data_time.update(time.time() - end) video = sample['frames'] target = sample['label'].cuda() if args.gpu is not None: video = video.cuda(args.gpu, non_blocking=True) if phase == 'test_dense': batch_size, clips_per_sample = video.shape[0], video.shape[1] video = video.flatten(0, 1).contiguous() # compute outputs if phase == 'train': logits = model(video) else: with torch.no_grad(): logits = model(video) # compute loss and measure accuracy total_loss = 0. for ft in feature_names: if phase == 'test_dense': confidence = softmax(logits[ft]).view(batch_size, clips_per_sample, -1).mean(1) target_tiled = target.unsqueeze(1).repeat( 1, clips_per_sample).view(-1) loss = criterion(logits[ft], target_tiled) else: confidence = softmax(logits[ft]) loss = criterion(logits[ft], target) total_loss += loss with torch.no_grad(): acc1, acc5 = metrics_utils.accuracy(confidence, target, topk=(1, 5)) loss_meters[ft].update(loss.item(), target.size(0)) top1_meters[ft].update(acc1[0].item(), target.size(0)) top5_meters[ft].update(acc5[0].item(), target.size(0)) # compute gradient and do SGD step if phase == 'train': optimizer.zero_grad() total_loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if (it + 1) % 100 == 0 or it == 0 or it + 1 == len(loader): for ft in progress: progress[ft].display(it + 1) if args.distributed: for ft in progress: progress[ft].synchronize_meters(args.gpu) progress[ft].display(len(loader) * args.world_size) return {ft: top1_meters[ft].avg for ft in feature_names }, {ft: top5_meters[ft].avg for ft in feature_names}
def build_model(feat_cfg, eval_cfg, eval_dir, args, logger): import models pretrained_net = models.__dict__[feat_cfg['arch']](**feat_cfg['args']) # Load from checkpoint checkpoint_fn = '{}/{}/checkpoint.pth.tar'.format(feat_cfg['model_dir'], feat_cfg['name']) ckp = torch.load(checkpoint_fn, map_location='cpu') pretrained_net.load_state_dict( {k.replace('module.', ''): ckp['model'][k] for k in ckp['model']}) # Wrap with linear-head classifiers if eval_cfg['model']['name'] == 'ClassificationWrapper': model = ClassificationWrapper( feature_extractor=pretrained_net.video_model, **eval_cfg['model']['args']) ckp_manager = CheckpointManager(eval_dir, rank=args.gpu) elif eval_cfg['model']['name'] == 'MOSTWrapper': model = MOSTModel(feature_extractor=pretrained_net.video_model, **eval_cfg['model']['args']) ckp_manager = MOSTCheckpointManager(eval_dir, rank=args.gpu) else: raise ValueError # Log model description logger.add_line("=" * 30 + " Model " + "=" * 30) logger.add_line(str(model)) logger.add_line("=" * 30 + " Parameters " + "=" * 30) logger.add_line(main_utils.parameter_description(model)) logger.add_line("=" * 30 + " Pretrained model " + "=" * 30) logger.add_line("File: {}\nEpoch: {}".format(checkpoint_fn, ckp['epoch'])) # Distribute model = distribute_model_to_cuda(model, args, eval_cfg) return model, ckp_manager
def build_dataloaders(cfg, fold, num_workers, distributed, logger): logger.add_line("=" * 30 + " Train DB " + "=" * 30) train_loader = build_dataloader(cfg, cfg['train'], fold, num_workers, distributed) logger.add_line(str(train_loader.dataset)) logger.add_line("=" * 30 + " Test DB " + "=" * 30) test_loader = build_dataloader(cfg, cfg['test'], fold, num_workers, distributed) logger.add_line(str(test_loader.dataset)) logger.add_line("=" * 30 + " Dense DB " + "=" * 30) dense_loader = build_dataloader(cfg, cfg['test_dense'], fold, num_workers, distributed) logger.add_line(str(dense_loader.dataset)) return train_loader, test_loader, dense_loader
def main_worker(gpu, ngpus, args, cfg): args.gpu = gpu ngpus_per_node = ngpus # Setup environment args = main_utils.initialize_distributed_backend( args, ngpus_per_node) ### Use other method instead logger, tb_writter, model_dir = main_utils.prep_environment(args, cfg) # Define model model = main_utils.build_model(cfg['model'], logger) model, args = main_utils.distribute_model_to_cuda(model, args) # Define dataloaders train_loader = main_utils.build_dataloaders( cfg['dataset'], cfg['num_workers'], args.multiprocessing_distributed, logger) # Define criterion train_criterion = main_utils.build_criterion(cfg['loss'], logger=logger) train_criterion = train_criterion.cuda() # Define optimizer optimizer, scheduler = main_utils.build_optimizer( params=list(model.parameters()) + list(train_criterion.parameters()), cfg=cfg['optimizer'], logger=logger) ckp_manager = main_utils.CheckpointManager( model_dir, rank=args.rank, dist=args.multiprocessing_distributed) # Optionally resume from a checkpoint start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs'] if cfg['resume']: if ckp_manager.checkpoint_exists(last=True): start_epoch = ckp_manager.restore(restore_last=True, model=model, optimizer=optimizer, train_criterion=train_criterion) scheduler.step(start_epoch) logger.add_line("Checkpoint loaded: '{}' (epoch {})".format( ckp_manager.last_checkpoint_fn(), start_epoch)) else: logger.add_line("No checkpoint found at '{}'".format( ckp_manager.last_checkpoint_fn())) cudnn.benchmark = True ############################ TRAIN ######################################### test_freq = cfg['test_freq'] if 'test_freq' in cfg else 1 for epoch in range(start_epoch, end_epoch): if (epoch % 10) == 0: ckp_manager.save(epoch, model=model, train_criterion=train_criterion, optimizer=optimizer, filename='checkpoint-ep{}.pth.tar'.format(epoch)) if args.multiprocessing_distributed: train_loader.sampler.set_epoch(epoch) # Train for one epoch logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30) logger.add_line('LR: {}'.format(scheduler.get_lr())) run_phase('train', train_loader, model, optimizer, train_criterion, epoch, args, cfg, logger, tb_writter) scheduler.step(epoch) if ((epoch % test_freq) == 0) or (epoch == end_epoch - 1): ckp_manager.save(epoch + 1, model=model, optimizer=optimizer, train_criterion=train_criterion)
def run_phase(phase, loader, model, optimizer, criterion, epoch, args, cfg, logger, tb_writter): from utils import metrics_utils logger.add_line('\n{}: Epoch {}'.format(phase, epoch)) batch_time = metrics_utils.AverageMeter('Time', ':6.3f', window_size=100) data_time = metrics_utils.AverageMeter('Data', ':6.3f', window_size=100) loss_meter = metrics_utils.AverageMeter('Loss', ':.3e') loss_meter_npid1 = metrics_utils.AverageMeter('Loss_npid1', ':.3e') loss_meter_npid2 = metrics_utils.AverageMeter('Loss_npid2', ':.3e') loss_meter_cmc1 = metrics_utils.AverageMeter('Loss_cmc1', ':.3e') loss_meter_cmc2 = metrics_utils.AverageMeter('Loss_cmc2', ':.3e') progress = utils.logger.ProgressMeter(len(loader), [ batch_time, data_time, loss_meter, loss_meter_npid1, loss_meter_npid2, loss_meter_cmc1, loss_meter_cmc2 ], phase=phase, epoch=epoch, logger=logger, tb_writter=tb_writter) # switch to train mode model.train(phase == 'train') end = time.time() device = args.gpu if args.gpu is not None else 0 for i, sample in enumerate(loader): # measure data loading time data_time.update(time.time() - end) if phase == 'train': embedding = model(sample) else: with torch.no_grad(): embedding = model(sample) # compute loss loss, loss_debug = criterion(embedding) loss_meter.update(loss.item(), embedding[0].size(0)) loss_meter_npid1.update(loss_debug[0].item(), embedding[0].size(0)) loss_meter_npid2.update(loss_debug[1].item(), embedding[0].size(0)) loss_meter_cmc1.update(loss_debug[2].item(), embedding[0].size(0)) loss_meter_cmc2.update(loss_debug[3].item(), embedding[0].size(0)) # compute gradient and do SGD step during training if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # print to terminal and tensorboard step = epoch * len(loader) + i if (i + 1) % cfg['print_freq'] == 0 or i == 0 or i + 1 == len(loader): progress.display(i + 1) # Sync metrics across all GPUs and print final averages if args.multiprocessing_distributed: progress.synchronize_meters(args.gpu) progress.display(len(loader) * args.world_size) if tb_writter is not None: for meter in progress.meters: tb_writter.add_scalar('{}-epoch/{}'.format(phase, meter.name), meter.avg, epoch)
def run_phase(phase, loader, model, optimizer, criterion, epoch, args, cfg, logger, tb_writter): from utils import metrics_utils logger.add_line('\n{}: Epoch {}'.format(phase, epoch)) batch_time = metrics_utils.AverageMeter('Time', ':6.3f', window_size=100) data_time = metrics_utils.AverageMeter('Data', ':6.3f', window_size=100) loss_meter = metrics_utils.AverageMeter('Loss', ':.3e') progress = utils.logger.ProgressMeter(len(loader), [batch_time, data_time, loss_meter], phase=phase, epoch=epoch, logger=logger, tb_writter=tb_writter) # switch to train mode model.train(phase == 'train') end = time.time() device = args.gpu if args.gpu is not None else 0 for i, sample in enumerate(loader): # measure data loading time data_time.update(time.time() - end) # Prepare batch video, audio, index = sample['frames'], sample['audio'], sample[ 'index'] video = video.cuda(device, non_blocking=True) audio = audio.cuda(device, non_blocking=True) index = index.cuda(device, non_blocking=True) # compute audio and video embeddings if phase == 'train': video_emb, audio_emb = model(video, audio) else: with torch.no_grad(): video_emb, audio_emb = model(video, audio) # compute loss loss, loss_debug = criterion(video_emb, audio_emb, index) loss_meter.update(loss.item(), video.size(0)) # compute gradient and do SGD step during training if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # print to terminal and tensorboard step = epoch * len(loader) + i if (i + 1) % cfg['print_freq'] == 0 or i == 0 or i + 1 == len(loader): progress.display(i + 1) if tb_writter is not None: for key in loss_debug: tb_writter.add_scalar('{}-batch/{}'.format(phase, key), loss_debug[key].item(), step) # Sync metrics across all GPUs and print final averages if args.distributed: progress.synchronize_meters(args.gpu) progress.display(len(loader) * args.world_size) if tb_writter is not None: for meter in progress.meters: tb_writter.add_scalar('{}-epoch/{}'.format(phase, meter.name), meter.avg, epoch)
def main_worker(gpu, ngpus, fold, args, cfg): args.gpu = gpu args.world_size = ngpus # Prepare folder and logger eval_dir, model_cfg, logger = eval_utils.prepare_environment( args, cfg, fold) # Model model, ckp_manager = eval_utils.build_model(model_cfg, cfg, eval_dir, args, logger) # Optimizer optimizer, scheduler = main_utils.build_optimizer(model.parameters(), cfg['optimizer'], logger) # Datasets train_loader, test_loader, dense_loader = eval_utils.build_dataloaders( cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger) ################################ Train ################################ start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs'] if cfg['resume'] and ckp_manager.checkpoint_exists(last=True): start_epoch = ckp_manager.restore(model, optimizer, scheduler, restore_last=True) logger.add_line("Loaded checkpoint '{}' (epoch {})".format( ckp_manager.last_checkpoint_fn(), start_epoch)) if not cfg['test_only']: logger.add_line("=" * 30 + " Training " + "=" * 30) # Warmup. Train classifier for a few epochs. if start_epoch == 0 and 'warmup_classifier' in cfg[ 'optimizer'] and cfg['optimizer']['warmup_classifier']: n_wu_epochs = cfg['optimizer'][ 'warmup_epochs'] if 'warmup_epochs' in cfg['optimizer'] else 5 cls_opt, _ = main_utils.build_optimizer( params=[ p for n, p in model.named_parameters() if 'feature_extractor' not in n ], cfg={ 'lr': { 'base_lr': cfg['optimizer']['lr']['base_lr'], 'milestones': [ n_wu_epochs, ], 'gamma': 1. }, 'weight_decay': cfg['optimizer']['weight_decay'], 'name': cfg['optimizer']['name'] }) for epoch in range(n_wu_epochs): run_phase('train', train_loader, model, cls_opt, epoch, args, cfg, logger) top1, _ = run_phase('test', test_loader, model, None, epoch, args, cfg, logger) # Main training loop for epoch in range(start_epoch, end_epoch): scheduler.step(epoch=epoch) if args.distributed: train_loader.sampler.set_epoch(epoch) test_loader.sampler.set_epoch(epoch) logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30) logger.add_line('LR: {}'.format(scheduler.get_lr())) run_phase('train', train_loader, model, optimizer, epoch, args, cfg, logger) top1, _ = run_phase('test', test_loader, model, None, epoch, args, cfg, logger) ckp_manager.save(model, optimizer, scheduler, epoch, eval_metric=top1) ################################ Eval ################################ logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30) cfg['dataset']['test'][ 'clips_per_video'] = 25 # Evaluate clip-level predictions with 25 clips per video for metric stability train_loader, test_loader, dense_loader = eval_utils.build_dataloaders( cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger) top1, top5 = run_phase('test', test_loader, model, None, end_epoch, args, cfg, logger) top1_dense, top5_dense = run_phase('test_dense', dense_loader, model, None, end_epoch, args, cfg, logger) logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30) logger.add_line('Clip@1: {:6.2f}'.format(top1)) logger.add_line('Clip@5: {:6.2f}'.format(top5)) logger.add_line('Video@1: {:6.2f}'.format(top1_dense)) logger.add_line('Video@5: {:6.2f}'.format(top5_dense))