def _run_act_layer_grad(act_type, inplace=True): x = torch.rand(10, 1000) * 10 m = MLP(act_layer=act_type, inplace=inplace) def _run(x, act_layer=''): if act_layer: # replace act layer if set m.act = create_act_layer(act_layer, inplace=inplace) out = m(x) l = (out - 0).pow(2).sum() return l out_me = _run(x) with set_layer_config(scriptable=True): out_jit = _run(x, act_type) assert torch.isclose(out_jit, out_me) with set_layer_config(no_jit=True): out_basic = _run(x, act_type) assert torch.isclose(out_basic, out_jit)
def validate(args): setup_default_logging() if args.amp: if has_apex: args.apex_amp = True elif has_native_amp: args.native_amp = True assert not args.apex_amp or not args.native_amp, "Only one AMP mode should be set." args.pretrained = args.pretrained or not args.checkpoint # might as well try to validate something args.prefetcher = not args.no_prefetcher # create model with set_layer_config(scriptable=args.torchscript): bench = create_model( args.model, bench_task='predict', num_classes=args.num_classes, pretrained=args.pretrained, redundant_bias=args.redundant_bias, soft_nms=args.soft_nms, checkpoint_path=args.checkpoint, checkpoint_ema=args.use_ema, ) model_config = bench.config param_count = sum([m.numel() for m in bench.parameters()]) print('Model %s created, param count: %d' % (args.model, param_count)) bench = bench.cuda() amp_autocast = suppress if args.apex_amp: bench = amp.initialize(bench, opt_level='O1') print('Using NVIDIA APEX AMP. Validating in mixed precision.') elif args.native_amp: amp_autocast = torch.cuda.amp.autocast print('Using native Torch AMP. Validating in mixed precision.') else: print('AMP not enabled. Validating in float32.') if args.num_gpu > 1: bench = torch.nn.DataParallel(bench, device_ids=list(range(args.num_gpu))) dataset = create_dataset(args.dataset, args.root, args.split) input_config = resolve_input_config(args, model_config) loader = create_loader(dataset, input_size=input_config['input_size'], batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=input_config['interpolation'], fill_color=input_config['fill_color'], mean=input_config['mean'], std=input_config['std'], num_workers=args.workers, pin_mem=args.pin_mem) evaluator = create_evaluator(args.dataset, dataset, pred_yxyx=False) bench.eval() batch_time = AverageMeter() end = time.time() last_idx = len(loader) - 1 with torch.no_grad(): for i, (input, target) in enumerate(loader): with amp_autocast(): output = bench(input, img_info=target) evaluator.add_predictions(output, target) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_freq == 0 or i == last_idx: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' .format(i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg)) mean_ap = 0. if dataset.parser.has_labels: mean_ap = evaluator.evaluate() else: evaluator.save(args.results) return mean_ap
def main(): setup_default_logging() args, args_text = _parse_args() args.pretrained_backbone = not args.no_pretrained_backbone args.prefetcher = not args.no_prefetcher args.distributed = True if 'WORLD_SIZE' in os.environ: os.environ['NCCL_DEBUG'] = 'INFO' #os.environ['CUDA_VISIBLE_DEVICES'] = [0, 1] args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:1' args.world_size = 1 args.rank = 0 # global rank print("args: ", args) print("args text: ", args_text) if args.distributed: args.device = 'cuda:%d' % args.local_rank print("args.device: ", args.device) torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() assert args.rank >= 0 if args.distributed: logging.info( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: logging.info('Training with a single process on 1 GPU.') use_amp = None if args.amp: # for backwards compat, `--amp` arg tries apex before native amp if has_apex: args.apex_amp = True elif has_native_amp: args.native_amp = True else: logging.warning( "Neither APEX or native Torch AMP is available, using float32. " "Install NVIDA apex or upgrade to PyTorch 1.6.") if args.apex_amp: if has_apex: use_amp = 'apex' else: logging.warning( "APEX AMP not available, using float32. Install NVIDA apex") elif args.native_amp: if has_native_amp: use_amp = 'native' else: logging.warning( "Native AMP not available, using float32. Upgrade to PyTorch 1.6." ) torch.manual_seed(args.seed + args.rank) with set_layer_config(scriptable=args.torchscript): model = create_model( args.model, bench_task='train', num_classes=args.num_classes, pretrained=args.pretrained, pretrained_backbone=args.pretrained_backbone, redundant_bias=args.redundant_bias, label_smoothing=args.smoothing, legacy_focal=args.legacy_focal, jit_loss=args.jit_loss, soft_nms=args.soft_nms, bench_labeler=args.bench_labeler, checkpoint_path=args.initial_checkpoint, ) model_config = model.config # grab before we obscure with DP/DDP wrappers if args.local_rank == 0: logging.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) model.cuda() if args.channels_last: model = model.to(memory_format=torch.channels_last) if args.distributed and args.sync_bn: if has_apex and use_amp != 'native': model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if args.local_rank == 0: logging.info( 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.' ) if args.torchscript: assert not use_amp == 'apex', 'Cannot use APEX AMP with torchscripted model, force native amp with `--native-amp` flag' assert not args.sync_bn, 'Cannot use SyncBatchNorm with torchscripted model. Use `--dist-bn reduce` instead of `--sync-bn`' model = torch.jit.script(model) optimizer = create_optimizer(args, model) amp_autocast = suppress # do nothing loss_scaler = None if use_amp == 'apex': model, optimizer = amp.initialize(model, optimizer, opt_level='O1') loss_scaler = ApexScaler() if args.local_rank == 0: logging.info('Using NVIDIA APEX AMP. Training in mixed precision.') elif use_amp == 'native': amp_autocast = torch.cuda.amp.autocast loss_scaler = NativeScaler() if args.local_rank == 0: logging.info( 'Using native Torch AMP. Training in mixed precision.') else: if args.local_rank == 0: logging.info('AMP not enabled. Training in float32.') # optionally resume from a checkpoint resume_epoch = None if args.resume: resume_epoch = resume_checkpoint( unwrap_bench(model), args.resume, optimizer=None if args.no_resume_opt else optimizer, loss_scaler=None if args.no_resume_opt else loss_scaler, log_info=args.local_rank == 0) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEmaV2(model, decay=args.model_ema_decay) if args.resume: load_checkpoint(unwrap_bench(model_ema), args.resume, use_ema=True) if args.distributed: if has_apex and use_amp != 'native': if args.local_rank == 0: logging.info("Using apex DistributedDataParallel.") model = ApexDDP(model, delay_allreduce=True) else: if args.local_rank == 0: logging.info("Using torch DistributedDataParallel.") model = NativeDDP(model, device_ids=[args.device]) # NOTE: EMA model does not need to be wrapped by DDP... if model_ema is not None and not args.resume: # ...but it is a good idea to sync EMA copy of weights # NOTE: ModelEma init could be moved after DDP wrapper if using PyTorch DDP, not Apex. model_ema.set(model) lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: logging.info('Scheduled epochs: {}'.format(num_epochs)) loader_train, loader_eval, evaluator = create_datasets_and_loaders( args, model_config) if model_config.num_classes < loader_train.dataset.parser.max_label: logging.error( f'Model {model_config.num_classes} has fewer classes than dataset {loader_train.dataset.parser.max_label}.' ) exit(1) if model_config.num_classes > loader_train.dataset.parser.max_label: logging.warning( f'Model {model_config.num_classes} has more classes than dataset {loader_train.dataset.parser.max_label}.' ) eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join( [datetime.now().strftime("%Y%m%d-%H%M%S"), args.model]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(model, optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler, checkpoint_dir=output_dir, decreasing=decreasing, unwrap_fn=unwrap_bench) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) try: for epoch in range(start_epoch, num_epochs): if args.distributed: loader_train.sampler.set_epoch(epoch) train_metrics = train_epoch(epoch, model, loader_train, optimizer, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema) if args.distributed and args.dist_bn in ('broadcast', 'reduce'): if args.local_rank == 0: logging.info( "Distributing BatchNorm running means and vars") distribute_bn(model, args.world_size, args.dist_bn == 'reduce') # the overhead of evaluating with coco style datasets is fairly high, so just ema or non, not both if model_ema is not None: if args.distributed and args.dist_bn in ('broadcast', 'reduce'): distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') eval_metrics = validate(model_ema.module, loader_eval, args, evaluator, log_suffix=' (EMA)') else: eval_metrics = validate(model, loader_eval, args, evaluator) if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) if saver is not None: update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) # save proper checkpoint with eval metric best_metric, best_epoch = saver.save_checkpoint( epoch=epoch, metric=eval_metrics[eval_metric]) except KeyboardInterrupt: pass if best_metric is not None: logging.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
def validate(args): setup_default_logging() if args.amp: if has_apex: args.apex_amp = True elif has_native_amp: args.native_amp = True assert not args.apex_amp or not args.native_amp, "Only one AMP mode should be set." args.pretrained = args.pretrained or not args.checkpoint # might as well try to validate something args.prefetcher = not args.no_prefetcher # create model with set_layer_config(scriptable=args.torchscript): bench = create_model( args.model, bench_task='predict', num_classes=args.num_classes, pretrained=args.pretrained, redundant_bias=args.redundant_bias, soft_nms=args.soft_nms, checkpoint_path=args.checkpoint, checkpoint_ema=args.use_ema, ) model_config = bench.config param_count = sum([m.numel() for m in bench.parameters()]) print('Model %s created, param count: %d' % (args.model, param_count)) bench = bench.cuda() amp_autocast = suppress if args.apex_amp: bench = amp.initialize(bench, opt_level='O1') print('Using NVIDIA APEX AMP. Validating in mixed precision.') elif args.native_amp: amp_autocast = torch.cuda.amp.autocast print('Using native Torch AMP. Validating in mixed precision.') else: print('AMP not enabled. Validating in float32.') if args.num_gpu > 1: bench = torch.nn.DataParallel(bench, device_ids=list(range(args.num_gpu))) dataset = create_dataset(args.dataset, args.root, args.split) input_config = resolve_input_config(args, model_config) loader = create_loader(dataset, input_size=input_config['input_size'], batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=input_config['interpolation'], fill_color=input_config['fill_color'], mean=input_config['mean'], std=input_config['std'], num_workers=args.workers, pin_mem=args.pin_mem) evaluator = create_evaluator(args.dataset, dataset, pred_yxyx=False) bench.eval() batch_time = AverageMeter() end = time.time() last_idx = len(loader) - 1 imgs = [] with torch.no_grad(): for i, (input, target) in enumerate(loader): for b in range(input.shape[0]): imgs.append(input[b].cpu().numpy()) # targets.append(target[b].cpu().numpy()) with amp_autocast(): output = bench(input, img_info=target) evaluator.add_predictions(output, target) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_freq == 0 or i == last_idx: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' .format(i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg)) mean_ap = 0. if dataset.parser.has_labels: preds = [p[:2, :] for p in evaluator.predictions] anns = evaluator.coco_api.imgToAnns targets = [ np.asarray((anns[k][0]['bbox'], anns[k][1]['bbox'])) for k in range(len(imgs)) ] mean_ap = evaluator.evaluate() if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) for i, img in enumerate(imgs): img = imgs[i] img_m = np.mean(img, axis=0) for c in range(3): img[c] = img_m img_ = img.transpose(1, 2, 0) m = img_.min() M = img_.max() img_ = ((img_ - m) / (M - m) * 255).astype('uint8').copy() img_ = draw_bbox(img_, preds[i], targets[i]) cv2.imwrite(os.path.join(args.out_dir, '%d.jpg' % i), img_) else: evaluator.save(args.results) return mean_ap