def flops(model, model_info): """ Count FLOPs and params. :param args: :param model: :param model_info: :return: """ from utils.flops_counter import get_model_complexity_info import copy model = copy.deepcopy(model).cpu() model.eval() inputs = tuple( torch.ones(model_info['input_shapes'][k], dtype=torch.float32) for k in model_info['input_names']) macs, params = get_model_complexity_info(model, inputs, as_strings=True, print_per_layer_stat=True, verbose=True) _logger.info('{:<30} {:<8}'.format('Computational complexity: ', macs)) _logger.info('{:<30} {:<8}'.format('Number of parameters: ', params))
def setup_model(self, options, model): dev = options['device'] model = model.to(dev) input_shape = model.input_shape input_type = model.input_type if hasattr(model, 'input_type') else None self.flops, self.params_num, self.model_bytes = \ get_model_complexity_info(model, input_shape, input_type=input_type, device=dev) return model
def test(): net = cp_spp_se_resnet152() y = net((torch.randn(1, 3, 224, 224))) print(y.size()) pytorch_total_params = sum(p.numel() for p in net.parameters()) pytorch_trainable_params = sum(p.numel() for p in net.parameters() if p.requires_grad) print('Total params:' + str(pytorch_total_params)) print('Total params:' + str(pytorch_trainable_params)) flops, params = get_model_complexity_info(net, (224, 224), as_strings=True, print_per_layer_stat=False) print('Flops: ' + flops) print('Params: ' + params)
def main(): args, args_text = _parse_args() time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S") output_base = args.output if args.output else './output' exp_name = '-'.join( [socket.gethostname(), time_stamp, args.model, 'logger.log']) os.makedirs(os.path.join(output_base, 'log'), exist_ok=True) logging.basicConfig(filename=os.path.join(output_base, 'log', exp_name), filemode='w') setup_default_logging() args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed and args.num_gpu > 1: logging.warning( 'Using more than one GPU per process in distributed mode is not allowed. Setting num_gpu to 1.' ) args.num_gpu = 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.num_gpu = 1 args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() assert args.rank >= 0 # use grid mask augmentation if args.grid: # import pdb;pdb.set_trace() grid = GridMask(args.d1, args.d2, args.rotate, args.ratio, args.mode, args.prob) else: grid = None if args.distributed: logging.info( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: logging.info('Training with a single process on %d GPUs.' % args.num_gpu) torch.manual_seed(args.seed + args.rank) model = create_model(args.model, pretrained=args.pretrained, num_classes=args.num_classes, drop_rate=args.drop, drop_connect_rate=args.drop_connect, global_pool=args.gp, bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps, checkpoint_path=args.initial_checkpoint) logging.info(model) with torch.cuda.device(0): # input = torch.randn(1, 3, 224, 224) # # scope(model, input_size=(3,224,224)) # # import pdb; pdb.set_trace() size_for_madd = 224 if args.img_size is None else args.img_size flops, params = get_model_complexity_info( model, (3, size_for_madd, size_for_madd), as_strings=True, print_per_layer_stat=True) print("Image size used for madd cal is: ", size_for_madd) print("=>Flops: " + flops) print("=>Params: " + params) if args.local_rank == 0: logging.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0) if args.num_gpu > 1: if args.amp: logging.warning( 'AMP does not work well with nn.DataParallel, disabling. Use distributed mode for multi-GPU AMP.' ) args.amp = False model = nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda() else: model.cuda() label_smoothing_param = None if args.mixup > 0.: # smoothing is handled with mixup label transform train_loss_fn = SoftTargetCrossEntropy().cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() elif args.smoothing: train_loss_fn = LabelSmoothingCrossEntropy( smoothing=args.smoothing).cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() # parameter_alpha = train_loss_fn.alpha else: train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = train_loss_fn optimizer = create_optimizer(args, model) # import pdb;pdb.set_trace() use_amp = False if has_apex and args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') use_amp = True if args.local_rank == 0: logging.info('NVIDIA APEX {}. AMP {}.'.format( 'installed' if has_apex else 'not installed', 'on' if use_amp else 'off')) # optionally resume from a checkpoint resume_state = {} resume_epoch = None if args.resume: resume_state, resume_epoch = resume_checkpoint(model, args.resume) if resume_state and not args.no_resume_opt: if 'optimizer' in resume_state: if args.local_rank == 0: logging.info('Restoring Optimizer state from checkpoint') optimizer.load_state_dict(resume_state['optimizer']) if use_amp and 'amp' in resume_state and 'load_state_dict' in amp.__dict__: if args.local_rank == 0: logging.info('Restoring NVIDIA AMP state from checkpoint') amp.load_state_dict(resume_state['amp']) del resume_state model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume=args.resume) if args.distributed: if args.sync_bn: try: if has_apex: model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( model) if args.local_rank == 0: logging.info( 'Converted model to use Synchronized BatchNorm.') except Exception as e: logging.error( 'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1' ) if has_apex: model = DDP(model, delay_allreduce=True) else: if args.local_rank == 0: logging.info( "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP." ) model = DDP(model, device_ids=[args.local_rank], find_unused_parameters=True ) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: logging.info('Scheduled epochs: {}'.format(num_epochs)) train_dir = os.path.join(args.data, 'train') if not os.path.exists(train_dir): logging.error( 'Training folder does not exist at: {}'.format(train_dir)) exit(1) dataset_train = Dataset(train_dir) collate_fn = None if args.prefetcher and args.mixup > 0: collate_fn = FastCollateMixup(args.mixup, args.smoothing, args.num_classes) loader_train = create_loader( dataset_train, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, rand_erase_prob=args.reprob, rand_erase_mode=args.remode, rand_erase_count=args.recount, color_jitter=args.color_jitter, auto_augment=args.aa, interpolation= 'random', # FIXME cleanly resolve this? data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, collate_fn=collate_fn, ) # import pdb;pdb.set_trace() eval_dir = os.path.join(args.data, 'val') if not os.path.isdir(eval_dir): eval_dir = os.path.join(args.data, 'validation') if not os.path.isdir(eval_dir): logging.error( 'Validation folder does not exist at: {}'.format(eval_dir)) exit(1) dataset_eval = Dataset(eval_dir) loader_eval = create_loader( dataset_eval, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, crop_pct=data_config['crop_pct'], ) # if args.mixup > 0.: # # smoothing is handled with mixup label transform # train_loss_fn = SoftTargetCrossEntropy().cuda() # validate_loss_fn = nn.CrossEntropyLoss().cuda() # elif args.smoothing: # train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).cuda() # validate_loss_fn = nn.CrossEntropyLoss().cuda() # else: # train_loss_fn = nn.CrossEntropyLoss().cuda() # validate_loss_fn = train_loss_fn eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join([ socket.gethostname(), time_stamp, args.model, str(data_config['input_size'][-1]) ]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) try: for epoch in range(start_epoch, num_epochs): if args.grid: grid.set_prob(epoch, args.st_epochs) if args.distributed: loader_train.sampler.set_epoch(epoch) if not args.eval_only: train_metrics = train_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, use_amp=use_amp, model_ema=model_ema, grid=grid) if args.distributed and args.dist_bn in ('broadcast', 'reduce'): if args.local_rank == 0: logging.info( "Distributing BatchNorm running means and vars") distribute_bn(model, args.world_size, args.dist_bn == 'reduce') eval_metrics = validate(model, loader_eval, validate_loss_fn, args) if model_ema is not None and not args.model_ema_force_cpu: if args.distributed and args.dist_bn in ('broadcast', 'reduce'): distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') ema_eval_metrics = validate(model_ema.ema, loader_eval, validate_loss_fn, args, log_suffix=' (EMA)') eval_metrics = ema_eval_metrics if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] best_metric, best_epoch = saver.save_checkpoint( model, optimizer, args, epoch=epoch, model_ema=model_ema, metric=save_metric, use_amp=use_amp) except KeyboardInterrupt: pass if best_metric is not None: logging.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
out = [] x_s, x_m, x_l = self.__yolov4(x) out.append(self.__head_s(x_s)) out.append(self.__head_m(x_m)) out.append(self.__head_l(x_l)) if self.training: p, p_d = list(zip(*out)) return p, p_d # smalll, medium, large else: p, p_d = list(zip(*out)) return p, torch.cat(p_d, 0) if __name__ == '__main__': from utils.flops_counter import get_model_complexity_info net = Build_Model() print(net) in_img = torch.randn(1, 3, 416, 416) p, p_d = net(in_img) flops, params = get_model_complexity_info(net, (224, 224), as_strings=False, print_per_layer_stat=False) print('GFlops: %.3fG' % (flops / 1e9)) print('Params: %.2fM' % (params / 1e6)) for i in range(3): print(p[i].shape) print(p_d[i].shape)
elif isinstance(m, nn.Linear): n = m.weight.size(1) m.weight.data.normal_(0, 1.0 / float(n)) m.bias.data.zero_() def load_pretrain(self, path): state_dict = torch.load(path) self.load_state_dict(state_dict, strict=True) def build_efficientnet_lite(name, num_classes): width_coefficient, depth_coefficient, _, dropout_rate = efficientnet_lite_params[ name] model = EfficientNetLite(width_coefficient, depth_coefficient, num_classes, 0.2, dropout_rate) return model if __name__ == '__main__': model_name = 'efficientnet_lite0' model = build_efficientnet_lite(model_name, 1000) model.eval() from utils.flops_counter import get_model_complexity_info wh = efficientnet_lite_params[model_name][2] input_shape = (3, wh, wh) flops, params = get_model_complexity_info(model, input_shape) split_line = '=' * 30 print(f'{split_line}\nInput shape: {input_shape}\n' f'Flops: {flops}\nParams: {params}\n{split_line}')