def main(): args, args_text = _parse_args() args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() model_name = 'efficientdet_d0' data_config = get_efficientdet_config(model_name) train_anno_set = 'train2017' train_annotation_path = os.path.join(args.data, 'annotations', f'instances_{train_anno_set}.json') train_image_dir = train_anno_set dataset_train = CocoDetection(os.path.join(args.data, train_image_dir), train_annotation_path, data_config) print("Length of training dataset {}".format(len(dataset_train))) loader_train = create_loader( dataset_train, input_size=args.input_size, batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, #re_prob=args.reprob, # FIXME add back various augmentations #re_mode=args.remode, #re_count=args.recount, #re_split=args.resplit, #color_jitter=args.color_jitter, #auto_augment=args.aa, interpolation=args.train_interpolation, #mean=data_config['mean'], #std=data_config['std'], num_workers=args.workers, distributed=args.distributed, #collate_fn=collate_fn, pin_mem=args.pin_mem, ) print("Iterations per epoch {}".format( math.ceil(len(dataset_train) / (args.batch_size * args.world_size)))) data_time_m = AverageMeter() end = time.time() if args.local_rank == 0: print("Starting to test...") for batch_idx, (input, target) in enumerate(loader_train): data_time_m.update(time.time() - end) if args.local_rank == 0 and batch_idx % 20 == 0: print("batch time till {} is {}".format(batch_idx, data_time_m.avg)) end = time.time()
def validate(args): # might as well try to validate something args.pretrained = args.pretrained or not args.checkpoint args.prefetcher = not args.no_prefetcher args.redundant_bias = not args.no_redundant_bias # create model config = get_efficientdet_config(args.model) config.redundant_bias = args.redundant_bias model = EfficientDet(config) if args.checkpoint: load_checkpoint(model, args.checkpoint) param_count = sum([m.numel() for m in model.parameters()]) print('Model %s created, param count: %d' % (args.model, param_count)) bench = DetBenchEval(model, config) bench = bench.cuda() if has_amp: print('Using AMP mixed precision.') bench = amp.initialize(bench, opt_level='O1') else: print('AMP not installed, running network in FP32.') if args.num_gpu > 1: bench = torch.nn.DataParallel(bench, device_ids=list(range(args.num_gpu))) if 'test' in args.anno: annotation_path = os.path.join(args.data, 'annotations', f'image_info_{args.anno}.json') image_dir = 'test2017' else: annotation_path = os.path.join(args.data, 'annotations', f'instances_{args.anno}.json') image_dir = args.anno dataset = CocoDetection(os.path.join(args.data, image_dir), annotation_path) loader = create_loader(dataset, input_size=config.image_size, batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=args.interpolation, fill_color=args.fill_color, num_workers=args.workers, pin_mem=args.pin_mem) img_ids = [] results = [] model.eval() batch_time = AverageMeter() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(loader): output = bench(input, target['scale']) output = output.cpu() sample_ids = target['img_id'].cpu() for index, sample in enumerate(output): image_id = int(sample_ids[index]) for det in sample: score = float(det[4]) if score < .001: # stop when below this threshold, scores in descending order break coco_det = dict(image_id=image_id, bbox=det[0:4].tolist(), score=score, category_id=int(det[5])) img_ids.append(image_id) results.append(coco_det) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_freq == 0: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' .format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, )) json.dump(results, open(args.results, 'w'), indent=4) if 'test' not in args.anno: coco_results = dataset.coco.loadRes(args.results) coco_eval = COCOeval(dataset.coco, coco_results, 'bbox') coco_eval.params.imgIds = img_ids # score only ids we've used coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return results
def validate(args): setup_default_logging() def setthresh(): if args.checkpoint.split("/")[-1].split( "_")[0] in getthresholds.keys(): return getthresholds[args.checkpoint.split("/")[-1].split("_")[0]] else: a = [] [a.append(args.threshold) for x in range(4)] return a threshs = setthresh() print(threshs) # might as well try to validate something args.pretrained = args.pretrained or not args.checkpoint args.prefetcher = not args.no_prefetcher # create model bench = create_model( args.model, bench_task='predict', pretrained=args.pretrained, redundant_bias=args.redundant_bias, checkpoint_path=args.checkpoint, checkpoint_ema=args.use_ema, ) input_size = bench.config.image_size param_count = sum([m.numel() for m in bench.parameters()]) print('Model %s created, param count: %d' % (args.model, param_count)) bench = bench.cuda() if has_amp: print('Using AMP mixed precision.') bench = amp.initialize(bench, opt_level='O1') else: print('AMP not installed, running network in FP32.') if args.num_gpu > 1: bench = torch.nn.DataParallel(bench, device_ids=list(range(args.num_gpu))) if 'test' in args.anno: annotation_path = os.path.join(args.data, 'annotations', f'image_info_{args.anno}.json') image_dir = args.anno else: annotation_path = os.path.join(args.data, 'annotations', f'instances_{args.anno}.json') image_dir = args.anno print(os.path.join(args.data, image_dir), annotation_path) dataset = CocoDetection(os.path.join(args.data, image_dir), annotation_path) loader = create_loader(dataset, input_size=input_size, batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=args.interpolation, fill_color=args.fill_color, num_workers=args.workers, pin_mem=args.pin_mem, mean=args.mean, std=args.std) if 'test' in args.anno: threshold = float(args.threshold) else: threshold = .001 img_ids = [] results = [] writetofilearrtay = [] bench.eval() batch_time = AverageMeter() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(loader): output = bench(input, target['img_scale'], target['img_size']) output = output.cpu() # print(target['img_id']) sample_ids = target['img_id'].cpu() for index, sample in enumerate(output): image_id = int(sample_ids[index]) for det in sample: score = float(det[4]) if score < threshold: # stop when below this threshold, scores in descending order coco_det = dict(image_id=image_id, category_id=-1) img_ids.append(image_id) results.append(coco_det) break coco_det = dict(image_id=image_id, bbox=det[0:4].tolist(), score=score, category_id=int(det[5]), sizes=target['img_size'].tolist()[0]) img_ids.append(image_id) results.append(coco_det) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_freq == 0: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' .format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, )) if 'test' in args.anno: from itertools import groupby results.sort(key=lambda x: x['image_id']) f = open( str(args.model) + "-" + str(args.anno) + "-" + str(min(threshs)) + ".txt", "w+") # for item in tqdm(writetofilearrtay): xxx = 0 for k, v in tqdm(groupby(results, key=lambda x: x['image_id'])): xxx += 1 f.write(getimageNamefromid(k) + ",") #print(getimageNamefromid(k),", ") for i in v: if i['category_id'] > 0: if (i['category_id'] ==1 and i['score'] >= threshs[0] ) or (i['category_id'] ==2 and i['score'] >= threshs[1] ) or \ (i['category_id'] ==3 and i['score'] >= threshs[2] ) or (i['category_id'] ==4 and i['score'] >= threshs[3] ) : f.write( str(round(i['category_id'])) + " " + str(round(i['bbox'][0])) + " " + str(round(i['bbox'][1])) + " " + str( round( float(i['bbox'][0]) + float(i['bbox'][2]))) + " " + str( round( float(i['bbox'][1]) + float(i['bbox'][3]))) + " ") f.write('\n') # print(i['category_id']," ",i['bbox'][0]," ",i['bbox'][1]," ",i['bbox'][2]," ",i['bbox'][3]," ") print("generated lines:", xxx) f.close() # f.close() if 'test' not in args.anno: array_of_dm = [] array_of_gt = [] i = 0 # if 'test' in args.anno : for _, item in tqdm(dataset): # if item["img_id"] == "1000780" : # print(item) for i in range(len(item['cls'])): # print(str(item["img_id"]),) array_of_gt.append( BoundingBox(imageName=str(item["img_id"]), classId=item["cls"][i], x=item["bbox"][i][1] * item['img_scale'], y=item["bbox"][i][0] * item['img_scale'], w=item["bbox"][i][3] * item['img_scale'], h=item["bbox"][i][2] * item['img_scale'], typeCoordinates=CoordinatesType.Absolute, bbType=BBType.GroundTruth, format=BBFormat.XYX2Y2, imgSize=(item['img_size'][0], item['img_size'][1]))) for item in tqdm(results): if item["category_id"] >= 0: array_of_dm.append( BoundingBox(imageName=str(item["image_id"]), classId=item["category_id"], classConfidence=item["score"], x=item['bbox'][0], y=item['bbox'][1], w=item['bbox'][2], h=item['bbox'][3], typeCoordinates=CoordinatesType.Absolute, bbType=BBType.Detected, format=BBFormat.XYWH, imgSize=(item['sizes'][0], item['sizes'][1]))) myBoundingBoxes = BoundingBoxes() # # # # Add all bounding boxes to the BoundingBoxes object: for box in (array_of_gt): myBoundingBoxes.addBoundingBox(box) for dm in array_of_dm: myBoundingBoxes.addBoundingBox(dm) evaluator = Evaluator() f1res = [] f1resd0 = [] f1resd10 = [] f1resd20 = [] f1resd40 = [] for conf in tqdm(range(210, 600, 1)): metricsPerClass = evaluator.GetPascalVOCMetrics( myBoundingBoxes, IOUThreshold=0.5, ConfThreshold=conf / 1000.0) totalTP = 0 totalp = 0 totalFP = 0 tp = [] fp = [] ta = [] # print('-------') for mc in metricsPerClass: tp.append(mc['total TP']) fp.append(mc['total FP']) ta.append(mc['total positives']) totalFP = totalFP + mc['total FP'] totalTP = totalTP + mc['total TP'] totalp = totalp + (mc['total positives']) # print(totalTP," ",totalFP," ",totalp) if totalTP + totalFP == 0: p = -1 else: p = totalTP / (totalTP + totalFP) if totalp == 0: r = -1 else: r = totalTP / (totalp) f1_dict = dict(tp=totalTP, fp=totalFP, totalp=totalp, conf=conf / 1000.0, prec=p, rec=r, f1score=(2 * p * r) / (p + r)) f1res.append(f1_dict) #must clean these parts f1resd0.append( dict(tp=tp[0], fp=fp[0], totalp=ta[0], conf=conf / 1000.0, prec=tp[0] / (tp[0] + fp[0]), rec=tp[0] / ta[0], f1score=(2 * (tp[0] / (tp[0] + fp[0])) * (tp[0] / ta[0])) / ((tp[0] / (tp[0] + fp[0])) + (tp[0] / ta[0])))) f1resd10.append( dict(tp=tp[1], fp=fp[1], totalp=ta[1], conf=conf / 1000.0, prec=tp[1] / (tp[1] + fp[1]), rec=tp[1] / ta[1], f1score=(2 * (tp[1] / (tp[1] + fp[1])) * (tp[1] / ta[1])) / ((tp[1] / (tp[1] + fp[1])) + (tp[1] / ta[1])))) f1resd20.append( dict(tp=tp[2], fp=fp[2], totalp=ta[2], conf=conf / 1000.0, prec=tp[2] / (tp[2] + fp[2]), rec=tp[2] / ta[2], f1score=(2 * (tp[2] / (tp[2] + fp[2])) * (tp[2] / ta[2])) / ((tp[2] / (tp[2] + fp[2])) + (tp[2] / ta[2])))) f1resd40.append( dict(tp=tp[3], fp=fp[3], totalp=ta[3], conf=conf / 1000.0, prec=tp[3] / (tp[3] + fp[3]), rec=tp[3] / ta[3], f1score=(2 * (tp[3] / (tp[3] + fp[3])) * (tp[3] / ta[3])) / ((tp[3] / (tp[3] + fp[3])) + (tp[3] / ta[3])))) sortedf1 = sorted(f1res, key=lambda k: k['f1score'], reverse=True) f1resd0 = sorted(f1resd0, key=lambda k: k['f1score'], reverse=True) f1resd10 = sorted(f1resd10, key=lambda k: k['f1score'], reverse=True) f1resd20 = sorted(f1resd20, key=lambda k: k['f1score'], reverse=True) f1resd40 = sorted(f1resd40, key=lambda k: k['f1score'], reverse=True) print(sortedf1[0]) print("\n\n") print(f1resd0[0]) print(f1resd10[0]) print(f1resd20[0]) print(f1resd40[0]) # sortedf1 = sorted(f1res, key=lambda k: k['f1score'],reverse=True) # print(sortedf1[0:2]) # json.dump(results, open(args.results, 'w'), indent=4) json.dump(results, open(args.results, 'w'), indent=4) # coco_results = dataset.coco.loadRes(args.results) # coco_eval = COCOeval(dataset.coco, coco_results, 'bbox') # coco_eval.params.imgIds = img_ids # score only ids we've used # coco_eval.evaluate() # coco_eval.accumulate() # coco_eval.summarize() # print(coco_eval.eval['params']) json.dump(results, open(args.results, 'w'), indent=4) return results
def main(): setup_default_logging() args, args_text = _parse_args() args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() assert args.rank >= 0 if args.distributed: logging.info( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: logging.info('Training with a single process on 1 GPU.') #torch.manual_seed(args.seed + args.rank) # create model config = get_efficientdet_config(args.model) config.redundant_bias = args.redundant_bias # redundant conv + BN bias layers (True to match official models) model = EfficientDet(config) if args.initial_checkpoint: load_checkpoint(model, args.initial_checkpoint) config.num_classes = 5 model.class_net.predict.conv_pw = create_conv2d(config.fpn_channels, 9 * 5, 1, padding=config.pad_type, bias=True) variance_scaling(model.class_net.predict.conv_pw.weight) model.class_net.predict.conv_pw.bias.data.fill_(-math.log((1 - 0.01) / 0.01)) model = DetBenchTrain(model, config) model.cuda() print(model.model.class_net.predict.conv_pw) # FIXME create model factory, pretrained zoo # model = create_model( # args.model, # pretrained=args.pretrained, # num_classes=args.num_classes, # drop_rate=args.drop, # drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path # drop_path_rate=args.drop_path, # drop_block_rate=args.drop_block, # global_pool=args.gp, # bn_tf=args.bn_tf, # bn_momentum=args.bn_momentum, # bn_eps=args.bn_eps, # checkpoint_path=args.initial_checkpoint) if args.local_rank == 0: logging.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) optimizer = create_optimizer(args, model) use_amp = False if has_apex and args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') use_amp = True if args.local_rank == 0: logging.info('NVIDIA APEX {}. AMP {}.'.format( 'installed' if has_apex else 'not installed', 'on' if use_amp else 'off')) # optionally resume from a checkpoint resume_state = {} resume_epoch = None if args.resume: resume_state, resume_epoch = resume_checkpoint(_unwrap_bench(model), args.resume) if resume_state and not args.no_resume_opt: if 'optimizer' in resume_state: if args.local_rank == 0: logging.info('Restoring Optimizer state from checkpoint') optimizer.load_state_dict(resume_state['optimizer']) if use_amp and 'amp' in resume_state and 'load_state_dict' in amp.__dict__: if args.local_rank == 0: logging.info('Restoring NVIDIA AMP state from checkpoint') amp.load_state_dict(resume_state['amp']) del resume_state model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '') #resume=args.resume) # FIXME bit of a mess with bench if args.resume: load_checkpoint(_unwrap_bench(model_ema), args.resume, use_ema=True) if args.distributed: if args.sync_bn: try: if has_apex: model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( model) if args.local_rank == 0: logging.info( 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.' ) except Exception as e: logging.error( 'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1' ) if has_apex: model = DDP(model, delay_allreduce=True) else: if args.local_rank == 0: logging.info( "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP." ) model = DDP(model, device_ids=[args.local_rank ]) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: logging.info('Scheduled epochs: {}'.format(num_epochs)) train_anno_set = 'train_small' train_annotation_path = os.path.join(args.data, 'annotations_small', f'train_annotations.json') train_image_dir = train_anno_set dataset_train = CocoDetection(os.path.join(args.data, train_image_dir), train_annotation_path) # FIXME cutmix/mixup worth investigating? # collate_fn = None # if args.prefetcher and args.mixup > 0: # collate_fn = FastCollateMixup(args.mixup, args.smoothing, args.num_classes) loader_train = create_loader( dataset_train, input_size=config.image_size, batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, #re_prob=args.reprob, # FIXME add back various augmentations #re_mode=args.remode, #re_count=args.recount, #re_split=args.resplit, #color_jitter=args.color_jitter, #auto_augment=args.aa, interpolation=args.train_interpolation, #mean=data_config['mean'], #std=data_config['std'], num_workers=args.workers, distributed=args.distributed, #collate_fn=collate_fn, pin_mem=args.pin_mem, ) #train_anno_set = 'valid_small' #train_annotation_path = os.path.join(args.data, 'annotations_small', f'valid_annotations.json') #train_image_dir = train_anno_set dataset_eval = CocoDetection(os.path.join(args.data, train_image_dir), train_annotation_path) loader_eval = create_loader( dataset_eval, input_size=config.image_size, batch_size=args.validation_batch_size_multiplier * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=args.interpolation, #mean=data_config['mean'], #std=data_config['std'], num_workers=args.workers, #distributed=args.distributed, pin_mem=args.pin_mem, ) evaluator = COCOEvaluator(dataset_eval.coco, distributed=args.distributed) eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join( [datetime.now().strftime("%Y%m%d-%H%M%S"), args.model]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) try: for epoch in range(start_epoch, num_epochs): if args.distributed: loader_train.sampler.set_epoch(epoch) train_metrics = train_epoch(epoch, model, loader_train, optimizer, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, use_amp=use_amp, model_ema=model_ema) if args.distributed and args.dist_bn in ('broadcast', 'reduce'): if args.local_rank == 0: logging.info( "Distributing BatchNorm running means and vars") distribute_bn(model, args.world_size, args.dist_bn == 'reduce') eval_metrics = validate(model, loader_eval, args, evaluator) if model_ema is not None and not args.model_ema_force_cpu: if args.distributed and args.dist_bn in ('broadcast', 'reduce'): distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') ema_eval_metrics = validate(model_ema.ema, loader_eval, args, log_suffix=' (EMA)') eval_metrics = ema_eval_metrics if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) if saver is not None: update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] best_metric, best_epoch = saver.save_checkpoint( _unwrap_bench(model), optimizer, args, epoch=epoch, model_ema=_unwrap_bench(model_ema), metric=save_metric, use_amp=use_amp) except KeyboardInterrupt: pass if best_metric is not None: logging.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
def validate(args): setup_default_logging() # might as well try to validate something args.pretrained = args.pretrained or not args.checkpoint args.prefetcher = not args.no_prefetcher # create model bench = create_model(args.model, bench_task='predict', pretrained=args.pretrained, redundant_bias=args.redundant_bias, checkpoint_path=args.checkpoint, checkpoint_ema=args.use_ema) input_size = bench.config.image_size param_count = sum([m.numel() for m in bench.parameters()]) print('Model %s created, param count: %d' % (args.model, param_count)) bench = bench.cuda() if has_amp: print('Using AMP mixed precision.') bench = amp.initialize(bench, opt_level='O1') else: print('AMP not installed, running network in FP32.') if args.num_gpu > 1: bench = torch.nn.DataParallel(bench, device_ids=list(range(args.num_gpu))) if 'test' in args.anno: annotation_path = os.path.join(args.data, 'annotations', f'image_info_{args.anno}.json') image_dir = args.anno else: annotation_path = os.path.join(args.data, 'annotations', f'instances_{args.anno}.json') image_dir = args.anno dataset = CocoDetection(os.path.join(args.data, image_dir), annotation_path) loader = create_loader(dataset, input_size=input_size, batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=args.interpolation, fill_color=args.fill_color, num_workers=args.workers, mean=args.mean, std=args.std, pin_mem=args.pin_mem) img_ids = [] results = [] bench.eval() for i, (input, target) in enumerate(loader, 1): dumm_inp = input tisc = target['img_scale'] tisz = target['img_size'] break starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event( enable_timing=True) # repetitions = 300 # timings=np.zeros((repetitions,1)) #GPU-WARM-UP # print(enumerate()) for _ in range(10): _ = bench(dumm_inp, tisc, tisz) # MEASURE PERFORMANCE # dummy_input = torch.randn(1, 3,bench.config.image_size,bench.config.image_size,dtype=torch.float).to("cuda") print("starting") batch_time = AverageMeter() # end = time.time() with torch.no_grad(): for _ in range(2000): starter.record() _ = bench(dumm_inp, tisc, tisz) ender.record() # measure elapsed time torch.cuda.synchronize() curr_time = starter.elapsed_time(ender) batch_time.update(curr_time) # print(curr_time) # end = time.time() # if i % args.log_freq == 0: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}ms ({batch_time.avg:.3f}ms, {rate_avg:>7.2f}/s) ' .format( i, len(loader), batch_time=batch_time, rate_avg=dumm_inp.size(0) / batch_time.avg, )) # json.dump(results, open(args.results, 'w'), indent=4) # if 'test' not in args.anno: # coco_results = dataset.coco.loadRes(args.results) # coco_eval = COCOeval(dataset.coco, coco_results, 'bbox') # coco_eval.params.imgIds = img_ids # score only ids we've used # coco_eval.evaluate() # coco_eval.accumulate() # coco_eval.summarize() return results
def main(): setup_default_logging() args, args_text = _parse_args() args.pretrained_backbone = not args.no_pretrained_backbone args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() assert args.rank >= 0 if args.distributed: logging.info( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: logging.info('Training with a single process on 1 GPU.') torch.manual_seed(args.seed + args.rank) model = create_model( args.model, bench_task='train', pretrained=args.pretrained, pretrained_backbone=args.pretrained_backbone, redundant_bias=args.redundant_bias, checkpoint_path=args.initial_checkpoint, ) # FIXME decide which args to keep and overlay on config / pass to backbone # num_classes=args.num_classes, # drop_rate=args.drop, # drop_path_rate=args.drop_path, # drop_block_rate=args.drop_block, input_size = model.config.image_size if args.local_rank == 0: logging.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) model.cuda() optimizer = create_optimizer(args, model) use_amp = False if has_apex and args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') use_amp = True if args.local_rank == 0: logging.info('NVIDIA APEX {}. AMP {}.'.format( 'installed' if has_apex else 'not installed', 'on' if use_amp else 'off')) # optionally resume from a checkpoint resume_state = {} resume_epoch = None if args.resume: resume_state, resume_epoch = resume_checkpoint(unwrap_bench(model), args.resume) if resume_state and not args.no_resume_opt: if 'optimizer' in resume_state: if args.local_rank == 0: logging.info('Restoring Optimizer state from checkpoint') optimizer.load_state_dict(resume_state['optimizer']) if use_amp and 'amp' in resume_state and 'load_state_dict' in amp.__dict__: if args.local_rank == 0: logging.info('Restoring NVIDIA AMP state from checkpoint') amp.load_state_dict(resume_state['amp']) del resume_state model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay) #resume=args.resume) # FIXME bit of a mess with bench if args.resume: load_checkpoint(unwrap_bench(model_ema), args.resume, use_ema=True) if args.distributed: if args.sync_bn: try: if has_apex: model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( model) if args.local_rank == 0: logging.info( 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.' ) except Exception as e: logging.error( 'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1' ) if has_apex: model = DDP(model, delay_allreduce=True) else: if args.local_rank == 0: logging.info( "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP." ) model = DDP(model, device_ids=[args.local_rank ]) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: logging.info('Scheduled epochs: {}'.format(num_epochs)) # train_anno_set = 'train2017' # train_annotation_path = os.path.join(args.data, 'annotations', f'instances_{train_anno_set}.json') # train_image_dir = train_anno_set #dataset_train = CocoDetection("/workspace/data/images", # "/workspace/data/datatrain90n.json") train_anno_set = 'train' train_annotation_path = os.path.join(args.data, 'annotations', f'instances_{train_anno_set}.json') train_image_dir = train_anno_set dataset_train = CocoDetection(os.path.join(args.data, train_image_dir), train_annotation_path) # FIXME cutmix/mixup worth investigating? # collate_fn = None # if args.prefetcher and args.mixup > 0: # collate_fn = FastCollateMixup(args.mixup, args.smoothing, args.num_classes) loader_train = create_loader( dataset_train, input_size=input_size, batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, #re_prob=args.reprob, # FIXME add back various augmentations #re_mode=args.remode, #re_count=args.recount, #re_split=args.resplit, #color_jitter=args.color_jitter, #auto_augment=args.aa, interpolation=args.train_interpolation, mean=[0.4533, 0.4744, 0.4722], #[0.4846, 0.5079, 0.5005],#[0.485, 0.456, 0.406], std=[0.2823, 0.2890, 0.3084], #[0.2687, 0.2705, 0.2869],#[0.485, 0.456, 0.406], num_workers=args.workers, distributed=args.distributed, #collate_fn=collate_fn, pin_mem=args.pin_mem, ) train_anno_set = 'val' train_annotation_path = os.path.join(args.data, 'annotations', f'instances_{train_anno_set}.json') train_image_dir = train_anno_set dataset_eval = CocoDetection(os.path.join(args.data, train_image_dir), train_annotation_path) # train_anno_set = 'val' # train_annotation_path = os.path.join(args.data, 'annotations', f'instances_{train_anno_set}.json') # train_image_dir = train_anno_set # dataset_eval = CocoDetection("/workspace/data/val/images", # "/workspace/data/dataval90n.json") loader_eval = create_loader( dataset_eval, input_size=input_size, batch_size=args.validation_batch_size_multiplier * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=args.interpolation, mean=[0.4535, 0.4744, 0.4724], #[0.4851, 0.5083, 0.5009], std=[0.2835, 0.2903, 0.3098], #[0.2690, 0.2709, 0.2877], num_workers=args.workers, #distributed=args.distributed, pin_mem=args.pin_mem, ) # for xx,item in dataset_train : # print("out",type(xx)) # break # exit() array_of_gt = [] if args.local_rank == 0: for _, item in tqdm(dataset_eval): # print(item) for i in range(len(item['cls'])): array_of_gt.append( BoundingBox(imageName=str(item["img_id"]), classId=item["cls"][i], x=item["bbox"][i][1] * item['img_scale'], y=item["bbox"][i][0] * item['img_scale'], w=item["bbox"][i][3] * item['img_scale'], h=item["bbox"][i][2] * item['img_scale'], typeCoordinates=CoordinatesType.Absolute, bbType=BBType.GroundTruth, format=BBFormat.XYX2Y2, imgSize=(item['img_size'][0], item['img_size'][1]))) evaluator = COCOEvaluator(dataset_eval.coco, distributed=args.distributed, gtboxes=array_of_gt) eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join( [datetime.now().strftime("%Y%m%d-%H%M%S"), args.model]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) # print(model) try: for epoch in range(start_epoch, num_epochs): if args.distributed: loader_train.sampler.set_epoch(epoch) train_metrics = train_epoch(epoch, model, loader_train, optimizer, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, use_amp=use_amp, model_ema=model_ema) if args.distributed and args.dist_bn in ('broadcast', 'reduce'): if args.local_rank == 0: logging.info( "Distributing BatchNorm running means and vars") distribute_bn(model, args.world_size, args.dist_bn == 'reduce') # the overhead of evaluating with coco style datasets is fairly high, so just ema or non, not both if model_ema is not None: if args.distributed and args.dist_bn in ('broadcast', 'reduce'): distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') eval_metrics = validate(model_ema.ema, loader_eval, args, evaluator, log_suffix=' (EMA)') else: eval_metrics = validate(model, loader_eval, args, evaluator) if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) if saver is not None: update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] best_metric, best_epoch = saver.save_checkpoint( unwrap_bench(model), optimizer, args, epoch=epoch, model_ema=unwrap_bench(model_ema), metric=save_metric, use_amp=use_amp) except KeyboardInterrupt: pass if best_metric is not None: logging.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
def eval_model(model_name, paper_model_name, paper_arxiv_id, batch_size=64, model_description=''): # create model bench = create_model( model_name, bench_task='predict', pretrained=True, ) bench.eval() input_size = bench.config.image_size param_count = sum([m.numel() for m in bench.parameters()]) print('Model %s created, param count: %d' % (model_name, param_count)) bench = bench.cuda() if has_amp: print('Using AMP mixed precision.') bench = amp.initialize(bench, opt_level='O1') else: print('AMP not installed, running network in FP32.') annotation_path = os.path.join(DATA_ROOT, 'annotations', f'instances_{ANNO_SET}.json') evaluator = COCOEvaluator( root=DATA_ROOT, model_name=paper_model_name, model_description=model_description, paper_arxiv_id=paper_arxiv_id) dataset = CocoDetection(os.path.join(DATA_ROOT, ANNO_SET), annotation_path) loader = create_loader( dataset, input_size=input_size, batch_size=batch_size, use_prefetcher=True, fill_color='mean', num_workers=4, pin_mem=True) iterator = tqdm.tqdm(loader, desc="Evaluation", mininterval=5) evaluator.reset_time() with torch.no_grad(): for i, (input, target) in enumerate(iterator): output = bench(input, target['img_scale'], target['img_size']) output = output.cpu() sample_ids = target['img_id'].cpu() results = [] for index, sample in enumerate(output): image_id = int(sample_ids[index]) for det in sample: score = float(det[4]) if score < .001: # stop when below this threshold, scores in descending order break coco_det = dict( image_id=image_id, bbox=det[0:4].tolist(), score=score, category_id=int(det[5])) results.append(coco_det) evaluator.add(results) if evaluator.cache_exists: break evaluator.save()
def validate(args): setup_dllogger(0, filename=args.dllogger_file) if args.checkpoint != '': args.pretrained = True args.prefetcher = not args.no_prefetcher if args.waymo: assert args.waymo_val is not None memory_format = (torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format) args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: torch.cuda.manual_seed_all(args.seed) args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() # Set device limit on the current device # cudaLimitMaxL2FetchGranularity = 0x05 pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) assert pValue.contents.value == 128 assert args.rank >= 0 # create model bench = create_model(args.model, input_size=args.input_size, num_classes=args.num_classes, bench_task='predict', pretrained=args.pretrained, redundant_bias=args.redundant_bias, checkpoint_path=args.checkpoint, checkpoint_ema=args.use_ema, soft_nms=args.use_soft_nms, strict_load=False) input_size = bench.config.image_size data_config = bench.config param_count = sum([m.numel() for m in bench.parameters()]) print('Model %s created, param count: %d' % (args.model, param_count)) bench = bench.cuda().to(memory_format=memory_format) if args.distributed > 1: raise ValueError( "Evaluation is supported only on single GPU. args.num_gpu must be 1" ) bench = DDP( bench, device_ids=[args.device] ) # torch.nn.DataParallel(bench, device_ids=list(range(args.num_gpu))) if args.waymo: annotation_path = args.waymo_val_annotation image_dir = args.waymo_val else: if 'test' in args.anno: annotation_path = os.path.join(args.data, 'annotations', f'image_info_{args.anno}.json') image_dir = 'test2017' else: annotation_path = os.path.join(args.data, 'annotations', f'instances_{args.anno}.json') image_dir = args.anno dataset = CocoDetection(os.path.join(args.data, image_dir), annotation_path, data_config) evaluator = COCOEvaluator(dataset.coco, distributed=args.distributed, waymo=args.waymo) loader = create_loader(dataset, input_size=input_size, batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=args.interpolation, fill_color=args.fill_color, num_workers=args.workers, distributed=args.distributed, pin_mem=args.pin_mem, memory_format=memory_format) img_ids = [] results = [] dllogger_metric = {} bench.eval() batch_time = AverageMeter() throughput = AverageMeter() end = time.time() total_time_start = time.time() with torch.no_grad(): for i, (input, target) in enumerate(loader): with torch.cuda.amp.autocast(enabled=args.amp): output = bench(input, target['img_scale'], target['img_size']) batch_time.update(time.time() - end) throughput.update(input.size(0) / batch_time.val) evaluator.add_predictions(output, target) torch.cuda.synchronize() # measure elapsed time if i == 9: batch_time.reset() throughput.reset() if args.rank == 0 and i % args.log_freq == 0: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' .format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, )) end = time.time() dllogger_metric['total_inference_time'] = time.time() - total_time_start dllogger_metric['inference_throughput'] = throughput.avg dllogger_metric['inference_time'] = 1000 / throughput.avg total_time_start = time.time() mean_ap = 0. if not args.inference: if 'test' not in args.anno: mean_ap = evaluator.evaluate() else: evaluator.save_predictions(args.results) dllogger_metric['map'] = mean_ap dllogger_metric['total_eval_time'] = time.time() - total_time_start else: evaluator.save_predictions(args.results) if not args.distributed or args.rank == 0: dllogger.log(step=(), data=dllogger_metric, verbosity=0) return results
def validate(args): setup_default_logging() def setthresh(): if args.checkpoint.split("/")[-1].split( "_")[0] in getthresholds.keys(): return getthresholds[args.checkpoint.split("/")[-1].split("_")[0]] else: a = [] [a.append(args.threshold) for x in range(4)] return a # might as well try to validate something args.pretrained = args.pretrained or not args.checkpoint args.prefetcher = not args.no_prefetcher # create model bench = create_model( args.model, bench_task='predict', pretrained=args.pretrained, redundant_bias=args.redundant_bias, checkpoint_path=args.checkpoint, checkpoint_ema=args.use_ema, ) input_size = bench.config.image_size param_count = sum([m.numel() for m in bench.parameters()]) print('Model %s created, param count: %d' % (args.model, param_count)) bench = bench.cuda() if has_amp: print('Using AMP mixed precision.') bench = amp.initialize(bench, opt_level='O1') else: print('AMP not installed, running network in FP32.') if args.num_gpu > 1: bench = torch.nn.DataParallel(bench, device_ids=list(range(args.num_gpu))) if 'test' in args.anno: annotation_path = os.path.join(args.data, 'annotations', f'image_info_{args.anno}.json') image_dir = args.anno elif 'val' in args.anno: annotation_path = os.path.join(args.data, 'annotations', f'instances_{args.anno}.json') image_dir = args.anno # else: # annotation_path = os.path.join(args.data, f'{args.anno}.json') # image_dir = args.anno print(os.path.join(args.data, image_dir), annotation_path) dataset = CocoDetection(os.path.join(args.data, image_dir), annotation_path) loader = create_loader(dataset, input_size=input_size, batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=args.interpolation, fill_color=args.fill_color, num_workers=args.workers, pin_mem=args.pin_mem, mean=args.mean, std=args.std) if 'test' in args.anno: threshold = float(args.threshold) # elif 'detector' in args.anno: # threshold = min(getthresholds['d0']) else: threshold = .001 img_ids = [] results = [] writetofilearrtay = [] bench.eval() batch_time = AverageMeter() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(loader): output = bench(input, target['img_scale'], target['img_size']) output = output.cpu() # print(target['img_id']) sample_ids = target['img_id'].cpu() for index, sample in enumerate(output): image_id = int(sample_ids[index]) # if 'test' in args.anno : # tempWritetoFile = [] # tempWritetoFile.append(getimageNamefromid(image_id)) for det in sample: score = float(det[4]) if score < threshold: # stop when below this threshold, scores in descending order coco_det = dict(image_id=image_id, category_id=-1) img_ids.append(image_id) results.append(coco_det) break coco_det = dict(image_id=image_id, bbox=det[0:4].tolist(), score=score, category_id=int(det[5]), sizes=target['img_size'].tolist()[0]) img_ids.append(image_id) results.append(coco_det) # exit() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_freq == 0: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' .format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, )) # if 'test' in args.anno : if not os.path.exists(args.tosave): os.makedirs(args.tosave) from itertools import groupby results.sort(key=lambda x: x['image_id']) count = 0 for k, v in tqdm(groupby(results, key=lambda x: x['image_id'])): # print(args.data +"/" + str(getimageNamefromid(k))) img = drawonimage( os.path.join(args.data, image_dir, str(getimageNamefromid(k))), v, setthresh()) cv2.imwrite(args.tosave + "/" + str(getimageNamefromid(k)), img) count += 1 # print(i['category_id']," ",i['bbox'][0]," ",i['bbox'][1]," ",i['bbox'][2]," ",i['bbox'][3]," ") print("generated predictions for ", count, " images.") return results
def validate(args): # might as well try to validate something args.pretrained = args.pretrained or not args.checkpoint args.prefetcher = not args.no_prefetcher # create model config = get_efficientdet_config(args.model) model = EfficientDet(config) if args.checkpoint: load_checkpoint(model, args.checkpoint) param_count = sum([m.numel() for m in model.parameters()]) logging.info('Model %s created, param count: %d' % (args.model, param_count)) bench = DetBenchEval(model, config) bench.model = bench.model.cuda() if has_amp: bench.model = amp.initialize(bench.model, opt_level='O1') if args.num_gpu > 1: bench.model = torch.nn.DataParallel(bench.model, device_ids=list(range( args.num_gpu))) if 'test' in args.anno: annotation_path = os.path.join(args.data, 'annotations', f'image_info_{args.anno}.json') image_dir = 'test2017' else: annotation_path = os.path.join(args.data, 'annotations', f'instances_{args.anno}.json') image_dir = args.anno dataset = CocoDetection(os.path.join(args.data, image_dir), annotation_path) loader = create_loader(dataset, input_size=config.image_size, batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=args.interpolation, num_workers=args.workers) img_ids = [] results = [] model.eval() batch_time = AverageMeter() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(loader): output = bench(input, target['img_id'], target['scale']) for batch_out in output: for det in batch_out: image_id = int(det[0]) score = float(det[5]) coco_det = { 'image_id': image_id, 'bbox': det[1:5].tolist(), 'score': score, 'category_id': int(det[6]), } img_ids.append(image_id) results.append(coco_det) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_freq == 0: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' .format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, )) json.dump(results, open(args.results, 'w'), indent=4) if 'test' not in args.anno: coco_results = dataset.coco.loadRes(args.results) coco_eval = COCOeval(dataset.coco, coco_results, 'bbox') coco_eval.params.imgIds = img_ids # score only ids we've used coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return results
def main(): setup_default_logging() ## TODO(sugh) replace args, args_text = _parse_args() set_affinity(args.local_rank) random.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: torch.cuda.manual_seed_all(args.seed) args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() # Set device limit on the current device # cudaLimitMaxL2FetchGranularity = 0x05 pValue = ctypes.cast((ctypes.c_int*1)(), ctypes.POINTER(ctypes.c_int)) _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) assert pValue.contents.value == 128 assert args.rank >= 0 setup_dllogger(args.rank, filename=args.dllogger_file) if args.distributed: logging.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: logging.info('Training with a single process on 1 GPU.') if args.waymo: if (args.waymo_train is not None and args.waymo_val is None) or (args.waymo_train is None and args.waymo_val is not None): raise Exception("waymo_train or waymo_val is not set") memory_format = ( torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format ) model = create_model( args.model, input_size=args.input_size, num_classes=args.num_classes, bench_task='train', pretrained=args.pretrained, pretrained_backbone_path=args.pretrained_backbone_path, redundant_bias=args.redundant_bias, checkpoint_path=args.initial_checkpoint, label_smoothing=args.smoothing, fused_focal_loss=args.fused_focal_loss, remove_params=args.remove_weights, freeze_layers=args.freeze_layers, strict_load=False ) # FIXME decide which args to keep and overlay on config / pass to backbone # num_classes=args.num_classes, input_size = model.config.image_size data_config = model.config print("Input size to be passed to dataloaders: {}".format(input_size)) print("Image size used in model: {}".format(model.config.image_size)) if args.rank == 0: dllogger.log(step='PARAMETER', data={'model_name':args.model, 'param_count': sum([m.numel() for m in model.parameters()])}) model = model.cuda().to(memory_format=memory_format) # # optionally resume from a checkpoint if args.distributed: if args.sync_bn: try: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if args.local_rank == 0: logging.info( 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.') except Exception as e: logging.error('Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1') optimizer = create_optimizer(args, model) scaler = torch.cuda.amp.GradScaler(enabled=args.amp) resume_state = {} resume_epoch = None output_base = args.output if args.output else './output' resume_checkpoint_path = get_latest_checkpoint(os.path.join(output_base, 'train')) if args.resume and resume_checkpoint_path is not None: print("Trying to load checkpoint from {}".format(resume_checkpoint_path)) resume_state, resume_epoch = resume_checkpoint(unwrap_bench(model), resume_checkpoint_path) if resume_epoch is not None: print("Resume training from {} epoch".format(resume_epoch)) if resume_state and not args.no_resume_opt: if 'optimizer' in resume_state: if args.local_rank == 0: logging.info('Restoring Optimizer state from checkpoint') optimizer.load_state_dict(resume_state['optimizer']) if args.amp and 'scaler' in resume_state: if args.local_rank == 0: logging.info('Restoring NVIDIA AMP state from checkpoint') scaler.load_state_dict(resume_state['scaler']) del resume_state model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper if args.resume and resume_checkpoint_path is not None: resume_path = resume_checkpoint_path else: resume_path = '' model_ema = ModelEma( model, decay=args.model_ema_decay, resume=resume_path) if args.distributed: if args.local_rank == 0: logging.info("Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP.") model = DDP(model, device_ids=[args.device]) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: dllogger.log(step="PARAMETER", data={'Scheduled_epochs': num_epochs}, verbosity=0) # Benchmark will always override every other setting. if args.benchmark: start_epoch = 0 num_epochs = args.epochs if args.waymo: train_annotation_path = args.waymo_train_annotation train_image_dir = args.waymo_train else: train_anno_set = 'train2017' train_annotation_path = os.path.join(args.data, 'annotations', f'instances_{train_anno_set}.json') train_image_dir = train_anno_set dataset_train = CocoDetection(os.path.join(args.data, train_image_dir), train_annotation_path, data_config) loader_train = create_loader( dataset_train, input_size=input_size, batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, interpolation=args.train_interpolation, num_workers=args.workers, distributed=args.distributed, pin_mem=args.pin_mem, memory_format=memory_format ) loader_train_iter = iter(loader_train) steps_per_epoch = int(np.ceil( len(dataset_train) / (args.world_size * args.batch_size) )) if args.waymo: val_annotation_path = args.waymo_val_annotation val_image_dir = args.waymo_val else: val_anno_set = 'val2017' val_annotation_path = os.path.join(args.data, 'annotations', f'instances_{val_anno_set}.json') val_image_dir = val_anno_set dataset_eval = CocoDetection(os.path.join(args.data, val_image_dir), val_annotation_path, data_config) loader_eval = create_loader( dataset_eval, input_size=input_size, batch_size=args.validation_batch_size_multiplier * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=args.interpolation, num_workers=args.workers, distributed=args.distributed, pin_mem=args.pin_mem, memory_format=memory_format ) evaluator = COCOEvaluator(dataset_eval.coco, distributed=args.distributed, waymo=args.waymo) eval_metric = args.eval_metric eval_metrics = None train_metrics = {} best_metric = -1 is_best = False best_epoch = None saver = None output_dir = '' if args.rank == 0: output_base = args.output if args.output else './output' output_dir = get_outdirectory(output_base, 'train') decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(checkpoint_dir=output_dir) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) try: for epoch in range(start_epoch, num_epochs): if args.distributed: loader_train.sampler.set_epoch(epoch) train_metrics = train_epoch( epoch, steps_per_epoch, model, loader_train_iter, optimizer, args, lr_scheduler=lr_scheduler, output_dir=output_dir, use_amp=args.amp, scaler=scaler, model_ema=model_ema) if args.distributed and args.dist_bn in ('broadcast', 'reduce'): if args.local_rank == 0: logging.info("Distributing BatchNorm running means and vars") distribute_bn(model, args.world_size, args.dist_bn == 'reduce') # the overhead of evaluating with coco style datasets is fairly high, so just ema or non, not both if model_ema is not None: if args.distributed and args.dist_bn in ('broadcast', 'reduce'): distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') if epoch >= args.eval_after: eval_metrics = validate(model_ema.ema, loader_eval, args, evaluator, epoch, log_suffix=' (EMA)') else: eval_metrics = validate(model, loader_eval, args, evaluator, epoch) lr_scheduler.step(epoch + 1) if saver is not None and args.rank == 0 and epoch % args.save_checkpoint_interval == 0: if eval_metrics is not None: # save proper checkpoint with eval metric is_best = eval_metrics[eval_metric] > best_metric best_metric = max( eval_metrics[eval_metric], best_metric ) best_epoch = epoch else: is_best = False best_metric = 0 saver.save_checkpoint(model, optimizer, epoch, model_ema=model_ema, metric=best_metric, is_best=is_best) except KeyboardInterrupt: dllogger.flush() torch.cuda.empty_cache() if best_metric > 0: train_metrics.update({'best_map': best_metric, 'best_epoch': best_epoch}) if eval_metrics is not None: train_metrics.update(eval_metrics) dllogger.log(step=(), data=train_metrics, verbosity=0)