def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info(f'Hyperparameters {hyp}') save_dir, epochs, batch_size, total_batch_size, weights, rank = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes names = ['item'] if opt.single_cls and len( data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: # with torch_distributed_zero_first(rank): # attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [ ] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if rank in [-1, 0] and wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init( config=opt, resume="allow", project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) loggers = {'wandb': wandb} # loggers dict # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(model.stride.max()) # grid size (max stride) nl = model.model[ -1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # TODO 将cfg添加到配置变量中 cfg_model = Darknet('cfg/yolov5s_v4.cfg', (opt.img_size[0], opt.img_size[0])).to(device) # cfg_model = Darknet('cfg/yolov5s_v3.cfg', (416, 416)).to(device) copy_weight_v4(model, cfg_model) # 剪枝操作 sr开启稀疏训练 prune 不同的剪枝策略 # 剪枝操作 if opt.prune == 1: CBL_idx, _, prune_idx, shortcut_idx, _ = parse_module_defs2( cfg_model.module_defs) if opt.sr: print('shortcut sparse training') elif opt.prune == 0: CBL_idx, _, prune_idx = parse_module_defs(cfg_model.module_defs) if opt.sr: print('normal sparse training ') # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, total_batch_size, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5)[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, save_dir, loggers) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale hyp['cls'] to class count hyp['obj'] *= imgsz**2 / 640.**2 * 3. / nl # scale hyp['obj'] to image size and output layers model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights( dataset.labels, nc).to(device) * nc # attach class weights model.names = names for idx in prune_idx: bn_weights = gather_bn_weights(cfg_model.module_list, [idx]) tb_writer.add_histogram('before_train_perlayer_bn_weights/hist', bn_weights.numpy(), idx, bins='doane') # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test\n' 'Using %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, save_dir, epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() sr_flag = get_sr_flag(epoch, opt.sr) for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward # with amp.autocast(enabled=cuda): # pred = model(imgs) # forward # loss, loss_items = compute_loss(pred, targets.to(device), model) # loss scaled by batch_size # if rank != -1: # loss *= opt.world_size # gradient averaged between devices in DDP mode # if opt.quad: # loss *= 4. # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward # scaler.scale(loss).backward() loss.backward() idx2mask = None # if opt.sr and opt.prune==1 and epoch > opt.epochs * 0.5: # idx2mask = get_mask2(model, prune_idx, 0.85) # copy_weight(model,cfg_model) BNOptimizer.updateBN(sr_flag, cfg_model.module_list, opt.s, prune_idx, epoch, idx2mask, opt) # Optimize if ni % accumulate == 0: # scaler.step(optimizer) # optimizer.step # scaler.update() optimizer.step() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 3 and wandb: wandb.log({ "Mosaics": [ wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg') ] }) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model, include=[ 'yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights' ]) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0) # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = [ 'train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}) # W&B #剪枝后bn层权重 bn_weights = gather_bn_weights(cfg_model.module_list, prune_idx) tb_writer.add_histogram('bn_weights/hist', bn_weights.numpy(), epoch, bins='doane') # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': wandb_run.id if wandb else None } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training for idx in prune_idx: bn_weights = gather_bn_weights(cfg_model.module_list, [idx]) tb_writer.add_histogram('after_train_perlayer_bn_weights/hist', bn_weights.numpy(), idx, bins='doane') if rank in [-1, 0]: # Strip optimizers final = best if best.exists() else last # final model for f in [last, best]: if f.exists(): strip_optimizer(f) # strip optimizers if opt.bucket: os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') # upload # Plots if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: files = [ 'results.png', 'precision_recall_curve.png', 'confusion_matrix.png' ] wandb.log({ "Results": [ wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists() ] }) if opt.log_artifacts: wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem) # Test best.pt logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) if opt.data.endswith('coco.yaml') and nc == 80: # if COCO for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]): # speed, mAP tests results, _, _ = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, conf_thres=conf, iou_thres=iou, model=attempt_load(final, device).half(), single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=save_json, plots=False) else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() return results
def train(hyp, opt, device, tb_writer=None): print(f'Hyperparameters {hyp}') log_dir = tb_writer.log_dir if tb_writer else 'runs/evolve' # run directory wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = log_dir + os.sep + 'results.txt' epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.local_rank # TODO: Use DDP logging. Only the first process is allowed to log. # Save run settings with open(Path(log_dir) / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(Path(log_dir) / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check # Remove previous results if rank in [-1, 0]: for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model model = Model(opt.cfg, nc=nc).to(device) #TODO 将cfg添加到配置变量中 cfg_model = Darknet('cfg/yolov5s_v2_hand.cfg', (opt.img_size[0], opt.img_size[0])).to(device) # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html # all-reduce operation is carried out during loss.backward(). # Thus, there would be redundant all-reduce communications in a accumulation procedure, # which means, the result is still right but the training speed gets slower. # TODO: If acceleration is needed, there is an implementation of allreduce_post_accumulation # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Load Model with torch_distributed_zero_first(rank): attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: exclude = ['anchor'] # exclude keys ckpt['model'] = {k: v for k, v in ckpt['model'].float().state_dict().items() if k in model.state_dict() and not any(x in k for x in exclude) and model.state_dict()[k].shape == v.shape} model.load_state_dict(ckpt['model'], strict=False) print('Transferred %g/%g items from %s' % (len(ckpt['model']), len(model.state_dict()), weights)) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ "Please delete or update %s and try again, or use --weights '' to train from scratch." \ % (weights, opt.cfg, weights, weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt #复制模型权重 目前只支持yolov5s copy_weight(model,cfg_model) # 剪枝操作 sr开启稀疏训练 prune 不同的剪枝策略 # 剪枝操作 if opt.prune == 1: CBL_idx, _, prune_idx, shortcut_idx, _ = parse_module_defs2(cfg_model.module_defs) if opt.sr: print('shortcut sparse training') elif opt.prune == 0: CBL_idx, _, prune_idx = parse_module_defs(cfg_model.module_defs) if opt.sr: print('normal sparse training ') # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[rank], output_device=rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=rank, world_size=opt.world_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) #剪枝前bn层权重 for idx in prune_idx: bn_weights = gather_bn_weights(cfg_model.module_list, [idx]) tb_writer.add_histogram('before_train_perlayer_bn_weights/hist', bn_weights.numpy(), idx, bins='doane') # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move # scaler = amp.GradScaler(enabled=cuda) if rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: # Generate indices if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() sr_flag = get_sr_flag(epoch, opt.sr) for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward loss.backward() # Autocast # with amp.autocast(enabled=cuda): # # Forward # pred = model(imgs) # # Loss # loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size # if rank != -1: # loss *= opt.world_size # gradient averaged between devices in DDP mode # # if not torch.isfinite(loss): # # print('WARNING: non-finite loss, ending training ', loss_items) # # return results # # Backward # scaler.scale(loss).backward() idx2mask = None # if opt.sr and opt.prune==1 and epoch > opt.epochs * 0.5: # idx2mask = get_mask2(model, prune_idx, 0.85) # copy_weight(model,cfg_model) BNOptimizer.updateBN(sr_flag, cfg_model.module_list, opt.s, prune_idx, epoch, idx2mask, opt) # Optimize if ni % accumulate == 0: # scaler.step(optimizer) # optimizer.step # scaler.update() optimizer.step() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss'] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) #剪枝后bn层权重 bn_weights = gather_bn_weights(cfg_model.module_list, prune_idx) tb_writer.add_histogram('bn_weights/hist', bn_weights.numpy(), epoch, bins='doane') # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train(hyp): cfg = opt.cfg # data = opt.data epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = max(round(64 / batch_size), 1) # accumulate n times before optimizer update (bs 64) weights = opt.weights # initial training weights # Image Sizes gs = 32 # (pixels) grid size max stride # Configure run init_seeds() nc = 1 if opt.single_cls else int(len(open( opt.names_classes).readlines())) # number of classes hyp['cls'] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Initialize model # model = Darknet(opt.cfg, opt.input_size, opt.algorithm_type).to(device) from utils.model_prune import Darknet_sss model = Darknet_sss(opt.cfg, opt.input_size, opt.algorithm_type).to(device) CBL_idx, _, prune_idx, ignore_idx = parse_module_defs(model.module_defs) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) if opt.sss: optimizer2 = APGNAG([{ 'params': model.lambda_block }], lr=hyp['lr0'], momentum=opt.momentum, gamma=opt.gamma_data) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g Conv2d.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 start_epoch = 0 best_fitness = 0.0 # attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. ckpt = torch.load(weights, map_location=device) # load model try: # ckpt['model'] = {k: v for k, v in ckpt['model'].state_dict().items() if model.state_dict()[k].numel() == v.numel()} # model.load_state_dict(ckpt['model'], strict=False) model.load_state_dict( torch.load(opt.weights, map_location=device)['model']) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer # if ckpt['optimizer'] is not None: # optimizer.load_state_dict(ckpt['optimizer']) # best_fitness = ckpt['best_fitness'] best_fitness = 1e-5 # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (opt.weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. load_darknet_weights(model, weights) if opt.freeze_layers: output_layer_indices = [ idx - 1 for idx, module in enumerate(model.module_list) if isinstance(module, YOLOLayer) ] freeze_layer_indices = [ x for x in range(len(model.module_list)) if (x not in output_layer_indices) and ( x - 1 not in output_layer_indices) ] for idx in freeze_layer_indices: for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.95 + 0.05 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch - 1 # see link below if opt.sss: scheduler2 = lr_scheduler.LambdaLR(optimizer2, lr_lambda=lf) scheduler2.last_epoch = start_epoch - 1 # see link below # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) model = torch.nn.DataParallel(model).to(device) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level dataloader, dataset = create_dataloader( opt.train_path, opt.input_size, batch_size, gs, hyp=hyp, augment=True, cache=False, rect=False, local_rank=-1, # Model parameters world_size=1) testloader = create_dataloader(opt.val_path, opt.input_size, 4, gs, hyp=hyp, augment=False, cache=False, rect=True, local_rank=-1, world_size=1)[0] nw = 8 model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.module_list = model.module.module_list # Model EMA ema = torch_utils.ModelEMA(model) # Start training nb = len(dataloader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() # print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test)) print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) start_epoch = 0 for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- # for i, (imgs, targets, paths, _) in enumerate(dataloader): # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 if opt.multi_scale: sz = random.randrange( opt.input_size * 0.5, opt.input_size * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) targets = targets.to(device) # Burn-in if ni <= n_burn: xi = [0, n_burn] # x interp model.gr = np.interp( ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, 64 / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) x['weight_decay'] = np.interp( ni, xi, [0.0, hyp['weight_decay'] if j == 1 else 0.0]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Forward # if opt.sss: pred = model(imgs) # else: # pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward loss *= batch_size / 64 # scale loss if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize idx2mask = None # if opt.sr and opt.prune==1 and epoch > opt.epochs * 0.5: # idx2mask = get_mask2(model, prune_idx, 0.85) # BNOptimizer.updateBN(opt.sr_flag, model.module_list, opt.gamma, prune_idx, idx2mask) if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() if opt.sss: optimizer2.step() optimizer2.zero_grad() ema.update(model) # Print mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 1: f = 'train_batch%g.jpg' % i # filename res = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, res, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( cfg=opt.cfg, names_file=opt.names_classes, batch_size=8, img_size=opt.input_size, conf_thres=0.01, save_json=False, # model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, model=ema.ema, single_cls=False, dataloader=testloader, save_dir=wdir) if opt.sss: print("lambda '{}'\n".format(model.module.lambda_block if hasattr( model, 'module') else model.lambda_block)) if epoch % opt.interval_prune == 0: #------------------------------ begin soft prune ---------------------------------------------# obtain_num_parameters = lambda model: sum( [param.nelement() for param in model.parameters()]) origin_nparameters = obtain_num_parameters(model) from utils.prune_utils import parse_module_defs2, get_global_norm_thr, obtain_filters_mask_norm, merge_mask, prune_soft_model_code, get_layer_norm_thr, obtain_filters_mask_norm_per_layer CBL_idx, Conv_idx, prune_idx, _, _ = parse_module_defs2( model.module.module_defs if hasattr(model, 'module' ) else model.module_defs) # norm_thr = get_global_norm_thr(model, prune_idx, opt.global_percent) norm_thr_list, norm_prune_index = get_layer_norm_thr( model, prune_idx, opt.global_percent) print("norm_thr_list is", norm_thr_list) # print("norm index is", norm_prune_index) num_filters_l2, filters_mask_l2 = obtain_filters_mask_norm_per_layer( model, norm_thr_list, CBL_idx, prune_idx, layer_keep=opt.layer_keep) CBLidx2mask = { idx: mask for idx, mask in zip(CBL_idx, filters_mask_l2) } CBLidx2filters = { idx: filters for idx, filters in zip(CBL_idx, num_filters_l2) } for i in model.module.module_defs if hasattr( model, 'module') else model.module_defs: if i['type'] == 'shortcut': i['is_access'] = False # print('merge the mask of layers connected to shortcut!') merge_mask(model.module if hasattr(model, 'module') else model, CBLidx2mask, CBLidx2filters) prune_soft_model_code( model.module if hasattr(model, 'module') else model, CBL_idx, CBLidx2mask) print("after soft prune, map test ") results_sfp, maps_sfp, times_sfp = test.test( cfg=opt.cfg, names_file=opt.names_classes, batch_size=8, img_size=opt.input_size, conf_thres=0.01, save_json=False, model=model.module if hasattr(model, 'module') else model, single_cls=False, dataloader=testloader, save_dir=wdir) # Write with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) bn_weights = gather_bn_weights(model.module_list, prune_idx) tb_writer.add_histogram('bn_weights/hist', bn_weights.numpy(), epoch, bins='doane') # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] # fisfp = fitness(np.array(results_sfp).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] # if fi-fisfp < 0.05: # with open(results_file, 'r') as f: # create checkpoint # ckpt = {'epoch': epoch, # 'best_fitness': str(fisfp), # 'training_results': f.read(), # # 'model': ema.ema.module.state_dict() if hasattr(model, 'module') else ema.ema.state_dict(), # 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, # 'optimizer': None if final_epoch else optimizer.state_dict()} # torch.save(ckpt, last) # if (best_fitness == fi) and not final_epoch: # torch.save(ckpt, best) # del ckpt if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), # 'model': ema.ema.module.state_dict() if hasattr(model, 'module') else ema.ema.state_dict(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def main(): """ Train and test :param opt: args :param writer: tensorboard :return: """ global opt opt = parse() arc = opt.arc cfg = opt.cfg teacher_cfg = opt.teacher_cfg img_size = opt.img_size epochs = opt.epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights teacher_weights = opt.teacher_weights multi_scale = opt.multi_scale sparsity_training = opt.st opt.weights = last if opt.resume else opt.weights # Initial logging logging.basicConfig( format="%(message)s", level=logging.INFO if opt.local_rank in [-1, 0] else logging.WARN) # Train logger.info(opt) if opt.local_rank in [-1, 0]: logger.info('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/') writer = SummaryWriter() # Hyperparameters with open(opt.hyp) as f_hyp: hyp = yaml.safe_load(f_hyp) # data dict with open(opt.data) as f_data: data = yaml.safe_load(f_data) # Distributed training initialize device = select_device(opt.device) if opt.local_rank != -1: dist.init_process_group(init_method="env://", backend='nccl') torch.cuda.set_device(opt.local_rank) device = torch.device(f"cuda:{opt.local_rank}") # world_size = torch.distributed.get_world_size() init_seeds() cuda = device.type != 'cpu' torch.backends.cudnn.benchmark = True if multi_scale: img_size_min = round(img_size / 32 / 1.5) + 1 img_size_max = round(img_size / 32 * 1.5) - 1 img_size = img_size_max * 32 # initiate with maximum multi_scale size logger.info(f'Using multi-scale {img_size_min * 32} - {img_size}') train_path = data['train'] num_classes = int(data['num_classes']) # number of classes # Load dataset dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, hyp=hyp, rect=opt.rect) train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if opt.local_rank != -1 else None num_worker = os.cpu_count() // torch.cuda.device_count() dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=min([num_worker, batch_size, 8]), shuffle=not (opt.rect or train_sampler), sampler=train_sampler, pin_memory=True, collate_fn=dataset.collate_fn) # Load model model = Model(cfg, img_size, arc=arc).to(device) # Load teacher model if teacher_cfg: teacher_model = Model(teacher_cfg, img_size, arc).to(device) # optimizer parameter groups param_group0, param_group1 = [], [] for key, value in model.named_parameters(): if 'Conv2d.weight' in key: param_group1.append(value) else: param_group0.append(value) if opt.adam: optimizer = optim.Adam(param_group0, lr=hyp['lr0']) else: optimizer = optim.SGD(param_group0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) # add param_group1 with weight_decay optimizer.add_param_group({'params': param_group1, 'weight_decay': hyp['weight_decay']}) logger.info(f'Optimizer groups: {len(param_group1)} conv.weight, {len(param_group0)} other') del param_group0, param_group1 start_epoch = 0 best_fitness = 0. if weights.endswith('.pt'): checkpoint = torch.load(weights, map_location=device) state_dict = intersect_dicts(checkpoint['model'], model.state_dict()) model.load_state_dict(state_dict, strict=False) print('loaded weights from', weights, '\n') # load optimizer if checkpoint['optimizer'] is not None: optimizer.load_state_dict(checkpoint['optimizer']) best_fitness = checkpoint['best_fitness'] # load results if checkpoint.get('training_results') is not None: with open(results_file, 'w') as file: file.write(checkpoint['training_results']) # resume if opt.resume: start_epoch = checkpoint['epoch'] + 1 del checkpoint elif len(weights) > 0: # weights are 'yolov4.weights', 'darknet53.conv.74' etc. load_darknet_weights(model, weights) logger.info(f'loaded weights from {weights}\n') # Load teacher weights if teacher_cfg: if teacher_weights.endswith('.pt'): teacher_model.load_state_dict(torch.load(teacher_weights, map_location=device)['model']) elif teacher_weights.endswith('.weights'): load_darknet_weights(teacher_model, teacher_weights) else: raise Exception('pls provide proper teacher weights for knowledge distillation') if not mixed_precision: teacher_model.eval() logger.info('<......................using knowledge distillation....................>') logger.info(f'teacher model: {teacher_weights}\n') # Sparsity training if opt.prune == 0: _, _, prune_index = parse_module_index(model.module_dicts) if sparsity_training: logger.info('normal sparse training') if mixed_precision: if teacher_cfg: [model, teacher_model], optimizer = amp.initialize([model, teacher_model], optimizer, opt_level='O1', verbosity=1) else: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=1) # SyncBatchNorm and distributed training if cuda and opt.local_rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) model = model.to(device) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[opt.local_rank]) model.module_list = model.module.module_list model.yolo_layers = model.module.yolo_layers for index in prune_index: bn_weights = gather_bn_weights(model.module_list, [index]) if opt.local_rank == 0: writer.add_histogram('before_train_per_layer_bn_weights/hist', bn_weights.numpy(), index, bins='doane') # Start training model.num_classes = num_classes model.arc = opt.arc model.hyp = hyp num_batch_size = len(dataloader) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' results = (0, 0, 0, 0, 0, 0, 0) start_train_time = time.time() logger.info('Image sizes %d \n Starting training for %d epochs...', img_size, epochs) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() mean_losses = torch.zeros(4).to(device) mean_soft_target = torch.zeros(1).to(device) pbar = enumerate(dataloader) logger.info(('\n %10s %10s %10s %10s %10s %10s %10s %10s'), 'Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size') if opt.local_rank in [-1, 0]: pbar = tqdm(pbar, total=num_batch_size) optimizer.zero_grad() for i, (imgs, targets, _, _) in pbar: # batch ------------------------------------------------------------- num_integrated_batches = i + num_batch_size * epoch # Adjust the learning rate learning_rate = adjust_learning_rate(optimizer, num_integrated_batches, num_batch_size, hyp, epoch, epochs) if i == 0 and opt.local_rank in [-1, 0]: logger.info(f'learning rate: {learning_rate}') imgs = imgs.to(device) / 255.0 targets = targets.to(device) # Multi-Scale training if multi_scale: if num_integrated_batches / accumulate % 10 == 0: img_size = random.randrange(img_size_min, img_size_max + 1) * 32 scale_factor = img_size / max(imgs.shape[2:]) if scale_factor != 1: new_shape = [math.ceil(x * scale_factor / 32.) * 32 for x in imgs.shape[2:]] imgs = F.interpolate(imgs, size=new_shape, mode='bilinear', align_corners=False) pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) # knowledge distillation soft_target = 0 if teacher_cfg: if mixed_precision: with torch.no_grad(): output_teacher = teacher_model(imgs) else: _, output_teacher = teacher_model(imgs) soft_target = distillation_loss(pred, output_teacher, model.num_classes, imgs.size(0)) loss += soft_target # Scale loss by nominal batch_size of 64 loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Sparse the BN layer that needs pruning if sparsity_training: # bn_l1_regularization(model.module_list, opt.penalty_factor, cba_index, epoch, epochs) bn_l1_regularization(model.module_list, opt.penalty_factor, prune_index, epoch, epochs) # Accumulate gradient for x batches before optimizing if num_integrated_batches % accumulate == 0: optimizer.step() optimizer.zero_grad() if opt.local_rank in [-1, 0]: mean_losses = (mean_losses * i + loss_items) / (i + 1) mean_soft_target = (mean_soft_target * i + soft_target) / (i + 1) memory = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0 # (GB) description = ('%10s' * 2 + '%10.3g' * 6) % ( '%g/%g' % (epoch, epochs - 1), '%.3gG' % memory, *mean_losses, mean_soft_target, img_size) pbar.set_description(description) # end batch ------------------------------------------------------------------------------------------------ # Update scheduler # scheduler.step() if opt.local_rank in [-1, 0]: final_epoch = epoch + 1 == epochs # Calculate mAP if not (opt.notest or opt.nosave) or final_epoch: with torch.no_grad(): results, _ = test(cfg, data, batch_size=batch_size, img_size=opt.img_size, model=model, conf_thres=0.001 if final_epoch and epoch > 0 else 0.1, # 0.1 for speed save_json=final_epoch and epoch > 0) # Write epoch results with open(results_file, 'a') as file: # P, R, mAP, F1, test_losses=(GIoU, obj, cls) file.write(description + '%10.3g' * 7 % results + '\n') # Write Tensorboard results if writer: outputs = list(mean_losses) + list(results) titles = ['GIoU', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'] for output, title in zip(outputs, titles): writer.add_scalar(title, output, epoch) bn_weights = gather_bn_weights(model.module_list, prune_index) writer.add_histogram('bn_weights/hist', bn_weights.numpy(), epoch, bins='doane') # Update best mAP fitness = results[2] if fitness > best_fitness: best_fitness = fitness # Save training results save = (not opt.nosave) or (final_epoch and not opt.evolve) if save and opt.local_rank == 0: with open(results_file, 'r') as file: # Create checkpoint checkpoint = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': file.read(), 'model': model.module.state_dict() if isinstance( model, nn.parallel.DistributedDataParallel) else model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last checkpoint torch.save(checkpoint, last) # Save best checkpoint if best_fitness == fitness: torch.save(checkpoint, best) # Delete checkpoint del checkpoint # end epoch ----------------------------------------------------------------------------------------------- # end training if opt.local_rank in [-1, 0]: if len(opt.name): os.rename('results.txt', 'results_%s.txt' % opt.name) plot_results() # save as results.png print(f'{epoch - start_epoch + 1} epochs completed in {(time.time() - start_train_time) / 3600:.3f} hours.\n') if torch.cuda.device_count() > 1: dist.destroy_process_group() torch.cuda.empty_cache() return results