def train(hyp, opt, device, tb_writer=None): print(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = str(log_dir / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank, loss_name = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank, opt.loss if loss_name == 'ciou': loss_fn = compute_loss_ciou elif loss_name == 'giou': loss_fn = compute_loss_giou elif loss_name == 'gioupp': loss_fn = compute_loss_gioupp # TODO: Use DDP logging. Only the first process is allowed to log. # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint model = Darknet(opt.cfg).to(device) # create state_dict = { k: v for k, v in ckpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(state_dict, strict=False) print('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Darknet(opt.cfg).to(device) # create # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2.append(v) # biases elif 'Conv2d.weight' in k: pg1.append(v) # apply weight_decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = 32 # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=(opt.local_rank)) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=rank, world_size=opt.world_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates *** # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = create_dataloader(test_path, imgsz_test, batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Check anchors #if not opt.noautoanchor: # check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) if rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: # Generate indices if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: print( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Autocast with amp.autocast(enabled=cuda): # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # if not torch.isfinite(loss): # print('WARNING: non-finite loss, ending training ', loss_items) # return results # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: if loss_items: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir, compute_loss=loss_fn) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: if loss_items: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) else: tags = [ "train/" + loss_name + "_loss", 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', "val/" + loss_name + "_loss" ] for x, tag in zip(list(mloss) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module.state_dict() if hasattr(ema, 'module') else ema.ema.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if epoch >= (epochs - 5): torch.save(ckpt, last.replace('.pt', '_{:03d}.pt'.format(epoch))) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train(hyp, device): init_seeds(1) cuda = device.type != 'cpu' # v5 Model weights = './weights/ckpt_model_599_800_0.07873.pt' model = attempt_load(weights, map_location=device) # load MNN Model model_file = './weights/20201231_exp25_599_800_forT_bs128_320x320.mnn' net = nn.load_module_from_file(model_file, for_training=True) nn.compress.train_quant(net, quant_bits=8) mnn_opt = MNN.optim.SGD(5e-6, 0.9, 0) mnn_opt.append(net.parameters) MNNF.set_thread_number(32) net.train(True) # Image sizes gs = 32 # grid size (max stride) imgsz = 320 batch_size = 128 # verify imgsz are gs-multiples stride = [8, 16, 32] train_path = '/home/sysman/gate_Sample/VOCdevkit/VOC2017/ImageSets/train_5th_add.txt' opt = '' f = open('20210104_quan_train.txt', 'w') # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=False, rect=False, rank=-1, world_size=1, workers=32) for epoch in range( 5 ): # epoch ------------------------------------------------------------------ t0 = time.time() total_loss = 0 # mnn_opt.learning_rate = learning_rate_scheduler(mnn_opt.learning_rate, epoch) for i, (imgs, targets, paths, _) in enumerate( dataloader ): # batch ------------------------------------------------------------- t1 = time.time() imgs = imgs.to(device, non_blocking=True).float() / 255.0 # data_image1 = imgs[0].cpu() # images_t3 = tf.ToPILImage()(data_image1) # images_t3.show() # Forward bs, c, h, w = imgs.shape # 20201231 by zlf # MNN forward data = MNNF.const(imgs.flatten().tolist(), [bs, 3, 320, 320], MNNF.data_format.NCHW) predict = net.forward(data) predict.read() p1 = MNNF.Var.read(predict) p1 = torch.tensor(p1).cuda() x1, x2, x3 = torch.split(p1, [4800, 1200, 300], 1) x1 = x1.view(-1, 3, 109, 40, 40).permute(0, 1, 3, 4, 2).contiguous() x2 = x2.view(-1, 3, 109, 20, 20).permute(0, 1, 3, 4, 2).contiguous() x3 = x3.view(-1, 3, 109, 10, 10).permute(0, 1, 3, 4, 2).contiguous() x = [x1, x2, x3] loss1, loss_items1 = compute_loss(x, targets.to(device), model) total_loss += loss_items1[3].item() avg_total_loss = total_loss / (i + 1) # if i == 0: # print('loss0:',loss1) # print('iou loss:%.4f,obj loss:%.4f,cls loss:%.4f,total:%.4f'%(loss_items1[0].item(),loss_items1[1].item(),loss_items1[2].item(),loss_items1[3].item())) loss1 = np.array(loss1.cpu()) loss1 = MNNF.const(loss1.flatten().tolist(), [1], MNNF.data_format.NCHW) # Backward mnn_opt.step(loss1) t2 = time.time() line = '[%d|%d|%d]iou_loss:%.4f,obj_loss:%.4f,cls_loss:%.4f,total:%.4f,mean_total:%.4f,time:%.3f' % ( epoch, i, len(dataloader) - 1, loss_items1[0].item(), loss_items1[1].item(), loss_items1[2].item(), loss_items1[3].item(), avg_total_loss, (t2 - t1)) # print("[%d|%d|%d]train loss:%.5f,time:%.3f "%(epoch,i,len(dataloader)-1,loss1.read(),(t2-t1))) f.write(line + '\n') print(line) # save model file_name = './weights/%d_20201231test.mnn' % epoch net.train(False) predict = net.forward(MNNF.placeholder([1, 3, 192, 320], MNNF.NC4HW4)) print("Save to " + file_name) MNNF.save([predict], file_name) print('Epoch:', (time.time() - t0)) # end epoch ---------------------------------------------------------------------------------------------------- # end training f.close()
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') # 获取记录训练日志的路径 # 如果设置进化算法则不会传入tb_writer(则为None),设置一个evolve文件夹作为日志目录 log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory # 设置保存权重的路径 wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' # 设置保存results的路径 results_file = str(log_dir / 'results.txt') # 获取轮次、批次、总批次(涉及到分布式训练)、权重、进程序号(主要用于分布式训练) epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # rank = -1 # Save run settings # 保存hyp和opt with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = (device.type != 'cpu') # 设置随机种子 init_seeds(2 + rank) with open(opt.data) as f: # 加载数据配置信息 data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first( rank): # torch_distributed_zero_first同步所有进程 check_dataset( data_dict ) # check_dataset检查数据集,如果没找到数据集则下载数据集(仅适用于项目中自带的yaml文件数据集) # 获取训练集、测试集图片路径 train_path = data_dict['train'] test_path = data_dict['val'] # 获取类别数量和类别名字, 如果设置了opt.single_cls则为一类 nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: # 如果采用预训练 # 加载模型,从google云盘中自动下载模型 # 但通常会下载失败,建议提前下载下来放进weights目录 with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally # 加载检查点 ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor """ 这里模型创建,可通过opt.cfg,也可通过ckpt['model'].yaml 这里的区别在于是否是resume,resume时会将opt.cfg设为空,则按照ckpt['model'].yaml创建模型; 这也影响着下面是否除去anchor的key(也就是不加载anchor),如果resume则不加载anchor 主要是因为保存的模型会保存anchors,有时候用户自定义了anchor之后,再resume,则原来基于coco数据集的anchor就会覆盖自己设定的anchor, 参考https://github.com/ultralytics/yolov5/issues/459 所以下面设置了intersect_dicts,该函数就是忽略掉exclude """ model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [ ] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load # 显示加载预训练权重的的键值对和创建模型的键值对 # 如果设置了resume,则会少加载两个键值对(anchors,anchor_grid) logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: # 创建模型, ch为输入图片通道 model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze """ 冻结模型层,设置冻结层名字即可 具体可以查看https://github.com/ultralytics/yolov5/issues/679 但作者不鼓励冻结层,因为他的实验当中显示冻结层不能获得更好的性能,参照:https://github.com/ultralytics/yolov5/pull/707 并且作者为了使得优化参数分组可以正常进行,在下面将所有参数的requires_grad设为了True 其实这里只是给一个freeze的示例 """ freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): # print(k,v) if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer """ nbs为模拟的batch_size; 就比如默认的话上面设置的opt.batch_size为16,这个nbs就为64, 也就是模型梯度累积了64/16=4(accumulate)次之后 再更新一次模型,变相的扩大了batch_size """ nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing accumulate = 4 # 根据accumulate设置权重衰减系数 hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay # 将模型分成三组(weight、bn, bias, 其他所有参数)优化 pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): # print(k) v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else # 选用优化器,并设置pg0组的优化方式 if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) # 设置weight、bn的优化方式 optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay # 设置biases的优化方式 optimizer.add_param_group({'params': pg2}) # add pg2 (biases) # 打印优化信息 logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # 设置学习率衰减,这里为余弦退火方式进行衰减 # 就是根据以下公式lf,epoch和超参数hyp['lrf']进行衰减 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume # 初始化开始训练的epoch和最好的结果 # best_fitness是以[0.0, 0.0, 0.1, 0.9]为系数并乘以[精确度, 召回率, [email protected], [email protected]:0.95]再求和所得 # 根据best_fitness来保存best.pt start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer # 加载优化器与 best_fitness if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results # 加载训练结果result.txt if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs # 加载训练的轮次 # print(ckpt['epoch']) start_epoch = ckpt['epoch'] + 1 # ckpt['epoch'] = -1 """ 如果resume,则备份权重 尽管目前resume能够近似100%成功的起作用了,参照:https://github.com/ultralytics/yolov5/pull/756 但为了防止resume时出现其他问题,把之前的权重覆盖了,所以这里进行备份,参照:https://github.com/ultralytics/yolov5/pull/765 """ if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}' ) # save previous weights """ 如果新设置epochs小于加载的epoch, 则视新设置的epochs为需要再训练的轮次数而不再是总的轮次数 """ if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes # 获取模型总步长和模型输入图片分辨率 gs = int(max(model.stride)) # grid size (max stride) # 检查输入图片分辨率确保能够整除总步长gs imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # imgsz, imgsz_test 都是640 # DP mode # 分布式训练,参照:https://github.com/ultralytics/yolov5/issues/475 # DataParallel模式,仅支持单机多卡 # rank为进程编号, 这里应该设置为rank=-1则使用DataParallel模式 # rank=-1且gpu数量=1时,不会进行分布式 if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # 执行了 # SyncBatchNorm # 使用跨卡同步BN if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average 指数滑动平均,或指数加权平均 # 为模型创建EMA指数滑动平均,如果GPU进程数大于1,则不创建 ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode # 如果rank不等于-1,则使用DistributedDataParallel模式 # local_rank为gpu编号,rank为进程,例如rank=3,local_rank=0 表示第 3 个进程内的第 1 块 GPU。 if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader # 创建训练集dataloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) """ 获取标签中最大的类别值,并于类别数作比较 如果小于类别数则表示有问题 """ mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: # 更新ema模型的updates参数,保持ema的平滑性 ema.updates = start_epoch * nb // accumulate # set EMA updates # 创建测试集dataloader testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: # 将所有样本的标签拼接到一起shape为(total, 5),统计后做可视化 labels = np.concatenate(dataset.labels, 0) # 获得所有样本的类别 c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) # 根据上面的统计对所有样本的类别,中心点xy位置,长宽wh做可视化 plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Anchors """ 计算默认锚点anchor与数据集标签框的长宽比值 标签的长h宽w与anchor的长h_a宽w_a的比值, 即h/h_a, w/w_a都要在(1/hyp['anchor_t'], hyp['anchor_t'])是可以接受的 如果标签框满足上面条件的数量小于总数的99%,则根据k-mean算法聚类新的锚点anchor """ if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters # 根据自己数据集的类别数设置分类损失的系数 hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset # 设置类别数,超参数 model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model """ 设置giou的值在objectness loss中做标签的系数, 使用代码如下 tobj[b, a, gj, gi] = (1.0 - model.gr) + model.gr * giou.detach().clamp(0).type(tobj.dtype) 这里model.gr=1,也就是说完全使用标签框与预测框的giou值来作为该预测框的objectness标签 """ model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) # 根据labels初始化图片采样权重 model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights # 获取类别的名字 model.names = names # Start training t0 = time.time() # 获取热身训练的迭代次数 nw = max(round(hyp['warmup_epochs'] * nb), 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training # 初始化mAP和results maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' """ 设置学习率衰减所进行到的轮次, 目的是打断训练后,--resume接着训练也能正常的衔接之前的训练进行学习率衰减 """ scheduler.last_epoch = start_epoch - 1 # do not move # 通过torch1.6自带的api设置混合精度训练 scaler = amp.GradScaler(enabled=cuda) """ 打印训练和测试输入图片分辨率 加载图片时调用的cpu进程数 从哪个epoch开始训练 """ logger.info( 'Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices """ 如果设置进行图片采样策略, 则根据前面初始化的图片采样权重model.class_weights以及maps配合每张图片包含的类别数 通过random.choices生成图片索引indices从而进行采样 """ if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP # 如果是DDP模式,则广播采样策略 if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() # 广播索引到其他group dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders # 初始化训练时打印的平均损失信息 mloss = torch.zeros(4, device=device) # mean losses if rank != -1: # DDP模式下打乱数据, ddp.sampler的随机采样数据是基于epoch+seed作为随机种子, # 每次epoch不同,随机种子就不同 dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: # tqdm 创建进度条,方便训练时 信息的展示 pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- # 计算迭代的次数iteration ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup """ 热身训练(前nw次迭代) 在前nw次迭代中,根据以下方式选取accumulate和学习率 """ if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 """ bias的学习率从0.1下降到基准学习率lr*lf(epoch), 其他的参数学习率从0增加到lr*lf(epoch). lf为上面设置的余弦退火的衰减函数 """ x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) # 动量momentum也从0.9慢慢变到hyp['momentum'](default=0.937) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale # 设置多尺度训练,从imgsz * 0.5, imgsz * 1.5 + gs随机选取尺寸 if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward # 混合精度 with amp.autocast(enabled=cuda): pred = model(imgs) # forward 前向传播 # Loss # 计算损失,包括分类损失,objectness损失,框的回归损失 # loss为总损失值,loss_items为一个元组,包含分类损失,objectness损失,框的回归损失和总损失 loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: # 平均不同gpu之间的梯度 loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward # 反向传播 scaler.scale(loss).backward() # Optimize # 模型反向传播accumulate次之后再根据累积的梯度更新一次参数 if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: # 打印显存,进行的轮次,损失,target的数量和图片的size等信息 mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) # 进度条显示以上信息 pbar.set_description(s) # Plot # 将前三次迭代batch的标签框在图片上画出来并保存 if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler # 进行学习率衰减 lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: # 更新EMA的属性 # 添加include的属性 ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) # 判断该epoch是否为最后一轮 final_epoch = epoch + 1 == epochs # 对测试集进行测试,计算mAP等指标 # 测试时使用的是EMA模型 if not opt.notest or final_epoch: # Calculate mAP if final_epoch: # replot predictions [ os.remove(x) for x in glob.glob( str(log_dir / 'test_batch*_pred.jpg')) if os.path.exists(x) ] results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write # 将指标写入result.txt with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) # 如果设置opt.bucket, 上传results.txt到谷歌云盘 if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard # 添加指标,损失等信息到tensorboard显示 if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP # 更新best_fitness fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model """ 保存模型,还保存了epoch,results,optimizer等信息, optimizer将不会在最后一轮完成后保存 model保存的是EMA的模型 """ save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers """ 模型训练完后,strip_optimizer函数将optimizer从ckpt中去除; 并且对模型进行model.half(), 将Float32的模型->Float16, 可以减少模型大小,提高inference速度 """ n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer # 上传结果到谷歌云盘 os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish # 可视化results.txt文件 if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) # 释放显存 dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # Resume start_epoch, best_fitness = 0, 0.0 # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes plot_labels(labels, save_dir=log_dir) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info( 'Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP if final_epoch: # replot predictions [ os.remove(x) for x in glob.glob( str(log_dir / 'test_batch*_pred.jpg')) if os.path.exists(x) ] results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train_aux_for_LCP(cfg, backbone, neck, data_loader, weights, aux_weight, hyp, device, resume, epochs): init_seeds() batch_size = data_loader.batch_size accumulate = 64 // batch_size model = Darknet(cfg).to(device) model_chkpt = torch.load(weights, map_location=device) model.load_state_dict(model_chkpt['model'], strict=True) del model_chkpt aux_util = AuxNetUtils(model, hyp, backbone, neck, strategy="LCP") hook_util = HookUtils() start_epoch = 0 aux_model_list = [] pg = [] for layer in aux_util.aux_in_layer: aux_model = aux_util.creat_aux_model(layer) aux_model.to(device) for v in aux_model.parameters(): pg += [v] aux_model_list.append(aux_model) optimizer = optim.SGD(pg, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) del pg if resume: chkpt = torch.load(aux_weight, map_location=device) for i, layer in enumerate(aux_util.aux_in_layer): aux_model_list[i].load_state_dict(chkpt['aux_in{}'.format(layer)], strict=True) if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) start_epoch = chkpt['epoch'] + 1 scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[epochs // 3, 2 * epochs // 3], gamma=0.1) scheduler.last_epoch = start_epoch - 1 handles = [] # 结束训练后handle需要回收 for name, child in model.module_list.named_children(): if name in aux_util.aux_in_layer: handles.append( child.register_forward_hook(hook_util.hook_origin_output)) if device.type != 'cpu' and torch.cuda.device_count() > 1: model = nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers nb = len(data_loader) model.nc = 80 model.hyp = hyp model.arc = 'default' print('Starting training for %g epochs...' % epochs) for epoch in range(start_epoch, epochs): for aux_model in aux_model_list: aux_model.train() print(('\n' + '%10s' * 6) % ('Stage', 'Epoch', 'gpu_mem', 'AuxID', 'cls', 'targets')) # -----------------start batch----------------- pbar = tqdm(enumerate(data_loader), total=nb) model.train() for i, (imgs, targets, _, _) in pbar: if len(targets) == 0: continue ni = i + nb * epoch imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) with torch.no_grad(): prediction = model(imgs) hook_util.cat_to_gpu0() for aux_idx, aux_model in enumerate(aux_model_list): pred, loc_loss = aux_model( hook_util.origin_features['gpu0'][aux_idx], targets, prediction) loss = compute_loss_for_LCP(pred, loc_loss, targets) loss *= batch_size / 64 loss.backward() mem = torch.cuda.memory_cached( ) / 1E9 if torch.cuda.is_available() else 0 # (GB) s = ('%10s' * 3 + '%10.3g' * 3) % ('Train Aux', '%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, aux_idx, loss, len(targets)) pbar.set_description(s) # 每个batch后要把hook_out内容清除 hook_util.clean_hook_out() if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() # -----------------end batches----------------- scheduler.step() final_epoch = epoch + 1 == epochs chkpt = { 'epoch': epoch, 'optimizer': None if final_epoch else optimizer.state_dict() } for i, layer in enumerate(aux_util.aux_in_layer): chkpt['aux_in{}'.format(layer)] = aux_model_list[i].state_dict() torch.save(chkpt, aux_weight) torch.save(chkpt, "../weights/LCP/aux-coco.pt") del chkpt with open("./LCP/aux_result.txt", 'a') as f: f.write(s + '\n') # 最后要把hook全部删除 for handle in handles: handle.remove() torch.cuda.empty_cache()
def init_seeds(seed=0): random.seed(seed) np.random.seed(seed) torch_utils.init_seeds(seed=seed)
default=0, help='optional test variable') parser.add_argument('--s', type=float, default=0.0001, help='sparity') parser.add_argument('--cfg', type=str, default='mul_sparsity/yolov3_5.cfg', help='cfg file path') parser.add_argument('--weights-path', type=str, default='mul_sparsity/yolov3_5.weights', help='path to store weights') opt = parser.parse_args() print(opt, end='\n\n') torch_utils.init_seeds() torch.cuda.empty_cache() train( opt.cfg, opt.data_config, img_size=opt.img_size, resume=opt.resume, epochs=opt.epochs, batch_size=opt.batch_size, weights_path=opt.weights_path, report=opt.report, multi_scale=opt.multi_scale, freeze_backbone=opt.freeze, var=opt.var, s=opt.s,
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory log_dir = Path('/home/data/yolov5_pt', log_dir) wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict print(data_dict) with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [ ] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # load MNN Model model_file = './weights/20201231_exp25_599_800_forT_bs128_320x320.mnn' net = nn.load_module_from_file(model_file, for_training=True) nn.compress.train_quant(net, quant_bits=8) mnn_opt = MNN.optim.SGD(1e-9, 0.9, 0) mnn_opt.append(net.parameters) net.train(True) # Freeze freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) # shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}') # save previous weights if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader t0 = time.time() dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) t1 = time.time() print('dataloader is created in {}s'.format(t1 - t0)) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 104. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info( 'Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) # add by zlf on 20201113 # write down all the results # save_path = '/home/data/yolov5_pt/new_val_work/val_result/20201124_finetune_800/' # if not os.path.exists(save_path): # os.makedirs(save_path) # count_f = open(save_path+'result_count.txt','w') # add by zlf on 20201113 for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): # data_image1 = imgs[0].cpu() # images_t3 = tf.ToPILImage()(data_image1) # images_t3.show() # for n,p in model.named_parameters(): # print(p.dtype) # pred = model(imgs) # 20201231 by zlf # MNN forward data = MNNF.const(imgs.flatten().tolist(), [128, 3, 320, 320], MNNF.data_format.NCHW) predict = net.forward(data) predict.read() p1 = MNNF.Var.read(predict) print(predict.shape) print(predict.size) print(predict.dtype) p1 = torch.tensor(p1).cuda() x1, x2, x3 = torch.split(p1, [4800, 1200, 300], 1) x1 = x1.view(-1, 3, 109, 40, 40).permute(0, 1, 3, 4, 2).contiguous() x2 = x2.view(-1, 3, 109, 20, 20).permute(0, 1, 3, 4, 2).contiguous() x3 = x3.view(-1, 3, 109, 10, 10).permute(0, 1, 3, 4, 2).contiguous() x = [x1, x2, x3] # by zlf 20201016 now,pred is (bs,327,6,10), # but complute loss needs (bs,3,6,10,109) # for idx in range(3): # bs, _, ny, nx = pred[idx].shape # pred[idx] = pred[idx].view(bs, 3, 109, ny, nx).permute(0, 1, 3, 4, 2).contiguous() # 20201016 by zlf # loss, loss_items = compute_loss(pred, targets.to(device), model) loss1, loss_items1 = compute_loss(x, targets.to(device), model) loss1 = np.array(loss1.cpu()) loss1 = MNNF.const(loss1.flatten().tolist(), [1], MNNF.data_format.NCHW) # print('loss:',loss)# loss scaled by batch_size # if rank != -1: # loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward # scaler.scale(loss).backward() mnn_opt.step(loss1) # Optimize # if ni % accumulate == 0: # scaler.step(optimizer) # optimizer.step # scaler.update() # optimizer.zero_grad() # if ema: # ema.update(model) # # # Print # if rank in [-1, 0]: # mloss = (mloss * i + loss_items) / (i + 1) # update mean losses # mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) # s = ('%10s' * 2 + '%10.4g' * 6) % ( # '%g/%g/%g' % (epoch, epochs - 1,i), mem, *mloss, targets.shape[0], imgs.shape[-1]) # pbar.set_description(s) # # # Plot # if ni < 3: # f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename # result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) # if tb_writer and result is not None: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # 20201016 by zlf # save model # with open(results_file, 'a') as f: # f.write(s + '%10.4g' * 7 % results + '\n') # # with open(results_file, 'r') as f: # create checkpoint # ckpt = {'epoch': epoch, # 'best_fitness': best_fitness, # 'training_results': None, # 'model': ema.ema, # 'optimizer': None} # if epoch>120 and (i % 450 == 0) and (i != 0): # torch.save(ckpt, wdir / 'ckpt_model_{}_{}_{}.pt'.format(epoch,i,round((mloss.sum()/2.).item(),5))) # add by zlf on 20201113 New Val Project # wdir_s = str(wdir)+'/' # modelv = ModelV(wdir_s+'ckpt_model_{}_{}_{}.pt'.format(epoch,i,round((mloss.sum()/2.).item(),5))) # imgs_dir = '%s_%s/'%(epoch,i) # if not os.path.exists(save_path+imgs_dir): # os.mkdir(save_path+imgs_dir) # l = val(modelv,'new_val_work/val.txt', # issave=False, # save_path=save_path+imgs_dir, # result_txt=save_path+imgs_dir+'result.txt') # result_txt = save_path+imgs_dir+'result.txt' # d1, d2, miss_detect, wrong_detect, extra_detect, total_error = check_result(result_txt) # line1 = 'model_epoch%s_%s: '%(epoch,i) # line2 = '{"total":%d,"miss":%d,"wrong":%d,"extra":%d}'%(total_error,miss_detect,wrong_detect,extra_detect) # print(line1+line2) # line = line1+line2+'\n'+str(d2) # count_f.write(line+'\n') # del ckpt # end batch ------------------------------------------------------------------------------------------------ # save model file_name = './weights/%d_20201231test.mnn' % epoch net.train(False) predict = net.forward(MNNF.placeholder([1, 3, 192, 320], MNNF.NC4HW4)) print("Save to " + file_name) MNNF.save([predict], file_name) # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP if final_epoch: # replot predictions [ os.remove(x) for x in glob.glob( str(log_dir / 'test_batch*_pred.jpg')) if os.path.exists(x) ] results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': None, 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: # best = wdir / 'epoch%d_best.pt'%epoch torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training # count_f.close() if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
action='store_true', help='use torch.optim.Adam() optimizer') parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') parser.add_argument('--logdir', type=str, default='runs/', help='logging directory') opt = parser.parse_args() init_seeds(0) # clw modify # Resume if opt.resume: last = get_latest_run( ) if opt.resume == 'get_last' else opt.resume # resume from most recent run if last and not opt.weights: print(f'Resuming training from {last}') opt.weights = last if opt.resume and not opt.weights else opt.weights # if opt.local_rank == -1 or ("RANK" in os.environ and os.environ["RANK"] == "0"): # clw delete # check_git_status() opt.hyp = opt.hyp or ('data/hyp.finetune.yaml' if opt.weights else 'data/hyp.scratch.yaml') opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file( opt.cfg), check_file(opt.hyp) # check files
def train(hyp, opt, device, tb_writer=None): print(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = str(log_dir / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # TODO: Use DDP logging. Only the first process is allowed to log. # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict root_path = data_dict['root'] train_path = data_dict['train'] test_emb_path = data_dict['test_emb'] test_path = data_dict['test'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Remove previous results if rank in [-1, 0]: for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model model = Model(opt.cfg, nc=nc).to(device) # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz = imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html # all-reduce operation is carried out during loss.backward(). # Thus, there would be redundant all-reduce communications in a accumulation procedure, # which means, the result is still right but the training speed gets slower. # TODO: If acceleration is needed, there is an implementation of allreduce_post_accumulation # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # Load Model with torch_distributed_zero_first(rank): attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: exclude = ['anchor'] # exclude keys ckpt['model'] = { k: v for k, v in ckpt['model'].float().state_dict().items() if k in model.state_dict() and not any(x in k for x in exclude) and model.state_dict()[k].shape == v.shape } model.load_state_dict(ckpt['model'], strict=False) print('Transferred %g/%g items from %s' % (len(ckpt['model']), len(model.state_dict()), weights)) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ "Please delete or update %s and try again, or use --weights '' to train from scratch." \ % (weights, opt.cfg, weights, weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=(opt.local_rank)) # Trainloader dataloader, dataset = create_dataloader(root_path, train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=rank, world_size=opt.world_size) # Testloader testloader = create_dataloader(root_path, test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # model.nID = dataset.nID # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) if rank in [0, -1]: print('Image sizes {} train, {} test'.format(str(imgsz), str(imgsz_test))) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() mloss = torch.zeros(5, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: print(('\n' + '%10s' * 8 + '%13s') % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'lid', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale and random.random() < 0.5: candidate_shapes = [[608, 1088], [480, 864], [320, 576], [512, 960], [384, 640]] curr_shapes = candidate_shapes[random.randint(0, 4)] imgs = F.interpolate(imgs, size=curr_shapes, mode='bilinear', align_corners=False) # Autocast with amp.autocast(enabled=cuda): # Forward pred_detect, pred_emb = model(imgs) # Loss loss, loss_items = compute_loss(pred_detect, pred_emb, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6 + "%10s") % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], ' [%g,%g]' % (imgs.shape[-1], imgs.shape[-2])) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir, emb_dim=model.module.emb_dim) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 8 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') print("tb_writer.log_dir: ", tb_writer.log_dir) #resumed的时候可以从这里恢复 log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' print("log_dir: ", log_dir) print("wdir: ", wdir) print("last: ", last) print("best: ", best) results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank print("epochs: ", epochs) print("batch_size: ", batch_size) print("total_batch_size: ", total_batch_size) print("weights: ", weights) print("rank: ", rank) # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' print("cuda: ", cuda) init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict print("data_dict: ", data_dict) with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') print("pretrained: ", pretrained) if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint """ckpt中,['epoch'], ['best_fitness'], ['training_results'], ['model'], ['optimizer']""" # print("ckpt: ", ckpt) print( "ckpt: ['epoch'], ['best_fitness'], ['training_results'], ['optimizer']: ", ckpt['epoch'], ckpt['best_fitness'], ckpt['training_results'], ckpt['optimizer']) # print("ckpt['model']: ", ckpt['model']) # print("ckpt['model'].model: ", ckpt['model'].model) # print("ckpt['model'].state_dict(): ", ckpt['model'].state_dict()) print("ckpt['model'].save: ", ckpt['model'].save) print("ckpt['model'].yaml: ", ckpt['model'].yaml) print("hyp.get('anchors'): ", hyp.get('anchors')) if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor print("opt.cfg: ", opt.cfg) # create, 都为真则取or前面的, 即以opt.cfg中的内容(eg:yolov5s.yaml, yolov5x.yaml...)为主,其次是ckpt['model'].yaml model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) exclude = (['anchor'] if opt.cfg or hyp.get('anchors') else [] ) # exclude keys, 以opt.cfg中的anchor为主,其次是hyp中的 print("exclude: ", exclude) state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load # print("state_dict: ", state_dict) logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer """当模型梯度累积了(nbs/total_batch_size)次之后,再更新一次模型参数,变相的扩大了batch_size""" nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else #optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # 设置学习率衰减,这里为余弦退火方式进行衰减 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) #每个epochs具有一个不同的学习率 # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}' ) # save previous weights if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) # 获取模型总步长和模型输入图片分辨率 imgsz, imgsz_test = [ check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples #检查输入图片分辨率是gs=32的整数倍 print("imgsz: ", imgsz) print("imgsz_test: ", imgsz_test) # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: print( "DP mode..............................................................................." ) model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) print("dataloader: ", dataloader) print("dataset: ", dataset) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) nb = len(dataloader) # number of batches print("nb: ", nb) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates print("ema.updates: ", ema.updates) testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names #print("model: ", model) # Start training t0 = time.time() n_warmup = max( round(hyp['warmup_epochs'] * nb), 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # n_warmup = min(n_warmup, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training print("n_warmup: ", n_warmup) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) #混合精度梯度放大模块 maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' logger.info( 'Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start batch) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= n_warmup: # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, [0, n_warmup], [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, [0, n_warmup], [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, [0, n_warmup], [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): #自动混合精度 pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize """每accumulate个batch时更新一次, 在n_warmup之内时,accumulate从1逐渐增大到4""" if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP if final_epoch: # replot predictions [ os.remove(x) for x in glob.glob( str(log_dir / 'test_batch*_pred.jpg')) if os.path.exists(x) ] results, maps, times = evaluate.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint # added by jiangrong if not opt.resume: ckpt['epoch'] = -1 if opt.nas: model = NasModel(opt.cfg, ch=3, nc=nc, nas=opt.nas, nas_stage=opt.nas_stage).to(device) # create else: model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg else [] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: if opt.nas: model = NasModel(opt.cfg, ch=3, nc=nc, nas=opt.nas, nas_stage=opt.nas_stage).to(device) # create if opt.nas_stage == 3: # TODO, Remapping with BN Statistics on Width-level model.re_organize_middle_weights() else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create if opt.nas and opt.nas_stage > 0: from models.experimental import attempt_load """ P R [email protected] 0.535 0.835 0.742 python test.py \ --weights /workspace/yolov5-v3/yolov5/runs/exp122/weights/best.pt \ --data ./data/baiguang.yaml \ --device 1 \ --conf-thres 0.2 """ teacher_model = attempt_load( "/workspace/yolov5-v3/yolov5/runs/exp259/weights/best.pt", map_location='cuda:1') teacher_model.eval() # Freeze freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None and not opt.nas > 0: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}' ) # save previous weights if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode # TheModel = model if cuda and rank == -1 and torch.cuda.device_count() > 1 and not ( opt.nas and opt.nas_stage > 0): # https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html # >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2]) # >>> output = net(input_var) # input_var can be on any device, including CPU model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=(opt.local_rank)) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # only runs on process 0 # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move # scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test' % (imgsz, imgsz_test)) logger.info('Using %g dataloader workers' % dataloader.num_workers) logger.info('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) plot_csum = 0 for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: # Generate indices if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- # print(type(targets), targets.size()) # [[_,classid(start from 0), x,y,w,h (0-1)]] # print('---> targets: ', targets) ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward ###### jiangrong, turn off mixed precision ########## # with amp.autocast(enabled=cuda): if 1 == 1: pred = model(imgs) # forward, format x(bs,3,20,20,80+1+4) loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # z= [] # for i in range(TheModel._modules['model'][-1].nl): # bs, _, ny, nx, _ = pred[i].shape # if TheModel._modules['model'][-1].grid[i].shape[2:4] != pred[i].shape[2:4]: # TheModel._modules['model'][-1].grid[i] = TheModel._modules['model'][-1]._make_grid(nx, ny).to(pred[i].device) # y = pred[i].sigmoid() # y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + TheModel._modules['model'][-1].grid[i].to(pred[i].device)) * TheModel._modules['model'][-1].stride[i] # xy # y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * TheModel._modules['model'][-1].anchor_grid[i] # wh # z.append(y.view(bs, -1, TheModel._modules['model'][-1].no)) # inf_out = torch.cat(z, 1) # teacher_pred = non_max_suppression(inf_out, conf_thres=0.2, iou_thres=0.6, merge=False) # assert len(teacher_pred) == imgs.size()[0] # for i, (det, plot_img) in enumerate(zip(teacher_pred, imgs.detach().cpu().numpy())): # plot_img = np.transpose(plot_img, (1,2,0)) # plot_img = np.uint8(plot_img * 255.0) # plot_csum += 1 # cv2.imwrite('./tmp/{}.jpg'.format(plot_csum), plot_img) # plot_img = cv2.imread('./tmp/{}.jpg'.format(plot_csum)) # for tgt in targets.detach().cpu().numpy(): # _, tgt_class_id, c_x, c_y, c_w, c_h = tgt # c_x, c_y, c_w, c_h = float(c_x), float(c_y), float(c_w), float(c_h) # c_x, c_y, c_w, c_h = c_x * plot_img.shape[1], c_y * plot_img.shape[0], c_w * plot_img.shape[1], c_h * plot_img.shape[0] # cv2.rectangle(plot_img, (int(c_x - c_w / 2), int(c_y - c_h / 2)), (int(c_x + c_w / 2), int(c_y + c_h / 2)), (0,0,255), 2) # print('===> ', int(c_x - c_w / 2), int(c_y - c_h / 2), int(c_x + c_w / 2), int(c_y + c_h / 2), tgt_class_id) # if det is not None: # det = det.detach().cpu().numpy() # for each_b in det: # pass # cv2.rectangle(plot_img, (int(each_b[0]), int(each_b[1])), (int(each_b[2]), int(each_b[3])), (255,0,0), 2) # print('---> ', int(each_b[0]), int(each_b[1]), int(each_b[2]), int(each_b[3]), float(each_b[4]), int(each_b[5])) # cv2.imwrite('./tmp/{}.jpg'.format(plot_csum), plot_img) if opt.nas and opt.nas_stage > 0: teacher_imgs = imgs.to('cuda:1') with torch.no_grad(): inf_out, _ = teacher_model(teacher_imgs) # forward # filter by obj confidence 0.05 teacher_pred = non_max_suppression_teacher( inf_out, conf_thres=0.05, iou_thres=0.6, merge=False ) # (x1, y1, x2, y2, conf, cls) in resized image size teacher_targets = teacher2targets(teacher_pred, teacher_imgs) # print('---> teacher_pred', teacher_pred) # print('---> targets', targets) # print('---> teacher_targets', teacher_targets) # TODO: apply soft label loss teacher_loss, teacher_loss_items = compute_teacher_loss( pred, teacher_targets.to(device), model) # loss scaled by batch_size # print("===> origin loss", loss, loss_items) # print("===> teacher loss", teacher_loss, teacher_loss_items) teacher_loss_scale = 2.0 loss += teacher_loss * teacher_loss_scale loss_items += teacher_loss_items * teacher_loss_scale ########## the targets and teacher predictions are matched, but they both can not be restored to the image, need TODO!! ########### # assert len(teacher_pred) == imgs.size()[0] # for i, (det, plot_img) in enumerate(zip(teacher_pred, imgs.detach().cpu().numpy())): # plot_img = np.transpose(plot_img, (1,2,0)) # plot_img = np.uint8(plot_img * 255.0) # plot_csum += 1 # cv2.imwrite('./tmp/{}.jpg'.format(plot_csum), plot_img) # plot_img = cv2.imread('./tmp/{}.jpg'.format(plot_csum)) # for tgt in targets.detach().cpu().numpy(): # _, tgt_class_id, c_x, c_y, c_w, c_h = tgt # c_x, c_y, c_w, c_h = float(c_x), float(c_y), float(c_w), float(c_h) # c_x, c_y, c_w, c_h = c_x * plot_img.shape[1], c_y * plot_img.shape[0], c_w * plot_img.shape[1], c_h * plot_img.shape[0] # cv2.rectangle(plot_img, (int(c_x - c_w / 2), int(c_y - c_h / 2)), (int(c_x + c_w / 2), int(c_y + c_h / 2)), (0,0,255), 2) # print('===> ', int(c_x - c_w / 2), int(c_y - c_h / 2), int(c_x + c_w / 2), int(c_y + c_h / 2), tgt_class_id) # if det is not None: # det = det.detach().cpu().numpy() # for each_b in det: # pass # cv2.rectangle(plot_img, (int(each_b[0]), int(each_b[1])), (int(each_b[2]), int(each_b[3])), (255,0,0), 2) # print('---> ', int(each_b[0]), int(each_b[1]), int(each_b[2]), int(each_b[3]), float(each_b[4]), int(each_b[5])) # cv2.imwrite('./tmp/{}.jpg'.format(plot_csum), plot_img) # Backward # scaler.scale(loss).backward() loss.backward() # Optimize if ni % accumulate == 0: # scaler.step(optimizer) # optimizer.step # scaler.update() optimizer.step() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP if opt.nas: # only evaluate the super network ema.ema.nas_stage = 0 results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) if opt.nas: ema.ema.nas_stage = opt.nas_stage # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train(opts_dict, args): train_epoch = int(opts_dict['epoch']) batch_size = int(opts_dict['batch_size']) pretrain_weight = opts_dict['pretrained'] num_workers = int(opts_dict['workers']) data_dir = opts_dict['data_dir'] crop_size = int(opts_dict['crop_size']) learn_rate = float(opts_dict['lr']) # if use_cuda: init_seeds() train_dir = opts_dict['train_dir'] val_dir = opts_dict['val_dir'] class_name = opts_dict['class_name'] class_name = class_name.split(",") freeze_layer = opts_dict["freeze_layer"] checkpoint_dir = opts_dict["checkpoint"] resume = opts_dict["resume"] try: os.stat(checkpoint_dir) except: os.mkdir(checkpoint_dir) embedding_log = 5 ###---------------------------------------------------------------### # 1.load data ## reference https://github.com/tanglang96/DataLoaders_DALI pip_train = HybridTrainPipe(batch_size=batch_size, num_threads=num_workers, device_id=args.local_rank, data_dir=data_dir + '/train', crop=crop_size, shard_id=args.local_rank, num_shards=args.world_size) pip_val = HybridValPipe(batch_size=batch_size, num_threads=num_workers, device_id=args.local_rank, data_dir=data_dir + '/val', crop=crop_size, size=crop_size, shard_id=args.local_rank, num_shards=args.world_size) pip_train.build() pip_val.build() # train_loader = DALIDataloader(pipeline=pip_train, size=SHIP_IMAGES_NUM_TRAIN, batch_size=batch_size, # onehot_label=True) # val_loader = DALIDataloader(pipeline=pip_val, size=SHIP_IMAGES_NUM_VAL, batch_size=batch_size, # onehot_label=True) train_loader = DALIClassificationIterator(pip_train, reader_name="Reader", fill_last_batch=True) val_loader = DALIClassificationIterator(pip_val, reader_name="Reader", fill_last_batch=False) # print("[DALI] train dataloader length: %d" % len(train_loader))## len(train_loader)*batch_size = total_image //8 # print("[DALI] val dataloader length: %d" % len(val_loader)) ## len(train_loader)*batch_size = total_image //8 # print('[DALI] start iterate train dataloader') # time_start = time.time() # for i, data in enumerate(train_loader): # Using it just like PyTorch dataloader # images = data[0].cuda(non_blocking=True) # labels = data[1].cuda(non_blocking=True) # time_end = time.time() # train_time = time_end - time_start # print('[DALI] iteration time: %fs [train]' % (train_time)) ###--------------------Pytorch dataloader test------------------### # transform_train = trns.Compose([ # trns.RandomResizedCrop(crop_size, scale=(0.08, 1.25)), # trns.RandomHorizontalFlip(), # trns.ToTensor(), # trns.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # ]) # train_dst = datasets.ImageFolder(data_dir + '/train', transform_train) # train_loader = torch.utils.data.DataLoader(train_dst, batch_size=batch_size, shuffle=True, pin_memory=True, # num_workers=num_workers) # print("[PyTorch] train dataloader length: %d" % len(train_loader)) # print('[PyTorch] start iterate train dataloader') # time_start = time.time() # for i, data in enumerate(train_loader): # images = data[0].cuda(non_blocking=True) # labels = data[1].cuda(non_blocking=True) # time_end = time.time() # train_time = time_end - time_start # print('[PyTorch] iteration time: %fs [train]' % (train_time) ###---------------------------------------------------------------### # 2.load model ###vgg16 model = models.vgg16(pretrained=False).cuda() model.load_state_dict(torch.load("./pretrained_weight/vgg16.pth")) num_features = model.classifier[6].in_features features = list(model.classifier.children())[:-1] # Remove last layer features.extend([ nn.Linear(num_features, 1024), nn.ReLU(inplace=True), nn.Linear(1024, 256), nn.ReLU(inplace=True), nn.Linear(256, 64), nn.ReLU(inplace=True), nn.Linear(64, len(class_name)) ]) # Add our layer with 4 outputs model.classifier = nn.Sequential(*features) # Replace the model classifier # model = resnest50(num_classes=1000) # model = model.cuda() # # if pretrain_weight: ##if have pretrain model # model.load_state_dict(torch.load(pretrain_weight)) # # num_features = model.fc.in_features # features = list(model.fc.children())[:-1] # Remove last layer # features.extend([nn.Linear(num_features, 512), nn.ReLU(inplace=True), nn.Linear(512,128), nn.ReLU(inplace=True), nn.Linear(128, len(class_name))]) # Add our layer with 4 outputs # model.fc = nn.Sequential(*features) # Replace the model classifier # # if resume: # num_features = model.fc.in_features # features = list(model.fc.children())[:-1] # Remove last layer # features.extend([nn.Linear(num_features, 512), nn.ReLU(inplace=True), nn.Linear(512,128), nn.ReLU(inplace=True), nn.Linear(128, len(class_name))]) # Add our layer with 4 outputs # model.fc = nn.Sequential(*features) # Replace the model classifier # model.load_state_dict(torch.load(resume)) if freeze_layer: ##Freeze training for layers ct = 0 for name, param in model.named_parameters(): ct += 1 if ct < int(freeze_layer): param.requires_grad_(False) print(ct, name, param.requires_grad) if used_multi_gpu: model = parallel.convert_syncbn_model(model) model = model.cuda() # model.to(device) # summary(model, (3, 224, 224))## if you need summary ###---------------------------------------------------------------### # 3.set optimizer # optimizer = optim.Adam(model.parameters(), lr=learn_rate) # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (epoch + 1)) args.lr = args.lr * float(int(opts_dict['batch_size']) * args.world_size) / 256. ## if batch size = 256 optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) scheduler_steplr = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=5, after_scheduler=scheduler_steplr) optimizer.zero_grad() optimizer.step() print("initial_learning_rate:", optimizer.defaults['lr']) ## Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=args.loss_scale) if used_multi_gpu: model = DDP(model, delay_allreduce=True) # model = DDP(model) criterion = CutMixCrossEntropyLoss(True) ##cutmixed # # # 4. training writer = SummaryWriter() top1 = AverageMeter() top3 = AverageMeter() e = 0 for epoch in range(1, train_epoch + 1): epoch_loss = 0.0 model.train() scheduler_warmup.step( epoch ) ## pytorch-gradual-warmup-lr https://github.com/ildoonet/pytorch-gradual-warmup-lr # print(epoch, optimizer.param_groups[0]['lr']) for batch_idx, data in enumerate(train_loader): # n_iter = (epoch*len(train_loader))+batch_idx ### if want to save dataloader image # data_2 = data[0][1].cpu().numpy() # data_2 = data_2.transpose(1,2,0) # data_2 -= data_2.min() # data_2 /= data_2.max() # data_2 *= 255 # # cv.imwrite("./data_aug_sample/{}.jpeg".format(batch_idx), data_2) # images = data[0].cuda(non_blocking=True) # labels = data[1].cuda(non_blocking=True) images = data[0]["data"] labels = data[0]["label"].squeeze().cuda().long() output = model(images) # loss = F.cross_entropy(output, labels) loss = criterion(output, labels) ## cut mixed optimizer.zero_grad() if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() epoch_loss += loss.item() * images.size(0) # if bath_idx % embedding_log == 0: # out = torch.cat((output.data, torch.ones(len(output), 1, device=device)), 1) # writer.add_embedding(out, metadata=labels.data, label_img=images.data, global_step=n_iter) if epoch % 10 == 0: ## every 10 epoch caculate once accuracy model.eval() for iii, val_data in enumerate(val_loader): # val_images = val_data[0].cuda(async=True) # val_labels = val_data[1].cuda(async=True) val_images = data[0]["data"] val_labels = data[0]["label"].squeeze().cuda().long() with torch.no_grad(): val_output = model(val_images) acc1, acc2 = accuracy_multi_gpu(val_output.data, val_labels, topk=(1, 3)) if used_multi_gpu: acc1 = reduce_tensor(acc1) acc2 = reduce_tensor(acc2) top1.update(to_python_float(acc1), val_images.size(0)) top3.update(to_python_float(acc2), val_images.size(0)) # out = torch.cat((val_output.data, torch.ones(len(val_output), 1, device=device)), 1) # writer.add_embedding(out, metadata=val_labels.data, label_img=val_images.data, global_step=(epoch*len(val_loader))+iii) if args.local_rank == 0: print('Top1 Acc: %.3f | Top3 Acc: %.3f ' % (top1.avg, top3.avg)) e = int(epoch) torch.save(model.state_dict(), "{}/{}.pt".format(checkpoint_dir, e)) val_loader.reset() torch.save(model.state_dict(), "{}/last.pt".format(checkpoint_dir)) # if epoch % 25 == 0: # scheduler.step() print("epoch:{}, loss:{:.6f}".format(epoch, epoch_loss)) ## visualization writer.add_scalar('./tensorboard/acc', top1.avg, epoch) writer.add_scalar('./tensorboard/total_loss', epoch_loss, epoch) writer.add_scalar('./tensorboard/lr', optimizer.param_groups[0]['lr'], epoch) train_loader.reset() writer.export_scalars_to_json("./all_scalars.json") writer.close()
init_weights=opt.init_weights, ) # Evolve hyperparameters (optional) if opt.evolve: best_fitness = results[2] # use mAP for fitness # Write mutation results print_mutation(hyp, results) gen = 50 # generations to evolve for _ in range(gen): # Mutate hyperparameters old_hyp = hyp.copy() torch_utils.init_seeds(seed=int(time.time())) s = [.2, .2, .2, .2, .2, .3, .2, .2, .02, .3] for i, k in enumerate(hyp.keys()): x = (np.random.randn(1) * s[i] + 1)**1.1 # plt.hist(x.ravel(), 100) hyp[k] = hyp[k] * float(x) # vary by about 30% 1sigma # Clip to limits keys = ['iou_t', 'momentum', 'weight_decay'] limits = [(0, 0.90), (0.80, 0.95), (0, 0.01)] for k, v in zip(keys, limits): hyp[k] = np.clip(hyp[k], v[0], v[1]) # Normalize loss components (sum to 1) keys = ['xy', 'wh', 'cls', 'conf'] s = sum([v for k, v in hyp.items() if k in keys])
def train(hyp, opt, device, tb_writer=None): logger.info(f"Hyperparameters {hyp}") log_dir = ( Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / "evolve" ) # logging directory wdir = log_dir / "weights" # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / "last.pt" best = wdir / "best.pt" results_file = str(log_dir / "results.txt") epochs, batch_size, total_batch_size, weights, rank = ( opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank, ) # Save run settings with open(log_dir / "hyp.yaml", "w") as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / "opt.yaml", "w") as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != "cpu" init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict["train"] test_path = train_path.replace("train", "val") nc, names = ( (1, ["item"]) if opt.single_cls else (int(data_dict["nc"]), data_dict["names"]) ) # number classes, names assert len(names) == nc, "%g names found for nc=%g dataset in %s" % ( len(names), nc, opt.data, ) # check # Model pretrained = weights.endswith(".pt") if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get("anchors"): ckpt["model"].yaml["anchors"] = round(hyp["anchors"]) # force autoanchor model = Model(opt.cfg or ckpt["model"].yaml, ch=3, nc=nc).to(device) # create exclude = ["anchor"] if opt.cfg or hyp.get("anchors") else [] # exclude keys state_dict = ckpt["model"].float().state_dict() # to FP32 state_dict = intersect_dicts( state_dict, model.state_dict(), exclude=exclude ) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( "Transferred %g/%g items from %s" % (len(state_dict), len(model.state_dict()), weights) ) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [ "", ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print("freezing %s" % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max( round(nbs / total_batch_size), 1 ) # accumulate loss before optimizing hyp["weight_decay"] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if ".bias" in k: pg2.append(v) # biases elif ".weight" in k and ".bn" not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam( pg0, lr=hyp["lr0"], betas=(hyp["momentum"], 0.999) ) # adjust beta1 to momentum else: optimizer = optim.SGD( pg0, lr=hyp["lr0"], momentum=hyp["momentum"], nesterov=True ) optimizer.add_param_group( {"params": pg1, "weight_decay": hyp["weight_decay"]} ) # add pg1 with weight_decay optimizer.add_param_group({"params": pg2}) # add pg2 (biases) logger.info( "Optimizer groups: %g .bias, %g conv.weight, %g other" % (len(pg2), len(pg1), len(pg0)) ) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = ( lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp["lrf"]) + hyp["lrf"] ) # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt["optimizer"] is not None: optimizer.load_state_dict(ckpt["optimizer"]) best_fitness = ckpt["best_fitness"] # Results if ckpt.get("training_results") is not None: with open(results_file, "w") as file: file.write(ckpt["training_results"]) # write results.txt # Epochs start_epoch = ckpt["epoch"] + 1 if opt.resume: assert ( start_epoch > 0 ), "%s training to %g epochs is finished, nothing to resume." % ( weights, epochs, ) shutil.copytree( wdir, wdir.parent / f"weights_backup_epoch{start_epoch - 1}" ) # save previous weights if epochs < start_epoch: logger.info( "%s has been trained for %g epochs. Fine-tuning for %g additional epochs." % (weights, ckpt["epoch"], epochs) ) epochs += ckpt["epoch"] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [ check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info("Using SyncBatchNorm()") # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader( train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, ) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert ( mlc < nc ), "Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g" % ( mlc, nc, opt.data, nc - 1, ) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, )[ 0 ] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram("classes", c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp["anchor_t"], imgsz=imgsz) # Model parameters hyp["cls"] *= nc / 80.0 # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device ) # attach class weights model.names = names # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0, ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info( "Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n" "Starting training for %g epochs..." % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs) ) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = ( model.class_weights.cpu().numpy() * (1 - maps) ** 2 ) # class weights iw = labels_to_image_weights( dataset.labels, nc=nc, class_weights=cw ) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n ) # rand weighted idx # Broadcast if DDP if rank != -1: indices = ( torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n) ).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ("\n" + "%10s" * 8) % ("Epoch", "gpu_mem", "GIoU", "obj", "cls", "total", "targets", "img_size") ) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _, ) in ( pbar ): # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = ( imgs.to(device, non_blocking=True).float() / 255.0 ) # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round() ) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x["lr"] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x["initial_lr"] * lf(epoch)] ) if "momentum" in x: x["momentum"] = np.interp(ni, xi, [0.9, hyp["momentum"]]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate( imgs, size=ns, mode="bilinear", align_corners=False ) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model ) # loss scaled by batch_size if rank != -1: loss *= ( opt.world_size ) # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = "%.3gG" % ( torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0 ) # (GB) s = ("%10s" * 2 + "%10.4g" * 6) % ( "%g/%g" % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1], ) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ("train_batch%g.jpg" % ni)) # filename result = plot_images( images=imgs, targets=targets, paths=paths, fname=f ) if tb_writer and result is not None: tb_writer.add_image( f, result, dataformats="HWC", global_step=epoch ) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x["lr"] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=["yaml", "nc", "hyp", "gr", "names", "stride"] ) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP if final_epoch: # replot predictions [ os.remove(x) for x in glob.glob(str(log_dir / "test_batch*_pred.jpg")) if os.path.exists(x) ] results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir, ) # Write with open(results_file, "a") as f: f.write( s + "%10.4g" * 7 % results + "\n" ) # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system( "gsutil cp %s gs://%s/results/results%s.txt" % (results_file, opt.bucket, opt.name) ) # Tensorboard if tb_writer: tags = [ "train/giou_loss", "train/obj_loss", "train/cls_loss", # train loss "metrics/precision", "metrics/recall", "metrics/mAP_0.5", "metrics/mAP_0.5:0.95", "val/giou_loss", "val/obj_loss", "val/cls_loss", # val loss "x/lr0", "x/lr1", "x/lr2", ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness( np.array(results).reshape(1, -1) ) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, "r") as f: # create checkpoint ckpt = { "epoch": epoch, "best_fitness": best_fitness, "training_results": f.read(), "model": ema.ema, "optimizer": None if final_epoch else optimizer.state_dict(), } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else "" fresults, flast, fbest = ( log_dir / f"results{n}.txt", wdir / f"last{n}.pt", wdir / f"best{n}.pt", ) for f1, f2 in zip( [wdir / "last.pt", wdir / "best.pt", results_file], [flast, fbest, fresults] ): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith(".pt"): # is *.pt strip_optimizer(f2) # strip optimizer os.system( "gsutil cp %s gs://%s/weights" % (f2, opt.bucket) ) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info( "%g epochs completed in %.3f hours.\n" % (epoch - start_epoch + 1, (time.time() - t0) / 3600) ) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results