def train(hyp, opt, device, tb_writer=None): print(f'Hyperparameters {hyp}') log_dir = tb_writer.log_dir if tb_writer else 'runs/evolve' # run directory wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = log_dir + os.sep + 'results.txt' epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.local_rank # Save run settings with open(Path(log_dir) / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(Path(log_dir) / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict # train_path = data_dict['train'] train_path = f'{opt.trainset_path}/images/train' # test_path = data_dict['val'] test_path = f'{opt.trainset_path}/images/test' nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check # Remove previous results if rank in [-1, 0]: for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model if opt.not_use_SE: model = Model(opt.cfg, nc=nc).to(device) else: model = ModelSE(opt.cfg, nc=nc).to(device) # print(model) # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html # all-reduce operation is carried out during loss.backward(). # Thus, there would be redundant all-reduce communications in a accumulation procedure, # which means, the result is still right but the training speed gets slower. # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Load Model with torch_distributed_zero_first(rank): attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: exclude = ['anchor'] # exclude keys ckpt['model'] = {k: v for k, v in ckpt['model'].float().state_dict().items() if k in model.state_dict() and not any(x in k for x in exclude) and model.state_dict()[k].shape == v.shape} model.load_state_dict(ckpt['model'], strict=False) print('Transferred %g/%g items from %s' % (len(ckpt['model']), len(model.state_dict()), weights)) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ "Please delete or update %s and try again, or use --weights '' to train from scratch." \ % (weights, opt.cfg, weights, weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model, device_ids=[0, 1]) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[rank], output_device=rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=rank, world_size=opt.world_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) # Testloaderc if rank in [-1, 0]: # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) if rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: # Generate indices if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Autocast with amp.autocast(enabled=cuda): # Forward pred = model(imgs) # print([x.shape for x in pred]) # [1, 3, 76, 76, 25] [1, 3, 38, 38, 25] [1, 3, 19, 19, 25]) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # if not torch.isfinite(loss): # print('WARNING: non-finite loss, ending training ', loss_items) # return results # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss'] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train(): cfg = opt.cfg data = opt.data img_size, img_size_test = opt.img_size if len( opt.img_size) == 2 else opt.img_size * 2 # train, test sizes epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights # Initialize init_seeds() if opt.multi_scale: img_sz_min = round(img_size / 32 / 1.5) img_sz_max = round(img_size / 32 * 1.5) img_size = img_sz_max * 32 # initiate with maximum multi_scale size print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) # Configure run data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int( data_dict['classes']) # number of classes # Remove previous results for f in glob.glob('*_batch*.png') + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg).to(device) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) del pg0, pg1, pg2 start_epoch = 0 best_fitness = 0.0 attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. chkpt = torch.load(weights, map_location=device) # load model try: chkpt['model'] = { k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(chkpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. load_darknet_weights(model, weights) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://github.com/ultralytics/yolov3/issues/238 lf = lambda x: (1 + math.cos(x * math.pi / epochs) ) / 2 # cosine https://arxiv.org/pdf/1812.01187.pdf scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1) # scheduler = lr_scheduler.MultiStepLR(optimizer, [round(epochs * x) for x in [0.8, 0.9]], 0.1, start_epoch - 1) # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels( test_path, img_size_test, batch_size * 2, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size * 2, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 0.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights # Model EMA # ema = torch_utils.ModelEMA(model, decay=0.9998) # Start training nb = len(dataloader) # number of batches prebias = start_epoch == 0 maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Prebias if prebias: ne = 3 # number of prebias epochs ps = 0.1, 0.9 # prebias settings (lr=0.1, momentum=0.9) if epoch == ne: ps = hyp['lr0'], hyp['momentum'] # normal training settings model.gr = 1.0 # giou loss ratio (obj_loss = giou) print_model_biases(model) prebias = False # Bias optimizer settings optimizer.param_groups[2]['lr'] = ps[0] if optimizer.param_groups[2].get( 'momentum') is not None: # for SGD but not Adam optimizer.param_groups[2]['momentum'] = ps[1] # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Hyperparameter Burn-in n_burn = 200 # number of burn-in batches if ni <= n_burn: # g = (ni / n_burn) ** 2 # gain for x in model.named_modules( ): # initial stats may be poor, wait to track if x[0].endswith('BatchNorm2d'): x[1].track_running_stats = ni == n_burn # for x in optimizer.param_groups: # x['lr'] = x['initial_lr'] * lf(epoch) * g # gain rises from 0 - 1 # if 'momentum' in x: # x['momentum'] = hyp['momentum'] * g # Plot images with bounding boxes if ni < 1: f = 'train_batch%g.png' % i # filename plot_images(imgs=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, cv2.imread(f)[:, :, ::-1], dataformats='HWC') # Multi-Scale training if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Scale loss by nominal batch_size of 64 loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize accumulated gradient if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() # ema.update(model) # Print batch results mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) pbar.set_description(s) # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results # ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP is_coco = any([ x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data'] ]) and model.nc == 80 results, maps = test.test( cfg, data, batch_size=batch_size * 2, img_size=img_size_test, model=model, conf_thres=0.001 if final_epoch else 0.01, # 0.001 for best mAP, 0.01 for speed iou_thres=0.6, save_json=final_epoch and is_coco, single_cls=opt.single_cls, dataloader=testloader) # Write epoch results with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Write Tensorboard results if tb_writer: x = list(mloss) + list(results) titles = [ 'GIoU', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' ] for xi, title in zip(x, titles): tb_writer.add_scalar(title, xi, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save training results save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # Create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': model.module.state_dict() if hasattr(model, 'module') else model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(chkpt, last) # Save best checkpoint if best_fitness == fi: torch.save(chkpt, best) # Save backup every 10 epochs (optional) # if epoch > 0 and epoch % 10 == 0: # torch.save(chkpt, wdir + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, 'last%s.pt' % n, 'best%s.pt' % n os.rename('results.txt', fresults) os.rename(wdir + 'last.pt', wdir + flast) if os.path.exists(wdir + 'last.pt') else None os.rename(wdir + 'best.pt', wdir + fbest) if os.path.exists(wdir + 'best.pt') else None if opt.bucket: # save to cloud os.system('gsutil cp %s gs://%s/results' % (fresults, opt.bucket)) os.system('gsutil cp %s gs://%s/weights' % (wdir + flast, opt.bucket)) # os.system('gsutil cp %s gs://%s/weights' % (wdir + fbest, opt.bucket)) if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def cleanup(): dist.destroy_process_group()
def run_test_multiple_groups(rank, world_size, tempfile_name): # Only work with the even ranks, to check that the global_rank indexing is properly used dist_init(rank=rank, world_size=world_size, tempfile_name=tempfile_name, backend="gloo") sub_group_ranks = [0, 2, 4] process_group = torch.distributed.new_group(ranks=sub_group_ranks, backend="gloo") # Make sure that all the ranks get different training data # So that the sync check in between their models is meaningful torch.manual_seed(rank) np.random.seed(rank) # Standard deep learning setup device = "cpu" epochs, batch, input_width, hidden, target_width = 5, 3, 20, 10, 5 loss_fn = torch.nn.L1Loss().to(device) def check(optimizer): # Just run a couple of epochs, check that the model is properly updated for _ in range(epochs): target = torch.rand((batch, target_width), device=device) inputs = torch.rand((batch, input_width), device=device) def closure(): optimizer.zero_grad() output = model(inputs) loss = loss_fn(output, target) loss /= world_size loss.backward() dist.all_reduce(loss, group=process_group ) # Not strictly needed for the test below return loss _ = optimizer.step(closure=closure) # Check that all the params are the same on all ranks for pg in optimizer.param_groups: for p in pg["params"]: receptacle = [p.clone() for _ in sub_group_ranks ] if rank == 0 else [] dist.gather(p, receptacle, dst=0, group=process_group) if rank == 0: for sync_p in receptacle[1:]: assert torch.all( torch.eq(receptacle[0], sync_p) ), "Models differ in between ranks {} - {}".format( torch.norm(receptacle[0]), torch.norm(sync_p)) if rank in sub_group_ranks: # Model fitting in the broadcast bucket model = torch.nn.Sequential(torch.nn.Linear(input_width, hidden), torch.nn.Linear(hidden, target_width)).to(device) # With SGD, Momentum is required to get a state to shard optimizer = optim.OSS(model.parameters(), lr=0.1, momentum=0.99, group=process_group, broadcast_buffer_size=2**20) check(optimizer) # Model not-fitting in the broadcast bucket model = torch.nn.Sequential(torch.nn.Linear(input_width, hidden), torch.nn.Linear(hidden, target_width)).to(device) # With SGD, Momentum is required to get a state to shard optimizer = optim.OSS(model.parameters(), lr=0.1, momentum=0.99, group=process_group, broadcast_buffer_size=0) check(optimizer) dist.destroy_process_group(process_group)
def run_state_dict_distributed(rank, world_size, tempfile_name): dist_init(rank, world_size, tempfile_name, backend="gloo") device = torch.device(rank) torch.manual_seed( rank) # make sure that the different rank get different data # Setup two problems in parallel, we'll make sure that the second track (with save/load) follows the first one(untouched) # We split the model in two to test the multiple param groups support batch, input_width, hidden, target_width = 3, 20, 10, 5 target = torch.rand((batch, target_width), device=device) inputs = torch.rand((batch, input_width), device=device) model_oss1 = torch.nn.Sequential(torch.nn.Linear(input_width, hidden), torch.nn.Linear(hidden, hidden)).to(device) head_oss1 = torch.nn.Linear(hidden, target_width).to(device) model_oss2 = copy.deepcopy(model_oss1) head_oss2 = copy.deepcopy(head_oss1) # For this test the gradients are (all) reduced in the same way in between the torch reference and fairscale. # Normally OSS would use ShardedDDP and only reduce to the proper rank, but this does not change the # gradient norm computation from OSS and adds a dependency. # to keep the comparison apples-to-apples DDP is used in both cases model_oss1 = DDP( module=model_oss1, device_ids=[rank], ) sharded_optimizer1 = optim.OSS(model_oss1.parameters(), lr=0.1, momentum=0.99) sharded_optimizer1.add_param_group({"params": head_oss1.parameters()}) model_oss2 = DDP( module=model_oss2, device_ids=[rank], ) sharded_optimizer2 = optim.OSS(model_oss2.parameters(), lr=0.1, momentum=0.99) sharded_optimizer2.add_param_group({"params": head_oss2.parameters()}) loss_fn = torch.nn.L1Loss().to(device) def run_grad_step(model, head, optimizer): model.zero_grad() outputs = head(model(inputs)) # pull the current state, broadcast it to all ranks sharded_optimizer2.consolidate_state_dict( recipient_rank=RECIPIENT_RANK) # all ranks state_dict2 = sharded_optimizer2.state_dict( ) if rank == RECIPIENT_RANK else {} state_dict2 = sync_object_ranks(state_dict2, RECIPIENT_RANK, device) # re-create a new optimizer from scratch with absurd values, load the previous state sharded_optimizer2 = optim.OSS(model_oss2.parameters(), lr=1e6, momentum=0.0001) sharded_optimizer2.add_param_group({"params": head_oss2.parameters()}) sharded_optimizer2.load_state_dict(state_dict2) check_same_model_params( model_oss1, model_oss2, "parameters of the two identical models have diverged (before any steps)" ) # now take a step and check that parameters are equal run_grad_step(model_oss1, head_oss1, sharded_optimizer1) run_grad_step(model_oss2, head_oss2, sharded_optimizer2) check_same_model_params( model_oss1, model_oss2, "parameters of the two identical models have diverged (after stepping)" ) # save the state dict for one model only, then distribute to the other ranks sharded_optimizer2.consolidate_state_dict( recipient_rank=RECIPIENT_RANK) # all ranks state_dict2 = sharded_optimizer2.state_dict( ) if rank == RECIPIENT_RANK else {} state_dict2 = sync_object_ranks(state_dict2, RECIPIENT_RANK, device) # Check that the pulled state and the .param_groups attribute are in sync for replica in range(len(state_dict2["param_groups"])): for k in state_dict2["param_groups"][replica].keys(): if k != "params": assert state_dict2["param_groups"][replica][ k] == sharded_optimizer2.param_groups[0][k] # take a step run_grad_step(model_oss1, head_oss1, sharded_optimizer1) run_grad_step(model_oss2, head_oss2, sharded_optimizer2) check_same_model_params( model_oss1, model_oss2, "parameters of the two identical models have diverged (after consolidating)" ) # save again for one rank, then distribute to the others sharded_optimizer2.consolidate_state_dict( recipient_rank=RECIPIENT_RANK) # all ranks state_dict2 = sharded_optimizer2.state_dict( ) if rank == RECIPIENT_RANK else {} state_dict2 = sync_object_ranks(state_dict2, RECIPIENT_RANK, device) # reload the state_dict sharded_optimizer2 = optim.OSS(model_oss2.parameters(), lr=0.1, momentum=0.99) sharded_optimizer2.add_param_group({"params": head_oss2.parameters()}) sharded_optimizer2.load_state_dict(state_dict2) # take a step run_grad_step(model_oss1, head_oss1, sharded_optimizer1) run_grad_step(model_oss2, head_oss2, sharded_optimizer2) check_same_model_params( model_oss1, model_oss2, "parameters of the two identical models have diverged (after reloading)" ) dist.destroy_process_group()
def destroy_process_group(): dist.destroy_process_group()
def train(hyp): cfg = opt.cfg data = opt.data epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = max(round(64 / batch_size), 1) # accumulate n times before optimizer update (bs 64) weights = opt.weights # initial training weights imgsz_min, imgsz_max, imgsz_test = opt.img_size # img sizes (min, max, test) # Image Sizes gs = 32 # (pixels) grid size assert math.fmod( imgsz_min, gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs) opt.multi_scale |= imgsz_min != imgsz_max # multi if different (min, max) if opt.multi_scale: if imgsz_min == imgsz_max: imgsz_min //= 1.5 imgsz_max //= 0.667 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs) img_size = imgsz_max # initialize with max size # Configure run init_seeds() data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int( data_dict['classes']) # number of classes hyp['cls'] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg).to(device) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0']) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g Conv2d.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 start_epoch = 0 best_fitness = 0.0 ckpt = torch.load(weights, map_location=device) # load model try: ckpt['model'] = { k: v for k, v in ckpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(ckpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (opt.weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] del ckpt if opt.freeze_layers: output_layer_indices = [ idx - 1 for idx, module in enumerate(model.module_list) if isinstance(module, YOLOLayer) ] freeze_layer_indices = [ x for x in range(len(model.module_list)) if (x not in output_layer_indices) and ( x - 1 not in output_layer_indices) ] for idx in freeze_layer_indices: for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.95 + 0.05 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch - 1 if device.type != 'cpu' and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, hyp=hyp, rect=opt.rect, cache_images=opt.cache_images, single_cls=opt.single_cls) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt.rect, pin_memory=True, collate_fn=dataset.collate_fn) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels( test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights # Model EMA ema = torch_utils.ModelEMA(model) # Start training nb = len(dataloader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test)) print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Burn-in if ni <= n_burn: xi = [0, n_burn] # x interp model.gr = np.interp( ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, 64 / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) x['weight_decay'] = np.interp( ni, xi, [0.0, hyp['weight_decay'] if j == 1 else 0.0]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-Scale if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(grid_min, grid_max + 1) * gs sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward loss *= batch_size / 64 # scale loss loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch + 1, epochs), mem, *mloss, len(targets), img_size) pbar.set_description(s) # Plot if ni < 1: f = 'train_batch%g.jpg' % i # filename res = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, res, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP is_coco = any([ x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data'] ]) and model.nc == 80 results, maps = test.test(cfg, data, batch_size=batch_size, imgsz=imgsz_test, model=ema.ema, save_json=final_epoch and is_coco, single_cls=opt.single_cls, dataloader=testloader, multi_label=ni > n_burn) # Write with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module.state_dict() if hasattr(model, 'module') else ema.ema.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def train(hyp): cfg = opt.cfg t_cfg = opt.t_cfg # teacher model cfg for knowledge distillation data = opt.data epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = max(round(64 / batch_size), 1) # accumulate n times before optimizer update (bs 64) weights = opt.weights # initial training weights t_weights = opt.t_weights # teacher model weights imgsz_min, imgsz_max, imgsz_test = opt.img_size # img sizes (min, max, test) # Image Sizes gs = 32 # (pixels) grid size start_epoch = 0 assert math.fmod( imgsz_min, gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs) opt.multi_scale |= imgsz_min != imgsz_max # multi if different (min, max) if opt.multi_scale: if imgsz_min == imgsz_max: imgsz_min //= 1.5 imgsz_max //= 0.667 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs) img_size = imgsz_max # initialize with max size # Configure run init_seeds() data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int( data_dict['classes']) # number of classes hyp['cls'] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # DDP init if opt.local_rank != -1: if opt.local_rank == 0: print("--------------using ddp---------------") assert torch.cuda.device_count() > opt.local_rank torch.cuda.set_device(opt.local_rank) dist.init_process_group(backend='nccl', init_method='env://') # distributed backend assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.batch_size // opt.world_size else: dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank # Initialize model steps = math.ceil(len(open(train_path).readlines()) / batch_size) * epochs model = Darknet(cfg, quantized=opt.quantized, a_bit=opt.a_bit, w_bit=opt.w_bit, FPGA=opt.FPGA, steps=steps, is_gray_scale=opt.gray_scale).to(device) if t_cfg: t_model = Darknet(t_cfg).to(device) # print('<.....................using gridmask.......................>') # gridmask = GridMask(d1=96, d2=224, rotate=360, ratio=0.6, mode=1, prob=0.8) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g Conv2d.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 best_fitness = 0.0 if weights != 'None': attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. chkpt = torch.load(weights, map_location=device) # load model try: chkpt['model'] = { k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(chkpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) if chkpt.get('best_fitness') is not None: best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt if chkpt.get('epoch') is not None: start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. load_darknet_weights(model, weights, pt=opt.pt, FPGA=opt.FPGA) if t_cfg: if t_weights.endswith('.pt'): t_model.load_state_dict( torch.load(t_weights, map_location=device)['model']) elif t_weights.endswith('.weights'): load_darknet_weights(t_model, t_weights) else: raise Exception( 'pls provide proper teacher weights for knowledge distillation' ) t_model.eval() print( '<.....................using knowledge distillation.......................>' ) print('teacher model:', t_weights, '\n') # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.95 + 0.05 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch - 1 # see link below # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Initialize distributed training if opt.local_rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank, find_unused_parameters=True) else: model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls, rank=opt.local_rank, is_gray_scale=True if opt.gray_scale else False) testset = LoadImagesAndLabels( test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls, rank=opt.local_rank, is_gray_scale=True if opt.gray_scale else False) # 获得要剪枝的层 if hasattr(model, 'module'): print('muti-gpus sparse') if opt.prune == 0: print('normal sparse training ') _, _, prune_idx = parse_module_defs(model.module.module_defs) elif opt.prune == 1: print('shortcut sparse training') _, _, prune_idx, _, _ = parse_module_defs2( model.module.module_defs) elif opt.prune == 2: print('layer sparse training') _, _, prune_idx = parse_module_defs4(model.module.module_defs) else: print('single-gpu sparse') if opt.prune == 0: print('normal sparse training') _, _, prune_idx = parse_module_defs(model.module_defs) elif opt.prune == 1: print('shortcut sparse training') _, _, prune_idx, _, _ = parse_module_defs2(model.module_defs) elif opt.prune == 2: print('layer sparse training') _, _, prune_idx = parse_module_defs4(model.module_defs) train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) # ddp sampler test_sampler = torch.utils.data.distributed.DistributedSampler(testset) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers dataloader = torch.utils.data.DataLoader( dataset, batch_size=int(batch_size / opt.world_size), num_workers=nw, shuffle=False if (opt.local_rank != -1) else not opt.rect, pin_memory=True, collate_fn=dataset.collate_fn, sampler=train_sampler if (opt.local_rank != -1) else None) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels( test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls, rank=opt.local_rank, is_gray_scale=True if opt.gray_scale else False), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) if opt.prune != -1: for idx in prune_idx: if hasattr(model, 'module'): bn_weights = gather_bn_weights(model.module.module_list, [idx]) else: bn_weights = gather_bn_weights(model.module_list, [idx]) tb_writer.add_histogram('before_train_perlayer_bn_weights/hist', bn_weights.numpy(), idx, bins='doane') # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights # Model EMA if opt.ema: ema = torch_utils.ModelEMA(model) # Start training nb = len(dataloader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() if opt.local_rank == -1 or opt.local_rank == 0: print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test)) print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) if opt.mpt: cuda = device.type != 'cpu' scaler = amp.GradScaler(enabled=cuda) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ if opt.local_rank != -1: dataloader.sampler.set_epoch(epoch) # DDP set seed # gridmask.set_prob(epoch, max_epoch) model.train() # 稀疏化标志 if opt.prune == -1: sr_flag = False else: sr_flag = True # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses if opt.local_rank == -1 or opt.local_rank == 0: print( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 # Burn-in if ni <= n_burn: xi = [0, n_burn] # x interp model.gr = np.interp( ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, 64 / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) x['weight_decay'] = np.interp( ni, xi, [0.0, hyp['weight_decay'] if j == 1 else 0.0]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-Scale if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(grid_min, grid_max + 1) * gs sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward if opt.mpt: with amp.autocast(enabled=cuda): targets = targets.to(device) pred, feature_s = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results soft_target = 0 if t_cfg: _, output_t, feature_t = t_model(imgs) if opt.KDstr == 1: soft_target = compute_lost_KD( pred, output_t, model.nc, imgs.size(0)) elif opt.KDstr == 2: soft_target, reg_ratio = compute_lost_KD2( model, targets, pred, output_t) elif opt.KDstr == 3: soft_target = compute_lost_KD3( model, targets, pred, output_t) elif opt.KDstr == 4: soft_target = compute_lost_KD4( model, targets, pred, output_t, feature_s, feature_t, imgs.size(0)) elif opt.KDstr == 5: soft_target = compute_lost_KD5( model, targets, pred, output_t, feature_s, feature_t, imgs.size(0), img_size) else: print("please select KD strategy!") loss += soft_target else: targets = targets.to(device) pred, feature_s = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results soft_target = 0 if t_cfg: _, output_t, feature_t = t_model(imgs) if opt.KDstr == 1: soft_target = compute_lost_KD(pred, output_t, model.nc, imgs.size(0)) elif opt.KDstr == 2: soft_target, reg_ratio = compute_lost_KD2( model, targets, pred, output_t) elif opt.KDstr == 3: soft_target = compute_lost_KD3(model, targets, pred, output_t) elif opt.KDstr == 4: soft_target = compute_lost_KD4(model, targets, pred, output_t, feature_s, feature_t, imgs.size(0)) elif opt.KDstr == 5: soft_target = compute_lost_KD5(model, targets, pred, output_t, feature_s, feature_t, imgs.size(0), img_size) else: print("please select KD strategy!") loss += soft_target # Backward loss *= batch_size / 64 # scale loss if opt.mpt: scaler.scale(loss).backward() else: loss.backward() if opt.SWIFT: if hasattr(model, 'module'): for i, (mdef, module) in enumerate( zip(model.module.module_defs[:], model.module.module_list[:])): if mdef['type'] == 'convolutional': if mdef['activation'] != 'linear': conv_layer_weight = module[0].weight conv_layer_weight.grad[conv_layer_weight == 0] = 0 else: for i, (mdef, module) in enumerate( zip(model.module_defs[:], model.module_list[:])): if mdef['type'] == 'convolutional': if mdef['activation'] != 'linear': conv_layer_weight = module[0].weight conv_layer_weight.grad[conv_layer_weight == 0] = 0 # 对要剪枝层的γ参数稀疏化 if hasattr(model, 'module'): if opt.prune != -1: BNOptimizer.updateBN(sr_flag, model.module.module_list, opt.s, prune_idx) else: if opt.prune != -1: BNOptimizer.updateBN(sr_flag, model.module_list, opt.s, prune_idx) # Optimize if ni % accumulate == 0: if opt.mpt: scaler.step(optimizer) # optimizer.step scaler.update() else: optimizer.step() optimizer.zero_grad() if opt.ema: ema.update(model) # Print mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) pbar.set_description(s) # Plot if i == 0: if not os.path.isdir('train_sample/'): os.makedirs('train_sample/') f = 'train_sample/train_batch%g.jpg' % epoch # filename res = plot_images(images=imgs, targets=targets, paths=paths, fname=f, is_gray_scale=opt.gray_scale) if tb_writer: tb_writer.add_image(f, res, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results if opt.ema: ema.update_attr(model) if hasattr(model, 'module'): module_defs, module_list = ema.eam.module.module_defs, ema.eam.module.module_list else: module_defs, module_list = ema.eam.module_defs, ema.eam.module_list for i, (mdef, module) in enumerate(zip(module_defs, module_list)): if mdef['type'] == 'yolo': yolo_layer = module yolo_layer.nx, yolo_layer.ny = 0, 0 if hasattr(model, 'module'): module_defs, module_list = model.module.module_defs, model.module.module_list else: module_defs, module_list = model.module_defs, model.module_list for i, (mdef, module) in enumerate(zip(module_defs, module_list)): if mdef['type'] == 'yolo': yolo_layer = module yolo_layer.nx, yolo_layer.ny = 0, 0 final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP is_coco = any([ x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data'] ]) and model.nc == 80 results, maps = test.test(cfg, data, batch_size=batch_size, imgsz=imgsz_test, model=ema.ema if opt.ema else model, save_json=final_epoch and is_coco, single_cls=opt.single_cls, dataloader=testloader, multi_label=ni > n_burn, quantized=opt.quantized, a_bit=opt.a_bit, w_bit=opt.w_bit, FPGA=opt.FPGA, rank=opt.local_rank, plot=False) # Write if opt.local_rank in [-1, 0]: with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system( 'gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) if opt.prune != -1: if hasattr(model, 'module'): bn_weights = gather_bn_weights(model.module.module_list, [idx]) else: bn_weights = gather_bn_weights(model.module_list, [idx]) tb_writer.add_histogram('bn_weights/hist', bn_weights.numpy(), epoch, bins='doane') # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if opt.ema: if hasattr(model, 'module'): model_temp = ema.ema.module.state_dict() else: model_temp = ema.ema.state_dict() else: if hasattr(model, 'module'): model_temp = model.module.state_dict() else: model_temp = model.state_dict() if save and dist.get_rank() == 0: # DDP save model only once with open(results_file, 'r') as f: # create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': model_temp, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(chkpt, last) if (best_fitness == fi) and not final_epoch: torch.save(chkpt, best) del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png if opt.local_rank in [-1, 0]: print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def entry_point(config: ConfigParser): ''' entry-point function for a single worker, distributed training ''' local_world_size = config['local_world_size'] # check distributed environment cfgs if config['distributed']: # distributed gpu mode # check gpu available if torch.cuda.is_available(): if torch.cuda.device_count() < local_world_size: raise RuntimeError( f'the number of GPU ({torch.cuda.device_count()}) is less than ' f'the number of processes ({local_world_size}) running on each node' ) local_master = (config['local_rank'] == 0) else: raise RuntimeError( 'CUDA is not available, Distributed training is not supported.' ) else: # one gpu or cpu mode if config['local_world_size'] != 1: raise RuntimeError( 'local_world_size must set be to 1, if distributed is set to false.' ) config.update_config('local_rank', 0) local_master = True config.update_config('global_rank', 0) logger = config.get_logger('train') if local_master else None if config['distributed']: logger.info('Distributed GPU training model start...' ) if local_master else None else: logger.info( 'One GPU or CPU training mode start...') if local_master else None if config['distributed']: # these are the parameters used to initialize the process group env_dict = { key: os.environ[key] for key in ('MASTER_ADDR', 'MASTER_PORT', 'RANK', 'WORLD_SIZE') } logger.info( f'[Process {os.getpid()}] Initializing process group with: {env_dict}' ) if local_master else None # init process group dist.init_process_group(backend='nccl', init_method='env://') config.update_config('global_rank', dist.get_rank()) # info distributed training cfg logger.info( f'[Process {os.getpid()}] world_size = {dist.get_world_size()}, ' + f'rank = {dist.get_rank()}, backend={dist.get_backend()}' ) if local_master else None # start train main(config, local_master, logger if local_master else None) # tear down the process group dist.destroy_process_group()
def ddp(args): if torch.cuda.is_available() is False: raise EnvironmentError("not find GPU device for training.") # 初始化各进程环境 init_distributed_mode(args=args) device = torch.device(args.gpu) batch_size = args.batch_size num_classes = args.num_classes load_path = args.load_path ckp_path = args.ckp_path args.lr *= args.world_size # 学习率要根据并行GPU的数倍增, 这里使用简单的倍增方法 train_transform = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), ]) test_transform = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), ]) train_data_set = torchvision.datasets.CIFAR10(root='./datasets/cifar10/', train=True, transform=train_transform, download=True) test_data_set = torchvision.datasets.CIFAR10(root='./datasets/cifar10/', train=False, transform=test_transform, download=True) # DistributedSampler给每个rank对应的进程分配训练的样本索引 train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) val_sampler = torch.utils.data.distributed.DistributedSampler( test_data_set) # BatchSampler将样本索引每batch_size个元素组成一个list train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, batch_size, drop_last=True) train_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, pin_memory=True, # 直接加载到显存中,达到加速效果 num_workers=args.n_workers) val_loader = torch.utils.data.DataLoader(test_data_set, batch_size=batch_size, sampler=val_sampler, pin_memory=True, num_workers=args.n_workers) # 实例化模型 model = torchvision.models.resnet34(num_classes=num_classes).to(device) # 需要保证每块GPU加载的权重是一模一样的 if load_path is not None and os.path.exists(load_path): # 如果存在预训练权重则载入 weights_dict = torch.load(load_path, map_location=device) # 简单对比每层的权重参数个数是否一致 load_weights_dict = { k: v for k, v in weights_dict.items() if model.state_dict()[k].numel() == v.numel() } print("Load the initial weights in rank {} from {}".format( get_rank(), load_path)) model.load_state_dict(load_weights_dict, strict=False) else: # 如果不存在预训练权重, 需要将第一个进程中的权重保存, 然后其他进程载入, 保持初始化权重一致 if not os.path.exists(ckp_path): os.makedirs(ckp_path) checkpoint_path = "{}/initial_weights.pth".format(ckp_path) if is_main_process(): print("Save the initial weights in rank {}".format(get_rank())) torch.save(model.state_dict(), checkpoint_path) dist.barrier() # 等待每个GPU都运行完这个地方以后再继续 print("Load the initial weights in rank {}".format(get_rank())) # 这里注意,一定要指定map_location参数, 否则会导致第一块GPU占用更多资源 model.load_state_dict(torch.load(checkpoint_path, map_location=device)) # 是否冻结权重 if args.freeze_layers: for name, param in model.named_parameters(): # 除全连接层外, 其他权重全部冻结 if "fc" not in name: param.requires_grad_(False) # 只有训练带有BN结构的网络时使用SyncBatchNorm采用意义 # 不使用同步BN, i.e., 每个设备计算自己的批次数据的均值方差: 效果与单GPU一致, 仅仅能提升训练速度; # 使用同步BN: 效果会有一定提升, 但是会损失一部分并行速度, i.e., 更耗时 if args.syncBN: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) # 转为DDP模型 model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # optimizer使用SGD+余弦淬火策略 loss_function = torch.nn.CrossEntropyLoss() params_group = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params_group, lr=args.lr, momentum=0.9, weight_decay=0.005) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0.1 * args.lr) for epoch in range(args.epochs): # 在每次迭代的时候获得一个不同的生成器,每一轮开始迭代获取数据之前设置随机种子, # 通过改变传进的epoch参数改变打乱数据顺序. 通过设置不同的随机种子, # 可以让不同GPU每轮拿到的数据不同. 后面的部分和单GPU相同. train_sampler.set_epoch(epoch) mean_loss = train_one_epoch(model=model, optimizer=optimizer, loss_function=loss_function, data_loader=train_loader, device=device, epoch=epoch) if is_main_process(): print("[Training] epoch {} mean loss {}".format(epoch, mean_loss)) scheduler.step() sum_num = evaluate(model=model, data_loader=val_loader, device=device) acc = sum_num / val_sampler.total_size if is_main_process(): print("[Testing] epoch {} Acc {}".format(epoch, acc)) # 保存模型的权重需要在主进程中进行保存. if is_main_process() and epoch % 10 == 0: torch.save(model.module.state_dict(), "{}/ckp-{}.pth".format(ckp_path, epoch)) dist.destroy_process_group() # 撤销进程组, 释放资源
def train(plane_args, yolo_args, midas_args, plane_weightage, yolo_weightage, midas_weightage, resume_train=False, model_path=''): options = plane_args config = InferenceConfig(options) ## Start yolo train setup opt = yolo_args cfg = opt.cfg data = opt.data epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights imgsz_min, imgsz_max, imgsz_test = opt.img_size # img sizes (min, max, test) device = torch_utils.select_device(opt.device, apex=mixed_precision, batch_size=opt.batch_size) # Image Sizes gs = 64 # (pixels) grid size assert math.fmod( imgsz_min, gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs) opt.multi_scale |= imgsz_min != imgsz_max # multi if different (min, max) if opt.multi_scale: if imgsz_min == imgsz_max: imgsz_min //= 1.5 imgsz_max //= 0.667 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = grid_min * gs, grid_max * gs img_size = imgsz_max # initialize with max size # Configure run init_seeds() data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int( data_dict['classes']) # number of classes hyp['cls'] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # Remove previous results for f in glob.glob('*_batch*.png') + glob.glob(results_file): os.remove(f) # Initialize model model = Model(yolo_cfg=cfg, midas_cfg=None, planercnn_cfg=config, path=midas_args.weights).to(device) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) del pg0, pg1, pg2 start_epoch = 0 best_loss = 1000 if resume_train: last_model = torch.load(model_path + 'model_last.pt', map_location=device) model.load_state_dict(last_model['state_dict']) optimizer.load_state_dict(last_model['optimizer']) best_loss = last_model['best_loss'] start_epoch = last_model['epoch'] + 1 del last_model else: attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. chkpt = torch.load(weights, map_location=device) # load model try: #chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()} model.load_state_dict(chkpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if chkpt['optimizer'] is not None: print('loading Optimizer') optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. load_darknet_weights(model, weights) model.load_state_dict(torch.load(options.checkpoint_dir + '/checkpoint.pth'), strict=False) midas_parameters = torch.load(midas_args.weights) if "optimizer" in midas_parameters: midas_parameters = midas_parameters["model"] model.load_state_dict(midas_parameters, strict=False) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://github.com/ultralytics/yolov3/issues/238 lf = lambda x: ( ((1 + math.cos(x * math.pi / epochs)) / 2 )**1.0) * 0.95 + 0.05 # cosine https://arxiv.org/pdf/1812.01187.pdf scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1) # scheduler = lr_scheduler.MultiStepLR(optimizer, [round(epochs * x) for x in [0.8, 0.9]], 0.1, start_epoch - 1) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level yolo_params = dict( path=train_path, img_size=img_size, batch_size=batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) planercnn_params = dict(options=options, config=config, random=False) midas_params = None # Dataset train_dataset = create_data(yolo_params, planercnn_params, midas_params) # Dataloader batch_size = min(batch_size, len(train_dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers trainloader = DataLoader( train_dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=train_dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(train_dataset.labels, nc).to( device) # attach class weights # Model EMA ema = torch_utils.ModelEMA(model) # Start training nb = len(trainloader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test)) print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) loss_list = [] for epoch in range(start_epoch, start_epoch + epochs): model.train() print(('\n' + '%10s' * 6) % ('Epoch', 'Dp_loss', 'bbx_loss', 'pln_loss', 'All_loss', 'img_size')) pbar = tqdm(enumerate(trainloader)) mloss = torch.zeros(4).to(device) # mean losses for i, (plane_data, yolo_data, depth_data) in pbar: optimizer.zero_grad() imgs, targets, _, _ = yolo_data ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Burn-in if ni <= n_burn * 2: model.gr = np.interp( ni, [0, n_burn * 2], [0.0, 1.0 ]) # giou yolo_loss ratio (obj_loss = 1.0 or giou) if ni == n_burn: # burnin complete print_model_biases(model) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, [0, n_burn], [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, [0, n_burn], [0.9, hyp['momentum']]) # Multi-Scale training if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(grid_min, grid_max + 1) * gs sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) yolo_ip = imgs depth_img_size, depth_img, depth_target = depth_data ####### depth_sample = torch.from_numpy(depth_img).to(device).unsqueeze( 0) ###### midas_ip = depth_sample #### data_pair, plane_img, plane_np = plane_data sample = data_pair plane_losses = [] input_pair = [] detection_pair = [] camera = sample[30][0].cuda() #for indexOffset in [0, ]: indexOffset = 0 images, image_metas, rpn_match, rpn_bbox, gt_class_ids, gt_boxes, gt_masks, gt_parameters, gt_depth, extrinsics, planes, gt_segmentation = sample[ indexOffset + 0].cuda(), sample[indexOffset + 1].numpy(), sample[ indexOffset + 2].cuda(), sample[indexOffset + 3].cuda(), sample[ indexOffset + 4].cuda(), sample[indexOffset + 5].cuda(), sample[ indexOffset + 6].cuda(), sample[indexOffset + 7].cuda(), sample[ indexOffset + 8].cuda(), sample[indexOffset + 9].cuda( ), sample[indexOffset + 10].cuda(), sample[indexOffset + 11].cuda() masks = (gt_segmentation == torch.arange(gt_segmentation.max() + 1).cuda().view( -1, 1, 1)).float() input_pair.append({ 'image': images, 'depth': gt_depth, 'bbox': gt_boxes, 'extrinsics': extrinsics, 'segmentation': gt_segmentation, 'camera': camera, 'plane': planes[0], 'masks': masks, 'mask': gt_masks }) plane_ip = dict(input=[ images, image_metas, gt_class_ids, gt_boxes, gt_masks, gt_parameters, camera ], mode='inference_detection', use_nms=2, use_refinement=True, return_feature_map=False) yolo_op, midas_op, plane_op = model.forward( yolo_ip, midas_ip, plane_ip) pred = yolo_op depth_prediction = midas_op rpn_class_logits, rpn_pred_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask, target_parameters, mrcnn_parameters, detections, detection_masks, detection_gt_parameters, detection_gt_masks, rpn_rois, roi_features, roi_indices, depth_np_pred = plane_op rpn_class_loss, rpn_bbox_loss, mrcnn_class_loss, mrcnn_bbox_loss, mrcnn_mask_loss, mrcnn_parameter_loss = compute_losses( config, rpn_match, rpn_bbox, rpn_class_logits, rpn_pred_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask, target_parameters, mrcnn_parameters) plane_losses = [ rpn_class_loss + rpn_bbox_loss + mrcnn_class_loss + mrcnn_bbox_loss + mrcnn_mask_loss + mrcnn_parameter_loss ] if depth_np_pred.shape != gt_depth.shape: depth_np_pred = torch.nn.functional.interpolate( depth_np_pred.unsqueeze(1), size=(512, 512), mode='bilinear', align_corners=False).squeeze(1) pass if config.PREDICT_NORMAL_NP: normal_np_pred = depth_np_pred[0, 1:] depth_np_pred = depth_np_pred[:, 0] gt_normal = gt_depth[0, 1:] gt_depth = gt_depth[:, 0] depth_np_loss = l1LossMask( depth_np_pred[:, 80:560], gt_depth[:, 80:560], (gt_depth[:, 80:560] > 1e-4).float()) normal_np_loss = l2LossMask( normal_np_pred[:, 80:560], gt_normal[:, 80:560], (torch.norm(gt_normal[:, 80:560], dim=0) > 1e-4).float()) plane_losses.append(depth_np_loss) plane_losses.append(normal_np_loss) else: depth_np_loss = l1LossMask( depth_np_pred[:, 80:560], gt_depth[:, 80:560], (gt_depth[:, 80:560] > 1e-4).float()) plane_losses.append(depth_np_loss) normal_np_pred = None pass if len(detections) > 0: detections, detection_masks = unmoldDetections(config, camera, detections, detection_masks, depth_np_pred, normal_np_pred, debug=False) if 'refine_only' in options.suffix: detections, detection_masks = detections.detach( ), detection_masks.detach() pass XYZ_pred, detection_mask, plane_XYZ = calcXYZModule( config, camera, detections, detection_masks, depth_np_pred, return_individual=True) detection_mask = detection_mask.unsqueeze(0) else: XYZ_pred = torch.zeros( (3, config.IMAGE_MAX_DIM, config.IMAGE_MAX_DIM)).cuda() detection_mask = torch.zeros( (1, config.IMAGE_MAX_DIM, config.IMAGE_MAX_DIM)).cuda() plane_XYZ = torch.zeros( (1, 3, config.IMAGE_MAX_DIM, config.IMAGE_MAX_DIM)).cuda() detections = torch.zeros( (3, config.IMAGE_MAX_DIM, config.IMAGE_MAX_DIM)).cuda() detection_masks = torch.zeros( (3, config.IMAGE_MAX_DIM, config.IMAGE_MAX_DIM)).cuda() pass detection_pair.append({ 'XYZ': XYZ_pred, 'depth': XYZ_pred[1:2], 'mask': detection_mask, 'detection': detections, 'masks': detection_masks, 'plane_XYZ': plane_XYZ, 'depth_np': depth_np_pred }) loss_fn = nn.MSELoss() try: plane_parameters = torch.from_numpy( plane_np['plane_parameters']).cuda() plane_masks = torch.from_numpy(plane_np['plane_masks']).cuda() plane_parameters_pred = detection_pair[0]['detection'][:, 6:9] plane_masks_pred = detection_pair[0]['masks'][:, 80:560] if plane_parameters_pred.shape != plane_parameters.shape: plane_parameters_pred = torch.nn.functional.interpolate( plane_parameters_pred.unsqueeze(1).unsqueeze(0), size=plane_parameters.shape, mode='bilinear', align_corners=True).squeeze() pass if plane_masks_pred.shape != plane_masks.shape: plane_masks_pred = torch.nn.functional.interpolate( plane_masks_pred.unsqueeze(1).unsqueeze(0), size=plane_masks.shape, mode='trilinear', align_corners=True).squeeze() pass plane_params_loss = loss_fn(plane_parameters_pred, plane_parameters) + loss_fn( plane_masks_pred, plane_masks) except: plane_params_loss = 1 predicted_detection = visualizeBatchPair(options, config, input_pair, detection_pair, indexOffset=i) predicted_detection = torch.from_numpy(predicted_detection) if predicted_detection.shape != plane_img.shape: predicted_detection = torch.nn.functional.interpolate( predicted_detection.permute(2, 0, 1).unsqueeze(0).unsqueeze(1), size=plane_img.permute(2, 0, 1).shape).squeeze() pass plane_img = plane_img.permute(2, 0, 1) #https://github.com/Po-Hsun-Su/pytorch-ssim #https://github.com/jorge-pessoa/pytorch-msssim pln_ssim = torch.clamp(1 - SSIM( predicted_detection.unsqueeze(0).type(torch.cuda.FloatTensor), plane_img.unsqueeze(0).type(torch.cuda.FloatTensor)), min=0, max=1) plane_loss = sum(plane_losses) + pln_ssim plane_losses = [l.data.item() for l in plane_losses] #train_planercnn.py 331 depth_prediction = (torch.nn.functional.interpolate( depth_prediction.unsqueeze(1), size=tuple(depth_img_size[:2]), mode="bicubic", align_corners=False)) bits = 2 depth_min = depth_prediction.min() depth_max = depth_prediction.max() max_val = (2**(8 * bits)) - 1 if depth_max - depth_min > np.finfo("float").eps: depth_out = max_val * (depth_prediction - depth_min) / (depth_max - depth_min) else: depth_out = 0 depth_target = torch.from_numpy( np.asarray(depth_target)).to(device).type( torch.cuda.FloatTensor).unsqueeze(0) depth_target = (torch.nn.functional.interpolate( depth_target.unsqueeze(1), size=depth_img_size[:2], mode="bicubic", align_corners=False)) depth_pred = Variable(depth_out, requires_grad=True) depth_target = Variable(depth_target, requires_grad=False) ssim_out = torch.clamp(1 - SSIM(depth_pred, depth_target), min=0, max=1) RMSE_loss = torch.sqrt(loss_fn(depth_pred, depth_target)) depth_loss = (0.0001 * RMSE_loss) + ssim_out yolo_loss, yolo_loss_items = compute_loss(pred, targets, model) if not torch.isfinite(yolo_loss): print('WARNING: non-finite yolo_loss, ending training ', yolo_loss_items) return results total_loss = (plane_weightage * plane_loss) + ( yolo_weightage * yolo_loss) + (midas_weightage * depth_loss) # Compute gradient if mixed_precision: with amp.scale_loss(total_loss, optimizer) as scaled_loss: scaled_loss.backward() else: total_loss.backward() pass optimizer.step() gc.collect() ema.update(model) # Print batch results mloss = (mloss * i + yolo_loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' + '%10.3g' * 5) % ('%g/%g' % (epoch, (start_epoch + epochs) - 1), depth_loss.item(), yolo_loss.item(), plane_loss.item(), total_loss.item(), img_size) pbar.set_description(s) scheduler.step() ema.update_attr(model) final_epoch = epoch + 1 == epochs model_chkpt = { 'best_loss': best_loss, 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(model_chkpt, model_path + 'model_last.pt') if total_loss < best_loss: best_loss = total_loss torch.save(model_chkpt, model_path + 'model_best.pt') loss_list.append(total_loss.item()) print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return loss_list
def finalize(self) -> None: dist.destroy_process_group() # restore backed-up env if self._env_backup is not None: os.environ.clear() os.environ.update(self._env_backup)
def train( cfg, data, img_size=416, epochs=100, # 500200 batches at bs 16, 117263 images = 273 epochs batch_size=16, accumulate=4): # effective bs = batch_size * accumulate = 16 * 4 = 64 # Initialize init_seeds() weights = 'weights' + os.sep last = weights + 'last.pt' best = weights + 'best.pt' device = torch_utils.select_device(apex=mixed_precision) multi_scale = opt.multi_scale if multi_scale: img_sz_min = round(img_size / 32 / 1.5) + 1 img_sz_max = round(img_size / 32 * 1.5) - 1 img_size = img_sz_max * 32 # initiate with maximum multi_scale size print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) # Configure run data_dict = parse_data_cfg(data) train_path = data_dict['train'] nc = int(data_dict['classes']) # number of classes # Initialize model model = Darknet(cfg).to(device) # Optimizer optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay'], nesterov=True) # optimizer = AdaBound(model.parameters(), lr=hyp['lr0'], final_lr=0.1) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_fitness = 0. if opt.resume or opt.transfer: # Load previously saved model if opt.transfer: # Transfer learning nf = int( model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device) model.load_state_dict( { k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255 }, strict=False) for p in model.parameters(): p.requires_grad = True if p.shape[0] == nf else False else: # resume from last.pt if opt.bucket: os.system('gsutil cp gs://%s/last.pt %s' % (opt.bucket, last)) # download from bucket chkpt = torch.load(last, map_location=device) # load checkpoint model.load_state_dict(chkpt['model']) if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] if chkpt.get('training_results') is not None: with open('results.txt', 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt else: # Initialize model with backbone (optional) if '-tiny.cfg' in cfg: cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') else: cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74') # Remove old results for f in glob.glob('*_batch*.jpg') + glob.glob('results.txt'): os.remove(f) # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[round(opt.epochs * x) for x in [0.8, 0.9]], gamma=0.1) scheduler.last_epoch = start_epoch - 1 # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Initialize distributed training if torch.cuda.device_count() > 1: dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel(model) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training image_weights=opt.img_weights, cache_images=opt.cache_images) # Dataloader dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=min(os.cpu_count(), batch_size), shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Start training model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model if dataset.image_weights: model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model_info(model, report='summary') # 'full' or 'summary' nb = len(dataloader) maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP, F1, test_loss t0 = time.time() for epoch in range(start_epoch, epochs): model.train() print( ('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'GIoU/xy', 'wh', 'obj', 'cls', 'total', 'targets', 'img_size')) # Update scheduler if epoch > 0: scheduler.step() # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional) freeze_backbone = False if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(5).to(device) # mean losses pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, (imgs, targets, paths, _) in pbar: imgs = imgs.to(device) targets = targets.to(device) # Multi-Scale training ni = (i + nb * epoch ) # number integrated batches (since train start) if multi_scale: if ni / accumulate % 10 == 0: # adjust (67% - 150%) every 10 batches img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Plot images with bounding boxes if epoch == 0 and i == 0: fname = 'train_batch%g.jpg' % i plot_images(imgs=imgs, targets=targets, paths=paths, fname=fname) if tb_writer: tb_writer.add_image(fname, cv2.imread(fname)[:, :, ::-1], dataformats='HWC') # Hyperparameter burn-in # n_burn = nb - 1 # min(nb // 5 + 1, 1000) # number of burn-in batches # if ni <= n_burn: # for m in model.named_modules(): # if m[0].endswith('BatchNorm2d'): # m[1].momentum = 1 - i / n_burn * 0.99 # BatchNorm2d momentum falls from 1 - 0.01 # g = (i / n_burn) ** 4 # gain rises from 0 - 1 # for x in optimizer.param_groups: # x['lr'] = hyp['lr0'] * g # x['weight_decay'] = hyp['weight_decay'] * g # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model, giou_loss=not opt.xywh) if torch.isnan(loss): print('WARNING: nan loss detected, ending training') return results # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if (i + 1) % accumulate == 0 or (i + 1) == nb: optimizer.step() optimizer.zero_grad() # Print batch results mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available( ) else 0 # (GB) s = ('%10s' * 2 + '%10.3g' * 7) % ('%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size) pbar.set_description(s) # Calculate mAP (always test final epoch, skip first 5 if opt.nosave) final_epoch = epoch + 1 == epochs if not (opt.notest or (opt.nosave and epoch < 10)) or final_epoch: with torch.no_grad(): results, maps = test.test( cfg, data, batch_size=batch_size, img_size=opt.img_size, model=model, conf_thres=0.001 if final_epoch else 0.1, # 0.1 for speed save_json=final_epoch and 'coco.data' in data) # Write epoch results with open('results.txt', 'a') as file: file.write(s + '%11.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) # Write Tensorboard results if tb_writer: x = list(mloss[:5]) + list(results[:7]) titles = [ 'GIoU/XY', 'Width/Height', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' ] for xi, title in zip(x, titles): tb_writer.add_scalar(title, xi, epoch) # Update best map fitness = results[2] # mAP if fitness > best_fitness: best_fitness = fitness # Save training results save = (not opt.nosave) or ((not opt.evolve) and final_epoch) if save: with open('results.txt', 'r') as file: # Create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': file.read(), 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': optimizer.state_dict() } # Save last checkpoint torch.save(chkpt, last) if opt.bucket: os.system('gsutil cp %s gs://%s' % (last, opt.bucket)) # upload to bucket # Save best checkpoint if best_fitness == fitness: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % 10 == 0: torch.save(chkpt, weights + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # Report time print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def train(): cfg = opt.cfg data = opt.data img_size, img_size_test = opt.img_size if len(opt.img_size) == 2 else opt.img_size * 2 # train, test sizes 416 epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights # 初始化 init_seeds() if opt.multi_scale: img_sz_min = round(img_size / 32 / 1.5) img_sz_max = round(img_size / 32 * 1.5) img_size = img_sz_max * 32 # 使用最大的multi_scale大小初始化 print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) # 读取训练集路径和测试集路径 data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int(data_dict['classes']) # 分类数 # 删除以前的结果 for f in glob.glob('*_batch*.png') + glob.glob(results_file): os.remove(f) # 初始化模型 model = Darknet(cfg).to(device) # 优化器 pg0, pg1, pg2 = [], [], [] # 参数组 for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # else: pg0 += [v] if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0']) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) # 随机梯度下降 optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # 添加带有weight_decay的pg1 optimizer.add_param_group({'params': pg2}) # 增加 pg2 (biases) del pg0, pg1, pg2 start_epoch = 0 best_fitness = 0.0 attempt_download(weights) if weights.endswith('.pt'): # pytorch format chkpt = torch.load(weights, map_location=device) # 加载模型 chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()} model.load_state_dict(chkpt['model'], strict=False) # 加载优化器 if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # 加载结果 if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format load_darknet_weights(model, weights) # 混合精度训练 if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # 调度器 lf = lambda x: (((1 + math.cos( x * math.pi / epochs)) / 2) ** 1.0) * 0.95 + 0.05 # 余弦 https://arxiv.org/pdf/1812.01187.pdf scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1) # 初始化分布的训练 if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): dist.init_process_group(backend='nccl', # '分布式的后端' init_method='tcp://127.0.0.1:9999', world_size=1, # 用于分布式训练的节点数 rank=0) # 分布式训练节点秩 model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # 将yolo层索引移动到顶层 # 数据集 dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, hyp=hyp, # 增加 hyperparameters rect=opt.rect, # 矩形训练 cache_images=opt.cache_images, single_cls=opt.single_cls) # 数据加载器 batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # 数量 dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt.rect, # Shuffle=True,除非使用矩形训练 pin_memory=True, collate_fn=dataset.collate_fn) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(test_path, img_size_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # 模型参数 model.nc = nc # 将类的数量附加到模型 model.hyp = hyp # 将超参数附加到模型 model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # 附加类权重 ema = torch_utils.ModelEMA(model) # 开始训练 nb = len(dataloader) # 批次数量 n_burn = max(3 * nb, 300) # burn-in 迭代次数 maps = np.zeros(nc) # mAP # torch.autograd.set_detect_anomaly(True) results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # 更新图像权重(可选) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # 平均值 losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # 进度条 for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # 迭代次数 imgs = imgs.to(device).float() / 255.0 # uint8 -- float32, 0 - 255 -- 0.0 - 1.0 targets = targets.to(device) if ni <= n_burn: model.gr = np.interp(ni, [0, n_burn], [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) if ni == n_burn: # burnin complete print_model_biases(model) for j, x in enumerate(optimizer.param_groups): # 偏差lr从0.1下降到lr0,所有其他lr从0.0上升到lr0 x['lr'] = np.interp(ni, [0, n_burn], [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, [0, n_burn], [0.9, hyp['momentum']]) # 多尺度的培训 if opt.multi_scale: if ni / accumulate % 1 == 0: # 每 1 batch调整 img_size (67% - 150%) img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:]] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # 运行 model pred = model(imgs) # 计算 loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # 按标称批处理大小计算的规模损失为64 loss *= batch_size / 64 # 计算梯度 if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # 优化梯度 if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # 打印批量结果 mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) pbar.set_description(s) # 用边界框绘制图像 if ni < 1: f = 'train_batch%g.png' % i # filename plot_images(imgs=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, cv2.imread(f)[:, :, ::-1], dataformats='HWC') # 结束 batch ------------------------------------------------------------------------------------------------ # 更新 scheduler scheduler.step() # 处理epoch结果 ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # 计算 mAP is_coco = any([x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data']]) and model.nc == 80 results, maps = test.test(cfg, data, batch_size=batch_size, img_size=img_size_test, model=ema.ema, save_json=final_epoch and is_coco, single_cls=opt.single_cls, dataloader=testloader) # 输出 epoch 结果 with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # 写 Tensorboard 结果 if tb_writer: x = list(mloss) + list(results) titles = ['GIoU', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'] for xi, title in zip(x, titles): tb_writer.add_scalar(title, xi, epoch) # 更新最好的 mAP fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # 保存训练结果 save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # Create checkpoint chkpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module.state_dict() if hasattr(model, 'module') else ema.ema.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict()} # 保存最后一个的 checkpoint torch.save(chkpt, last) # 保存最好的 checkpoint if (best_fitness == fi) and not final_epoch: torch.save(chkpt, best) del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, 'last%s.pt' % n, 'best%s.pt' % n os.rename('results.txt', fresults) os.rename(wdir + 'last.pt', wdir + flast) if os.path.exists(wdir + 'last.pt') else None os.rename(wdir + 'best.pt', wdir + fbest) if os.path.exists(wdir + 'best.pt') else None if opt.bucket: # save to cloud os.system('gsutil cp %s gs://%s/results' % (fresults, opt.bucket)) os.system('gsutil cp %s gs://%s/weights' % (wdir + flast, opt.bucket)) os.system('gsutil cp %s gs://%s/weights' % (wdir + fbest, opt.bucket)) if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def destroy_pg(self): if self.trainer_group: dist.destroy_process_group(self.trainer_group)
def train(): global max_id_dict print('Task mode: {}'.format(opt.task)) last = wdir + opt.task + '_last.pt' cfg = opt.cfg data = opt.data epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = max(round(64 / batch_size), 1) # accumulate n times before optimizer update (bs 64) weights = opt.weights # initial training weights imgsz_min, imgsz_max, imgsz_test = opt.img_size # img sizes (min, max, test) # Image Sizes gs = 64 # (pixels) grid size assert math.fmod( imgsz_min, gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs) opt.multi_scale |= imgsz_min != imgsz_max # multi if different (min, max) if opt.multi_scale: if imgsz_min == imgsz_max: imgsz_min //= 1.5 imgsz_max //= 0.667 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = grid_min * gs, grid_max * gs img_size = imgsz_max # initialize with max size # Configure run init_seeds() data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int( data_dict['classes']) # number of classes hyp['cls'] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Dataset if opt.task == 'pure_detect': dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyper parameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) else: dataset = LoadImgsAndLbsWithID( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyper parameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) # Initialize model if opt.task == 'pure_detect': model = Darknet( cfg=cfg, img_size=img_size, verbose=False, max_id_dict=max_id_dict, # after dataset's statistics emb_dim=128, mode=opt.task).to(device) else: max_id_dict = dataset.max_ids_dict model = Darknet( cfg=cfg, img_size=img_size, verbose=False, max_id_dict=max_id_dict, # using priori knowledge emb_dim=128, mode=opt.task).to(device) # print(model) print(max_id_dict) # Optimizer definition and model parameters registration pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else # do not succeed... if opt.auto_weight: if opt.task == 'pure_detect' or opt.task == 'detect': awl = AutomaticWeightedLoss(3) elif opt.task == 'track': awl = AutomaticWeightedLoss(4) if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) if opt.auto_weight: optimizer.add_param_group({ 'params': awl.parameters(), 'weight_decay': 0 }) # auto weighted params del pg0, pg1, pg2 start_epoch = 0 best_fitness = 0.0 # attempt_download(weights) if weights.endswith('.pt'): # pytorch format chkpt = torch.load(weights, map_location=device) # load model try: chkpt['model'] = { k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(chkpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e if 'epoch' in chkpt.keys(): print('Checkpoint of epoch {} loaded.'.format(chkpt['epoch'])) # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt # load dark-net format weights elif len(weights) > 0: load_darknet_weights(model, weights) # freeze weights of some previous layers(for yolo detection only) for layer_i, (name, child) in enumerate(model.module_list.named_children()): if layer_i < 51: for param in child.parameters(): param.requires_grad = False else: print('Layer ', name, ' requires grad.') # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.95 + 0.05 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch - 1 # see link below # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 ## Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9997', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) model.yolo_layer_inds = model.module.yolo_layer_inds # move yolo layer indices to top level # Data loader batch_size = min(batch_size, len(dataset)) nw = 0 # for debugging if not opt.isdebug: nw = 8 # min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers data_loader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Test loader if opt.task == 'pure_detect': test_loader = torch.utils.data.DataLoader( LoadImagesAndLabels( test_path, imgsz_test, batch_size, hyp=hyp, rect=True, # True cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) else: test_loader = torch.utils.data.DataLoader( LoadImgsAndLbsWithID(test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Define model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyper-parameters to model model.gr = 1.0 # g_iou loss_funcs ratio (obj_loss = 1.0 or g_iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights # Model EMA: exponential moving average ema = torch_utils.ModelEMA(model) # Start training nb = len(data_loader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test)) print('Using %g data_loader workers' % nw) print('Starting training for %g epochs...' % epochs) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # train mode # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx if opt.task == 'pure_detect' or opt.task == 'detect': m_loss = torch.zeros(4).to(device) # mean losses elif opt.task == 'track': m_loss = torch.zeros(5).to(device) # mean losses else: print('[Err]: unrecognized task mode.') return if opt.task == 'track': print(('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'reid', 'total', 'targets', 'img_size')) elif opt.task == 'detect' or opt.task == 'pure_detect': print( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) else: print('[Err]: unrecognized task mode.') return p_bar = tqdm(enumerate(data_loader), total=nb) # progress bar if opt.task == 'pure_detect' or opt.task == 'detect': for batch_i, ( imgs, targets, paths, shape ) in p_bar: # batch ------------------------------------------------------------- ni = batch_i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Burn-in if ni <= n_burn * 2: model.gr = np.interp( ni, [0, n_burn * 2], [0.0, 1.0 ]) # giou loss_funcs ratio (obj_loss = 1.0 or giou) if ni == n_burn: # burn_in complete print_model_biases(model) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, [0, n_burn], [ 0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp(ni, [0, n_burn], [0.9, hyp['momentum']]) # Multi-Scale if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(grid_min, grid_max + 1) * gs sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model.forward(imgs) # Loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('[Warning]: infinite loss_funcs, ending training ', loss_items) return results # Backward loss *= batch_size / 64.0 # scale loss_funcs if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print m_loss = (m_loss * batch_i + loss_items) / ( batch_i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *m_loss, len(targets), img_size) p_bar.set_description(s) # Plot if ni < 1: f = 'train_batch%g.jpg' % batch_i # filename plot_images(imgs=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, cv2.imread(f)[:, :, ::-1], dataformats='HWC') # tb_writer.add_graph(model, imgs) # add model to tensorboard # Save model if ni != 0 and ni % 300 == 0: # save checkpoint every 100 batches save = (not opt.nosave) or (not opt.evolve) if save: chkpt = {'epoch': epoch, 'batch': ni, 'best_fitness': best_fitness, 'model': ema.ema.module.state_dict() \ if hasattr(model, 'module') else ema.ema.state_dict(), 'optimizer': optimizer.state_dict()} # Save last, best and delete torch.save(chkpt, last) print('{:s} saved.'.format(last)) del chkpt # Save .weights file wei_f_path = wdir + opt.task + '_last.weights' save_weights(model, wei_f_path) print('{:s} saved.'.format(wei_f_path)) elif opt.task == 'track': for batch_i, ( imgs, targets, paths, shape, track_ids ) in p_bar: # batch ------------------------------------------------------------- ni = batch_i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) track_ids = track_ids.to(device) # Burn-in if ni <= n_burn * 2: model.gr = np.interp( ni, [0, n_burn * 2], [0.0, 1.0 ]) # giou loss_funcs ratio (obj_loss = 1.0 or giou) if ni == n_burn: # burnin complete print_model_biases(model) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, [0, n_burn], [ 0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp(ni, [0, n_burn], [0.9, hyp['momentum']]) # print('Lr {:.3f}'.format(x['lr'])) # Multi-Scale if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(grid_min, grid_max + 1) * gs sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred, reid_feat_out = model.forward(imgs) # Loss loss, loss_items = compute_loss_no_upsample( pred, reid_feat_out, targets, track_ids, model) if opt.auto_weight: loss = awl.forward(loss_items[0], loss_items[1], loss_items[2], loss_items[3]) if not torch.isfinite(loss_items[3]): print('[Warning]: infinite reid loss.') loss_items[3:] = torch.zeros((1, 1), device=device) if not torch.isfinite(loss): for i in range(loss_items.shape[0]): loss_items[i] = torch.zeros((1, 1), device=device) print('[Warning] infinite loss_funcs', loss_items) # ending training return results # Backward loss *= batch_size / 64.0 # scale loss_funcs if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print m_loss = (m_loss * batch_i + loss_items) / ( batch_i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 7) % ( '%g/%g' % (epoch, epochs - 1), mem, *m_loss, len(targets), img_size) p_bar.set_description(s) # Plot if ni < 1: f = 'train_batch%g.jpg' % batch_i # filename plot_images(imgs=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, cv2.imread(f)[:, :, ::-1], dataformats='HWC') # tb_writer.add_graph(model, imgs) # add model to tensorboard # Save model if ni != 0 and ni % 300 == 0: # save checkpoint every 100 batches save = (not opt.nosave) or (not opt.evolve) if save: chkpt = {'epoch': epoch, 'batch': ni, 'best_fitness': best_fitness, 'model': ema.ema.module.state_dict() \ if hasattr(model, 'module') else ema.ema.state_dict(), 'optimizer': optimizer.state_dict()} # Save last, best and delete torch.save(chkpt, last) print('{:s} saved.'.format(last)) del chkpt # Save .weights file wei_f_path = wdir + opt.task + '_last.weights' save_weights(model, wei_f_path) print('{:s} saved.'.format(wei_f_path)) # end batch ------------------------------------------------------------------------------------------------ else: print('[Err]: unrecognized task mode.') return # Update scheduler scheduler.step() # Process epoch results ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP is_coco = any([ x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data'] ]) and model.nc == 80 results, maps = test.test(cfg, data, batch_size=batch_size, img_size=imgsz_test, model=ema.ema, save_json=final_epoch and is_coco, single_cls=opt.single_cls, data_loader=test_loader, task=opt.task) # Write with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'train/reid_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(m_loss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: # create checkpoint: whithin an epoch, no results yet, donot save results.txt chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'model': ema.ema.module.state_dict() if hasattr(model, 'module') else ema.ema.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(chkpt, last) if (best_fitness == fi) and not final_epoch: torch.save(chkpt, best) del chkpt # # Save .weights file # wei_f_path = wdir + opt.task + '_last.weights' # save_weights(model, wei_f_path) # print('{:s} saved.'.format(wei_f_path)) # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info( colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) save_dir, epochs, batch_size, total_batch_size, weights, rank = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes names = ['item'] if opt.single_cls and len( data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [ ] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR if opt.linear_lr: lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp[ 'lrf'] # linear else: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if rank in [-1, 0] and wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init( config=opt, resume="allow", project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) loggers = {'wandb': wandb} # loggers dict # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = max(int(model.stride.max()), 32) # grid size (max stride) nl = model.model[ -1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: ')) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, batch_size * 2, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5, prefix=colorstr('val: '))[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, save_dir, loggers) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['box'] *= 3. / nl # scale to layers hyp['cls'] *= nc / 80. * 3. / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640)**2 * 3. / nl # scale to image size and layers model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights( dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) compute_loss = ComputeLoss(model) # init loss class logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n' f'Using {dataloader.num_workers} dataloader workers\n' f'Logging results to {save_dir}\n' f'Starting training for {epochs} epochs...') for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device)) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 10 and wandb: wandb.log( { "Mosaics": [ wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg') if x.exists() ] }, commit=False) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model, include=[ 'yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights' ]) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, verbose=nc < 50 and final_epoch, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0, compute_loss=compute_loss) # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = [ 'train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}, step=epoch, commit=tag == tags[-1]) # W&B # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': wandb_run.id if wandb else None } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers final = best if best.exists() else last # final model for f in [last, best]: if f.exists(): strip_optimizer(f) # strip optimizers if opt.bucket: os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') # upload # Plots if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: files = [ 'results.png', 'confusion_matrix.png', *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')] ] wandb.log({ "Results": [ wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists() ] }) if opt.log_artifacts: wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem) # Test best.pt logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) if opt.data.endswith('coco.yaml') and nc == 80: # if COCO for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]): # speed, mAP tests results, _, _ = test.test(opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, conf_thres=conf, iou_thres=iou, model=attempt_load(final, device).half(), single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=save_json, plots=False) else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() return results
def train(): cfg = opt.cfg data = opt.data img_size = opt.img_size epochs = 1 if opt.prebias else opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights if 'pw' not in opt.arc: # remove BCELoss positive weights hyp['cls_pw'] = 1. hyp['obj_pw'] = 1. # Initialize init_seeds() multi_scale = opt.multi_scale if multi_scale: img_sz_min = round(img_size / 32 / 1.5) + 1 img_sz_max = round(img_size / 32 * 1.5) - 1 img_size = img_sz_max * 32 # initiate with maximum multi_scale size print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) # Configure run data_dict = parse_data_cfg(data) train_path = data_dict['train'] nc = int(data_dict['classes']) # number of classes # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg, arc=opt.arc).to(device) # Optimizer pg0, pg1 = [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if 'Conv2d.weight' in k: pg1 += [v] # parameter group 1 (apply weight_decay) else: pg0 += [v] # parameter group 0 if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay del pg0, pg1 cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_fitness = float('inf') attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. if opt.bucket: os.system('gsutil cp gs://%s/last.pt %s' % (opt.bucket, last)) # download from bucket chkpt = torch.load(weights, map_location=device) # load model # if opt.transfer: chkpt['model'] = { k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(chkpt['model'], strict=False) # else: # model.load_state_dict(chkpt['model']) # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. cutoff = load_darknet_weights(model, weights) if opt.transfer or opt.prebias: # transfer learning edge (yolo) layers nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) if opt.prebias: for p in optimizer.param_groups: # lower param count allows more aggressive training settings: i.e. SGD ~0.1 lr0, ~0.9 momentum p['lr'] *= 100 # lr gain if p.get('momentum') is not None: # for SGD but not Adam p['momentum'] *= 0.9 for p in model.parameters(): if opt.prebias and p.numel() == nf: # train (yolo biases) p.requires_grad = True elif opt.transfer and p.shape[ 0] == nf: # train (yolo biases+weights) p.requires_grad = True else: # freeze layer p.requires_grad = False # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=range(59, 70, 1), gamma=0.8) # gradual fall to 0.1*lr0 scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[round(opt.epochs * x) for x in [0.8, 0.9]], gamma=0.1) scheduler.last_epoch = start_epoch - 1 # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Initialize distributed training if torch.cuda.device_count() > 1: dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel(model) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training image_weights=opt.img_weights, cache_labels=True if epochs > 10 else False, cache_images=False if opt.prebias else opt.cache_images) # Dataloader dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=min([os.cpu_count(), batch_size, 16]), shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Start training model.nc = nc # attach number of classes to model model.arc = opt.arc # attach yolo architecture model.hyp = hyp # attach hyperparameters to model # model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights torch_utils.model_info(model, report='summary') # 'full' or 'summary' nb = len(dataloader) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Starting %s for %g epochs...' % ('prebias' if opt.prebias else 'training', epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional) freeze_backbone = False if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device) targets = targets.to(device) # Multi-Scale training if multi_scale: if ni / accumulate % 10 == 0: # adjust (67% - 150%) every 10 batches img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Plot images with bounding boxes if ni == 0: fname = 'train_batch%g.jpg' % i plot_images(imgs=imgs, targets=targets, paths=paths, fname=fname) if tb_writer: tb_writer.add_image(fname, cv2.imread(fname)[:, :, ::-1], dataformats='HWC') # Hyperparameter burn-in # n_burn = nb - 1 # min(nb // 5 + 1, 1000) # number of burn-in batches # if ni <= n_burn: # for m in model.named_modules(): # if m[0].endswith('BatchNorm2d'): # m[1].momentum = 1 - i / n_burn * 0.99 # BatchNorm2d momentum falls from 1 - 0.01 # g = (i / n_burn) ** 4 # gain rises from 0 - 1 # for x in optimizer.param_groups: # x['lr'] = hyp['lr0'] * g # x['weight_decay'] = hyp['weight_decay'] * g # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Scale loss by nominal batch_size of 64 loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() # Print batch results mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available( ) else 0 # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size) pbar.set_description(s) # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results final_epoch = epoch + 1 == epochs if opt.prebias: print_model_biases(model) else: # Calculate mAP (always test final epoch, skip first 10 if opt.nosave) if not (opt.notest or (opt.nosave and epoch < 10)) or final_epoch: with torch.no_grad(): results, maps = test.test( cfg, data, batch_size=batch_size, img_size=opt.img_size, model=model, conf_thres=0.001 if final_epoch and epoch > 0 else 0.1, # 0.1 for speed save_json=final_epoch and epoch > 0 and 'coco.data' in data) # Write epoch results with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) # Write Tensorboard results if tb_writer: x = list(mloss) + list(results) titles = [ 'GIoU', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' ] for xi, title in zip(x, titles): tb_writer.add_scalar(title, xi, epoch) # Update best mAP fitness = sum(results[4:]) # total loss if fitness < best_fitness: best_fitness = fitness # Save training results save = (not opt.nosave) or (final_epoch and not opt.evolve) or opt.prebias if save: with open(results_file, 'r') as f: # Create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(chkpt, last) if opt.bucket and not opt.prebias: os.system('gsutil cp %s gs://%s' % (last, opt.bucket)) # upload to bucket # Save best checkpoint if best_fitness == fitness: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % 10 == 0: torch.save(chkpt, wdir + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if len(opt.name) and not opt.prebias: os.rename('results.txt', 'results_%s.txt' % opt.name) os.rename( wdir + 'last.pt', wdir + 'last_%s.pt' % opt.name) if os.path.exists(wdir + 'last.pt') else None os.rename( wdir + 'best.pt', wdir + 'best_%s.pt' % opt.name) if os.path.exists(wdir + 'best.pt') else None plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() # save to cloud # os.system('gsutil cp results.txt gs://...') # os.system('gsutil cp weights/best.pt gs://...') return results
def main_cli(): parser = argparse.ArgumentParser('CEDR model training and validation') model_init_utils.add_model_init_basic_args(parser, True) parser.add_argument('--datafiles', metavar='data files', help='data files: docs & queries', type=argparse.FileType('rt'), nargs='+', required=True) parser.add_argument('--qrels', metavar='QREL file', help='QREL file', type=argparse.FileType('rt'), required=True) parser.add_argument('--train_pairs', metavar='paired train data', help='paired train data', type=argparse.FileType('rt'), required=True) parser.add_argument('--valid_run', metavar='validation file', help='validation file', type=argparse.FileType('rt'), required=True) parser.add_argument('--model_out_dir', metavar='model out dir', help='an output directory for the trained model', required=True) parser.add_argument('--epoch_qty', metavar='# of epochs', help='# of epochs', type=int, default=10) parser.add_argument('--no_cuda', action='store_true') parser.add_argument('--warmup_pct', metavar='warm-up fraction', default=None, type=float, help='use a warm-up/cool-down learning-reate schedule') parser.add_argument('--device_qty', type=int, metavar='# of device for multi-GPU training', default=1, help='# of GPUs for multi-GPU training') parser.add_argument( '--batch_sync_qty', metavar='# of batches before model sync', type=int, default=4, help= 'Model syncronization frequency for multi-GPU trainig in the # of batche' ) parser.add_argument('--master_port', type=int, metavar='pytorch master port', default=None, help='pytorch master port for multi-GPU training') parser.add_argument('--print_grads', action='store_true', help='print gradient norms of parameters') parser.add_argument('--save_epoch_snapshots', action='store_true', help='save model after each epoch') parser.add_argument( '--save_last_snapshot_every_k_batch', metavar='debug: save latest snapshot every k batch', type=int, default=None, help='debug option: save latest snapshot every k batch') parser.add_argument('--seed', metavar='random seed', help='random seed', type=int, default=42) parser.add_argument('--optim', metavar='optimizer', choices=[OPT_SGD, OPT_ADAMW], default=OPT_ADAMW, help='Optimizer') parser.add_argument('--loss_margin', metavar='loss margin', help='Margin in the margin loss', type=float, default=1) parser.add_argument( '--init_lr', metavar='init learn. rate', type=float, default=0.001, help='Initial learning rate for BERT-unrelated parameters') parser.add_argument('--momentum', metavar='SGD momentum', type=float, default=0.9, help='SGD momentum') parser.add_argument('--init_bert_lr', metavar='init BERT learn. rate', type=float, default=0.00005, help='Initial learning rate for BERT parameters') parser.add_argument('--epoch_lr_decay', metavar='epoch LR decay', type=float, default=1.0, help='Per-epoch learning rate decay') parser.add_argument('--weight_decay', metavar='weight decay', type=float, default=0.0, help='optimizer weight decay') parser.add_argument('--batch_size', metavar='batch size', type=int, default=32, help='batch size') parser.add_argument('--batch_size_val', metavar='val batch size', type=int, default=32, help='validation batch size') parser.add_argument('--backprop_batch_size', metavar='backprop batch size', type=int, default=12, help='batch size for each backprop step') parser.add_argument( '--batches_per_train_epoch', metavar='# of rand. batches per epoch', type=int, default=0, help='# of random batches per epoch: 0 tells to use all data') parser.add_argument( '--max_query_val', metavar='max # of val queries', type=int, default=0, help='max # of validation queries: 0 tells to use all data') parser.add_argument('--no_shuffle_train', action='store_true', help='disabling shuffling of training data') parser.add_argument('--use_external_eval', action='store_true', help='use external eval tools: gdeval or trec_eval') parser.add_argument('--eval_metric', choices=METRIC_LIST, default=METRIC_LIST[0], help='Metric list: ' + ','.join(METRIC_LIST), metavar='eval metric') parser.add_argument('--loss_func', choices=LOSS_FUNC_LIST, default=PairwiseSoftmaxLoss.name(), help='Loss functions: ' + ','.join(LOSS_FUNC_LIST)) parser.add_argument( '--json_conf', metavar='JSON config', type=str, default=None, help= 'a JSON config (simple-dictionary): keys are the same as args, takes precedence over command line args' ) args = parser.parse_args() all_arg_names = vars(args).keys() if args.json_conf is not None: conf_file = args.json_conf print(f'Reading configuration variables from {conf_file}') add_conf = utils.read_json(conf_file) for arg_name, arg_val in add_conf.items(): if arg_name not in all_arg_names: print(f'Invalid option in the configuration file: {arg_name}') sys.exit(1) arg_default = getattr(args, arg_name) exp_type = type(arg_default) if arg_default is not None and type(arg_val) != exp_type: print( f'Invalid type in the configuration file: {arg_name} expected type: ' + str(type(exp_type)) + f' default {arg_default}') sys.exit(1) print(f'Using {arg_name} from the config') setattr(args, arg_name, arg_val) # This hack copies max query and document length parameters to the model space parameters # maybe some other approach is more elegant, but this one should at least work setattr(args, f'{MODEL_PARAM_PREF}max_query_len', args.max_query_len) setattr(args, f'{MODEL_PARAM_PREF}max_doc_len', args.max_doc_len) if args.save_last_snapshot_every_k_batch is not None and args.save_last_snapshot_every_k_batch < 2: print('--save_last_snapshot_every_k_batch should be > 1') sys.exit(1) utils.set_all_seeds(args.seed) loss_name = args.loss_func if loss_name == PairwiseSoftmaxLoss.name(): loss_obj = PairwiseSoftmaxLoss() elif loss_name == MarginRankingLossWrapper.name(): loss_obj = MarginRankingLossWrapper(margin=args.loss_margin) else: print('Unsupported loss: ' + loss_name) sys.exit(1) # If we have the complete model, we just load it, # otherwise we first create a model and load *SOME* of its weights. # For example, if we start from an original BERT model, which has # no extra heads, it we will load only the respective weights and # initialize the weights of the head randomly. if args.init_model is not None: print('Loading a complete model from:', args.init_model.name) model = torch.load(args.init_model.name, map_location='cpu') elif args.init_model_weights is not None: model = model_init_utils.create_model_from_args(args) print('Loading model weights from:', args.init_model_weights.name) model.load_state_dict(torch.load(args.init_model_weights.name, map_location='cpu'), strict=False) else: print('Creating the model from scratch!') model = model_init_utils.create_model_from_args(args) os.makedirs(args.model_out_dir, exist_ok=True) print(model) utils.sync_out_streams() model.set_grad_checkpoint_param(args.grad_checkpoint_param) dataset = data.read_datafiles(args.datafiles) qrelf = args.qrels.name qrels = readQrelsDict(qrelf) train_pairs_all = data.read_pairs_dict(args.train_pairs) valid_run = readRunDict(args.valid_run.name) max_query_val = args.max_query_val query_ids = list(valid_run.keys()) if max_query_val > 0: query_ids = query_ids[0:max_query_val] valid_run = {k: valid_run[k] for k in query_ids} print('# of eval. queries:', len(query_ids), ' in the file', args.valid_run.name) device_qty = args.device_qty master_port = args.master_port if device_qty > 1: if master_port is None: print('Specify a master port for distributed training!') sys.exit(1) processes = [] is_distr_train = device_qty > 1 qids = [] if is_distr_train: qids = list(train_pairs_all.keys()) sync_barrier = Barrier(device_qty) # We must go in the reverse direction, b/c # rank == 0 trainer is in the same process and # we call the function do_train in the same process, # i.e., this call is blocking processing and # prevents other processes from starting. for rank in range(device_qty - 1, -1, -1): if is_distr_train: device_name = f'cuda:{rank}' else: device_name = args.device_name if args.no_cuda: device_name = DEVICE_CPU # When we have only a single GPP, the main process is its own master is_master_proc = rank == 0 train_params = TrainParams( init_lr=args.init_lr, init_bert_lr=args.init_bert_lr, momentum=args.momentum, warmup_pct=args.warmup_pct, batch_sync_qty=args.batch_sync_qty, epoch_lr_decay=args.epoch_lr_decay, weight_decay=args.weight_decay, backprop_batch_size=args.backprop_batch_size, batches_per_train_epoch=args.batches_per_train_epoch, save_epoch_snapshots=args.save_epoch_snapshots, save_last_snapshot_every_k_batch=args. save_last_snapshot_every_k_batch, batch_size=args.batch_size, batch_size_val=args.batch_size_val, max_query_len=args.max_query_len, max_doc_len=args.max_doc_len, epoch_qty=args.epoch_qty, device_name=device_name, use_external_eval=args.use_external_eval, eval_metric=args.eval_metric.lower(), print_grads=args.print_grads, shuffle_train=not args.no_shuffle_train, optim=args.optim) train_pair_qty = len(train_pairs_all) if is_distr_train or train_pair_qty < device_qty: tpart_qty = int((train_pair_qty + device_qty - 1) / device_qty) train_start = rank * tpart_qty train_end = min(train_start + tpart_qty, len(qids)) train_pairs = { k: train_pairs_all[k] for k in qids[train_start:train_end] } else: train_pairs = train_pairs_all print('Process rank %d device %s using %d training pairs out of %d' % (rank, device_name, len(train_pairs), train_pair_qty)) param_dict = { 'sync_barrier': sync_barrier, 'device_qty': device_qty, 'master_port': master_port, 'rank': rank, 'is_master_proc': is_master_proc, 'dataset': dataset, 'qrels': qrels, 'qrel_file_name': qrelf, 'train_pairs': train_pairs, 'valid_run': valid_run, 'model_out_dir': args.model_out_dir, 'model': model, 'loss_obj': loss_obj, 'train_params': train_params } if is_distr_train and not is_master_proc: p = Process(target=do_train, kwargs=param_dict) p.start() processes.append(p) else: do_train(**param_dict) for p in processes: utils.join_and_check_stat(p) if device_qty > 1: dist.destroy_process_group()
def train(hyp): epochs = opt.epochs # 300 batch_size = opt.batch_size # 64 weights = opt.weights # initial training weights # Configure init_seeds(1) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict train_path = data_dict['train'] test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model model = Model(opt.cfg).to(device) assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % ( opt.data, nc, opt.cfg, model.md['nc']) model.names = data_dict['names'] # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else optimizer = optim.Adam(pg0, lr=hyp['lr0']) if opt.adam else \ optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Load Model google_utils.attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: ckpt['model'] = { k: v for k, v in ckpt['model'].float().state_dict().items() if model.state_dict()[k].shape == v.shape } # to FP32, filter model.load_state_dict(ckpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ "Please delete or update %s and try again, or use --weights '' to train from scatch." \ % (opt.weights, opt.cfg, opt.weights, opt.weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt start_epoch = ckpt['epoch'] + 1 del ckpt # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.9 + 0.1 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch - 1 # do not move # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend='nccl', # distributed backend init_method='tcp://127.0.0.1:9999', # init method world_size=1, # number of nodes rank=0) # node rank model = torch.nn.parallel.DistributedDataParallel(model) # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Correct your labels or your model.' % ( mlc, nc, opt.cfg) # Testloader testloader = create_dataloader(test_path, imgsz_test, batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights # Class frequency labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) if tb_writer: plot_labels(labels) tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Exponential moving average ema = torch_utils.ModelEMA(model) # Start training t0 = time.time() nb = len(dataloader) # number of batches n_burn = max(3 * nb, 1e3) # burn-in iterations, max(3 epochs, 1k iterations) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 # Burn-in if ni <= n_burn: xi = [0, n_burn] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = 'train_batch%g.jpg' % ni # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # mAP ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema, single_cls=opt.single_cls, dataloader=testloader) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(model, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete if (best_fitness == fi) and not final_epoch: torch.save(ckpt, wdir + 'best' + str(epoch) + '.pt') else: torch.save(ckpt, wdir + 'last' + str(epoch) + '.pt') del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group( ) if device.type != 'cpu' and torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def run_test_collect_shards(rank, world_size, reference_rank, tempfile_name): dist_init(rank, world_size, tempfile_name) device = torch.device(rank) if torch.cuda.device_count() > 1 else DEVICE torch.cuda.set_device(rank) # Run a dummy step so that the optimizer state dict exists batch, input_width, hidden, target_width = 3, 3, 3, 5 target = torch.rand((batch, target_width), device=device) inputs = torch.rand((batch, input_width), device=device) model = torch.nn.Sequential(torch.nn.Linear(input_width, hidden), torch.nn.Linear(hidden, target_width)) model.to(device) loss_fn = torch.nn.L1Loss() loss_fn.to(device) # With SGD, Momentum is required to get a state to shard optimizer = optim.OSS(model.parameters(), lr=0.1, momentum=0.99) def closure(): optimizer.zero_grad() output = model(inputs) loss = loss_fn(output, target) loss.backward() return loss _ = optimizer.step(closure=closure) # Update the optimizer state on the reference rank optimizer.consolidate_state_dict(recipient_rank=reference_rank) # Fetch the state on the reference rank # - check that it has the correct size # - load it again if rank == reference_rank: optimizer_state_dict = optimizer.state_dict() assert len(optimizer_state_dict["state"]) == len( list(model.parameters())) else: optimizer_state_dict = {} # distribute to the other ranks optimizer_state_dict = sync_object_ranks(optimizer_state_dict, reference_rank, device) # Load the optimizer state dict optimizer.load_state_dict(optimizer_state_dict) # Check that the states are not None, but {} for state in optimizer.state.values(): for _, _ in state.items(): pass # Test the state dict materialization on all ranks _ = optimizer.step(closure=closure) optimizer_state_dict = optimizer.state_dict(all_ranks=True) # one per rank optimizer.load_state_dict(optimizer_state_dict) _ = optimizer.step(closure=closure) check_same_models_across_ranks(model, dist.group.WORLD, params_should_be_equal=True, check_broadcast_buffers=False) # Check that if the model is moved to cpu, the optimizer consolidation still works model.cpu() optimizer = optim.OSS(model.parameters(), lr=0.1, momentum=0.99) optimizer.consolidate_state_dict(recipient_rank=reference_rank) dist.destroy_process_group()
def evaluate(args): if args.local_rank == 0 and not os.path.exists(args.dir): # create 16 random image, mask paris for evaluation print(f"generating synthetic data to {args.dir} (this may take a while)") os.makedirs(args.dir) # set random seed to generate same random data for every node np.random.seed(seed=0) for i in range(16): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz")) # initialize the distributed evaluation process, every GPU runs in a process dist.init_process_group(backend="nccl", init_method="env://") images = sorted(glob(os.path.join(args.dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz"))) val_files = [{"image": img, "label": seg} for img, seg in zip(images, segs)] # define transforms for image and segmentation val_transforms = Compose( [ LoadNiftid(keys=["image", "label"]), AsChannelFirstd(keys=["image", "label"], channel_dim=-1), ScaleIntensityd(keys="image"), ToTensord(keys=["image", "label"]), ] ) # create a evaluation data loader val_ds = Dataset(data=val_files, transform=val_transforms) # create a evaluation data sampler val_sampler = DistributedSampler(val_ds, shuffle=False) # sliding window inference need to input 1 image in every iteration val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, num_workers=2, pin_memory=True, sampler=val_sampler) # create UNet, DiceLoss and Adam optimizer device = torch.device(f"cuda:{args.local_rank}") net = monai.networks.nets.UNet( dimensions=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) # wrap the model with DistributedDataParallel module net = DistributedDataParallel(net, device_ids=[args.local_rank]) val_post_transforms = Compose( [ Activationsd(keys="pred", sigmoid=True), AsDiscreted(keys="pred", threshold_values=True), KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]), ] ) val_handlers = [ CheckpointLoader( load_path="./runs/checkpoint_epoch=4.pth", load_dict={"net": net}, # config mapping to expected GPU device map_location={"cuda:0": f"cuda:{args.local_rank}"}, ), ] if dist.get_rank() == 0: logging.basicConfig(stream=sys.stdout, level=logging.INFO) val_handlers.extend( [ StatsHandler(output_transform=lambda x: None), SegmentationSaver( output_dir="./runs/", batch_transform=lambda batch: batch["image_meta_dict"], output_transform=lambda output: output["pred"], ), ] ) evaluator = SupervisedEvaluator( device=device, val_data_loader=val_loader, network=net, inferer=SlidingWindowInferer(roi_size=(96, 96, 96), sw_batch_size=4, overlap=0.5), post_transform=val_post_transforms, key_val_metric={ "val_mean_dice": MeanDice( include_background=True, output_transform=lambda x: (x["pred"], x["label"]), device=device, ) }, additional_metrics={"val_acc": Accuracy(output_transform=lambda x: (x["pred"], x["label"]), device=device)}, val_handlers=val_handlers, # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation amp=True if monai.config.get_torch_version_tuple() >= (1, 6) else False, ) evaluator.run() dist.destroy_process_group()
def run_gradient_clipping(rank, world_size, tempfile_name): dist_init(rank, world_size, tempfile_name, backend="gloo") device = torch.device(rank) torch.manual_seed( rank) # make sure that the different rank get different data # Run a dummy step so that the optimizer state dict exists batch, input_width, hidden, target_width = 3, 20, 10, 5 target = torch.rand((batch, target_width), device=device) inputs = torch.rand((batch, input_width), device=device) NORMS = [1.0, 2.0, 1, 2, inf] CLIP_NORM = 0.3 def check(norm): model_oss = torch.nn.Sequential( torch.nn.Linear(input_width, hidden), torch.nn.Linear(hidden, hidden), torch.nn.Linear(hidden, target_width), ).to(device) model = copy.deepcopy(model_oss) # For this test the gradients are (all) reduced in the same way in between the torch reference and fairscale. # Normally OSS would use ShardedDDP and only reduce to the proper rank, but this does not change the # gradient norm computation from OSS and adds a dependency. # to keep the comparison apples-to-apples DDP is used in both cases model_oss = DDP( module=model_oss, device_ids=[rank], ) sharded_optimizer = optim.OSS(model_oss.parameters(), lr=0.1, momentum=0.99) model = DDP( model, device_ids=[rank], ) loss_fn = torch.nn.L1Loss() loss_fn.to(device) model.zero_grad() model_oss.zero_grad() outputs = model(inputs) outputs_oss = model_oss(inputs) loss = loss_fn(outputs, target) loss.backward() loss_oss = loss_fn(outputs_oss, target) loss_oss.backward() torch.testing.assert_allclose(loss_oss, loss) # Check the equivalence with the non-sharded optim oss_total_norm = sharded_optimizer.clip_grad_norm(CLIP_NORM, norm_type=norm) total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM, norm_type=norm) assert torch.allclose( oss_total_norm, total_norm), "torch and fairscale should return the same grad norm" # Check that the params have indeed been clipped for params in sharded_optimizer._per_device_params.values(): for param in filter(lambda x: x.grad is not None, params[rank]): assert torch.norm( param.grad, p=norm ) < CLIP_NORM, f"param grad norm above clip : {param.grad}" for norm in NORMS: print(f"Checking norm {norm}") check(norm) # Check twice, catch an hypothetic iterator dumb mistake check(norm) dist.destroy_process_group()
def main_worker(args): # disable logging for processes except 0 on every node if args.local_rank != 0: f = open(os.devnull, "w") sys.stdout = sys.stderr = f if not os.path.exists(args.dir): raise FileNotFoundError(f"Missing directory {args.dir}") # initialize the distributed training process, every GPU runs in a process dist.init_process_group(backend="nccl", init_method="env://") total_start = time.time() train_transforms = Compose([ # load 4 Nifti images and stack them together LoadImaged(keys=["image", "label"]), AsChannelFirstd(keys="image"), ConvertToMultiChannelBasedOnBratsClassesd(keys="label"), Spacingd(keys=["image", "label"], pixdim=(1.5, 1.5, 2.0), mode=("bilinear", "nearest")), Orientationd(keys=["image", "label"], axcodes="RAS"), RandSpatialCropd(keys=["image", "label"], roi_size=[128, 128, 64], random_size=False), NormalizeIntensityd(keys="image", nonzero=True, channel_wise=True), RandFlipd(keys=["image", "label"], prob=0.5, spatial_axis=0), RandScaleIntensityd(keys="image", factors=0.1, prob=0.5), RandShiftIntensityd(keys="image", offsets=0.1, prob=0.5), ToTensord(keys=["image", "label"]), ]) # create a training data loader train_ds = BratsCacheDataset( root_dir=args.dir, transform=train_transforms, section="training", num_workers=4, cache_rate=args.cache_rate, shuffle=True, ) train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # validation transforms and dataset val_transforms = Compose([ LoadImaged(keys=["image", "label"]), AsChannelFirstd(keys="image"), ConvertToMultiChannelBasedOnBratsClassesd(keys="label"), Spacingd(keys=["image", "label"], pixdim=(1.5, 1.5, 2.0), mode=("bilinear", "nearest")), Orientationd(keys=["image", "label"], axcodes="RAS"), CenterSpatialCropd(keys=["image", "label"], roi_size=[128, 128, 64]), NormalizeIntensityd(keys="image", nonzero=True, channel_wise=True), ToTensord(keys=["image", "label"]), ]) val_ds = BratsCacheDataset( root_dir=args.dir, transform=val_transforms, section="validation", num_workers=4, cache_rate=args.cache_rate, shuffle=False, ) val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if dist.get_rank() == 0: # Logging for TensorBoard writer = SummaryWriter(log_dir=args.log_dir) # create UNet, DiceLoss and Adam optimizer device = torch.device(f"cuda:{args.local_rank}") torch.cuda.set_device(device) if args.network == "UNet": model = UNet( dimensions=3, in_channels=4, out_channels=3, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) else: model = SegResNet(in_channels=4, out_channels=3, init_filters=16, dropout_prob=0.2).to(device) loss_function = DiceLoss(to_onehot_y=False, sigmoid=True, squared_pred=True) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5, amsgrad=True) # wrap the model with DistributedDataParallel module model = DistributedDataParallel(model, device_ids=[device]) # start a typical PyTorch training total_epoch = args.epochs best_metric = -1000000 best_metric_epoch = -1 epoch_time = AverageMeter("Time", ":6.3f") progress = ProgressMeter(total_epoch, [epoch_time], prefix="Epoch: ") end = time.time() print(f"Time elapsed before training: {end-total_start}") for epoch in range(total_epoch): train_loss = train(train_loader, model, loss_function, optimizer, epoch, args, device) epoch_time.update(time.time() - end) if epoch % args.print_freq == 0: progress.display(epoch) if dist.get_rank() == 0: writer.add_scalar("Loss/train", train_loss, epoch) if (epoch + 1) % args.val_interval == 0: metric, metric_tc, metric_wt, metric_et = evaluate( model, val_loader, device) if dist.get_rank() == 0: writer.add_scalar("Mean Dice/val", metric, epoch) writer.add_scalar("Mean Dice TC/val", metric_tc, epoch) writer.add_scalar("Mean Dice WT/val", metric_wt, epoch) writer.add_scalar("Mean Dice ET/val", metric_et, epoch) if metric > best_metric: best_metric = metric best_metric_epoch = epoch + 1 print( f"current epoch: {epoch + 1} current mean dice: {metric:.4f}" f" tc: {metric_tc:.4f} wt: {metric_wt:.4f} et: {metric_et:.4f}" f"\nbest mean dice: {best_metric:.4f} at epoch: {best_metric_epoch}" ) end = time.time() print(f"Time elapsed after epoch {epoch + 1} is {end - total_start}") if dist.get_rank() == 0: print( f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}" ) # all processes should see same parameters as they all start from same # random parameters and gradients are synchronized in backward passes, # therefore, saving it in one process is sufficient torch.save(model.state_dict(), "final_model.pth") writer.flush() dist.destroy_process_group()
def run_ddp_parity(rank, world_size, backend, temp_file_name, change_train_graph, broadcast_fp16): url = "file://" + temp_file_name dist.init_process_group(init_method=url, backend=backend, rank=rank, world_size=world_size) device = torch.device("cuda") torch.cuda.set_device(rank) torch.manual_seed(rank) np.random.seed(rank) hidden = 5 in_channels = 3 out_channels = 3 batch = 64 def check_optimizer_equivalence(optimizer: Type[torch.optim.Optimizer], change_train_graph: bool = False): # Any model works. Add one different buffer per rank trunk = torch.nn.Sequential(torch.nn.Linear(in_channels, hidden), torch.nn.Linear(hidden, hidden), torch.nn.Linear(hidden, hidden)) trunk.register_buffer("test_buffer", torch.ones((1)) * rank) trunk.to(device) head = torch.nn.Linear(hidden, out_channels).to(device) # Define a model to be trained by OSS oss_module = torch.nn.Sequential(trunk, head) # Make sure that the param groups are interleaved, to catch an ordering bug in the state dict oss_trainable_params = [ { "params": list(trunk.parameters())[:-1] + list(head.parameters()), "lr": 1e-5 }, { "params": list(trunk.parameters())[-1], "lr": 1e-4 }, ] optimizer_settings: Dict[Any, Any] = {} if isinstance(optimizer, torch.optim.SGD): optimizer_settings["momentum"] = 0.9 sharded_optimizer = optim.OSS( params=oss_trainable_params, optim=optimizer, group=None, broadcast_buffer_size=2**10, **optimizer_settings, ) oss_ddp_model = DDP(module=oss_module, device_ids=[rank], broadcast_buffers=True, find_unused_parameters=True) # Define a model to be trained by normal pytorch + DDP ddp_trunk = copy.deepcopy(trunk) ddp_head = copy.deepcopy(head) ddp_module = torch.nn.Sequential(ddp_trunk, ddp_head) ddp_trainable_params = [ { "params": list(ddp_trunk.parameters())[:-1] + list(ddp_head.parameters()), "lr": 1e-5 }, { "params": list(ddp_trunk.parameters())[-1], "lr": 1e-4 }, ] ddp_optimizer = optimizer(ddp_trainable_params, **optimizer_settings) # type: ignore ddp_model = DDP(module=ddp_module, device_ids=[rank], broadcast_buffers=True, find_unused_parameters=True) def check_step(): input_tensor = torch.rand((batch, in_channels)).to(device) def closure_ddp(input_tensor=input_tensor): ddp_optimizer.zero_grad() ddp_loss = ddp_model(input_tensor).abs().sum() ddp_loss.backward() return ddp_loss def closure_sharded(input_tensor=input_tensor): sharded_optimizer.zero_grad() sharded_loss = oss_ddp_model(input_tensor).abs().sum() sharded_loss.backward() return sharded_loss loss_ddp = cast(torch.Tensor, ddp_optimizer.step(closure=closure_ddp)) loss_sharded_optim = cast( torch.Tensor, sharded_optimizer.step(closure=closure_sharded)) assert torch.allclose( loss_ddp, loss_sharded_optim, rtol=1e-3 ), f"Losses differ in between Pytorch optim and OSS\n {loss_ddp.item()} - {loss_sharded_optim.item()} - world size {world_size}" check_same_model_params(oss_ddp_model, ddp_model) # The model should be synchronized in between the ranks at construction time, check that check_same_model_params(oss_ddp_model, ddp_model) # The models should stay the same in between ddp and sharded optimizer for i in range(5): check_step() # Check that altering the trainable parameters does not cause DDP and OSS to diverge if change_train_graph: # Flip the first parameter from trainable to non-trainable and vice-versa next(ddp_module.parameters()).requires_grad = not next( ddp_module.parameters()).requires_grad next(oss_module.parameters()).requires_grad = not next( oss_module.parameters()).requires_grad # sharded_optimizer.refresh_trainable() # Check that the checkpoints are compatible (post pytorch 1.5) if torch_version()[1] > 5: # - get states ddp_state_dict = ddp_optimizer.state_dict() sharded_optimizer.consolidate_state_dict( recipient_rank=RECIPIENT_RANK) sharded_optim_state_dict = sharded_optimizer.state_dict( ) if rank == RECIPIENT_RANK else {} sharded_optim_state_dict = sync_object_ranks( sharded_optim_state_dict, RECIPIENT_RANK, device) # - cross load the states # run one step and check that the models are still the same ddp_state_dict_ref = copy.deepcopy( ddp_state_dict) # OSS will remove some states ddp_optimizer.load_state_dict( sharded_optim_state_dict) # mixup on purpose ! sharded_optimizer.load_state_dict(ddp_state_dict) check_step() # - self load, rewind, check no problem # run one step and check that the models are still the same ddp_optimizer.load_state_dict(ddp_state_dict_ref) sharded_optimizer.load_state_dict(sharded_optim_state_dict) check_step() for opt in [torch.optim.Adam, torch.optim.SGD]: check_optimizer_equivalence(opt, change_train_graph=change_train_graph) dist.destroy_process_group()
for key, value in config.items(): print("\t{}: {}".format(key, value)) print("\n") # create log directory only by 1 node if (not distributed) or (dist.get_rank() == 0): from datetime import datetime now = datetime.now().strftime("%Y%m%d-%H%M%S") gpu_conf = "-single-gpu" if distributed: ngpus_per_node = torch.cuda.device_count() nnodes = dist.get_world_size() // ngpus_per_node gpu_conf = "-distributed-{}nodes-{}gpus".format( nnodes, ngpus_per_node) output_path = Path(config['output_path']) / "{}{}".format( now, gpu_conf) if not output_path.exists(): output_path.mkdir(parents=True) output_path = output_path.as_posix() print("Output path: {}".format(output_path)) try: run(output_path, config) except KeyboardInterrupt: print("Catched KeyboardInterrupt -> exit") if distributed: dist.destroy_process_group()
def main(opt): set_logging(RANK) if RANK in [-1, 0]: print( colorstr('train: ') + ', '.join(f'{k}={v}' for k, v in vars(opt).items())) check_git_status() check_requirements(exclude=['thop']) # Resume wandb_run = check_wandb_resume(opt) if opt.resume and not wandb_run: # resume an interrupted run ckpt = opt.resume if isinstance( opt.resume, str) else get_latest_run() # specified or most recent path assert os.path.isfile( ckpt), 'ERROR: --resume checkpoint does not exist' with open(Path(ckpt).parent.parent / 'opt.yaml') as f: opt = argparse.Namespace(**yaml.safe_load(f)) # replace opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate logger.info('Resuming training from %s' % ckpt) else: # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file( opt.cfg), check_file(opt.hyp) # check files assert len(opt.cfg) or len( opt.weights), 'either --cfg or --weights must be specified' opt.img_size.extend( [opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) opt.name = 'evolve' if opt.evolve else opt.name opt.save_dir = str( increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok or opt.evolve)) # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if LOCAL_RANK != -1: from datetime import timedelta assert torch.cuda.device_count( ) > LOCAL_RANK, 'insufficient CUDA devices for DDP command' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) dist.init_process_group( backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60)) assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count' assert not opt.image_weights, '--image-weights argument is not compatible with DDP training' # Train if not opt.evolve: train(opt.hyp, opt, device) if WORLD_SIZE > 1 and RANK == 0: _ = [ print('Destroying process group... ', end=''), dist.destroy_process_group(), print('Done.') ] # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = { 'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight 'iou_t': (0, 0.1, 0.7), # IoU training threshold 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) 'scale': (1, 0.0, 0.9), # image scale (+/- gain) 'shear': (1, 0.0, 10.0), # image shear (+/- deg) 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) 'mosaic': (1, 0.0, 1.0), # image mixup (probability) 'mixup': (1, 0.0, 1.0), # image mixup (probability) 'copy_paste': (1, 0.0, 1.0) } # segment copy-paste (probability) with open(opt.hyp) as f: hyp = yaml.safe_load(f) # load hyps dict if 'anchors' not in hyp: # anchors commented in hyp.yaml hyp['anchors'] = 3 assert LOCAL_RANK == -1, 'DDP mode not implemented for --evolve' opt.notest, opt.nosave = True, True # only test/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices yaml_file = Path( opt.save_dir) / 'hyp_evolved.yaml' # save best result here if opt.bucket: os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists for _ in range(opt.evolve): # generations to evolve if Path('evolve.txt').exists( ): # if evolve.txt exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt('evolve.txt', ndmin=2) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() + 1E-6 # weights (sum > 0) if parent == 'single' or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == 'weighted': x = (x * w.reshape( n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([x[0] for x in meta.values()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all( v == 1 ): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device) # Write mutation results print_mutation(hyp.copy(), results, yaml_file, opt.bucket) # Plot results plot_evolution(yaml_file) print( f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n' f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}' )
def train(rank, world_size, args): dist.init_process_group("nccl", rank=rank, world_size=world_size) if rank == 0 and args.logging_dir: if not os.path.isdir(args.logging_dir): os.makedirs(args.logging_dir) filehandler = logging.FileHandler( os.path.join(args.logging_dir, 'train.log')) filehandler.setLevel(logging.INFO) logger.addHandler(filehandler) writer = SummaryWriter(log_dir=args.logging_dir) else: writer = None torch.manual_seed(0) torch.cuda.set_device(rank) symbols = get_symbol_list(args.text_preprocessor) model = Tacotron2( mask_padding=args.mask_padding, n_mels=args.n_mels, n_symbol=len(symbols), n_frames_per_step=args.n_frames_per_step, symbol_embedding_dim=args.symbols_embedding_dim, encoder_embedding_dim=args.encoder_embedding_dim, encoder_n_convolution=args.encoder_n_convolution, encoder_kernel_size=args.encoder_kernel_size, decoder_rnn_dim=args.decoder_rnn_dim, decoder_max_step=args.decoder_max_step, decoder_dropout=args.decoder_dropout, decoder_early_stopping=(not args.decoder_no_early_stopping), attention_rnn_dim=args.attention_rnn_dim, attention_hidden_dim=args.attention_hidden_dim, attention_location_n_filter=args.attention_location_n_filter, attention_location_kernel_size=args.attention_location_kernel_size, attention_dropout=args.attention_dropout, prenet_dim=args.prenet_dim, postnet_n_convolution=args.postnet_n_convolution, postnet_kernel_size=args.postnet_kernel_size, postnet_embedding_dim=args.postnet_embedding_dim, gate_threshold=args.gate_threshold, ).cuda(rank) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank]) optimizer = Adam(model.parameters(), lr=args.learning_rate) best_loss = float("inf") start_epoch = 0 if args.checkpoint_path and os.path.isfile(args.checkpoint_path): logger.info(f"Checkpoint: loading '{args.checkpoint_path}'") map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} checkpoint = torch.load(args.checkpoint_path, map_location=map_location) start_epoch = checkpoint["epoch"] best_loss = checkpoint["best_loss"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) logger.info( f"Checkpoint: loaded '{args.checkpoint_path}' at epoch {checkpoint['epoch']}" ) trainset, valset = get_datasets(args) train_sampler = torch.utils.data.distributed.DistributedSampler( trainset, shuffle=True, num_replicas=world_size, rank=rank, ) val_sampler = torch.utils.data.distributed.DistributedSampler( valset, shuffle=False, num_replicas=world_size, rank=rank, ) loader_params = { "batch_size": args.batch_size, "num_workers": args.workers, "prefetch_factor": 1024, 'persistent_workers': True, "shuffle": False, "pin_memory": True, "drop_last": False, "collate_fn": partial(text_mel_collate_fn, n_frames_per_step=args.n_frames_per_step), } train_loader = DataLoader(trainset, sampler=train_sampler, **loader_params) val_loader = DataLoader(valset, sampler=val_sampler, **loader_params) dist.barrier() for epoch in range(start_epoch, args.epochs): start = time() model.train() trn_loss, counts = 0, 0 if rank == 0: iterator = tqdm(enumerate(train_loader), desc=f"Epoch {epoch}", total=len(train_loader)) else: iterator = enumerate(train_loader) for i, batch in iterator: adjust_learning_rate(epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor) model.zero_grad() loss, losses = training_step(model, batch, i) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() if rank == 0 and writer: global_iters = epoch * len(train_loader) writer.add_scalar("trn/mel_loss", losses[0], global_iters) writer.add_scalar("trn/mel_postnet_loss", losses[1], global_iters) writer.add_scalar("trn/gate_loss", losses[2], global_iters) trn_loss += loss * len(batch[0]) counts += len(batch[0]) trn_loss = trn_loss / counts trn_loss = reduce_tensor(trn_loss, world_size) if rank == 0: logger.info( f"[Epoch: {epoch}] time: {time()-start}; trn_loss: {trn_loss}") if writer: writer.add_scalar("trn_loss", trn_loss, epoch) if ((epoch + 1) % args.validate_and_checkpoint_freq == 0) or (epoch == args.epochs - 1): val_start_time = time() model.eval() val_loss, counts = 0, 0 iterator = tqdm(enumerate(val_loader), desc=f"[Rank: {rank}; Epoch: {epoch}; Eval]", total=len(val_loader)) with torch.no_grad(): for val_batch_idx, val_batch in iterator: val_loss = val_loss + validation_step( model, val_batch, val_batch_idx)[0] * len(val_batch[0]) counts = counts + len(val_batch[0]) val_loss = val_loss / counts val_loss = reduce_tensor(val_loss, world_size) if rank == 0 and writer: writer.add_scalar("val_loss", val_loss, epoch) log_additional_info(writer, model, val_loader, epoch) if rank == 0: is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) logger.info( f"[Rank: {rank}, Epoch: {epoch}; Eval] time: {time()-val_start_time}; val_loss: {val_loss}" ) logger.info( f"[Epoch: {epoch}] Saving checkpoint to {args.checkpoint_path}" ) save_checkpoint( { "epoch": epoch + 1, "state_dict": model.state_dict(), "best_loss": best_loss, "optimizer": optimizer.state_dict(), }, is_best, args.checkpoint_path, ) dist.destroy_process_group()
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round(hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = ['', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp['lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs) shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}') # save previous weights if epochs < start_epoch: logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test\n' 'Using %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / f'train_batch{ni}.jpg') # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) # if tb_writer and result is not None: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir, plots=epoch == 0 or final_epoch) # plot first and last # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2'] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def dist_cleanup() -> None: # Clean processes dist.destroy_process_group()