def run_ddp_parity_two_optim(rank, world_size, backend, temp_file_name): url = "file://" + temp_file_name dist.init_process_group(init_method=url, backend=backend, rank=rank, world_size=world_size) device = torch.device("cuda") torch.cuda.set_device(rank) torch.manual_seed(rank) np.random.seed(rank) # Any model works. Add one different buffer per rank model = Sequential(Linear(2, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3)) model.register_buffer("test_buffer", torch.ones((1)) * rank) model.to(device) n_half_params = len(list(model.parameters())) // 2 sharded_optimizer = OSS(params=list(model.parameters())[:n_half_params], optim=torch.optim.SGD, lr=1e-3, momentum=0.99) sharded_optimizer_2 = OSS(params=list(model.parameters())[n_half_params:], optim=torch.optim.SGD, lr=1e-3, momentum=0.99) sharded_ddp_model = ShardedDataParallel( module=model, sharded_optimizer=sharded_optimizer, broadcast_buffers=True) ddp_model_single = copy.deepcopy(model) ddp_optimizer = torch.optim.SGD(list( ddp_model_single.parameters())[:n_half_params], lr=1e-3, momentum=0.99) ddp_optimizer_2 = torch.optim.SGD(list( ddp_model_single.parameters())[n_half_params:], lr=1e-3, momentum=0.99) ddp_model = DDP(ddp_model_single, device_ids=[rank], broadcast_buffers=True) def check_same_model_params(): for pg, ddp_pg in zip(sharded_optimizer.param_groups, ddp_optimizer.param_groups): for p, ddp_p in zip(pg["params"], ddp_pg["params"]): assert torch.allclose( p, ddp_p, atol=1e-3 ), f"Model parameters differ in between DDP and ShardedDDP {p} {ddp_p}" for b, ddp_b in zip(sharded_ddp_model.buffers(), ddp_model.buffers()): assert torch.allclose( b, ddp_b, atol=1e-3 ), "Model buffers differ in between DDP and ShardedDDP" check_same_model_params( ) # The models should stay the same in between the ranks for i in range(20): input_tensor = torch.rand((64, 2)).to(device) # Run DDP ddp_optimizer.zero_grad() ddp_optimizer_2.zero_grad() ddp_loss = ddp_model(input_tensor).abs().sum() ddp_loss.backward() ddp_optimizer.step() ddp_optimizer_2.step() # Run Sharded sharded_optimizer.zero_grad() sharded_optimizer_2.zero_grad() sharded_loss = sharded_ddp_model(input_tensor).abs().sum() sharded_loss.backward() sharded_optimizer.step() sharded_optimizer_2.step() check_same_model_params() dist.destroy_process_group()
def run_training(rank, args, hp, port=None): if args.n_gpus > 1: init_distributed(rank, args.n_gpus, port) torch.cuda.set_device(f'cuda:{rank}') ## NOTE: variable model = TransformerWav2vec2( hp, pretrain_model='facebook/wav2vec2-large-lv60', freeze_feature_extractor=hp.freeze_feature_extractor) ## TODO: change init_weight (maybe initialize all networks) #model.apply(init_weight) model.train() if rank == 0: print(model) model = model.to(rank) if args.n_gpus > 1: model = DDP(torch.nn.SyncBatchNorm.convert_sync_batchnorm(model), device_ids=[rank]) max_lr = hp.init_lr if hp.optimizer_type == 'Noam': ## NOTE: scheduling? ## NOTE: learning rate? optimizer = torch.optim.Adam(model.parameters(), lr=max_lr, betas=(0.9, 0.98), eps=1e-9) else: optimizer = torch.optim.Adam(model.parameters(), lr=max_lr) assert (hp.batch_size is None) != (hp.max_seqlen is None) if args.n_gpus > 1: dist.barrier() # configure map_location properly map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} if hp.loaded_epoch is not None: start_epoch = hp.loaded_epoch load_dir = hp.loaded_dir print('epoch {} loaded'.format(hp.loaded_epoch)) loaded_dict = load_model("{}".format( os.path.join(load_dir, 'network.epoch{}'.format(hp.loaded_epoch))), map_location=map_location) model.load_state_dict(loaded_dict) if hp.is_flat_start: step = 1 start_epoch = 0 print('flat_start') else: loaded_dict = torch.load("{}".format( os.path.join( load_dir, 'network.optimizer.epoch{}'.format(hp.loaded_epoch))), map_location=map_location) optimizer.load_state_dict(loaded_dict) step = loaded_dict['state'][0]['step'] #lr = get_learning_rate(step//hp.accum_grad+1, hp) lr = get_learning_rate_tristage(step // hp.accum_grad + 1) for param_group in optimizer.param_groups: param_group['lr'] = lr del loaded_dict torch.cuda.empty_cache() else: start_epoch = 0 step = 1 pytorch_total_params = sum(p.numel() for p in model.parameters()) print('params = {0:.2f}M'.format(pytorch_total_params / 1000 / 1000)) train_epoch(model, optimizer, args, hp, step=step, start_epoch=start_epoch, rank=rank)
def train(hyp): cfg = opt.cfg t_cfg = opt.t_cfg # teacher model cfg for knowledge distillation data = opt.data epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = max(round(64 / batch_size), 1) # accumulate n times before optimizer update (bs 64) weights = opt.weights # initial training weights t_weights = opt.t_weights # teacher model weights imgsz_min, imgsz_max, imgsz_test = opt.img_size # img sizes (min, max, test) # Image Sizes gs = 32 # (pixels) grid size start_epoch = 0 assert math.fmod(imgsz_min, gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs) opt.multi_scale |= imgsz_min != imgsz_max # multi if different (min, max) if opt.multi_scale: if imgsz_min == imgsz_max: imgsz_min //= 1.5 imgsz_max //= 0.667 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs) img_size = imgsz_max # initialize with max size # Configure run init_seeds() data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int(data_dict['classes']) # number of classes hyp['cls'] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # DDP init if opt.local_rank != -1: if opt.local_rank == 0: print("--------------using ddp---------------") assert torch.cuda.device_count() > opt.local_rank torch.cuda.set_device(opt.local_rank) dist.init_process_group(backend='nccl', init_method='env://') # distributed backend assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.batch_size // opt.world_size else: dist.init_process_group(backend='nccl', # 'distributed backend' init_method='tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank # Initialize model steps = math.ceil(len(open(train_path).readlines()) / batch_size) * epochs model = Darknet(cfg, quantized=opt.quantized, a_bit=opt.a_bit, w_bit=opt.w_bit, FPGA=opt.FPGA, steps=steps).to(device) if t_cfg: t_model = Darknet(t_cfg).to(device) # print('<.....................using gridmask.......................>') # gridmask = GridMask(d1=96, d2=224, rotate=360, ratio=0.6, mode=1, prob=0.8) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g Conv2d.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 best_fitness = 0.0 if weights != 'None': attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. chkpt = torch.load(weights, map_location=device) # load model try: chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()} model.load_state_dict(chkpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) if chkpt.get('best_fitness') is not None: best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt if chkpt.get('epoch') is not None: start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. load_darknet_weights(model, weights, pt=opt.pt, FPGA=opt.FPGA) if t_cfg: if t_weights.endswith('.pt'): t_model.load_state_dict(torch.load(t_weights, map_location=device)['model']) elif t_weights.endswith('.weights'): load_darknet_weights(t_model, t_weights) else: raise Exception('pls provide proper teacher weights for knowledge distillation') t_model.eval() print('<.....................using knowledge distillation.......................>') print('teacher model:', t_weights, '\n') # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.95 + 0.05 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch - 1 # see link below # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Initialize distributed training if opt.local_rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank, find_unused_parameters=True) else: model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) testset = LoadImagesAndLabels(test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls) # 获得要剪枝的层 if hasattr(model, 'module'): print('muti-gpus sparse') if opt.prune == 0: print('normal sparse training ') _, _, prune_idx = parse_module_defs(model.module.module_defs) elif opt.prune == 1: print('shortcut sparse training') _, _, prune_idx, _, _ = parse_module_defs2(model.module.module_defs) elif opt.prune == 2: print('layer sparse training') _, _, prune_idx = parse_module_defs4(model.module.module_defs) else: print('single-gpu sparse') if opt.prune == 0: print('normal sparse training') _, _, prune_idx = parse_module_defs(model.module_defs) elif opt.prune == 1: print('shortcut sparse training') _, _, prune_idx, _, _ = parse_module_defs2(model.module_defs) elif opt.prune == 2: print('layer sparse training') _, _, prune_idx = parse_module_defs4(model.module_defs) train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) # ddp sampler test_sampler = torch.utils.data.distributed.DistributedSampler(testset) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers dataloader = torch.utils.data.DataLoader(dataset, batch_size=int(batch_size / opt.world_size), num_workers=nw, shuffle=False if (opt.local_rank != -1) else not opt.rect, pin_memory=True, collate_fn=dataset.collate_fn, sampler=train_sampler if (opt.local_rank != -1) else None ) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) if opt.prune != -1: for idx in prune_idx: if hasattr(model, 'module'): bn_weights = gather_bn_weights(model.module.module_list, [idx]) else: bn_weights = gather_bn_weights(model.module_list, [idx]) tb_writer.add_histogram('before_train_perlayer_bn_weights/hist', bn_weights.numpy(), idx, bins='doane') # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights # Model EMA if opt.ema: ema = torch_utils.ModelEMA(model) # Start training nb = len(dataloader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test)) print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) if opt.mpt: cuda = device.type != 'cpu' scaler = amp.GradScaler(enabled=cuda) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ if opt.local_rank != -1: dataloader.sampler.set_epoch(epoch) # DDP set seed # gridmask.set_prob(epoch, max_epoch) model.train() # 稀疏化标志 if opt.prune == -1: sr_flag = False else: sr_flag = True # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 # Burn-in if ni <= n_burn: xi = [0, n_burn] # x interp model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max(1, np.interp(ni, xi, [1, 64 / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) x['weight_decay'] = np.interp(ni, xi, [0.0, hyp['weight_decay'] if j == 1 else 0.0]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-Scale if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(grid_min, grid_max + 1) * gs sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward if opt.mpt: with amp.autocast(enabled=cuda): targets = targets.to(device) pred, feature_s = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results soft_target = 0 if t_cfg: _, output_t, feature_t = t_model(imgs) if opt.KDstr == 1: soft_target = compute_lost_KD(pred, output_t, model.nc, imgs.size(0)) elif opt.KDstr == 2: soft_target, reg_ratio = compute_lost_KD2(model, targets, pred, output_t) elif opt.KDstr == 3: soft_target = compute_lost_KD3(model, targets, pred, output_t) elif opt.KDstr == 4: soft_target = compute_lost_KD4(model, targets, pred, output_t, feature_s, feature_t, imgs.size(0)) elif opt.KDstr == 5: soft_target = compute_lost_KD5(model, targets, pred, output_t, feature_s, feature_t, imgs.size(0), img_size) else: print("please select KD strategy!") loss += soft_target else: targets = targets.to(device) pred, feature_s = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results soft_target = 0 if t_cfg: _, output_t, feature_t = t_model(imgs) if opt.KDstr == 1: soft_target = compute_lost_KD(pred, output_t, model.nc, imgs.size(0)) elif opt.KDstr == 2: soft_target, reg_ratio = compute_lost_KD2(model, targets, pred, output_t) elif opt.KDstr == 3: soft_target = compute_lost_KD3(model, targets, pred, output_t) elif opt.KDstr == 4: soft_target = compute_lost_KD4(model, targets, pred, output_t, feature_s, feature_t, imgs.size(0)) elif opt.KDstr == 5: soft_target = compute_lost_KD5(model, targets, pred, output_t, feature_s, feature_t, imgs.size(0), img_size) else: print("please select KD strategy!") loss += soft_target # Backward loss *= batch_size / 64 # scale loss if opt.mpt: scaler.scale(loss).backward() else: loss.backward() # 对要剪枝层的γ参数稀疏化 if hasattr(model, 'module'): if opt.prune != -1: BNOptimizer.updateBN(sr_flag, model.module.module_list, opt.s, prune_idx) else: if opt.prune != -1: BNOptimizer.updateBN(sr_flag, model.module_list, opt.s, prune_idx) # Optimize if ni % accumulate == 0: if opt.mpt: scaler.step(optimizer) # optimizer.step scaler.update() else: optimizer.step() optimizer.zero_grad() if opt.ema: ema.update(model) # Print mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) pbar.set_description(s) # Plot if i == 0: if not os.path.isdir('train_sample/'): os.makedirs('train_sample/') f = 'train_sample/train_batch%g.jpg' % epoch # filename res = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, res, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results if opt.ema: ema.update_attr(model) if hasattr(model, 'module'): module_defs, module_list = ema.eam.module.module_defs, ema.eam.module.module_list else: module_defs, module_list = ema.eam.module_defs, ema.eam.module_list for i, (mdef, module) in enumerate(zip(module_defs, module_list)): if mdef['type'] == 'yolo': yolo_layer = module yolo_layer.nx, yolo_layer.ny = 0, 0 if hasattr(model, 'module'): module_defs, module_list = model.module.module_defs, model.module.module_list else: module_defs, module_list = model.module_defs, model.module_list for i, (mdef, module) in enumerate(zip(module_defs, module_list)): if mdef['type'] == 'yolo': yolo_layer = module yolo_layer.nx, yolo_layer.ny = 0, 0 final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP is_coco = any([x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data']]) and model.nc == 80 results, maps = test.test(cfg, data, batch_size=batch_size, imgsz=imgsz_test, model=ema.ema if opt.ema else model, save_json=final_epoch and is_coco, single_cls=opt.single_cls, dataloader=testloader, multi_label=ni > n_burn, quantized=opt.quantized, a_bit=opt.a_bit, w_bit=opt.w_bit, FPGA=opt.FPGA) # Write with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss'] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) if opt.prune != -1: if hasattr(model, 'module'): bn_weights = gather_bn_weights(model.module.module_list, [idx]) else: bn_weights = gather_bn_weights(model.module_list, [idx]) tb_writer.add_histogram('bn_weights/hist', bn_weights.numpy(), epoch, bins='doane') # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if opt.ema: if hasattr(model, 'module'): model_temp = ema.ema.module.state_dict() else: model_temp = ema.ema.state_dict() else: if hasattr(model, 'module'): model_temp = model.module.state_dict() else: model_temp = model.state_dict() if save and dist.get_rank() == 0: # DDP save model only once with open(results_file, 'r') as f: # create checkpoint chkpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': model_temp, 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last, best and delete torch.save(chkpt, last) if (best_fitness == fi) and not final_epoch: torch.save(chkpt, best) del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def run_ddp_parity_two_optim(rank, world_size, backend, temp_file_name, reduce_buffer_size): dist.init_process_group(init_method="file://" + temp_file_name, backend=backend, rank=rank, world_size=world_size) device = torch.device("cuda") torch.cuda.set_device(rank) torch.manual_seed(rank) np.random.seed(rank) # Any model works. Add one different buffer per rank BATCHS = 20 model = _get_mlp() model.register_buffer("test_buffer", torch.ones((1)) * rank) model.to(device) n_half_params = len(list(model.parameters())) // 2 optim_settings = {"lr": 1e-3, "momentum": 0.99} sharded_optimizer = OSS(params=list(model.parameters())[:n_half_params], optim=torch.optim.SGD, **optim_settings) sharded_optimizer_2 = OSS(params=list(model.parameters())[n_half_params:], optim=torch.optim.SGD, **optim_settings) sharded_ddp_model = ShardedDataParallel( module=model, sharded_optimizer=[sharded_optimizer, sharded_optimizer_2], broadcast_buffers=True, reduce_buffer_size=reduce_buffer_size, ) ddp_model_single = copy.deepcopy(model) ddp_optimizer = torch.optim.SGD( list(ddp_model_single.parameters())[:n_half_params], **optim_settings) ddp_optimizer_2 = torch.optim.SGD( list(ddp_model_single.parameters())[n_half_params:], **optim_settings) ddp_model = DDP(ddp_model_single, device_ids=[rank], broadcast_buffers=True) check_same_model_params( sharded_ddp_model, ddp_model, f"DDP parity two optim test failing. differing at startup, Buffers {reduce_buffer_size}", ) for i in range(BATCHS): input_tensor = torch.rand((64, 2)).to(device) # Run DDP ddp_optimizer.zero_grad() ddp_optimizer_2.zero_grad() ddp_loss = ddp_model(input_tensor).abs().sum() ddp_loss.backward() ddp_optimizer.step() ddp_optimizer_2.step() torch.cuda.synchronize(device) # Run Sharded sharded_optimizer.zero_grad() sharded_optimizer_2.zero_grad() sharded_loss = sharded_ddp_model(input_tensor).abs().sum() sharded_loss.backward() sharded_optimizer.step() sharded_optimizer_2.step() torch.cuda.synchronize(device) check_same_model_params( sharded_ddp_model, ddp_model, f"DDP parity two optim test failing, step {i}, buffers {reduce_buffer_size}", ) dist.destroy_process_group()
def initialize(self, training=True, force_load_plans=False): """ this is a copy of nnUNetTrainerV2's initialize. We only add the regions to the data augmentation :param training: :param force_load_plans: :return: """ if not self.was_initialized: maybe_mkdir_p(self.output_folder) if force_load_plans or (self.plans is None): self.load_plans_file() self.process_plans(self.plans) self.setup_DA_params() self.folder_with_preprocessed_data = join(self.dataset_directory, self.plans['data_identifier'] + "_stage%d" % self.stage) if training: self.dl_tr, self.dl_val = self.get_basic_generators() if self.unpack_data: if self.local_rank == 0: print("unpacking dataset") unpack_dataset(self.folder_with_preprocessed_data) print("done") else: # we need to wait until worker 0 has finished unpacking npz_files = subfiles(self.folder_with_preprocessed_data, suffix=".npz", join=False) case_ids = [i[:-4] for i in npz_files] all_present = all( [isfile(join(self.folder_with_preprocessed_data, i + ".npy")) for i in case_ids]) while not all_present: print("worker", self.local_rank, "is waiting for unpacking") sleep(3) all_present = all( [isfile(join(self.folder_with_preprocessed_data, i + ".npy")) for i in case_ids]) # there is some slight chance that there may arise some error because dataloader are loading a file # that is still being written by worker 0. We ignore this for now an address it only if it becomes # relevant # (this can occur because while worker 0 writes the file is technically present so the other workers # will proceed and eventually try to read it) else: print( "INFO: Not unpacking data! Training may be slow due to that. Pray you are not using 2d or you " "will wait all winter for your model to finish!") # setting weights for deep supervision losses net_numpool = len(self.net_num_pool_op_kernel_sizes) # we give each output a weight which decreases exponentially (division by 2) as the resolution decreases # this gives higher resolution outputs more weight in the loss weights = np.array([1 / (2 ** i) for i in range(net_numpool)]) # we don't use the lowest 2 outputs. Normalize weights so that they sum to 1 mask = np.array([True if i < net_numpool - 1 else False for i in range(net_numpool)]) weights[~mask] = 0 weights = weights / weights.sum() self.ds_loss_weights = weights seeds_train = np.random.random_integers(0, 99999, self.data_aug_params.get('num_threads')) seeds_val = np.random.random_integers(0, 99999, max(self.data_aug_params.get('num_threads') // 2, 1)) print("seeds train", seeds_train) print("seeds_val", seeds_val) self.tr_gen, self.val_gen = get_moreDA_augmentation(self.dl_tr, self.dl_val, self.data_aug_params[ 'patch_size_for_spatialtransform'], self.data_aug_params, deep_supervision_scales=self.deep_supervision_scales, seeds_train=seeds_train, seeds_val=seeds_val, pin_memory=self.pin_memory, regions=self.regions) self.print_to_log_file("TRAINING KEYS:\n %s" % (str(self.dataset_tr.keys())), also_print_to_console=False) self.print_to_log_file("VALIDATION KEYS:\n %s" % (str(self.dataset_val.keys())), also_print_to_console=False) else: pass self.initialize_network() self.initialize_optimizer_and_scheduler() self._maybe_init_amp() self.network = DDP(self.network, self.local_rank) else: self.print_to_log_file('self.was_initialized is True, not running self.initialize again') self.was_initialized = True
def main(): parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Example') parser.add_argument('--bs', '--batch_size', type=int, default=32, metavar='N', help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', '--learning_rate', type=float, default=1.0e-02, metavar='LR', help='learning rate (default: 1.0e-02)') args = parser.parse_args() master_addr = os.getenv("MASTER_ADDR", default="localhost") master_port = os.getenv('MASTER_POST', default='8888') method = "tcp://{}:{}".format(master_addr, master_port) rank = int(os.getenv('OMPI_COMM_WORLD_RANK', '0')) world_size = int(os.getenv('OMPI_COMM_WORLD_SIZE', '1')) dist.init_process_group("nccl", init_method=method, rank=rank, world_size=world_size) ngpus = torch.cuda.device_count() device = torch.device('cuda', rank % ngpus) if rank == 0: wandb.init() wandb.config.update(args) train_dataset = datasets.CIFAR10('./data', train=True, download=True, transform=transforms.ToTensor()) val_dataset = datasets.CIFAR10('./data', train=False, transform=transforms.ToTensor()) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank()) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.bs, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=args.bs, shuffle=False) # model = VGG('VGG19') # model = ResNet18() # model = PreActResNet18() # model = GoogLeNet() # model = DenseNet121() # model = ResNeXt29_2x64d() # model = MobileNet() # model = MobileNetV2() # model = DPN92() # model = ShuffleNetG2() # model = SENet18() # model = ShuffleNetV2(1) # model = EfficientNetB0() # model = RegNetX_200MF() model = VGG('VGG19').to(device) if rank == 0: wandb.config.update({ "model": model.__class__.__name__, "dataset": "CIFAR10" }) model = DDP(model, device_ids=[rank % ngpus]) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) for epoch in range(args.epochs): model.train() train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, device) val_loss, val_acc = validate(val_loader, model, criterion, device) if rank == 0: wandb.log({ 'train_loss': train_loss, 'train_acc': train_acc, 'val_loss': val_loss, 'val_acc': val_acc }) dist.destroy_process_group()
def main( cfg: AAEModelConfig, encoder_gpu: int, generator_gpu: int, discriminator_gpu: int, distributed: bool, ): # Do some scaffolding for DDP comm_rank = 0 comm_size = 1 comm = None if distributed and dist.is_available(): import mpi4py mpi4py.rc.initialize = False from mpi4py import MPI # noqa: E402 MPI.Init_thread() # get communicator: duplicate from comm world comm = MPI.COMM_WORLD.Dup() # now match ranks between the mpi comm and the nccl comm os.environ["WORLD_SIZE"] = str(comm.Get_size()) os.environ["RANK"] = str(comm.Get_rank()) # init pytorch dist.init_process_group(backend="nccl", init_method="env://") comm_rank = dist.get_rank() comm_size = dist.get_world_size() model_hparams = AAE3dHyperparams( num_features=cfg.num_features, encoder_filters=cfg.encoder_filters, encoder_kernel_sizes=cfg.encoder_kernel_sizes, generator_filters=cfg.generator_filters, discriminator_filters=cfg.discriminator_filters, latent_dim=cfg.latent_dim, encoder_relu_slope=cfg.encoder_relu_slope, generator_relu_slope=cfg.generator_relu_slope, discriminator_relu_slope=cfg.discriminator_relu_slope, use_encoder_bias=cfg.use_encoder_bias, use_generator_bias=cfg.use_generator_bias, use_discriminator_bias=cfg.use_discriminator_bias, noise_mu=cfg.noise_mu, noise_std=cfg.noise_std, lambda_rec=cfg.lambda_rec, lambda_gp=cfg.lambda_gp, ) # optimizers optimizer_hparams = OptimizerHyperparams(name=cfg.optimizer_name, hparams={"lr": cfg.optimizer_lr}) # Save hparams to disk and load initial weights and create virtual h5 file if comm_rank == 0: cfg.output_path.mkdir(exist_ok=True) model_hparams.save(cfg.output_path.joinpath("model-hparams.json")) optimizer_hparams.save( cfg.output_path.joinpath("optimizer-hparams.json")) init_weights = get_init_weights(cfg) h5_file, h5_files = get_h5_training_file(cfg) with open(cfg.output_path.joinpath("virtual-h5-metadata.json"), "w") as f: json.dump(h5_files, f) else: init_weights, h5_file = None, None if comm_size > 1: init_weights = comm.bcast(init_weights, 0) h5_file = comm.bcast(h5_file, 0) # construct model aae = AAE3d( cfg.num_points, cfg.num_features, cfg.batch_size, model_hparams, optimizer_hparams, gpu=(encoder_gpu, generator_gpu, discriminator_gpu), init_weights=init_weights, ) enc_device = torch.device(f"cuda:{encoder_gpu}") if comm_size > 1: if (encoder_gpu == generator_gpu) and (encoder_gpu == discriminator_gpu): aae.model = DDP(aae.model, device_ids=[enc_device], output_device=enc_device) else: aae.model = DDP(aae.model, device_ids=None, output_device=None) # set global default device torch.cuda.set_device(enc_device.index) if comm_rank == 0: # Diplay model print(aae) assert isinstance(h5_file, Path) # set up dataloaders train_dataset = get_dataset( cfg.dataset_location, h5_file, cfg.dataset_name, cfg.rmsd_name, cfg.fnc_name, cfg.num_points, cfg.num_features, split="train", shard_id=comm_rank, num_shards=comm_size, normalize="box", cms_transform=False, ) train_loader = DataLoader( train_dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=True, pin_memory=True, num_workers=cfg.num_data_workers, ) valid_dataset = get_dataset( cfg.dataset_location, h5_file, cfg.dataset_name, cfg.rmsd_name, cfg.fnc_name, cfg.num_points, cfg.num_features, split="valid", shard_id=comm_rank, num_shards=comm_size, normalize="box", cms_transform=False, ) valid_loader = DataLoader( valid_dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=True, pin_memory=True, num_workers=cfg.num_data_workers, ) print( f"Having {len(train_dataset)} training and {len(valid_dataset)} validation samples." ) wandb_config = setup_wandb(cfg, aae.model, comm_rank) # Optional callbacks loss_callback = LossCallback(cfg.output_path.joinpath("loss.json"), wandb_config=wandb_config, mpi_comm=comm) checkpoint_callback = CheckpointCallback( out_dir=cfg.output_path.joinpath("checkpoint"), mpi_comm=comm) save_callback = SaveEmbeddingsCallback( out_dir=cfg.output_path.joinpath("embeddings"), interval=cfg.embed_interval, sample_interval=cfg.sample_interval, mpi_comm=comm, ) # TSNEPlotCallback requires SaveEmbeddingsCallback to run first tsne_callback = TSNEPlotCallback( out_dir=cfg.output_path.joinpath("embeddings"), projection_type="3d", target_perplexity=100, interval=cfg.tsne_interval, tsne_is_blocking=True, wandb_config=wandb_config, mpi_comm=comm, ) # Train model with callbacks callbacks = [ loss_callback, checkpoint_callback, save_callback, tsne_callback, ] # Optionaly train for a different number of # epochs on the first DDMD iterations if cfg.stage_idx == 0: epochs = cfg.initial_epochs else: epochs = cfg.epochs aae.train(train_loader, valid_loader, epochs, callbacks=callbacks) # Save loss history to disk. if comm_rank == 0: loss_callback.save(cfg.output_path.joinpath("loss.json")) # Save final model weights to disk aae.save_weights( cfg.output_path.joinpath("encoder-weights.pt"), cfg.output_path.joinpath("generator-weights.pt"), cfg.output_path.joinpath("discriminator-weights.pt"), )
def __init__(self, train_data, model, optimizer=None, loss=None, callbacks_all=None, callbacks_master=None, batch_size_per_gpu=8, n_epochs=1, num_workers=1, drop_last=False, dev_data=None, metrics=None, metric_key=None, update_every=1, print_every=10, validate_every=-1, save_every=-1, save_path=None, device='auto', fp16='', backend=None, init_method=None): assert device in [ 'auto', 'cuda', 'cpu' ], "Please set correct device in [auto', 'cuda', 'cpu']" if device == 'auto': device = 'cuda' if torch.cuda.is_available() else 'cpu' if backend is None: backend = 'nccl' if device == 'cuda' else 'gloo' # init distributed if device == 'cuda': torch.cuda.set_device(get_local_rank()) self.device = torch.device("cuda", get_local_rank()) else: self.device = torch.device(device) dist.init_process_group(backend=backend, init_method=init_method) self.world_size = dist.get_world_size() self.rank = dist.get_rank() # unique id for each process self.model = model self.train_data = train_data self.batch_size_per_gpu = int(batch_size_per_gpu) self.n_epochs = int(n_epochs) self.num_data_workers = int(num_workers) self.drop_last = drop_last self.update_every = int(update_every) self.print_every = int(print_every) self.validate_every = int(validate_every) self.save_every = int(save_every) self.save_path = save_path self.losser = _prepare_losser(loss) self.fp16 = fp16 self.init_method = init_method self.backend = backend self.local_rank = get_local_rank() self._forward_func = model.forward self.callback_manager = DistCallbackManager( env={"trainer": self}, callbacks_all=callbacks_all, callbacks_master=callbacks_master) self.metric_key = metric_key model.to(self.device) optimizer = self._get_optimizer(optimizer) # init fp16, must before DataParallel init if len(self.fp16): assert isinstance( self.fp16, str ), "Please set Apex AMP optimization level selected in ['O0', 'O1', 'O2', 'O3']" try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." assert device == 'cuda', "Amp requires cuda device" model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16) # init DataParallel if parse_version(torch.__version__) >= parse_version('1.1'): self.model = DDP(model, device_ids=[self.local_rank], output_device=self.local_rank, find_unused_parameters=True) else: self.model = DDP(model, device_ids=[self.local_rank], output_device=self.local_rank) self.optimizer = optimizer self.sampler = DistributedSampler(self.train_data) self.data_iterator = self._get_data_iter(self.train_data) self.n_steps = self._get_n_steps() # for evaluation, only run eval on master proc if dev_data and metrics: cb = TesterCallback(dev_data, model, metrics, batch_size=batch_size_per_gpu, num_workers=num_workers) self.callback_manager.add_callback([cb], master=True) # Setup logging dist.barrier() self.start_time = datetime.now().strftime('%m_%d_%Y-%H_%M') if self.save_path: self.cp_save_path = os.path.join(self.save_path, 'checkpoints', self.start_time) else: self.cp_save_path = None # use INFO in the master, WARN for others logger.setLevel(logging.INFO if self.is_master else logging.WARNING) self.logger = logger self.logger.info("Setup Distributed Trainer") self.logger.warning( "Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}". format(os.getpid(), self.rank, self.local_rank, self.device, self.fp16 if self.fp16 else False)) self.logger.info("Num of processes: {}".format(self.world_size)) self.logger.info("Use device: {}".format(device)) self.logger.info( "Training with fp16: {}, optimization level: {}".format( len(self.fp16) > 0, self.fp16 if self.fp16 else None))
def __init__(self, model, optimizer, config, data_loader, valid_data_loader=None, lr_scheduler=None, max_len_step=None): ''' :param model: :param optimizer: :param config: :param data_loader: :param valid_data_loader: :param lr_scheduler: :param max_len_step: controls number of batches(steps) in each epoch. ''' self.config = config self.local_master = config['local_rank'] == 0 self.logger = config.get_logger( 'trainer', config['trainer']['log_verbosity']) if self.local_master else None # setup GPU device if available, move model into configured device self.device, self.device_ids = self._prepare_device( config['local_rank'], config['local_world_size']) self.model = model.to(self.device) self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] monitor_open = cfg_trainer['monitor_open'] if monitor_open: self.monitor = cfg_trainer.get('monitor', 'off') else: self.monitor = 'off' # configuration to monitor model performance and save best if self.monitor == 'off': self.monitor_mode = 'off' self.monitor_best = 0 else: self.monitor_mode, self.monitor_metric = self.monitor.split() assert self.monitor_mode in ['min', 'max'] self.monitor_best = inf if self.monitor_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.early_stop = inf if self.early_stop == -1 else self.early_stop self.start_epoch = 1 if self.local_master: self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) # load checkpoint for resume training if config.resume is not None: self._resume_checkpoint(config.resume) # load checkpoint following load to multi-gpu, avoid 'module.' prefix if self.config['trainer']['sync_batch_norm']: self.model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( self.model) self.mode = DDP(self.model, device_ids=self.device_ids, output_device=self.device_ids[0], find_unused_parameters=True) self.data_loader = data_loader if max_len_step is None: # max length of iteration step of every epoch # epoch-based training self.len_step = len(self.data_loader) else: # iteration-based training self.data_loader = inf_loop(data_loader) self.len_step = max_len_step self.valid_data_loader = valid_data_loader self.do_validation = self.valid_data_loader is not None self.lr_scheduler = lr_scheduler log_step = self.config['trainer']['log_step_interval'] self.log_step = log_step if log_step != -1 and 0 < log_step < self.len_step else int( np.sqrt(data_loader.batch_size)) val_step_interval = self.config['trainer']['val_step_interval'] # self.val_step_interval = val_step_interval if val_step_interval!= -1 and 0 < val_step_interval < self.len_step\ # else int(np.sqrt(data_loader.batch_size)) self.val_step_interval = val_step_interval self.gl_loss_lambda = self.config['trainer']['gl_loss_lambda'] self.train_loss_metrics = MetricTracker( 'loss', 'gl_loss', 'crf_loss', writer=self.writer if self.local_master else None) self.valid_f1_metrics = SpanBasedF1MetricTracker(iob_labels_vocab_cls)
def create_nerf(rank, args): ###### create network and wrap in ddp; each process should do this # fix random seed just to make sure the network is initialized with same weights at different processes torch.manual_seed(777) # very important!!! otherwise it might introduce extra memory in rank=0 gpu torch.cuda.set_device(rank) models = OrderedDict() models['cascade_level'] = args.cascade_level models['cascade_samples'] = [ int(x.strip()) for x in args.cascade_samples.split(',') ] for m in range(models['cascade_level']): img_names = None if args.optim_autoexpo: # load training image names for autoexposure f = os.path.join(args.basedir, args.expname, 'train_images.json') with open(f) as file: img_names = json.load(file) net = NerfNetWithAutoExpo(args, optim_autoexpo=args.optim_autoexpo, img_names=img_names).to(rank) net = DDP(net, device_ids=[rank], output_device=rank, find_unused_parameters=True) # net = DDP(net, device_ids=[rank], output_device=rank) optim = torch.optim.Adam(net.parameters(), lr=args.lrate) models['net_{}'.format(m)] = net models['optim_{}'.format(m)] = optim start = -1 ###### load pretrained weights; each process should do this if (args.ckpt_path is not None) and (os.path.isfile(args.ckpt_path)): ckpts = [args.ckpt_path] else: ckpts = [ os.path.join(args.basedir, args.expname, f) for f in sorted( os.listdir(os.path.join(args.basedir, args.expname))) if f.endswith('.pth') ] def path2iter(path): tmp = os.path.basename(path)[:-4] idx = tmp.rfind('_') return int(tmp[idx + 1:]) ckpts = sorted(ckpts, key=path2iter) logger.info('Found ckpts: {}'.format(ckpts)) if len(ckpts) > 0 and not args.no_reload: fpath = ckpts[-1] logger.info('Reloading from: {}'.format(fpath)) start = path2iter(fpath) # configure map_location properly for different processes map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} to_load = torch.load(fpath, map_location=map_location) for m in range(models['cascade_level']): for name in ['net_{}'.format(m), 'optim_{}'.format(m)]: models[name].load_state_dict(to_load[name]) return start, models
def run(rank, n_gpus, hps): global global_step if rank == 0: logger = utils.get_logger(hps.model_dir) logger.info(hps) utils.check_git_hash(hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir) writer_eval = SummaryWriter( log_dir=os.path.join(hps.model_dir, "eval")) dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) torch.manual_seed(hps.train.seed) torch.cuda.set_device(rank) train_dataset = TextAudioLoader(hps.data.training_files, hps.data) train_sampler = DistributedBucketSampler( train_dataset, hps.train.batch_size, [32, 300, 400, 500, 600, 700, 800, 900, 1000], num_replicas=n_gpus, rank=rank, shuffle=True) collate_fn = TextAudioCollate() train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, collate_fn=collate_fn, batch_sampler=train_sampler) if rank == 0: eval_dataset = TextAudioLoader(hps.data.validation_files, hps.data) eval_loader = DataLoader(eval_dataset, num_workers=8, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=False, collate_fn=collate_fn) net_g = SynthesizerTrn(len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model).cuda(rank) net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) optim_g = torch.optim.AdamW(net_g.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) optim_d = torch.optim.AdamW(net_d.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) net_g = DDP(net_g, device_ids=[rank]) net_d = DDP(net_d, device_ids=[rank]) try: _, _, _, epoch_str = utils.load_checkpoint( utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) _, _, _, epoch_str = utils.load_checkpoint( utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) global_step = (epoch_str - 1) * len(train_loader) except: epoch_str = 1 global_step = 0 scheduler_g = torch.optim.lr_scheduler.ExponentialLR( optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) scheduler_d = torch.optim.lr_scheduler.ExponentialLR( optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) scaler = GradScaler(enabled=hps.train.fp16_run) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval]) else: train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None) scheduler_g.step() scheduler_d.step()
def train(opt, train_dict, device, tb_writer=None): log_dir = Path(tb_writer.log_dir) if tb_writer else Path( train_dict['logdir']) / 'logs' wdir = str(log_dir / 'weights') + os.sep os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = 'results.txt' with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(train_dict, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' rank = opt.global_rank init_seeds(2 + rank) train_path = train_dict['train'] test_path = train_dict['val'] train_dict['weights'] = last if not train_dict['pretrain'] or ( train_dict['pretrain'] and not os.path.exists(train_dict['weights'])) else train_dict['weights'] model = RetinaFace(train_dict, phase='Train') pretrained = False if os.path.exists(train_dict['weights']): pretrained = True logger('Loading resume network from ====>{}'.format( train_dict['weights'])) state_dict = torch.load(train_dict['weights'], map_location=device) # create new OrderedDict that does not contain `module.` from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict['model'].items(): head = k[:7] if head == 'module.': name = k[7:] # remove `module.` else: name = k new_state_dict[name] = v model.load_state_dict(new_state_dict) pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if train_dict['adam']: optimizer = optim.Adam(pg0, lr=train_dict['lr0'], betas=(train_dict['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=train_dict['lr0'], momentum=train_dict['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': train_dict['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 epochs = train_dict['epoch'] lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if state_dict['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = state_dict['best_fitness'] # Results if state_dict.get('training_results') is not None: with open(results_file, 'w') as file: file.write(state_dict['training_results']) # write results.txt # Epochs start_epoch = state_dict['epoch'] + 1 if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, state_dict['epoch'], epochs)) epochs += state_dict['epoch'] # finetune additional epochs del ckpt, state_dict if train_dict['sync_bn'] and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # ddp if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=(opt.local_rank)) # Trainloader batch_size = train_dict['batch_size'] image_size = train_dict['image_size'] # dataloader, dataset = create_dataloader(train_path,image_size, batch_size, opt, hyp=train_dict, augment=True, # rect=opt.rect, rank=rank, # world_size=opt.world_size, workers=train_dict['workers']) rgb_mean = (104, 117, 123) # bgr order dataset = WiderFaceDetection(train_path, preproc(image_size, rgb_mean)) sampler = torch.utils.data.distributed.DistributedSampler(dataset) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=8, sampler=sampler, pin_memory=True, collate_fn=detection_collate) criterion = MultiBoxLoss(num_classes, 0.35, True, 0, True, 7, 0.35, False) priorbox = PriorBox(train_dict, image_size=(image_size, image_size)) with torch.no_grad(): priors = priorbox.forward() priors = priors.cuda() for epoch in range(start_epoch, epochs): if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: pbar = tqdm(pbar) # progress bar optimizer.zero_grad() for i, ( images, targets ) in pbar: # batch ------------------------------------------------------------- with amp.autocast(enabled=cuda): images = images.cuda() targets = [anno.cuda() for anno in targets] out = model(images) optimizer.zero_grad() loss_l, loss_c, loss_landm = criterion( out, priors, targets) * opt.world_size loss = cfg['loc_weight'] * loss_l + loss_c + loss_landm loss.backward() optimizer.step() load_t1 = time.time() batch_time = load_t1 - load_t0 eta = int(batch_time * (max_iter - iteration)) if rank in [-1, 0]: print( 'Epoch:{}/{} || Epochiter: {}/{} || Iter: {}/{} || Loc: {:.4f} Cla: {:.4f} Landm: {:.4f} || LR: {:.8f} || Batchtime: {:.4f} s || ETA: {}' .format(epoch, max_epoch, (iteration % epoch_size) + 1, epoch_size, iteration + 1, max_iter, loss_l.item(), loss_c.item(), loss_landm.item(), lr, batch_time, str(datetime.timedelta(seconds=eta)))) torch.save(net.state_dict(), wdir + os.sep + '{}_Final.pth'.format(i))
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = str(log_dir / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg else [] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=(opt.local_rank)) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # only runs on process 0 # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test' % (imgsz, imgsz_test)) logger.info('Using %g dataloader workers' % dataloader.num_workers) logger.info('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: # Generate indices if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Autocast with amp.autocast(enabled=cuda): # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # if not torch.isfinite(loss): # logger.info('WARNING: non-finite loss, ending training ', loss_items) # return results # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def get_network(name, n_classes, in_channels=3, feature_scale=4, tensor_dim='2D', nonlocal_mode='embedded_gaussian', attention_dsample=(2, 2, 2), aggregation_mode='concat', dataDims=[4, 1, 96, 192, 112], rank=0): # NOTE: batchSize is only used for the STN based network. For the other networks, it is irrelevant model = _get_model_instance(name, tensor_dim) if name in ['unet', 'unet_ct_dsv']: model = model(n_classes=n_classes, is_batchnorm=True, in_channels=in_channels, feature_scale=feature_scale, is_deconv=False) elif name in ['unet_nonlocal']: model = model(n_classes=n_classes, is_batchnorm=True, in_channels=in_channels, is_deconv=False, nonlocal_mode=nonlocal_mode, feature_scale=feature_scale) elif name in [ 'unet_grid_gating', 'unet_ct_single_att_dsv', 'unet_ct_multi_att_dsv', 'unet_ct_dense_multi_att_dsv' ]: model = model(n_classes=n_classes, is_batchnorm=True, in_channels=in_channels, nonlocal_mode=nonlocal_mode, feature_scale=feature_scale, attention_dsample=attention_dsample, is_deconv=False) elif name in [ 'unet_ct_multi_att_dsv_stn', 'unet_ct_multi_att_dsv_stn_v2', 'unet_ct_multi_att_dsv_stn_unreg_v2', 'unet_ct_multi_att_dsv_deform', 'unet_ct_multi_att_dsv_deform_small', 'unet_CT_multi_att_dsv_deform_ax_cor' ]: model = model(n_classes=n_classes, is_batchnorm=True, in_channels=in_channels, nonlocal_mode=nonlocal_mode, feature_scale=feature_scale, attention_dsample=attention_dsample, is_deconv=False, dataDims=dataDims).to(rank) elif name in ['unet_t2_ax_cor']: model = model(n_classes=n_classes, is_batchnorm=True, in_channels=in_channels, nonlocal_mode=nonlocal_mode, feature_scale=feature_scale, attention_dsample=attention_dsample, is_deconv=False, dataDims=dataDims).to(rank) # ToDo: Uncomment the last line for DDP processing - need to do mor eefficiently # Currently, we do this locally only for this model #model = nn.SyncBatchNorm.convert_sync_batchnorm(model) model = DDP(model, device_ids=[rank], find_unused_parameters=True) # Uncomment this for DDP elif name in ['transformer_registration']: # TODO: Still needs work! model = model(vol_size=dataDims[2:], enc_nf=[16, 32, 32, 32], dec_nf=[32, 32, 32, 32, 32, 16, 16]) else: raise 'Model {} not available'.format(name) # Utilize multiple GPUs in parallel #model = nn.DataParallel(model) return model
def __init__(self, args, train_loader=None, val_loader=None, logger=None, num_answers=0, train=True): self.args = args self.max_text_length = args.max_text_length self.train_loader = train_loader self.val_loader = val_loader self.num_answers = num_answers self.logger = logger # Model self.model = VQAModel.from_pretrained("bert-base-uncased", args=args, num_answers=self.num_answers) self.verbose = True if self.args.distributed: if self.args.gpu != 0: self.verbose = False # Load Checkpoint self.start_epoch = None if args.load is not None: path = args.load + '.pth' self.load(path, verbose=self.verbose) elif args.load_lxmert_qa is not None: path = args.load_lxmert_qa + '_LXRT.pth' load_lxmert_qa( args, path, self.model, label2ans=self.train_loader.dataset.raw_dataset.label2ans, verbose=self.verbose) # GPU Options print(f'Model Launching at GPU {self.args.gpu}') from time import time start = time() self.model.cuda(args.gpu) # Optimizer if train: self.optim, self.lr_scheduler = self.create_optimizer_and_scheduler( ) self.bce_loss = nn.BCEWithLogitsLoss() if args.multiGPU: assert args.distributed self.model = DDP(self.model, device_ids=[args.gpu], find_unused_parameters=True) if args.gpu == 0: print(f'It took {time() - start:.1f}s') # Output Directory self.output = args.output os.makedirs(self.output, exist_ok=True)
USE_GPU = False model = BatchProgramCC(EMBEDDING_DIM, HIDDEN_DIM, MAX_TOKENS + 1, ENCODE_DIM, LABELS, BATCH_SIZE, USE_GPU, embeddings) if USE_GPU: model.cuda() # setup devices for this process, rank 1 uses GPUs [0, 1, 2, 3] and # rank 2 uses GPUs [4, 5, 6, 7]. n = torch.cuda.device_count() // world_size device_ids = list(range(2 * n, (2 + 1) * n)) # create model and move it to device_ids[0] model = model.to(device_ids[0]) # output_device defaults to device_ids[0] ddp_model = DDP(model, device_ids=device_ids) parameters = ddp_model.parameters() optimizer = torch.optim.Adamax(parameters) loss_function = torch.nn.BCELoss() #print(train_data) precision, recall, f1 = 0, 0, 0 print('Start training...') for t in range(1, categories + 1): if lang == 'java': train_data_t = train_data[train_data['label'].isin([t, 0])] train_data_t.loc[train_data_t['label'] > 0, 'label'] = 1 test_data_t = test_data[test_data['label'].isin([t, 0])] test_data_t.loc[test_data_t['label'] > 0, 'label'] = 1
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info(f'Hyperparameters {hyp}') save_dir, epochs, batch_size, total_batch_size, weights, rank = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint model = Darknet(opt.cfg).to(device) # create state_dict = { k: v for k, v in ckpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(state_dict, strict=False) print('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Darknet(opt.cfg).to(device) # create # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2.append(v) # biases elif 'Conv2d.weight' in k: pg1.append(v) # apply weight_decay elif 'm.weight' in k: pg1.append(v) # apply weight_decay elif 'w.weight' in k: pg1.append(v) # apply weight_decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init( config=opt, resume="allow", project='YOLOR' if opt.project == 'runs/train' else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) # Resume start_epoch, best_fitness = 0, 0.0 best_fitness_p, best_fitness_r, best_fitness_ap50, best_fitness_ap, best_fitness_f = 0.0, 0.0, 0.0, 0.0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] best_fitness_p = ckpt['best_fitness_p'] best_fitness_r = ckpt['best_fitness_r'] best_fitness_ap50 = ckpt['best_fitness_ap50'] best_fitness_ap = ckpt['best_fitness_ap'] best_fitness_f = ckpt['best_fitness_f'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = 64 #int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, batch_size * 2, gs, opt, hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, save_dir=save_dir) if tb_writer: tb_writer.add_histogram('classes', c, 0) if wandb: wandb.log({ "Labels": [ wandb.Image(str(x), caption=x.name) for x in save_dir.glob('*labels*.png') ] }) # Anchors # if not opt.noautoanchor: # check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test\n' 'Using %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, save_dir, epochs)) torch.save(model, wdir / 'init.pt') for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename plot_images(images=imgs, targets=targets, paths=paths, fname=f) # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 3 and wandb: wandb.log({ "Mosaics": [ wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg') ] }) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP if epoch >= 3: results, maps, times = test.test( opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0) # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = [ 'train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}) # W&B # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] fi_p = fitness_p(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] fi_r = fitness_r(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] fi_ap50 = fitness_ap50(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] fi_ap = fitness_ap(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if (fi_p > 0.0) or (fi_r > 0.0): fi_f = fitness_f(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] else: fi_f = 0.0 if fi > best_fitness: best_fitness = fi if fi_p > best_fitness_p: best_fitness_p = fi_p if fi_r > best_fitness_r: best_fitness_r = fi_r if fi_ap50 > best_fitness_ap50: best_fitness_ap50 = fi_ap50 if fi_ap > best_fitness_ap: best_fitness_ap = fi_ap if fi_f > best_fitness_f: best_fitness_f = fi_f # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'best_fitness_p': best_fitness_p, 'best_fitness_r': best_fitness_r, 'best_fitness_ap50': best_fitness_ap50, 'best_fitness_ap': best_fitness_ap, 'best_fitness_f': best_fitness_f, 'training_results': f.read(), 'model': ema.ema.module.state_dict() if hasattr(ema, 'module') else ema.ema.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': wandb_run.id if wandb else None } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if (best_fitness == fi) and (epoch >= 200): torch.save(ckpt, wdir / 'best_{:03d}.pt'.format(epoch)) if best_fitness == fi: torch.save(ckpt, wdir / 'best_overall.pt') if best_fitness_p == fi_p: torch.save(ckpt, wdir / 'best_p.pt') if best_fitness_r == fi_r: torch.save(ckpt, wdir / 'best_r.pt') if best_fitness_ap50 == fi_ap50: torch.save(ckpt, wdir / 'best_ap50.pt') if best_fitness_ap == fi_ap: torch.save(ckpt, wdir / 'best_ap.pt') if best_fitness_f == fi_f: torch.save(ckpt, wdir / 'best_f.pt') if epoch == 0: torch.save(ckpt, wdir / 'epoch_{:03d}.pt'.format(epoch)) if ((epoch + 1) % 25) == 0: torch.save(ckpt, wdir / 'epoch_{:03d}.pt'.format(epoch)) if epoch >= (epochs - 5): torch.save(ckpt, wdir / 'last_{:03d}.pt'.format(epoch)) elif epoch >= 420: torch.save(ckpt, wdir / 'last_{:03d}.pt'.format(epoch)) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = save_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if f1.exists(): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: wandb.log({ "Results": [ wandb.Image(str(save_dir / x), caption=x) for x in ['results.png', 'precision-recall_curve.png'] ] }) logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() return results
def train( hyp, # path/to/hyp.yaml or hyp dictionary opt, device, callbacks): save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \ opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze # Directories w = save_dir / 'weights' # weights dir (w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir last, best = w / 'last.pt', w / 'best.pt' # Hyperparameters if isinstance(hyp, str): with open(hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict LOGGER.info( colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) # Save run settings if not evolve: with open(save_dir / 'hyp.yaml', 'w') as f: yaml.safe_dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.safe_dump(vars(opt), f, sort_keys=False) # Loggers data_dict = None if RANK in [-1, 0]: loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance if loggers.wandb: data_dict = loggers.wandb.data_dict if resume: weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp # Register actions for k in methods(loggers): callbacks.register_action(k, callback=getattr(loggers, k)) # Config plots = not evolve # create plots cuda = device.type != 'cpu' init_seeds(1 + RANK) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] nc = 1 if single_cls else int(data_dict['nc']) # number of classes names = ['item'] if single_cls and len( data_dict['names']) != 1 else data_dict['names'] # class names assert len( names ) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check is_coco = isinstance(val_path, str) and val_path.endswith( 'coco/val2017.txt') # COCO dataset # Model check_suffix(weights, '.pt') # check weights pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(LOCAL_RANK): weights = attempt_download( weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create exclude = [ 'anchor' ] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys csd = ckpt['model'].float().state_dict( ) # checkpoint state_dict as FP32 csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(csd, strict=False) # load LOGGER.info( f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}' ) # report else: model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create # Freeze freeze = [ f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0])) ] # layers to freeze for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): LOGGER.info(f'freezing {k}') v.requires_grad = False # Image size gs = max(int(model.stride.max()), 32) # grid size (max stride) imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple # Batch size if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size batch_size = check_train_batch_size(model, imgsz) loggers.on_params_update({"batch_size": batch_size}) # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}") g0, g1, g2 = [], [], [] # optimizer parameter groups for v in model.modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias g2.append(v.bias) if isinstance(v, nn.BatchNorm2d): # weight (no decay) g0.append(v.weight) elif hasattr(v, 'weight') and isinstance( v.weight, nn.Parameter): # weight (with decay) g1.append(v.weight) if opt.optimizer == 'Adam': optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum elif opt.optimizer == 'AdamW': optimizer = AdamW(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': g1, 'weight_decay': hyp['weight_decay'] }) # add g1 with weight_decay optimizer.add_param_group({'params': g2}) # add g2 (biases) LOGGER.info( f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups " f"{len(g0)} weight, {len(g1)} weight (no decay), {len(g2)} bias") del g0, g1, g2 # Scheduler if opt.linear_lr: lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp[ 'lrf'] # linear else: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR( optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA ema = ModelEMA(model) if RANK in [-1, 0] else None # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # EMA if ema and ckpt.get('ema'): ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) ema.updates = ckpt['updates'] # Epochs start_epoch = ckpt['epoch'] + 1 if resume: assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.' if epochs < start_epoch: LOGGER.info( f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs." ) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, csd # DP mode if cuda and RANK == -1 and torch.cuda.device_count() > 1: LOGGER.warning( 'WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n' 'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.' ) model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and RANK != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) LOGGER.info('Using SyncBatchNorm()') # Trainloader train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls, hyp=hyp, augment=True, cache=opt.cache, rect=opt.rect, rank=LOCAL_RANK, workers=workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '), shuffle=True) mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class nb = len(train_loader) # number of batches assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}' # Process 0 if RANK in [-1, 0]: val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls, hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1, workers=workers, pad=0.5, prefix=colorstr('val: '))[0] if not resume: labels = np.concatenate(dataset.labels, 0) # c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, names, save_dir) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) model.half().float() # pre-reduce anchor precision callbacks.run('on_pretrain_routine_end') # DDP mode if cuda and RANK != -1: model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK) # Model attributes nl = de_parallel( model).model[-1].nl # number of detection layers (to scale hyps) hyp['box'] *= 3 / nl # scale to layers hyp['cls'] *= nc / 80 * 3 / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640)**2 * 3 / nl # scale to image size and layers hyp['label_smoothing'] = opt.label_smoothing model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights( dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training last_opt_step = -1 maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) stopper = EarlyStopping(patience=opt.patience) compute_loss = ComputeLoss(model) # init loss class LOGGER.info( f'Image sizes {imgsz} train, {imgsz} val\n' f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n' f"Logging results to {colorstr('bold', save_dir)}\n" f'Starting training for {epochs} epochs...') for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional, single-GPU only) if opt.image_weights: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Update mosaic border (optional) # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(3, device=device) # mean losses if RANK != -1: train_loader.sampler.set_epoch(epoch) pbar = enumerate(train_loader) LOGGER.info( ('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size')) if RANK in [-1, 0]: pbar = tqdm( pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device)) # loss scaled by batch_size if RANK != -1: loss *= WORLD_SIZE # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni - last_opt_step >= accumulate: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) last_opt_step = ni # Log if RANK in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB) pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])) callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots, opt.sync_bn) # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for loggers scheduler.step() if RANK in [-1, 0]: # mAP callbacks.run('on_train_epoch_end', epoch=epoch) ema.update_attr(model, include=[ 'yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights' ]) final_epoch = (epoch + 1 == epochs) or stopper.possible_stop if not noval or final_epoch: # Calculate mAP results, maps, _ = val.run(data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=ema.ema, single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, plots=False, callbacks=callbacks, compute_loss=compute_loss) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi log_vals = list(mloss) + list(results) + lr callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi) # Save model if (not nosave) or (final_epoch and not evolve): # if save ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'model': deepcopy(de_parallel(model)).half(), 'ema': deepcopy(ema.ema).half(), 'updates': ema.updates, 'optimizer': optimizer.state_dict(), 'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None, 'date': datetime.now().isoformat() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0): torch.save(ckpt, w / f'epoch{epoch}.pt') del ckpt callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi) # Stop Single-GPU if RANK == -1 and stopper(epoch=epoch, fitness=fi): break # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576 # stop = stopper(epoch=epoch, fitness=fi) # if RANK == 0: # dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks # Stop DPP # with torch_distributed_zero_first(RANK): # if stop: # break # must break all DDP ranks # end epoch ---------------------------------------------------------------------------------------------------- # end training ----------------------------------------------------------------------------------------------------- if RANK in [-1, 0]: LOGGER.info( f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.' ) for f in last, best: if f.exists(): strip_optimizer(f) # strip optimizers if f is best: LOGGER.info(f'\nValidating {f}...') results, _, _ = val.run( data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=attempt_load(f, device).half(), iou_thres=0.65 if is_coco else 0.60, # best pycocotools results at 0.65 single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, save_json=is_coco, verbose=True, plots=True, callbacks=callbacks, compute_loss=compute_loss) # val best model with plots if is_coco: callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi) callbacks.run('on_train_end', last, best, plots, epoch, results) LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}") torch.cuda.empty_cache() return results
def run(self): ## init distributed self.cfg = init_distributed(self.cfg) cfg = self.cfg # cfg.print() ## parser_dict self.dictionary = self._parser_dict() ## parser_datasets datasets, dataloaders, data_samplers, dataset_sizes = self._parser_datasets( ) ## parser_model model_ft = self._parser_model() # Scale learning rate based on global batch size if cfg.SCALE_LR.ENABLED: cfg.INIT_LR = cfg.INIT_LR * float( self.batch_size) / cfg.SCALE_LR.VAL scaler = amp.GradScaler(enabled=True) if cfg.WARMUP.NAME is not None: logger.info('Start warm-up ... ') self.warm_up(scaler, model_ft, dataloaders['train'], cfg) logger.info('finish warm-up!') ## parser_optimizer optimizer_ft = build_optimizer(cfg, model_ft) ## parser_lr_scheduler lr_scheduler_ft = build_lr_scheduler(cfg, optimizer_ft) if cfg.distributed: model_ft = DDP(model_ft, device_ids=[cfg.local_rank], output_device=(cfg.local_rank)) # Freeze freeze_models(model_ft) if self.cfg.PRETRAIN_MODEL is not None: if self.cfg.RESUME: self.start_epoch = self.ckpts.resume_checkpoint( model_ft, optimizer_ft) else: self.start_epoch = self.ckpts.load_checkpoint( self.cfg.PRETRAIN_MODEL, model_ft, optimizer_ft) ## vis network graph if self.cfg.TENSORBOARD_MODEL and False: self.tb_writer.add_graph(model_ft, (model_ft.dummy_input.cuda(), )) self.steps_per_epoch = int(dataset_sizes['train'] // self.batch_size) best_acc = 0.0 best_perf_rst = None for epoch in range(self.start_epoch + 1, self.cfg.N_MAX_EPOCHS): if cfg.distributed: dataloaders['train'].sampler.set_epoch(epoch) self.train_epoch(scaler, epoch, model_ft, datasets['train'], dataloaders['train'], optimizer_ft) lr_scheduler_ft.step() if self.cfg.DATASET.VAL and ( not epoch % cfg.EVALUATOR.EVAL_INTERVALS or epoch == self.cfg.N_MAX_EPOCHS - 1): acc, perf_rst = self.val_epoch(epoch, model_ft, datasets['val'], dataloaders['val']) if cfg.local_rank == 0: # start to save best performance model after learning rate decay to 1e-6 if best_acc < acc: self.ckpts.autosave_checkpoint(model_ft, epoch, 'best', optimizer_ft) best_acc = acc best_perf_rst = perf_rst # continue if not epoch % cfg.N_EPOCHS_TO_SAVE_MODEL: if cfg.local_rank == 0: self.ckpts.autosave_checkpoint(model_ft, epoch, 'last', optimizer_ft) if best_perf_rst is not None: logger.info(best_perf_rst.replace("(val)", "(best)")) if cfg.local_rank == 0: self.tb_writer.close() dist.destroy_process_group() if cfg.local_rank != 0 else None torch.cuda.empty_cache()
def run(rank, size, outputfile): normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) """set randomseed""" torch.manual_seed(randomseed) print('manual_seed=', randomseed) """set up data""" train_set, bsz = partition_dataset(normalize) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) test_set = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, num_workers=2, batch_size=batch_size, shuffle=False, pin_memory=True) criterion = torch.nn.CrossEntropyLoss().to(device) """set up model""" model = mdl.VGG11() model.to(device) model = DDP(model) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001) num_batches = math.ceil(len(train_set.dataset) / float(bsz)) """write output to file""" if os.path.exists(outputfile): os.remove(outputfile) fp = open( outputfile + "_r" + str(dist.get_rank()) + "_size" + str(dist.get_world_size()), "a") """start training""" total_epoch = 1 for epoch in range(total_epoch): running_loss = 0.0 # remember to exit the train loop at end of the epoch for batch_idx, (data, target) in enumerate(train_set): if batch_idx < 10: start = timeit.default_timer() # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(data) loss = criterion(outputs, target) running_loss += loss.item() loss.backward() optimizer.step() running_loss += loss.item() if batch_idx % 20 == 19: # print every 20 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, batch_idx + 1, running_loss / 20)) fp.write('[%d, %5d] loss: %.3f\n' % (epoch + 1, batch_idx + 1, running_loss / 20)) running_loss = 0.0 if batch_idx == 0: fp.write("Batch\trunning time\n") if batch_idx < 10: end = timeit.default_timer() - start print("Batch " + str(batch_idx) + " running time:" + str(end)) fp.write('%d\t%.5f\n' % (batch_idx, end)) # # training stop fp.close() test_model(model, test_loader, criterion, outputfile)
def train(lm_dataloader, model, criterion, optimizer, vocab_size, args): model.train() from functools import reduce import operator num_params = reduce(operator.add, (reduce(operator.mul, x.size()) for x in model.parameters())) if model.group: total = torch.Tensor([num_params]) if torch.cuda.is_available(): total = total.cuda() torch.distributed.all_reduce(total, group=model.group) logging.info( f"training model, #prams = {num_params}, group: {model.group.rank()}, grank:" f" {torch.distributed.get_rank()}, sizes {model.group.size()}") torch.distributed.barrier() if model.group.rank() == 0: logging.info(f"total #prams = {total.item()}") else: logging.info(f"training model, #prams = {num_params}") vocab_size = 10000 # FIXME total_loss = 0.0 start_time = time.time() word_counter = 0 optimizer = optimizer(model) def get_first_device(model): if isinstance(model, DDP): model = model.module if not torch.cuda.is_available(): return torch.device("cpu") if model.devices: return model.devices[0] else: return torch.cuda.current_device() def get_last_device(model): if isinstance(model, DDP): model = model.module if not torch.cuda.is_available(): return torch.device("cpu") if model.devices: return model.devices[-1] else: return torch.cuda.current_device() pipe_group = model.group if args.ddp_zero: model = DDP( model, device_ids=[torch.cuda.current_device()], process_group=get_data_parallel_group(), find_unused_parameters=False, ) if pipe_group and pipe_group.rank() != 0 and pipe_group.rank() != ( pipe_group.size() - 1): thing = {"input": torch.zeros(args.batch_size)} class FakeDataset: def __getitem__(self, index): return thing def __len__(self): return len(lm_dataloader) lm_dataloader = FakeDataset() for i, batch in enumerate(lm_dataloader): bi = batch["input"] if args.max_batch and i > args.max_batch: break optimizer.zero_grad() try: if (pipe_group is None or pipe_group.rank() == 0) and not args.ddp_zero: tmp = batch["input"].to(get_first_device(model)) output = model(tmp) else: output = model(batch["input"]) except Exception as e: raise RuntimeError( f"training failed on {torch.distributed.get_rank()}") from e if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1: target = batch["target"].to(get_last_device(model)) output = output.to(target.device) loss = criterion(output.view(-1, vocab_size), target.view(-1)) if args.ddp_zero: ddp_group = get_data_parallel_group() torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.SUM, group=ddp_group) loss /= ddp_group.size() loss.backward() del target else: if args.ddp_zero: model.module.back_helper(output) else: model.back_helper(output) del output torch.nn.utils.clip_grad_value_(model.parameters(), 0.05) optimizer.step() if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1: total_loss += loss.item() log_interval = 1 word_counter += batch["ntokens"] if i % log_interval == 0 and i > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print( "| batch {:5d} | wps {:5.2f} | loss {:5.2f} | ppl {:8.2f}". format(i, word_counter / elapsed, cur_loss, math.exp(cur_loss))) word_counter = 0 total_loss = 0 start_time = time.time()
def run_main(args, rank=None): # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if args.parallel == 'DDP': n = torch.cuda.device_count() // args.world_size device = list(range(rank * n, (rank + 1) * n)) else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vocab = torch.load(args.save_vocab) cls_id = vocab.stoi['<cls>'] pad_id = vocab.stoi['<pad>'] sep_id = vocab.stoi['<sep>'] if args.dataset == 'WikiText103': from torchtext.experimental.datasets import WikiText103 train_dataset, valid_dataset, test_dataset = WikiText103(vocab=vocab, single_line=False) elif args.dataset == 'BookCorpus': from data import BookCorpus train_dataset, test_dataset, valid_dataset = BookCorpus(vocab, min_sentence_len=60) if rank is not None: chunk_len = len(train_dataset.data) // args.world_size train_dataset.data = train_dataset.data[(rank * chunk_len):((rank + 1) * chunk_len)] if args.checkpoint != 'None': model = torch.load(args.checkpoint) else: pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout) pretrained_bert.load_state_dict(torch.load(args.bert_model)) model = NextSentenceTask(pretrained_bert) if args.parallel == 'DDP': model = model.to(device[0]) model = DDP(model, device_ids=device) else: model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) adaptdl.torch.init_process_group("nccl" if torch.cuda.is_available() else "gloo") # Changed model = adaptdl.torch.AdaptiveDataParallel(model, optimizer, scheduler) # Changed best_val_loss = None train_loss_log, val_loss_log = [], [] tensorboard_dir = os.path.join(os.getenv("ADAPTDL_TENSORBOARD_LOGDIR", "/tmp") if adaptdl.env.replica_rank() == 0 else "/tmp", adaptdl.env.job_id()) writer = SummaryWriter(tensorboard_dir)# Added for tensorboard # for epoch in range(1, args.epochs + 1): # original for epoch in adaptdl.torch.remaining_epochs_until(args.epochs): # Changed epoch_start_time = time.time() train(process_raw_data(train_dataset, args), model, train_loss_log, device, optimizer, criterion, epoch, scheduler, cls_id, sep_id, pad_id, args, rank, writer) val_loss = evaluate(process_raw_data(valid_dataset, args), model, device, criterion, cls_id, sep_id, pad_id, args, test = False, epoch = epoch, writer = writer) val_loss_log.append(val_loss) if (rank is None) or (rank == 0): print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s ' '| valid loss {:8.5f} | '.format(epoch, (time.time() - epoch_start_time), val_loss)) print('-' * 89) if not best_val_loss or val_loss < best_val_loss: if rank is None: with open(args.save, 'wb') as f: torch.save(model.module, f) elif rank == 0: with open(args.save, 'wb') as f: torch.save(model.module.state_dict(), f) best_val_loss = val_loss else: scheduler.step() if args.parallel == 'DDP': rank0_devices = [x - rank * len(device) for x in device] device_pairs = zip(rank0_devices, device) map_location = {'cuda:%d' % x: 'cuda:%d' % y for x, y in device_pairs} model.load_state_dict(torch.load(args.save, map_location=map_location)) model = adaptdl.torch.AdaptiveDataParallel(model, optimizer, scheduler) # Changed test_loss = evaluate(process_raw_data(test_dataset, args), model, device, criterion, cls_id, sep_id, pad_id, args) if rank == 0: wrap_up(train_loss_log, val_loss_log, test_loss, args, model.module, 'ns_loss.txt', 'ns_model.pt') else: with open(args.save, 'rb') as f: model = torch.load(f) test_loss = evaluate(process_raw_data(test_dataset, args), model, device, criterion, cls_id, sep_id, pad_id, args) wrap_up(train_loss_log, val_loss_log, test_loss, args, model, 'ns_loss.txt', 'ns_model.pt')
def check_parity(amp: bool, manual_reduction: bool): # The API should be the exact same in between the sharded and non-sharded variants, generic closure def closure(model, scaler, input_tensor, should_accumulate, _manual_reduction=False): accumulate_steps = 3 if should_accumulate else 1 model.zero_grad() def step(): if scaler is not None: with torch.cuda.amp.autocast(): loss = model(input_tensor).abs().sum() scaler.scale(loss).backward() else: loss = model(input_tensor).abs().sum() loss.backward() with model.no_sync() if should_accumulate else suppress(): for _ in range(accumulate_steps - 1): step() if not _manual_reduction: step() else: with model.no_sync(): step() model.reduce() # Any model works. Add one different buffer per rank model = _get_mlp() model.register_buffer("test_buffer", torch.ones((1)) * rank) model.to(device) # Make sure that the model starts with non-trainable, so that we check for the buckets to be # properly reassigned when/if this changes next(model.parameters()).requires_grad = False sharded_optimizer = OSS(params=model.parameters(), optim=torch.optim.SGD, lr=1e-4, momentum=0.99) sharded_ddp_model = ShardedDataParallel( module=model, sharded_optimizer=sharded_optimizer, broadcast_buffers=True, reduce_buffer_size=reduce_buffer_size, reduce_fp16=fp16_reduction, ) ddp_model_single = copy.deepcopy(model) ddp_optimizer = torch.optim.SGD(ddp_model_single.parameters(), lr=1e-4, momentum=0.99) ddp_model = DDP(ddp_model_single, device_ids=[rank], broadcast_buffers=True, find_unused_parameters=True) if fp16_reduction: from dist.algorithms.ddp_com_hooks.default_hooks import fp16_compress_hook ddp_model.register_comm_hook( state=None, hook=fp16_compress_hook) # type: ignore ddp_scaler = TorchGradScaler() if amp else None sharded_ddp_scaler = ShardedGradScaler() if amp else None # The model should be synchronized in between the ranks at construction time, check that check_same_model_params(sharded_ddp_model, ddp_model) # Typical training loop, check that we get the exact same results as DDP for i in range(NUMBER_BATCHS): input_tensor = torch.rand((BATCH_SIZE, 2)).to(device) def closure_ddp(input_tensor=input_tensor): return closure(ddp_model, ddp_scaler, input_tensor, grad_accumulation) def closure_sharded(input_tensor=input_tensor): return closure( sharded_ddp_model, sharded_ddp_scaler, input_tensor, grad_accumulation, _manual_reduction=manual_reduction, ) # Step/scale both if ddp_scaler is not None: _ = closure_ddp(input_tensor) ddp_scaler.step(ddp_optimizer) ddp_scaler.update() else: ddp_optimizer.step(closure=closure_ddp) if sharded_ddp_scaler is not None: _ = closure_sharded(input_tensor) sharded_ddp_scaler.step(sharded_optimizer) sharded_ddp_scaler.update() else: sharded_optimizer.step(closure=closure_sharded) check_same_model_params(sharded_ddp_model, ddp_model, f"Rank: {rank} - Step {i} broke") # Flip the trainability of the first parameter back and forth if i == 0 and change_train_graph: next(sharded_ddp_model.parameters()).requires_grad = not next( sharded_ddp_model.parameters()).requires_grad next(ddp_model.parameters()).requires_grad = not next( ddp_model.parameters()).requires_grad check_same_model_params( sharded_ddp_model, ddp_model, f"Rank: {rank} - Trainability refresh {i} broke")
def check_optimizer_equivalence(optimizer: Type[torch.optim.Optimizer]): # Any model works. Add one different buffer per rank model = torch.nn.Sequential( torch.nn.Linear(2, 3), torch.nn.Linear(3, 3), torch.nn.Linear(3, 3), ) model.register_buffer("test_buffer", torch.ones((1)) * self.rank) model.to(self.device) sharded_optimizer = ZeroRedundancyOptimizer( params=model.parameters(), optimizer_class=optimizer, lr=1e-3 ) sharded_ddp_model = DDP( module=model, device_ids=[self.rank], broadcast_buffers=True, find_unused_parameters=True ) ddp_model_single = copy.deepcopy(model) ddp_model_single.to(self.device) ddp_optimizer = optimizer(ddp_model_single.parameters(), lr=1e-3) ddp_model = DDP( ddp_model_single, device_ids=[self.rank], broadcast_buffers=True, find_unused_parameters=True ) # The model should be synchronized in between the ranks at construction time, check that check_same_model_params(sharded_ddp_model, ddp_model, "Models differ from the start") def check_step(): input_tensor = torch.rand((64, 2)) def closure_ddp(input_tensor=input_tensor): ddp_optimizer.zero_grad() ddp_loss = ddp_model(input_tensor).abs().sum() ddp_loss.backward() return ddp_loss def closure_sharded(input_tensor=input_tensor): sharded_optimizer.zero_grad() sharded_loss = sharded_ddp_model(input_tensor).abs().sum() sharded_loss.backward() return sharded_loss loss_ddp = cast(torch.Tensor, ddp_optimizer.step(closure=closure_ddp)) loss_sharded_optim = cast(torch.Tensor, sharded_optimizer.step(closure=closure_sharded)) assert torch.allclose( loss_ddp, loss_sharded_optim ), "Losses differ in between Pytorch optim and ZeroRedundancyOptimizer" check_same_model_params(sharded_ddp_model, ddp_model, "Models differ after a step") # The models should stay the same in between the ranks for i in range(BATCHS): check_step() # Change the models trainability, check that parity is maintained # only check after a couple of constant batchs to go through both regimes if i > BATCHS // 2: next(ddp_model.parameters()).requires_grad = bool(i % 2) next(sharded_ddp_model.parameters()).requires_grad = bool(i % 2) # Check that the checkpoints are compatible reference_rank = 0 # - get states ddp_state_dict = ddp_optimizer.state_dict() sharded_optimizer.consolidate_state_dict(to=reference_rank) sharded_optim_state_dict = [sharded_optimizer.state_dict() if self.rank == reference_rank else {}] dist.broadcast_object_list(sharded_optim_state_dict, src=reference_rank, group=dist.group.WORLD) sharded_optim_state_dict = sharded_optim_state_dict[0] # - cross load the states # run one step and check that the models are still the same ddp_state_dict_ref = copy.deepcopy(ddp_state_dict) # OSS will remove some states ddp_optimizer.load_state_dict(sharded_optim_state_dict) # mixup on purpose ! sharded_optimizer.load_state_dict(ddp_state_dict) check_step() # - self load, rewind, check no problem # run one step and check that the models are still the same ddp_optimizer.load_state_dict(ddp_state_dict_ref) sharded_optimizer.load_state_dict(sharded_optim_state_dict) check_step()
def train(local_rank, args): torch.backends.cudnn.benchmark = True import os # torch.multiprocessing.set_sharing_strategy('file_system') # too many barriers / one node data parallel and multiple node DDP os.environ['MASTER_ADDR'] = args["master_addr"] os.environ['MASTER_PORT'] = args["master_port"] os.environ["NCCL_DEBUG"] = "WARN" # os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank) # gpu_device = 0 gpu_device = local_rank os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if args["wandb_dryrun"]: os.environ["WANDB_MODE"] = "dryrun" os.environ["WANDB_SILENT"] = "true" os.environ['TOKENIZERS_PARALLELISM'] = "true" torch.backends.cudnn.benchmark = True rank = args["nr"] if args["cpu"] else (args["nr"] * args["gpus_per_node"] + local_rank) nr = args["nr"] if args["cpu"]: assert local_rank == 0 device = torch.device("cpu") args["dist_backend"] = "gloo" # init_method = "tcp://%s:%s" % ("127.0.0.1", "9999") else: device = torch.device(f'cuda:{gpu_device}') # Unique only on individual node. torch.cuda.set_device(device) if args["init_method"] == "tcp": if args["nr"] == 0: args["master_addr"] = "0.0.0.0" init_method="tcp://%s:%s" % (args["master_addr"], args["master_port"]) elif args["init_method"] == "file": init_method = 'file://%s/%s' % (args["master_addr"], args["master_port"]) else: raise ValueError rnd = torch.tensor(0.0, device="cpu") if args["world_size"] > 1: dist.init_process_group(args["dist_backend"], rank=rank, world_size=args["world_size"], init_method=init_method) rnd = torch.tensor(int(time.time())).to(device) dist.broadcast(rnd, 0) barrier = get_barrier(args["world_size"] > 1) format = "%Y-%m-%d %H-%M %Z" # + timedelta(hours=5, minutes=30) time_string = (datetime.fromtimestamp(time.mktime(time.gmtime(rnd.cpu().item())))).astimezone(timezone('Asia/Kolkata')).strftime(format) ds_name = list(filter(lambda x: len(x.strip()) > 0, args["dataset"].split("/")))[-1].replace("train_fastformer_resampled_", "") set_seeds(args["seed"]) batch_size = 8 optimizer_config.lr = args["lr"] optimizer_config.weight_decay = args["weight_decay"] optimizer_config.gradient_clipping = args["gradient_clipping"] optimizer_config.beta_1 = args["beta_1"] optimizer_config.beta_2 = args["beta_2"] eps = 1e-4 if args["no_autocast"]: optimizer_config.eps = 1e-7 eps = 1e-7 reinit = args["pretrained_model"] is None or "pretrained_model" not in args or args["pretrained_model"] == "" backbone, tokenizer = get_mtt_backbone(args["model_config"], args["cls_tokens"], args["enable_layer_normalizers"], args["sampling_alpha"], reinit, args["enable_layer_normalizers"], args["enable_layer_normalizers_statistics"], dropout_prob=0.01) teacher_backbone, _ = get_mtt_backbone(args["model_config"], args["cls_tokens"], args["enable_layer_normalizers"], None, reinit, args["enable_layer_normalizers"], args["enable_layer_normalizers_statistics"], dropout_prob=0.0) batch_size = args["batch_size"] if "batch_size" in args and isinstance(args["batch_size"], int) else batch_size generator_w = args["generator_w"] if "generator_w" in args else 0.0 discriminator_w = args["discriminator_w"] if "discriminator_w" in args else 0.0 dino_w = args["dino_w"] if "dino_w" in args else 0.0 sentence_order_prediction_w = args["sentence_order_prediction_w"] if "sentence_order_prediction_w" in args else 0.0 attention_penalty_w = args["attention_penalty_w"] if "attention_penalty_w" in args else 0.0 student = MTTModel(backbone, tokenizer, args["cls_tokens"], generator_w=generator_w, discriminator_w=discriminator_w, dino_w=dino_w, sentence_order_prediction_w=sentence_order_prediction_w, attention_penalty_w=attention_penalty_w, lm_layers=args["lm_layers"], electra_layers=args["electra_layers"], lm_layers_total=args["lm_layers_total"], electra_layers_total=args["electra_layers_total"], drop_unused_layers=args["drop_unused_layers"], approximate_unused_layers=args["consecutive_layers"], exclude_layers=args["exclude_layers"], keep_last_layer=args["keep_last_layer"], lm_temperature=args["lm_temperature"]) teacher = MTTModel(teacher_backbone, tokenizer, args["cls_tokens"], generator_w=generator_w, discriminator_w=discriminator_w, dino_w=1.0, sentence_order_prediction_w=sentence_order_prediction_w, attention_penalty_w=0.0, lm_layers=None, electra_layers=None, lm_layers_total=args["lm_layers_total"], electra_layers_total=args["electra_layers_total"], lm_temperature=args["lm_temperature"]) teacher = teacher.eval() model = MultiTaskHighwayCLSPretraining(student, teacher, eps, device if args["move_unused_to_cpu"] else None).to(device) trainable_model = student if dino_w == 0: model.teacher = None teacher = None clean_memory() del teacher if local_rank == 0 and rank == 0: print("[Train]: Time = %s, Trainable Params = %s" % (get_time_string(), numel(trainable_model) / 1_000_000)) if args["pretrained_model"] is not None and os.path.exists(args["pretrained_model"]): state_dict = torch.load(args["pretrained_model"], map_location='cpu' if args['cpu'] else 'cuda:%d' % gpu_device) try: trainable_model.load_state_dict(state_dict, strict=True) load_type = "strict" except: try: try: trainable_model.load_state_dict(state_dict, strict=False) load_type = "not_strict" except: state_dict = {k: v for k, v in state_dict.items() if k.startswith("backbone.")} trainable_model.load_state_dict(state_dict, strict=False) load_type = "not_strict_no_ffn" except: try: try: state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} trainable_model.load_state_dict(state_dict, strict=True) load_type = "strict-from-ddp" except: state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} state_dict = {k: v for k, v in state_dict.items() if not k.startswith("backbone.")} trainable_model.load_state_dict(state_dict, strict=True) load_type = "strict-from-ddp-no-ffn" except: try: state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} trainable_model.load_state_dict(state_dict, strict=False) load_type = "not_strict-from-ddp" except: state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} state_dict = {k: v for k, v in state_dict.items() if not k.startswith("backbone.")} trainable_model.load_state_dict(state_dict, strict=False) load_type = "not_strict-from-ddp-no-ffn" if dino_w > 0: student_teacher_param_update(model.student, model.teacher, 0.001, device if args["move_unused_to_cpu"] else None) print("[Train]: Time = %s, Loaded Pretrained model with Load type = %s, Torch Version = %s" % (get_time_string(), load_type, torch.__version__)) del state_dict model = model.train() # print("[Train]: Time = %s, Trainable Params = %s" % (get_time_string(), {k for k, v in model.named_parameters() if v.requires_grad})) if args["world_size"] > 1: # model = FSDP(model, **fsdp_params) # find_unused_parameters=True trainable_model = DDP(trainable_model, device_ids=None if args["cpu"] else [gpu_device], find_unused_parameters=True, bucket_cap_mb=50) # find_unused_parameters=True model.student = trainable_model if dino_w > 0: model.teacher = model.teacher.eval() student_teacher_param_update(model.student, model.teacher, 0.01, device if args["move_unused_to_cpu"] else None) try: from torch.distributed.algorithms.ddp_comm_hooks.default_hooks import fp16_compress_hook trainable_model.register_comm_hook(state=None, hook=fp16_compress_hook) except: print("[Train]: Time = %s, No fp16_compress_hook present, Torch Version = %s" % (get_time_string(), torch.__version__)) del backbone del teacher_backbone del student clean_memory() barrier() optc = optimizer_config.to_dict() trainable_params = list(filter(lambda p: p.requires_grad, trainable_model.parameters())) if args["optimizer"] == "adamw": optimizer = torch.optim.AdamW(trainable_params, lr=optc["lr"], eps=optc["eps"], weight_decay=optc["weight_decay"], betas=(optc["beta_1"], optc["beta_2"])) elif args["optimizer"] == "sgd": optimizer = torch.optim.SGD(trainable_params, lr=optc["lr"], momentum=0.9, weight_decay=optc["weight_decay"], nesterov=True) elif args["optimizer"] == "novograd": optimizer = Novograd(trainable_params, lr=optc["lr"], eps=optc["eps"], betas=(optc["beta_1"], optc["beta_2"]), weight_decay=optc["weight_decay"],) elif args["optimizer"] == "rangerlars": optimizer = RangerLars(trainable_params, lr=optc["lr"], eps=optc["eps"], betas=(optc["beta_1"], optc["beta_2"]), weight_decay=optc["weight_decay"],) else: raise ValueError # print("[Train]: Time = %s, Trainable Params = %s" % (get_time_string(), {k for k, v in trainable_model.named_parameters() if v.requires_grad})) del trainable_params optimizer.zero_grad(set_to_none=True) model_save_dir = args["model_save_dir"] model_save_name = args["model_save_name"] set_seeds(args["seed"] + rank) if local_rank == 0: if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) assert os.path.exists(model_save_dir) try: dataloader = build_dataloader(os.path.join(args["dataset"], "all_512_only"), args["shuffle_dataset"], batch_size, tokenizer, args["cls_tokens"], world_size=args["world_size"], num_workers=args["num_workers"], max_length=512) dataloader128 = build_dataloader(os.path.join(args["dataset"], "all_128_only"), args["shuffle_dataset"], batch_size * 6, tokenizer, args["cls_tokens"], world_size=args["world_size"], num_workers=args["num_workers"], max_length=128) dataloader256 = build_dataloader(os.path.join(args["dataset"], "all_256_only"), args["shuffle_dataset"], batch_size * 3, tokenizer, args["cls_tokens"], world_size=args["world_size"], num_workers=args["num_workers"], max_length=256) except: print("[WARN] [Train]: Time = %s, All dataloaders and datasets are same = %s" % (get_time_string(), args["dataset"])) dataloader = build_dataloader(args["dataset"], args["shuffle_dataset"], batch_size, tokenizer, args["cls_tokens"], world_size=args["world_size"], num_workers=args["num_workers"], max_length=512) dataloader128 = build_dataloader(args["dataset"], args["shuffle_dataset"], batch_size * 4, tokenizer, args["cls_tokens"], world_size=args["world_size"], num_workers=args["num_workers"], max_length=128) dataloader256 = build_dataloader(args["dataset"], args["shuffle_dataset"], batch_size * 2, tokenizer, args["cls_tokens"], world_size=args["world_size"], num_workers=args["num_workers"], max_length=256) iter_size = max(args["accumulation_steps"], 1) no_sync = iter_size > 1 steps_per_epoch = int(np.ceil(len(dataloader.sampler) / (batch_size * iter_size)) if dataloader.sampler is not None else (len(dataloader) / iter_size)) if local_rank == 0: print("[Train]: Time = %s, Optimizer and Scheduler Initialised, max lr = %.5f, steps_per_epoch = %s, batch size = %s, dataloader length = %s, Sampler Present = %s, Sampler Length = %s" % (get_time_string(), optc["lr"], steps_per_epoch, batch_size, len(dataloader), dataloader.sampler is not None, len(dataloader.sampler) if dataloader.sampler is not None else -1)) dataloader = get_next(dataloader) dataloader128 = get_next(dataloader128) dataloader256 = get_next(dataloader256) log_every_steps = args["log_every_steps"] * iter_size save_every_steps = args["save_every_steps"] # scheduler = optimization.get_constant_schedule_with_warmup(optimizer, optc["warmup_steps"]) # scheduler = optimization.get_linear_schedule_with_warmup(optimizer, optc["warmup_steps"], args["epochs"] * len(dataloader)) div_factor = optc["lr"]/1e-6 scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, optc["lr"], total_steps=args["total_steps"], div_factor=div_factor, three_phase=False, pct_start=0.06, anneal_strategy="linear", cycle_momentum=False) # scheduler1 = optimization.get_constant_schedule_with_warmup(optimizer, optc["warmup_steps"]) # scheduler2 = torch.optim.lr_scheduler.StepLR(optimizer, step_size=(steps_per_epoch * args["epochs"]) // args["lr_steps"], gamma=0.5) # scheduler = [scheduler1, scheduler2] barrier() gradient_clipping = optc["gradient_clipping"] group = "%s-%s-%s-%sN-%s" % (args["wandb_name"], ds_name, args["model_config"], args["nodes"], time_string) wandb_init_args = dict(project="de_lm", name="%s-%s-%s-%s" % (group, args["nr"], rank, local_rank), group=group, id=f"{group}-worker-{nr}-{rank}-{local_rank}", config={"args":args, "optimizer_config": optc}, settings=wandb.Settings(start_method="fork")) time.sleep(random.random()) wandb.init(**wandb_init_args) full_times = [] batch_times = [] model_times = [] model.zero_grad(set_to_none=True) samples_processed = 0 samples_processed_this_log_iter = 0 if args["detect_anomaly"]: torch.autograd.set_detect_anomaly(True) def hook(grad): is_nan_inf = torch.logical_not(torch.isfinite(grad)) if is_nan_inf.any(): # print("[GRAD-HOOK]: Time = %s, Param Name = %s, Detected Nan/Inf" % (get_time_string(), name_of_param)) grad[is_nan_inf] = 0.0 return grad return None if not args["no_autocast"] and args["backward_hook"]: for name, param in model.named_parameters(): param.register_hook(hook) dino_center = None discriminator_dino_center = None total_steps = args["total_steps"] steps_done = 0 step = 0 start_time = time.time() while steps_done < total_steps: random.seed(step) len_proba = random.random() if len_proba < 0.5: batch = dataloader128() elif len_proba < 0.6: batch = dataloader256() else: batch = dataloader() epoch_128 = dataloader128.epoch epoch_256 = dataloader256.epoch epoch_512 = dataloader.epoch # batch = None # if len_proba < 0.9: # batches = [dataloader128() for _ in range(4)] # elif len_proba < 0.97: # batches = [dataloader256() for _ in range(2)] # else: # batch = dataloader() # # if batch is None: # keys = batches[0].keys() # batch = dict() # for k in keys: # elems = [b[k] for b in batches] # if isinstance(elems[0], (list, tuple)): # new_elems = [i for e in elems for i in e] # elif isinstance(elems[0], torch.Tensor): # new_elems = torch.cat(elems, 0) # else: # raise TypeError("Expected List or Tensor") # batch[k] = new_elems key = list(batch.keys())[0] bs_size = list(batch[key].size()) batch = {k: v.to(device, non_blocking=True) if hasattr(v, "to") else v for k, v in batch.items()} gen_batch_time = time.time() - start_time teacher_update_w = np.interp(steps_done, [0, args["teacher_warmup_steps"]], [0.95, 0.999]) inner_model = getattr(trainable_model, "module", trainable_model) if hasattr(inner_model, "start_from_proba"): start_from_proba = np.interp(steps_done, [0, args["warmup_steps"], args["warmup_steps"] * 2], [0.0, 0.0, args["start_from_proba"]]) inner_model.start_from_proba = start_from_proba if hasattr(inner_model.backbone.encoder, "sampling_alpha") and args["sampling_alpha"] is not None and args["sampling_alpha"] != 1.0: sampling_alpha = np.interp(steps_done, [0, args["warmup_steps"], args["warmup_steps"] * 2], [1.0, 1.0, args["sampling_alpha"]]) inner_model.backbone.encoder.sampling_alpha = max(sampling_alpha, 0.01) inner_model.sampling_alpha = max(sampling_alpha, 0.01) if args["dino_w"] > 0: dino_w = np.interp(steps_done, [0, args["teacher_warmup_steps"], args["teacher_warmup_steps"] * 2], [0.0, 0.0, args["dino_w"]]) inner_model.dino_w = dino_w lm_temperature = np.interp(steps_done, [0, args["warmup_steps"], args["warmup_steps"] * 2], [args["lm_temperature"], args["lm_temperature"], args["lm_temperature"] + 1.0]) inner_model.lm_temperature = lm_temperature batch_times.append(gen_batch_time) if (steps_done + 1) % save_every_steps == 0 or (args["total_steps"] is not None and (steps_done + 1) >= args["total_steps"]): state_dict = trainable_model.state_dict() if not isinstance(trainable_model, DDP) else trainable_model.module.state_dict() if local_rank == 0: torch.save(state_dict, os.path.join(model_save_dir, model_save_name)) del state_dict clean_memory() barrier() if args["total_steps"] is not None and (steps_done + 1) >= args["total_steps"]: return samples_processed += int(batch[key].size(0)) samples_processed_this_log_iter += int(batch[key].size(0)) inner_args = dict(no_autocast=args["no_autocast"], cpu=args["cpu"]) validation_iter = (step + 1) % log_every_steps == 0 or step == 0 model_start = time.time() if no_sync and (step + 1) % iter_size != 0 and hasattr(trainable_model, "no_sync"): with trainable_model.no_sync(): output = train_inner_loop(inner_args, model, batch, optimizer, scheduler, gradient_clipping, iter_size=iter_size, no_sync=True, validation_iter=validation_iter, dino_center=dino_center, discriminator_dino_center=discriminator_dino_center, freeze_last_layer=steps_done < args["freeze_last_layer"], step=step + 1) model_times.append(time.time() - model_start) else: output = train_inner_loop(inner_args, model, batch, optimizer, scheduler, gradient_clipping, iter_size=iter_size, no_sync=False, validation_iter=validation_iter, dino_center=dino_center, discriminator_dino_center=discriminator_dino_center, freeze_last_layer=steps_done < args["freeze_last_layer"], step=step + 1) optimizer.zero_grad(set_to_none=True) steps_done += 1 model_times.append(time.time() - model_start) step += 1 del batch if dino_w > 0 and (step + 1) % iter_size: student_teacher_param_update(model.student, model.teacher, teacher_update_w, device if args["move_unused_to_cpu"] else None) dino_center = output.pop("dino_center", None) discriminator_dino_center = output.pop("discriminator_dino_center", None) # if dino_w > 0 and (step + 1) % (1 * iter_size) == 0 and args["world_size"] > 1: # if dino_center is not None: # dtype = dino_center.dtype # dino_center = dino_center.type(torch.float64) / args["world_size"] # torch.distributed.all_reduce(dino_center, torch.distributed.ReduceOp.SUM) # dino_center = dino_center.type(dtype) # if discriminator_dino_center is not None: # dtype = discriminator_dino_center.dtype # discriminator_dino_center = discriminator_dino_center.type(torch.float64) / args["world_size"] # torch.distributed.all_reduce(discriminator_dino_center, torch.distributed.ReduceOp.SUM) # discriminator_dino_center = discriminator_dino_center.type(dtype) if (step + 1) % (4 * iter_size) == 0 and hasattr(getattr(trainable_model, "module", trainable_model).backbone, "layer_normalizers") and args["world_size"] > 1: layer_normalizers = getattr(trainable_model, "module", trainable_model).backbone.layer_normalizers if layer_normalizers is not None: dtype = layer_normalizers.dtype layer_normalizers = layer_normalizers.type(torch.float64) torch.distributed.all_reduce(layer_normalizers, torch.distributed.ReduceOp.SUM) layer_normalizers = layer_normalizers / args["world_size"] getattr(trainable_model, "module", trainable_model).backbone.layer_normalizers = layer_normalizers.type(dtype) if (step + 1) % (4 * iter_size) == 0 and hasattr(getattr(trainable_model, "module", trainable_model).backbone, "layer_normalizers_small") and args["world_size"] > 1: layer_normalizers_small = getattr(trainable_model, "module", trainable_model).backbone.layer_normalizers_small if layer_normalizers_small is not None: dtype = layer_normalizers_small.dtype layer_normalizers_small = layer_normalizers_small.type(torch.float64) torch.distributed.all_reduce(layer_normalizers_small, torch.distributed.ReduceOp.SUM) layer_normalizers_small = layer_normalizers_small / args["world_size"] getattr(trainable_model, "module", trainable_model).backbone.layer_normalizers_small = layer_normalizers_small.type(dtype) full_time = time.time() - start_time full_times.append(full_time) if step == 0 and local_rank == 0: print("[Train]: Time = %s, First Batch Training for Rank = %s" % (get_time_string(), rank)) if validation_iter: steps_remaining = total_steps - steps_done # print({k for k, v in output.items() if isinstance(v, torch.Tensor)}) output = {k: float(v) if v else v for k, v in output.items()} samples_per_second = samples_processed_this_log_iter / np.sum(full_times) wandb_log = dict(lr=optimizer.param_groups[0]['lr'], step=step, samples_processed=samples_processed, samples_per_second=samples_per_second, batch_times=np.mean(batch_times), full_times=np.mean(full_times), model_times=np.mean(model_times), steps_remaining=steps_remaining, pct_complete=(100 * steps_done / total_steps), epoch_128=epoch_128, epoch_256=epoch_256, epoch_512=epoch_512, **{k: v for k, v in output.items() if v is not None}) wandb.log(wandb_log) if local_rank == 0: print("[Train]: Time = %s, Rank = %s, steps = %s, samples_processed=%s, batch_size = %s, Details = %s, LR = %s" % (get_time_string(), rank, step, samples_processed, bs_size, output, optimizer.param_groups[0]['lr'])) print("[Train-Timings]: Time = %s, Batch time = %.4f, Full Time = %.4f, Model Time = %.4f, samples_per_second = %s, steps_remaining = %s, pct_complete = %.4f" % ( get_time_string(), np.mean(batch_times), np.mean(full_times), np.mean(model_times), samples_per_second, steps_remaining, (100 * steps_done / total_steps),)) # print("Step = %s, Steps Done = %s, log_every_steps = %s, total_steps = %s, steps_remaining = %s, validation_iter = %s, %s" % (step, steps_done, log_every_steps, total_steps, steps_remaining, validation_iter, (step + 1) % log_every_steps == 0)) batch_times = [] full_times = [] model_times = [] samples_processed_this_log_iter = 0 if args["enable_layer_normalizers_statistics"] and local_rank == 0: backbone = getattr(model.student, "module", model.student).backbone stats = backbone.layer_normalizers inp_stats=backbone.encoder.layer_normalizers_statistics norms = stats[:, 2, 0].tolist() inp_norms = inp_stats[:, 2, 0].tolist() centers = stats[:, 0, 0:8].tolist() inp_centers = inp_stats[:, 0, 0:8].tolist() stds = stats[:, 1, 0:8].tolist() inp_stds = inp_stats[:, 1, 0:8].tolist() dist_stats = backbone.encoder.distance_statistics.tolist() print("Branch Norms = \n", tabulate(pd.DataFrame(norms), tablefmt="psql")) print("Skip Norms = \n", tabulate(pd.DataFrame({"norm": inp_norms, "dist": dist_stats}), tablefmt="psql")) print("Branch centers = \n", tabulate(pd.DataFrame(centers), tablefmt="psql")) print("Skip centers = \n", tabulate(pd.DataFrame(inp_centers), tablefmt="psql")) print("Branch stds = \n", tabulate(pd.DataFrame(stds), tablefmt="psql")) print("Skip stds = \n", tabulate(pd.DataFrame(inp_stds), tablefmt="psql")) # clean_memory() # barrier() del output del bs_size start_time = time.time() print("Time = %s, Finished Training for Rank = %s" % (get_time_string(), rank)) state_dict = trainable_model.state_dict() if not isinstance(trainable_model, DDP) else trainable_model.module.state_dict() if local_rank == 0: torch.save(state_dict, os.path.join(model_save_dir, model_save_name)) del model
def _test_zero_join(self, device): r""" Check that the ZeRO join hook allows training with uneven inputs when using the given device. Arguments: device (torch.device): device used to store parameters and perform collective communications. """ NUM_INPUTS = 3 NUM_EPOCHS = 2 torch.manual_seed(0) torch.cuda.manual_seed(0) rank = self.rank world_size = self.world_size is_gpu = device.type == "cuda" backend = dist.Backend.NCCL if is_gpu else dist.Backend.GLOO self.dist_init(rank, world_size, backend) if BACKEND == dist.Backend.NCCL and is_gpu: torch.cuda.set_device(self.device) model = torch.nn.Sequential( torch.nn.Linear(2, 3), torch.nn.Linear(3, 3), torch.nn.Linear(3, 3), ) model.to(device) # DDP ensures correct gradients in data parallel training, so DDP with # local optimizers on uneven inputs should be equivalent to ZeRO on # uneven inputs with gradients being manually set ddp_model = DDP(model, device_ids=[rank]) if is_gpu else DDP(model) local_optim = torch.optim.Adam(ddp_model.parameters(), lr=0.01) zero_model = copy.deepcopy(model) zero_model.to(device) zero_optim = ZeroRedundancyOptimizer(zero_model.parameters(), torch.optim.Adam, lr=0.01) loss_fn = torch.nn.MSELoss() # Use uneven inputs: rank i has i extra inputs inputs = [torch.randn(20, 2).to(device) for _ in range(NUM_INPUTS + rank)] labels = torch.randn(20, 3).to(device) # Save the gradients and parameters from DDP as the ground truth; do # so on the last-joining rank (in this case, the largest rank) grads_at_each_iter = [] params_at_each_iter = [] with ddp_model.join(): for _ in range(NUM_EPOCHS): for input in inputs: output = ddp_model(input) loss_fn(output, labels).backward() if rank == world_size - 1: grads = [] for p in ddp_model.parameters(): grads.append(p.grad.detach().clone().to(device)) local_optim.step() if rank == world_size - 1: params = [] for p in ddp_model.parameters(): params.append(p.detach().clone().to(device)) grads_at_each_iter.append(grads) params_at_each_iter.append(params) # Broadcast the saved gradients and parameters to all of the other # ranks (which joined early) grads_and_params = [grads_at_each_iter, params_at_each_iter] grads_and_params = _broadcast_object(grads_and_params, src_rank=world_size - 1, group=dist.group.WORLD, device=device) grads_at_each_iter = grads_and_params[0] params_at_each_iter = grads_and_params[1] # TODO: Replace this `_broadcast_object` with `broadcast_object_list` # once the latter supports loading to the destination device instead # of the source device # A process must still set the remaining gradients after joining, so we # define a join hook to do this before the ZeRO join hook class _JoinGradInfo(): def __init__(self, grads): self.grads = grads # remaining gradients to set (in order) self.index = 0 class _SetGradsJoinHook(_JoinHook): def __init__(self, zero_optim, grads): zero_optim._join_grad_info = _JoinGradInfo(grads) self.zero = zero_optim super().__init__() def main_hook(self): grads = self.zero._join_grad_info.grads[self.zero._join_grad_info.index] self.zero._join_grad_info.index += 1 for p, grad in zip(self.zero._all_params, grads): p.grad = grad.detach().clone().to(device) class _GradientSetter(_Joinable): def __init__(self): super().__init__() def _join_hook(self, **kwargs): assert "zero_optim" in kwargs assert "grads" in kwargs zero_optim = kwargs["zero_optim"] grads = kwargs["grads"] return _SetGradsJoinHook(zero_optim, grads) @property def _join_device(self): return device @property def _join_process_group(self): return dist.group.WORLD num_grads_after_joining = NUM_EPOCHS * (world_size - rank - 1) grads = grads_at_each_iter[-num_grads_after_joining:] gradient_setter = _GradientSetter() iter = 0 with _Join([gradient_setter, zero_optim], zero_optim=zero_optim, grads=grads): for _ in range(NUM_EPOCHS): for input in inputs: # Notify join context that this process has not joined _Join.notify_join_context(gradient_setter) # Set gradients manually for p, grad in zip(zero_model.parameters(), grads_at_each_iter[iter]): p.grad = grad.detach().clone().to(device) # Perform optimizer step and check parity zero_optim.step() for p, ddp_p in zip(zero_model.parameters(), params_at_each_iter[iter]): assert torch.allclose(p, ddp_p), \ "Parameters differ between using ZeRO and local optimizer" iter += 1
def train(hyp, tb_writer, opt, device): print(f'Hyperparameters {hyp}') log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution' # run directory wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = log_dir + os.sep + 'results.txt' # Save run settings with open(Path(log_dir) / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(Path(log_dir) / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) epochs = opt.epochs # 300 batch_size = opt.batch_size # batch size per process. total_batch_size = opt.total_batch_size weights = opt.weights # initial training weights local_rank = opt.local_rank # TODO: Init DDP logging. Only the first process is allowed to log. # Since I see lots of print here, the logging configuration is skipped here. We may see repeated outputs. # Configure init_seeds(2 + local_rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Remove previous results if local_rank in [-1, 0]: for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model model = Model(opt.cfg, nc=nc).to(device) # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size # the default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html # all-reduce operation is carried out during loss.backward(). # Thus, there would be redundant all-reduce communications in a accumulation procedure, # which means, the result is still right but the training speed gets slower. # TODO: If acceleration is needed, there is an implementation of allreduce_post_accumulation # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if hyp['optimizer'] == 'adam': # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Load Model # Avoid multiple downloads. with torch_distributed_zero_first(local_rank): google_utils.attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: ckpt['model'] = { k: v for k, v in ckpt['model'].float().state_dict().items() if k in model.state_dict() and model.state_dict()[k].shape == v.shape } model.load_state_dict(ckpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ "Please delete or update %s and try again, or use --weights '' to train from scratch." \ % (weights, opt.cfg, weights, weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.9 + 0.1 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) # DP mode if device.type != 'cpu' and local_rank == -1 and torch.cuda.device_count( ) > 1: model = torch.nn.DataParallel(model) # Exponential moving average # From https://github.com/rwightman/pytorch-image-models/blob/master/train.py: # "Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper" # chenyzsjtu: ema should be placed before after SyncBN. As SyncBN introduces new modules. if opt.sync_bn and device.type != 'cpu' and local_rank != -1: print("SyncBN activated!") model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) ema = torch_utils.ModelEMA(model) if local_rank in [-1, 0] else None # DDP mode if device.type != 'cpu' and local_rank != -1: model = DDP(model, device_ids=[local_rank], output_device=local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=local_rank, world_size=opt.world_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Testloader if local_rank in [-1, 0]: # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency # Only one check and log is needed. if local_rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move if local_rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) # When in DDP mode, the generated indices will be broadcasted to synchronize dataset. if dataset.image_weights: # Generate indices. if local_rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast. if local_rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if local_rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if local_rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if local_rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if local_rank in [-1, 0]: print( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # loss is scaled with batch size in func compute_loss. But in DDP mode, gradient is averaged between devices. if local_rank != -1: loss *= opt.world_size if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if local_rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # Only the first process in DDP mode is allowed to log or save checkpoints. if local_rank in [-1, 0]: # mAP if ema is not None: ema.update_attr( model, include=['md', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Explicitly keep the shape. # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system( 'gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if local_rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if local_rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def _test_zero_model_parallel(self, parameters_as_bucket_view: bool): # Use two processes each with two GPUs assert self.rank < 2 NUM_EPOCHS = 3 NUM_INPUTS = 5 LR = 0.01 torch.manual_seed(0) torch.cuda.manual_seed(0) class ModelParallelModel(torch.nn.Module): def __init__(self, dev0, dev1): super().__init__() self.dev0 = dev0 self.dev1 = dev1 self.net0 = torch.nn.Linear(10, 10).to(dev0) self.relu = torch.nn.ReLU() self.net1 = torch.nn.Linear(10, 5).to(dev1) def forward(self, x): x = x.to(self.dev0) x = self.relu(self.net0(x)) x = x.to(self.dev1) return self.net1(x) class LocalModel(torch.nn.Module): def __init__(self): super().__init__() self.net0 = torch.nn.Linear(10, 10) self.relu = torch.nn.ReLU() self.net1 = torch.nn.Linear(10, 5) def forward(self, x): return self.net1(self.relu(self.net0(x))) dev0 = 2 * self.rank dev1 = 2 * self.rank + 1 mp_model = ModelParallelModel(dev0, dev1) ddp_model = DDP(mp_model) local_model = LocalModel() cpu_device = torch.device("cpu") # Ensure the parameters are the same across the two models local_model.net0.weight = torch.nn.Parameter(mp_model.net0.weight.detach().clone().to(cpu_device)) local_model.net0.bias = torch.nn.Parameter(mp_model.net0.bias.detach().clone().to(cpu_device)) local_model.net1.weight = torch.nn.Parameter(mp_model.net1.weight.detach().clone().to(cpu_device)) local_model.net1.bias = torch.nn.Parameter(mp_model.net1.bias.detach().clone().to(cpu_device)) # Compare parity between DDP with model parallelism using ZeRO and # a local model using a local optimizer zero_optim = ZeroRedundancyOptimizer( ddp_model.parameters(), optimizer_class=torch.optim.Adam, parameters_as_bucket_view=parameters_as_bucket_view, lr=LR ) local_optim = torch.optim.Adam(local_model.parameters(), lr=LR) inputs = [torch.randn(20, 10) for _ in range(NUM_INPUTS)] for _ in range(NUM_EPOCHS): for input in inputs: def closure_local(): local_optim.zero_grad() local_loss = local_model(input).abs().sum() local_loss.backward() return local_loss def closure_ddp(): zero_optim.zero_grad() ddp_loss = ddp_model(input).abs().sum() ddp_loss.backward() return ddp_loss local_loss = cast(torch.Tensor, local_optim.step(closure=closure_local)) ddp_loss = cast(torch.Tensor, zero_optim.step(closure=closure_ddp)).to(cpu_device) assert torch.allclose( local_loss, ddp_loss ), "Losses differ between local optim and ZeroRedundancyOptimizer" for local_p, ddp_p in zip(local_model.parameters(), ddp_model.parameters()): ddp_p = ddp_p.to(cpu_device) assert torch.allclose(local_p, ddp_p), "Models differ after a step"
def train(hyp, opt, device, tb_writer=None): logger.info( colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) save_dir, epochs, batch_size, total_batch_size, weights, rank = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict is_coco = opt.data.endswith('coco.yaml') # Logging- Doing this before checking the dataset. Might update data_dict loggers = {'wandb': None} # loggers dict if rank in [-1, 0]: opt.hyp = hyp # add hyperparameters run_id = torch.load(weights).get('wandb_id') if weights.endswith( '.pt') and os.path.isfile(weights) else None wandb_logger = WandbLogger(opt, Path(opt.save_dir).stem, run_id, data_dict) loggers['wandb'] = wandb_logger.wandb data_dict = wandb_logger.data_dict if wandb_logger.wandb: weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp # WandbLogger might update weights, epochs if resuming nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes names = ['item'] if opt.single_cls and len( data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create exclude = [ 'anchor' ] if (opt.cfg or hyp.get('anchors')) and not opt.resume else [ ] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR if opt.linear_lr: lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp[ 'lrf'] # linear else: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # EMA if ema and ckpt.get('ema'): ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) ema.updates = ckpt['updates'] # Results if ckpt.get('training_results') is not None: results_file.write_text( ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = max(int(model.stride.max()), 32) # grid size (max stride) nl = model.model[ -1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: ')) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: testloader = create_dataloader( test_path, imgsz_test, batch_size * 2, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5, prefix=colorstr('val: '))[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, names, save_dir, loggers) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) model.half().float() # pre-reduce anchor precision # DDP mode if cuda and rank != -1: model = DDP( model, device_ids=[opt.local_rank], output_device=opt.local_rank, # nn.MultiheadAttention incompatibility with DDP https://github.com/pytorch/pytorch/issues/26698 find_unused_parameters=any( isinstance(layer, nn.MultiheadAttention) for layer in model.modules())) # Model parameters hyp['box'] *= 3. / nl # scale to layers hyp['cls'] *= nc / 80. * 3. / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640)**2 * 3. / nl # scale to image size and layers hyp['label_smoothing'] = opt.label_smoothing model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights( dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) compute_loss = ComputeLoss(model) # init loss class logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n' f'Using {dataloader.num_workers} dataloader workers\n' f'Logging results to {save_dir}\n' f'Starting training for {epochs} epochs...') for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'labels', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device)) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(torch.jit.trace(model, imgs, strict=False), []) # add model graph elif plots and ni == 10 and wandb_logger.wandb: wandb_logger.log({ "Mosaics": [ wandb_logger.wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg') if x.exists() ] }) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP ema.update_attr(model, include=[ 'yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights' ]) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP wandb_logger.current_epoch = epoch + 1 results, maps, times = test.test(data_dict, batch_size=batch_size * 2, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, verbose=nc < 50 and final_epoch, plots=plots and final_epoch, wandb_logger=wandb_logger, compute_loss=compute_loss, is_coco=is_coco) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # append metrics, val_loss if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = [ 'train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb_logger.wandb: wandb_logger.log({tag: x}) # W&B # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi wandb_logger.end_epoch(best_result=best_fitness == fi) # Save model if (not opt.nosave) or (final_epoch and not opt.evolve): # if save ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': results_file.read_text(), 'model': deepcopy( model.module if is_parallel(model) else model).half(), 'ema': deepcopy(ema.ema).half(), 'updates': ema.updates, 'optimizer': optimizer.state_dict(), 'wandb_id': wandb_logger.wandb_run.id if wandb_logger.wandb else None } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if wandb_logger.wandb: if ((epoch + 1) % opt.save_period == 0 and not final_epoch) and opt.save_period != -1: wandb_logger.log_model(last.parent, opt, epoch, fi, best_model=best_fitness == fi) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Plots if plots: plot_results(save_dir=save_dir) # save as results.png if wandb_logger.wandb: files = [ 'results.png', 'confusion_matrix.png', *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')] ] wandb_logger.log({ "Results": [ wandb_logger.wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists() ] }) # Test best.pt logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) if opt.data.endswith('coco.yaml') and nc == 80: # if COCO for m in (last, best) if best.exists() else (last): # speed, mAP tests results, _, _ = test.test(opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, conf_thres=0.001, iou_thres=0.7, model=attempt_load(m, device).half(), single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=True, plots=False, is_coco=is_coco) # Strip optimizers final = best if best.exists() else last # final model for f in last, best: if f.exists(): strip_optimizer(f) # strip optimizers if opt.bucket: os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') # upload if wandb_logger.wandb and not opt.evolve: # Log the stripped model wandb_logger.wandb.log_artifact( str(final), type='model', name='run_' + wandb_logger.wandb_run.id + '_model', aliases=['last', 'best', 'stripped']) wandb_logger.finish_run() else: dist.destroy_process_group() torch.cuda.empty_cache() return results
def train_step(self, rank, start_time, return_dict, writer): device = torch.device("cuda:" + str(rank)) print('Running on device: ', device) torch.cuda.set_device(device) torch.set_default_tensor_type(torch.FloatTensor) self.setup(rank, self.args.num_processes) if self.cfg.MC_DQL: transition = namedtuple('Transition', ('episode')) else: transition = namedtuple( 'Transition', ('state', 'action', 'reward', 'next_state', 'done')) memory = TransitionData_ts(capacity=self.args.t_max, storage_object=transition) env = SpGcnEnv(self.args, device, writer=writer, writer_counter=self.global_writer_quality_count, win_event_counter=self.global_win_event_count) # Create shared network # model = GcnEdgeAC_1(self.cfg, self.args.n_raw_channels, self.args.n_embedding_features, 1, device, writer=writer) model = GcnEdgeAC_3(self.cfg, self.args.n_raw_channels, self.args.n_embedding_features, 1, device, writer=writer) # model = GcnEdgeAC(self.cfg, self.args.n_raw_channels, self.args.n_embedding_features, 1, device, writer=writer) model.cuda(device) shared_model = DDP(model, device_ids=[model.device], find_unused_parameters=True) # dloader = DataLoader(MultiDiscSpGraphDsetBalanced(no_suppix=False, create=False), batch_size=1, shuffle=True, pin_memory=True, # num_workers=0) dloader = DataLoader(SpgDset(), batch_size=self.cfg.batch_size, shuffle=True, pin_memory=True, num_workers=0) # Create optimizer for shared network parameters with shared statistics # optimizer = CstmAdam(shared_model.parameters(), lr=self.args.lr, betas=self.args.Adam_betas, # weight_decay=self.args.Adam_weight_decay) ###################### self.action_range = 1 self.device = torch.device(device) self.discount = 0.5 self.critic_tau = self.cfg.critic_tau self.actor_update_frequency = self.cfg.actor_update_frequency self.critic_target_update_frequency = self.cfg.critic_target_update_frequency self.batch_size = self.cfg.batch_size self.log_alpha = torch.tensor(np.log(self.cfg.init_temperature)).to( self.device) self.log_alpha.requires_grad = True # set target entropy to -|A| ###################### # optimizers OptimizerContainer = namedtuple('OptimizerContainer', ('actor', 'critic', 'temperature')) actor_optimizer = torch.optim.Adam( shared_model.module.actor.parameters(), lr=self.cfg.actor_lr, betas=self.cfg.actor_betas) critic_optimizer = torch.optim.Adam( shared_model.module.critic.parameters(), lr=self.cfg.critic_lr, betas=self.cfg.critic_betas) temp_optimizer = torch.optim.Adam([self.log_alpha], lr=self.cfg.alpha_lr, betas=self.cfg.alpha_betas) optimizers = OptimizerContainer(actor_optimizer, critic_optimizer, temp_optimizer) if self.args.fe_extr_warmup and rank == 0 and not self.args.test_score_only: fe_extr = shared_model.module.fe_ext fe_extr.cuda(device) self.fe_extr_warm_start_1(fe_extr, writer=writer) # self.fe_extr_warm_start(fe_extr, writer=writer) if self.args.model_name == "" and not self.args.no_save: torch.save(fe_extr.state_dict(), os.path.join(self.save_dir, 'agent_model_fe_extr')) elif not self.args.no_save: torch.save(fe_extr.state_dict(), os.path.join(self.save_dir, self.args.model_name)) dist.barrier() for param in model.fe_ext.parameters(): param.requires_grad = False if self.args.model_name != "": shared_model.load_state_dict( torch.load(os.path.join(self.save_dir, self.args.model_name))) elif self.args.model_fe_name != "": shared_model.module.fe_ext.load_state_dict( torch.load(os.path.join(self.save_dir, self.args.model_fe_name))) elif self.args.fe_extr_warmup: print('loaded fe extractor') shared_model.module.fe_ext.load_state_dict( torch.load(os.path.join(self.save_dir, 'agent_model_fe_extr'))) if not self.args.test_score_only: quality = self.args.stop_qual_scaling + self.args.stop_qual_offset best_quality = np.inf last_quals = [] while self.global_count.value() <= self.args.T_max: if self.global_count.value() == 78: a = 1 self.update_env_data(env, dloader, device) # waff_dis = torch.softmax(env.edge_features[:, 0].squeeze() + 1e-30, dim=0) # waff_dis = torch.softmax(env.gt_edge_weights + 0.5, dim=0) waff_dis = torch.softmax(torch.ones_like( env.b_gt_edge_weights), dim=0) loss_weight = torch.softmax(env.b_gt_edge_weights + 1, dim=0) env.reset() # self.target_entropy = - float(env.gt_edge_weights.shape[0]) self.target_entropy = -8.0 env.stop_quality = self.stop_qual_rule.apply( self.global_count.value(), quality) if self.cfg.temperature_regulation == 'follow_quality': self.alpha = self.eps_rule.apply(self.global_count.value(), quality) print(self.alpha.item()) with open(os.path.join(self.save_dir, 'runtime_cfg.yaml')) as info: args_dict = yaml.full_load(info) if args_dict is not None: if 'safe_model' in args_dict: self.args.safe_model = args_dict['safe_model'] args_dict['safe_model'] = False if 'add_noise' in args_dict: self.args.add_noise = args_dict['add_noise'] if 'critic_lr' in args_dict and args_dict[ 'critic_lr'] != self.cfg.critic_lr: self.cfg.critic_lr = args_dict['critic_lr'] adjust_learning_rate(critic_optimizer, self.cfg.critic_lr) if 'actor_lr' in args_dict and args_dict[ 'actor_lr'] != self.cfg.actor_lr: self.cfg.actor_lr = args_dict['actor_lr'] adjust_learning_rate(actor_optimizer, self.cfg.actor_lr) if 'alpha_lr' in args_dict and args_dict[ 'alpha_lr'] != self.cfg.alpha_lr: self.cfg.alpha_lr = args_dict['alpha_lr'] adjust_learning_rate(temp_optimizer, self.cfg.alpha_lr) with open(os.path.join(self.save_dir, 'runtime_cfg.yaml'), "w") as info: yaml.dump(args_dict, info) if self.args.safe_model: best_quality = quality if rank == 0: if self.args.model_name_dest != "": torch.save( shared_model.state_dict(), os.path.join(self.save_dir, self.args.model_name_dest)) else: torch.save( shared_model.state_dict(), os.path.join(self.save_dir, 'agent_model')) state = env.get_state() while not env.done: # Calculate policy and values post_input = True if ( self.global_count.value() + 1) % 15 == 0 and env.counter == 0 else False round_n = env.counter # sample action for data collection distr = None if self.global_count.value() < self.cfg.num_seed_steps: action = torch.rand_like(env.b_current_edge_weights) else: distr, _, _, action = self.agent_forward( env, shared_model, state=state, grad=False, post_input=post_input) logg_dict = {'temperature': self.alpha.item()} if distr is not None: logg_dict['mean_loc'] = distr.loc.mean().item() logg_dict['mean_scale'] = distr.scale.mean().item() if self.global_count.value( ) >= self.cfg.num_seed_steps and memory.is_full(): self._step(memory, optimizers, env, shared_model, self.global_count.value(), writer=writer) self.global_writer_loss_count.increment() next_state, reward, quality = env.execute_action( action, logg_dict) last_quals.append(quality) if len(last_quals) > 10: last_quals.pop(0) if self.args.add_noise: noise = torch.randn_like(reward) * self.alpha.item() reward = reward + noise memory.push(self.state_to_cpu(state), action, reward, self.state_to_cpu(next_state), env.done) # Train the network # self._step(memory, shared_model, env, optimizer, loss_weight, off_policy=True, writer=writer) # reward = self.args.reward_clip and min(max(reward, -1), 1) or reward # Optionally clamp rewards # done = done or episode_length >= self.args.max_episode_length # Stop episodes at a max length state = next_state self.global_count.increment() dist.barrier() if rank == 0: if not self.args.cross_validate_hp and not self.args.test_score_only and not self.args.no_save: # pass if self.args.model_name_dest != "": torch.save( shared_model.state_dict(), os.path.join(self.save_dir, self.args.model_name_dest)) print('saved') else: torch.save(shared_model.state_dict(), os.path.join(self.save_dir, 'agent_model')) self.cleanup() return sum(last_quals) / 10