def train_model(args): with open("tacotron/config.toml") as file: cfg = toml.load(file) tensorboard_path = Path("tensorboard") / args.checkpoint_dir checkpoint_dir = Path(args.checkpoint_dir) writer = SummaryWriter(tensorboard_path) tacotron = Tacotron(**cfg["model"]).cuda() optimizer = optim.Adam(tacotron.parameters(), lr=cfg["train"]["optimizer"]["lr"]) scaler = amp.GradScaler() scheduler = optim.lr_scheduler.MultiStepLR( optimizer=optimizer, milestones=cfg["train"]["scheduler"]["milestones"], gamma=cfg["train"]["scheduler"]["gamma"], ) if args.resume is not None: global_step = load_checkpoint( tacotron=tacotron, optimizer=optimizer, scaler=scaler, scheduler=scheduler, load_path=args.resume, ) else: global_step = 0 root_path = Path(args.dataset_dir) text_path = Path(args.text_path) dataset = TTSDataset(root_path, text_path) sampler = samplers.RandomSampler(dataset) batch_sampler = BucketBatchSampler( sampler=sampler, batch_size=cfg["train"]["batch_size"], drop_last=True, sort_key=dataset.sort_key, bucket_size_multiplier=cfg["train"]["bucket_size_multiplier"], ) collate_fn = partial( pad_collate, reduction_factor=cfg["model"]["decoder"]["reduction_factor"]) loader = DataLoader( dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, num_workers=cfg["train"]["n_workers"], pin_memory=True, ) n_epochs = cfg["train"]["n_steps"] // len(loader) + 1 start_epoch = global_step // len(loader) + 1 for epoch in range(start_epoch, n_epochs + 1): average_loss = 0 for i, (mels, texts, mel_lengths, text_lengths, attn_flag) in enumerate(tqdm(loader), 1): mels, texts = mels.cuda(), texts.cuda() optimizer.zero_grad() with amp.autocast(): ys, alphas = tacotron(texts, mels) loss = F.l1_loss(ys, mels) scaler.scale(loss).backward() scaler.unscale_(optimizer) clip_grad_norm_(tacotron.parameters(), cfg["train"]["clip_grad_norm"]) scaler.step(optimizer) scaler.update() scheduler.step() global_step += 1 average_loss += (loss.item() - average_loss) / i if global_step % cfg["train"]["checkpoint_interval"] == 0: save_checkpoint( tacotron=tacotron, optimizer=optimizer, scaler=scaler, scheduler=scheduler, step=global_step, checkpoint_dir=checkpoint_dir, ) if attn_flag: index = attn_flag[0] alpha = alphas[ index, :text_lengths[index], :mel_lengths[index] // 2] alpha = alpha.detach().cpu().numpy() y = ys[index, :, :].detach().cpu().numpy() log_alignment(alpha, y, cfg["preprocess"], writer, global_step) writer.add_scalar("loss", average_loss, global_step) print( f"epoch {epoch} : loss {average_loss:.4f} : {scheduler.get_last_lr()}" )
def train( hyp, # path/to/hyp.yaml or hyp dictionary opt, device, callbacks): save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \ opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze # Directories w = save_dir / 'weights' # weights dir (w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir last, best = w / 'last.pt', w / 'best.pt' # Hyperparameters if isinstance(hyp, str): with open(hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict LOGGER.info( colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) # Save run settings if not evolve: with open(save_dir / 'hyp.yaml', 'w') as f: yaml.safe_dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.safe_dump(vars(opt), f, sort_keys=False) # Loggers data_dict = None if RANK in [-1, 0]: loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance if loggers.wandb: data_dict = loggers.wandb.data_dict if resume: weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size # Register actions for k in methods(loggers): callbacks.register_action(k, callback=getattr(loggers, k)) # Config plots = not evolve # create plots cuda = device.type != 'cpu' init_seeds(1 + RANK) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] nc = 1 if single_cls else int(data_dict['nc']) # number of classes names = ['item'] if single_cls and len( data_dict['names']) != 1 else data_dict['names'] # class names assert len( names ) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check is_coco = isinstance(val_path, str) and val_path.endswith( 'coco/val2017.txt') # COCO dataset # Model check_suffix(weights, '.pt') # check weights pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(LOCAL_RANK): weights = attempt_download( weights) # download if not found locally ckpt = torch.load(weights, map_location='cpu' ) # load checkpoint to CPU to avoid CUDA memory leak model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create exclude = [ 'anchor' ] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys csd = ckpt['model'].float().state_dict( ) # checkpoint state_dict as FP32 csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(csd, strict=False) # load LOGGER.info( f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}' ) # report else: model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create # Freeze freeze = [ f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0])) ] # layers to freeze for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): LOGGER.info(f'freezing {k}') v.requires_grad = False # Image size gs = max(int(model.stride.max()), 32) # grid size (max stride) imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple # Batch size if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size batch_size = check_train_batch_size(model, imgsz) loggers.on_params_update({"batch_size": batch_size}) # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}") g0, g1, g2 = [], [], [] # optimizer parameter groups for v in model.modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias g2.append(v.bias) if isinstance(v, nn.BatchNorm2d): # weight (no decay) g0.append(v.weight) elif hasattr(v, 'weight') and isinstance( v.weight, nn.Parameter): # weight (with decay) g1.append(v.weight) if opt.optimizer == 'Adam': optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum elif opt.optimizer == 'AdamW': optimizer = AdamW(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': g1, 'weight_decay': hyp['weight_decay'] }) # add g1 with weight_decay optimizer.add_param_group({'params': g2}) # add g2 (biases) LOGGER.info( f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups " f"{len(g0)} weight (no decay), {len(g1)} weight, {len(g2)} bias") del g0, g1, g2 # Scheduler if opt.cos_lr: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] else: lf = lambda x: (1 - x / epochs) * (1.0 - hyp['lrf']) + hyp['lrf' ] # linear scheduler = lr_scheduler.LambdaLR( optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA ema = ModelEMA(model) if RANK in [-1, 0] else None # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # EMA if ema and ckpt.get('ema'): ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) ema.updates = ckpt['updates'] # Epochs start_epoch = ckpt['epoch'] + 1 if resume: assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.' if epochs < start_epoch: LOGGER.info( f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs." ) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, csd # DP mode if cuda and RANK == -1 and torch.cuda.device_count() > 1: LOGGER.warning( 'WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n' 'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.' ) model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and RANK != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) LOGGER.info('Using SyncBatchNorm()') # Trainloader train_loader, dataset = create_dataloader( train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls, hyp=hyp, augment=True, cache=None if opt.cache == 'val' else opt.cache, rect=opt.rect, rank=LOCAL_RANK, workers=workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '), shuffle=True) mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class nb = len(train_loader) # number of batches assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}' # Process 0 if RANK in [-1, 0]: val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls, hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1, workers=workers * 2, pad=0.5, prefix=colorstr('val: '))[0] if not resume: labels = np.concatenate(dataset.labels, 0) # c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, names, save_dir) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) model.half().float() # pre-reduce anchor precision callbacks.run('on_pretrain_routine_end') # DDP mode if cuda and RANK != -1: model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK) # Model attributes nl = de_parallel( model).model[-1].nl # number of detection layers (to scale hyps) hyp['box'] *= 3 / nl # scale to layers hyp['cls'] *= nc / 80 * 3 / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640)**2 * 3 / nl # scale to image size and layers hyp['label_smoothing'] = opt.label_smoothing model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights( dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 100) # number of warmup iterations, max(3 epochs, 100 iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training last_opt_step = -1 maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) stopper = EarlyStopping(patience=opt.patience) compute_loss = ComputeLoss(model) # init loss class LOGGER.info( f'Image sizes {imgsz} train, {imgsz} val\n' f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n' f"Logging results to {colorstr('bold', save_dir)}\n" f'Starting training for {epochs} epochs...') for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional, single-GPU only) if opt.image_weights: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Update mosaic border (optional) # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(3, device=device) # mean losses if RANK != -1: train_loader.sampler.set_epoch(epoch) pbar = enumerate(train_loader) LOGGER.info( ('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size')) if RANK in [-1, 0]: pbar = tqdm( pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device)) # loss scaled by batch_size if RANK != -1: loss *= WORLD_SIZE # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni - last_opt_step >= accumulate: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) last_opt_step = ni # Log if RANK in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB) pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])) callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots, opt.sync_bn) if callbacks.stop_training: return # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for loggers scheduler.step() if RANK in [-1, 0]: # mAP callbacks.run('on_train_epoch_end', epoch=epoch) ema.update_attr(model, include=[ 'yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights' ]) final_epoch = (epoch + 1 == epochs) or stopper.possible_stop if not noval or final_epoch: # Calculate mAP results, maps, _ = val.run(data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=ema.ema, single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, plots=False, callbacks=callbacks, compute_loss=compute_loss) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi log_vals = list(mloss) + list(results) + lr callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi) # Save model if (not nosave) or (final_epoch and not evolve): # if save ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'model': deepcopy(de_parallel(model)).half(), 'ema': deepcopy(ema.ema).half(), 'updates': ema.updates, 'optimizer': optimizer.state_dict(), 'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None, 'date': datetime.now().isoformat() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0): torch.save(ckpt, w / f'epoch{epoch}.pt') del ckpt callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi) # Stop Single-GPU if RANK == -1 and stopper(epoch=epoch, fitness=fi): break # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576 # stop = stopper(epoch=epoch, fitness=fi) # if RANK == 0: # dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks # Stop DPP # with torch_distributed_zero_first(RANK): # if stop: # break # must break all DDP ranks # end epoch ---------------------------------------------------------------------------------------------------- # end training ----------------------------------------------------------------------------------------------------- if RANK in [-1, 0]: LOGGER.info( f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.' ) for f in last, best: if f.exists(): strip_optimizer(f) # strip optimizers if f is best: LOGGER.info(f'\nValidating {f}...') results, _, _ = val.run( data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=attempt_load(f, device).half(), iou_thres=0.65 if is_coco else 0.60, # best pycocotools results at 0.65 single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, save_json=is_coco, verbose=True, plots=True, callbacks=callbacks, compute_loss=compute_loss) # val best model with plots if is_coco: callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi) callbacks.run('on_train_end', last, best, plots, epoch, results) LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}") torch.cuda.empty_cache() return results
def train(n_gpus, rank, output_directory, epochs, optim_algo, learning_rate, weight_decay, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path, ignore_layers, include_layers, finetune_layers, warmstart_checkpoint_path, with_tensorboard, grad_clip_val, fp16_run): fp16_run = bool(fp16_run) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpus > 1: init_distributed(rank, n_gpus, **dist_config) criterion = FlowtronLoss(sigma, bool(model_config['n_components']), bool(model_config['use_gate_layer'])) model = Flowtron(**model_config).cuda() if len(finetune_layers): for name, param in model.named_parameters(): if name in finetune_layers: param.requires_grad = True else: param.requires_grad = False print("Initializing %s optimizer" % (optim_algo)) if optim_algo == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_algo == 'RAdam': optimizer = RAdam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) else: print("Unrecognized optimizer %s!" % (optim_algo)) exit(1) # Load checkpoint if one exists iteration = 0 if warmstart_checkpoint_path != "": model = warmstart(warmstart_checkpoint_path, model) if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, ignore_layers) iteration += 1 # next iteration is iteration + 1 if n_gpus > 1: model = apply_gradient_allreduce(model) print(model) scaler = amp.GradScaler(enabled=fp16_run) train_loader, valset, collate_fn = prepare_dataloaders( data_config, n_gpus, batch_size) # Get shared output_directory ready if rank == 0 and not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("Output directory", output_directory) if with_tensorboard and rank == 0: tboard_out_path = os.path.join(output_directory, 'logs') print("Setting up Tensorboard log in %s" % (tboard_out_path)) logger = FlowtronLogger(tboard_out_path) # force set the learning rate to what is specified for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for batch in train_loader: model.zero_grad() mel, speaker_vecs, text, in_lens, out_lens, gate_target, attn_prior = batch mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda( ), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda( ), gate_target.cuda() attn_prior = attn_prior.cuda() if valset.use_attn_prior else None with amp.autocast(enabled=fp16_run): z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, speaker_vecs, text, in_lens, out_lens, attn_prior) loss_nll, loss_gate = criterion( (z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) loss = loss_nll + loss_gate if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_gate_loss = reduce_tensor(loss_gate.data, n_gpus).item() reduced_nll_loss = reduce_tensor(loss_nll.data, n_gpus).item() else: reduced_loss = loss.item() reduced_gate_loss = loss_gate.item() reduced_nll_loss = loss_nll.item() scaler.scale(loss).backward() if grad_clip_val > 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val) scaler.step(optimizer) scaler.update() if rank == 0: print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('training_loss_gate', reduced_gate_loss, iteration) logger.add_scalar('training_loss_nll', reduced_nll_loss, iteration) logger.add_scalar('learning_rate', learning_rate, iteration) if iteration % iters_per_checkpoint == 0: val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target = compute_validation_loss( model, criterion, valset, collate_fn, batch_size, n_gpus) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, val_loss)) if with_tensorboard: logger.log_validation(val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target, iteration) checkpoint_path = "{}/model_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(args): trainer = base_trainer.Trainer() args, device = get_args(args) args, log, tbx = trainer.setup(args) # Get BPE log.info("Loading BPE...") bpe = get_bpe(args) log.info("Loaded {} BPE tokens".format(len(bpe))) # Get data loader log.info("Building dataset...") train_dataset, train_loader = get_dataset(args, args.train_record_file, bpe, shuffle=True) dev_dataset, dev_loader = get_dataset(args, args.train_record_file, bpe, shuffle=False) args.epoch_size = len(train_dataset) log.info("Train has {} examples".format(args.epoch_size)) # Get model log.info("Building model...") model = get_model(args, bpe) model = trainer.setup_model(model, device) # Get optimizer, scheduler, and scaler optimizer = optim.AdamW( model.parameters(), args.lr, betas=(args.beta_1, args.beta_2), eps=args.eps, weight_decay=args.l2_wd, ) get_num_steps(args) log.info("Scheduler will decay over {} steps".format(args.num_steps)) scheduler = sched.get_linear_warmup_power_decay_scheduler( optimizer, args.warmup_steps, args.num_steps, power=args.power_decay) scaler = amp.GradScaler() optimizer, scheduler, scaler = trainer.setup_optimizer( optimizer, scheduler, scaler) # Train log.info("Training...") model.train() sample_num = 0 samples_till_eval = args.eval_per_n_samples epoch = 0 step = 0 trainer.setup_saver() trainer.setup_random() sample_num, samples_till_eval, epoch, step = trainer.setup_step( step_vars=(sample_num, samples_till_eval, epoch, step)) trainer.setup_close() while epoch != args.num_epochs: trainer.save_checkpoint(step_vars=(sample_num, samples_till_eval, epoch, step)) epoch += 1 log.info(f"Starting epoch {epoch}...") # Print histogram of weights every epoch for tags, params in model.named_parameters(): tbx.add_histogram(tags, params.data, epoch) with torch.enable_grad(), tqdm( total=len(train_loader.dataset)) as progress_bar: for x, y, _, _ in train_loader: batch_size = x.size(0) loss, loss_val, _ = forward(x, y, args, device, model) loss = loss / args.gradient_accumulation # Backward scaler.scale(loss).backward() if (step + 1) % args.gradient_accumulation == 0: scaler.unscale_(optimizer) nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scaler.step(optimizer) scaler.update() scheduler.step() optimizer.zero_grad() # Log info step += 1 sample_num += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar("train/NLL", loss_val, sample_num) tbx.add_scalar("train/LR", optimizer.param_groups[0]["lr"], sample_num) tbx.add_scalar("train/steps", step // args.gradient_accumulation, sample_num) results, augs = augment(model, dev_loader, device, bpe, args) for k, v in results.items(): tbx.add_scalar(f"dev/{k}", v, sample_num) save(args.train_aug_file, augs, "train aug")
def run_train(): fold = 3 out_dir = \ '/root/share1/kaggle/2021/bms-moleular-translation/result/try10/resnet26d-224-transformer/fold%d-1' % fold initial_checkpoint = \ out_dir + '/checkpoint/00098000_model.pth'# #'/root/share1/kaggle/2021/bms-moleular-translation/result/try02/resnet26d-224-lstm/fold3/checkpoint/00036000_model.pth' encoder_checkpoint = \ None #'/root/share1/kaggle/2021/bms-moleular-translation/result/try02/resnet26d-224/fold3-1/checkpoint/00204000_model.pth' debug = 0 start_lr = 0.001 # 1 batch_size = 32 # 24 ## setup ---------------------------------------- for f in ['checkpoint', 'train', 'valid', 'backup']: os.makedirs(out_dir + '/' + f, exist_ok=True) # backup_project_as_zip(PROJECT_PATH, out_dir +'/backup/code.train.%s.zip'%IDENTIFIER) log = Logger() log.open(out_dir + '/log.train.txt', mode='a') log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) log.write('\t%s\n' % COMMON_STRING) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % out_dir) log.write('\n') ## dataset ------------------------------------ df_train, df_valid = make_fold('train-%d' % fold) tokenizer = load_tokenizer() train_dataset = BmsDataset(df_train, tokenizer) valid_dataset = BmsDataset(df_valid, tokenizer) train_loader = DataLoader( train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size, drop_last=True, num_workers=0, pin_memory=True, collate_fn=null_collate, ) valid_loader = DataLoader( valid_dataset, #sampler=SequentialSampler(valid_dataset), sampler=FixNumSampler(valid_dataset, 5_000), #200_000 #5_000 batch_size=32, drop_last=False, num_workers=0, pin_memory=True, collate_fn=null_collate, #collate_fn=lambda batch: null_collate(batch,False), ) log.write('** dataset setting **\n') log.write('train_dataset : \n%s\n' % (train_dataset)) log.write('valid_dataset : \n%s\n' % (valid_dataset)) log.write('\n') ## net ---------------------------------------- scaler = amp.GradScaler() net = AmpNet().cuda() if encoder_checkpoint is not None: f = torch.load(encoder_checkpoint, map_location=lambda storage, loc: storage) encoder_state_dict = f['encoder_state_dict'] net.encoder.load_state_dict(encoder_state_dict, strict=False) # True if initial_checkpoint is not None: f = torch.load(initial_checkpoint, map_location=lambda storage, loc: storage) start_iteration = f['iteration'] start_epoch = f['epoch'] state_dict = f['state_dict'] #decoder_state_dict = f['decoder_state_dict'] #decoder_state_dict = { k.replace('image_pos','image_pos1'):v for k,v, in decoder_state_dict.items()} net.load_state_dict(state_dict, strict=True) # True else: start_iteration = 0 start_epoch = 0 log.write('** net setting **\n') log.write('encoder_checkpoint : %s\n' % (encoder_checkpoint)) log.write('initial_checkpoint : %s\n' % (initial_checkpoint)) log.write('\n') # ----------------------------------------------- if 0: ##freeze for p in net.encoder.parameters(): p.requires_grad = False optimizer = Lookahead(RAdam(filter(lambda p: p.requires_grad, net.parameters()), lr=start_lr), alpha=0.5, k=5) # optimizer = RAdam(filter(lambda p: p.requires_grad, net.parameters()),lr=start_lr) num_iteration = 80000 * 1000 iter_log = 1000 iter_valid = 1000 iter_save = list(range(0, num_iteration, 1000)) # 1*1000 log.write('optimizer\n %s\n' % (optimizer)) log.write('\n') ## start training here! ############################################## log.write('** start training here! **\n') log.write(' is_mixed_precision = %s \n' % str(is_mixed_precision)) log.write(' batch_size = %d\n' % (batch_size)) log.write(' experiment = %s\n' % str(__file__.split('/')[-2:])) log.write( ' |----- VALID ---|---- TRAIN/BATCH --------------\n' ) log.write( 'rate iter epoch | loss lb(lev) | loss0 loss1 | time \n' ) log.write( '----------------------------------------------------------------------\n' ) # 0.00000 0.00* 0.00 | 0.000 0.000 | 0.000 0.000 | 0 hr 00 min def message(mode='print'): if mode == ('print'): asterisk = ' ' loss = batch_loss if mode == ('log'): asterisk = '*' if iteration in iter_save else ' ' loss = train_loss text = \ '%0.5f %5.4f%s %4.2f | ' % (rate, iteration / 10000, asterisk, epoch,) + \ '%4.3f %4.3f | ' % (*valid_loss,) + \ '%4.3f %4.3f %4.3f | ' % (*loss,) + \ '%s' % (time_to_str(timer() - start_timer, 'min')) return text # ---- valid_loss = np.zeros(2, np.float32) train_loss = np.zeros(3, np.float32) batch_loss = np.zeros_like(train_loss) sum_train_loss = np.zeros_like(train_loss) sum_train = 0 loss0 = torch.FloatTensor([0]).cuda().sum() loss1 = torch.FloatTensor([0]).cuda().sum() loss2 = torch.FloatTensor([0]).cuda().sum() start_timer = timer() iteration = start_iteration epoch = start_epoch rate = 0 while iteration < num_iteration: for t, batch in enumerate(train_loader): if iteration in iter_save: if iteration != start_iteration: torch.save( { 'state_dict': net.state_dict(), 'iteration': iteration, 'epoch': epoch, }, out_dir + '/checkpoint/%08d_model.pth' % (iteration)) pass if (iteration % iter_valid == 0): #if iteration != start_iteration: valid_loss = do_valid(net, tokenizer, valid_loader) # pass if (iteration % iter_log == 0): print('\r', end='', flush=True) log.write(message(mode='log') + '\n') # learning rate schduler ------------ rate = get_learning_rate(optimizer) # one iteration update ------------- batch_size = len(batch['index']) image = batch['image'].cuda() token = batch['token'].cuda() length = batch['length'] # ---- net.train() optimizer.zero_grad() # encoder.eval() # with torch.no_grad(): # image_embed = encoder(image) if is_mixed_precision: with amp.autocast(): #assert(False) logit = data_parallel(net, (image, token, length)) loss0 = seq_cross_entropy_loss(logit, token, length) #teacher forcing, exposure bias if 0: #use wrong prediction as input probability = F.softmax(logit, -1) predict = logit.argmax(-1) a = (torch.rand(token[:, 1:].shape) > 0.8).float( ).cuda() #<todo> sample according to probability fake_token = token.clone() fake_token[:, 1:] = a * fake_token[:, 1:] + ( 1 - a) * predict[:, :-1] logit1 = data_parallel(net, (image, fake_token, length)) loss1 = seq_cross_entropy_loss(logit1, token, length) pass scaler.scale(loss0).backward() # scaler.scale(loss1).backward() # scaler.scale(loss0 + 0.1*loss1).backward() #scaler.unscale_(optimizer) #torch.nn.utils.clip_grad_norm_(net.parameters(), 2) scaler.step(optimizer) scaler.update() else: assert False # print('fp32') logit = net(image, token, length) loss0 = seq_cross_entropy_loss(logit, token, length) (loss0).backward() optimizer.step() # print statistics -------- epoch += 1 / len(train_loader) iteration += 1 batch_loss = np.array([loss0.item(), loss1.item(), loss2.item()]) sum_train_loss += batch_loss sum_train += 1 if iteration % 100 == 0: train_loss = sum_train_loss / (sum_train + 1e-12) sum_train_loss[...] = 0 sum_train = 0 print('\r', end='', flush=True) print(message(mode='print'), end='', flush=True) # debug-------------------------- if debug: pass log.write('\n')
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device: Any = torch.device("cpu") else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. _C = Config(_A.config, _A.config_override) common_setup(_C, _A) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER # ------------------------------------------------------------------------- train_dataset = PretrainingDatasetFactory.from_config(_C, split="train") val_dataset = PretrainingDatasetFactory.from_config(_C, split="val") # Make `DistributedSampler`s to shard datasets across GPU processes. # Skip this if training on CPUs. train_sampler = ( DistributedSampler(train_dataset, shuffle=True) # type: ignore if _A.num_gpus_per_machine > 0 else None) val_sampler = ( DistributedSampler(val_dataset, shuffle=False) # type: ignore if _A.num_gpus_per_machine > 0 else None) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=train_sampler, shuffle=train_sampler is None, num_workers=_A.cpu_workers, pin_memory=True, drop_last=True, collate_fn=train_dataset.collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=val_sampler, shuffle=False, num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) model = PretrainingModelFactory.from_config(_C).to(device) optimizer = OptimizerFactory.from_config(_C, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_C, optimizer) # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Create a gradient scaler for automatic mixed precision. scaler = amp.GradScaler(enabled=_C.AMP) # Load checkpoint to resume training if specified. if _A.resume_from is not None: start_iteration = CheckpointManager( model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler, ).load(_A.resume_from) else: start_iteration = 0 # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device, start_iteration) # Wrap model in DDP if using more than one processes. if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) # Keep track of time per iteration and ETA. timer = Timer(start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS) # Create tensorboard writer and checkpoint manager (only in master process). if dist.is_master_process(): tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) tensorboard_writer.add_text("config", f"```\n{_C}\n```") checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler, ) # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch = next(train_dataloader_iter) with amp.autocast(enabled=_C.AMP): output_dict = model(batch) loss = output_dict["loss"] scaler.scale(loss).backward() # First clip norm of gradients, and then perform optimizer step. scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), _C.OPTIM.CLIP_GRAD_NORM) scaler.step(optimizer) scaler.update() scheduler.step() timer.toc() # --------------------------------------------------------------------- # LOGGING # --------------------------------------------------------------------- if iteration % _A.log_every == 0: logger.info( f"{timer.stats} [Loss {loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]" ) if dist.is_master_process(): tensorboard_writer.add_scalars( "learning_rate", { "visual": optimizer.param_groups[0]["lr"], "common": optimizer.param_groups[-1]["lr"], }, iteration, ) tensorboard_writer.add_scalars("train", output_dict["loss_components"], iteration) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: if dist.is_master_process(): checkpoint_manager.step(iteration) # All processes will wait till master process is done serializing. dist.synchronize() torch.set_grad_enabled(False) model.eval() # Accumulate different val loss components according to the type of # pretraining model. val_loss_counter: Counter = Counter() for val_iteration, val_batch in enumerate(val_dataloader, start=1): for key in val_batch: val_batch[key] = val_batch[key].to(device) output_dict = model(val_batch) val_loss_counter.update(output_dict["loss_components"]) # Divide each loss component by number of val batches per GPU. val_loss_dict = { k: v / val_iteration for k, v in dict(val_loss_counter).items() } dist.average_across_processes(val_loss_dict) torch.set_grad_enabled(True) model.train() logger.info(f"Iteration: {iteration} [Val loss: {val_loss_dict}]") if dist.is_master_process(): tensorboard_writer.add_scalars("val", val_loss_dict, iteration)
def train_one_epoch(model, train_loader, losses, optimizer, scheduler, epoch): global batch_step epoch_start_time = time.time() logger.info( 'training', 'Start training epoch-{}, lr={:.6}'.format( epoch, get_lr_from_optim(optimizer))) scaler = amp.GradScaler() model.train() history = collections.defaultdict(list) for i, (imgs, labels) in enumerate(train_loader): batch = i + 1 batch_start_time = time.time() imgs, labels = imgs.cuda(), labels.cuda() with amp.autocast(): f_bn, p, f_mask = model(imgs) ce_loss = losses['cross_entropy_loss'](p, labels) triplet_hard_loss, _ = losses['triplet_hard_loss'](f_bn, f_mask, labels) loss = Config.weight_ce * ce_loss loss += Config.weight_tri * triplet_hard_loss scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() optimizer.zero_grad() sortmask = torch.sort(f_mask, dim=1)[0] bottom20mean = sortmask[:, :20].mean() top20mean = sortmask[:, -20:].mean() acc = accuracy(p, labels)[0] batch_end_time = time.time() time_spent = batch_end_time - batch_start_time dist_ap, dist_an = losses['triplet_hard_loss'].get_mean_hard_dist() perform = { 'ce_loss': float(Config.weight_ce * ce_loss), 'triplet_hard_loss_a': float(Config.weight_tri * triplet_hard_loss), 'dist_ap_hard': float(dist_ap), 'dist_an_hard': float(dist_an), 'accuracy': float(acc), 'mtop20': float(top20mean), 'mbot20': float(bottom20mean), 'time(s)': float(time_spent) } if i % Config.batch_per_log == 0: stage = (epoch, batch) text = '' for k, v in perform.items(): text += '|{}:{:<8.4f} '.format(k, float(v)) logger.info('training', text, stage=stage) for k, v in perform.items(): history[k].append(float(v)) if k != 'time(s)': logger.add_scalar('TRAIN_b/' + k, v, batch_step) batch_step += 1 scheduler.step() epoch_end_time = time.time() time_spent = sec2min_sec(epoch_start_time, epoch_end_time) text = 'Finish training epoch {}, time spent: {}mins {}secs, performance:\n##'.format( epoch, time_spent[0], time_spent[1]) for k, vlist in history.items(): v = mean(vlist) text += '|{}:{:>5.4f} '.format(k, v) if k != 'time(s)': logger.add_scalar('TRAIN_e/' + k, v, epoch) logger.info('training', text)
def __init__(self, args): self.args = args # Set random initialization seed, easy to reproduce. init_torch_seeds(args.manualSeed) logger.info("Load training dataset") # Selection of appropriate treatment equipment. train_dataset = BaseTrainDataset(root=os.path.join(args.data, "train")) test_dataset = BaseTestDataset(root=os.path.join(args.data, "test"), image_size=args.image_size) self.train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=int(args.workers)) self.test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=int(args.workers)) logger.info( f"Train Dataset information:\n" f"\tTrain Dataset dir is `{os.getcwd()}/{args.data}/train`\n" f"\tBatch size is {args.batch_size}\n" f"\tWorkers is {int(args.workers)}\n" f"\tLoad dataset to CUDA") logger.info(f"Test Dataset information:\n" f"\tTest Dataset dir is `{os.getcwd()}/{args.data}/test`\n" f"\tBatch size is {args.batch_size}\n" f"\tWorkers is {int(args.workers)}\n" f"\tLoad dataset to CUDA") # Construct network architecture model of generator and discriminator. self.device = select_device(args.device, batch_size=1) if args.pretrained: logger.info(f"Using pre-trained model `{args.arch}`") self.generator = models.__dict__[args.arch](pretrained=True).to( self.device) else: logger.info(f"Creating model `{args.arch}`") self.generator = models.__dict__[args.arch]().to(self.device) logger.info(f"Creating discriminator model") self.discriminator = discriminator().to(self.device) # Parameters of pre training model. self.start_psnr_epoch = math.floor(args.start_psnr_iter / len(self.train_dataloader)) self.psnr_epochs = math.ceil(args.psnr_iters / len(self.train_dataloader)) self.psnr_optimizer = torch.optim.Adam(self.generator.parameters(), lr=args.lr, betas=(0.9, 0.999)) logger.info(f"Pre-training model training parameters:\n" f"\tIters is {args.psnr_iters}\n" f"\tEpoch is {self.psnr_epochs}\n" f"\tOptimizer Adam\n" f"\tLearning rate {args.lr}\n" f"\tBetas (0.9, 0.999)") # Create a SummaryWriter at the beginning of training. self.psnr_writer = SummaryWriter( f"runs/SRResNet_{int(time.time())}_logs") self.gan_writer = SummaryWriter(f"runs/SRGAN_{int(time.time())}_logs") # Creates a GradScaler once at the beginning of training. self.scaler = amp.GradScaler() logger.info(f"Turn on mixed precision training.") # Parameters of GAN training model. self.start_epoch = math.floor(args.start_iter / len(self.train_dataloader)) self.epochs = math.ceil(args.iters / len(self.train_dataloader)) self.discriminator_optimizer = torch.optim.Adam( self.discriminator.parameters(), lr=args.lr, betas=(0.9, 0.999)) self.generator_optimizer = torch.optim.Adam( self.generator.parameters(), lr=args.lr, betas=(0.9, 0.999)) self.discriminator_scheduler = torch.optim.lr_scheduler.StepLR( self.discriminator_optimizer, step_size=self.epochs // 2, gamma=0.1) self.generator_scheduler = torch.optim.lr_scheduler.StepLR( self.generator_optimizer, step_size=self.epochs // 2, gamma=0.1) logger.info(f"All model training parameters:\n" f"\tIters is {args.iters}\n" f"\tEpoch is {self.epochs}\n" f"\tOptimizer is Adam\n" f"\tLearning rate is {args.lr}\n" f"\tBetas is (0.9, 0.999)\n" f"\tScheduler is StepLR") # Loss = pixel loss + 0.006 * perceptual loss + 0.001 * adversarial loss. self.pixel_criterion = nn.MSELoss().to(self.device) self.adversarial_criterion = nn.BCELoss().to(self.device) self.perceptual_criterion = VGGLoss().to(self.device) # LPIPS Evaluating. self.lpips_criterion = lpips.LPIPS(net="vgg", verbose=False).to(self.device) logger.info(f"Loss function:\n" f"\tPixel loss is MSELoss\n" f"\tPerceptual loss is VGGLoss\n" f"\tAdversarial loss is BCELoss")
def fit( model: nn.Module, epochs: int, train_loader, val_loader, device: str, optimizer, scheduler=None, num_batches: int = None, log_interval: int = 100, fp16: bool = False, ): """ A fit function that performs training for certain number of epochs. Args: model : A pytorch Faster RCNN Model. epochs: Number of epochs to train. train_loader : Train loader. val_loader : Validation loader. device : "cuda" or "cpu" optimizer : PyTorch optimizer. scheduler : (optional) Learning Rate scheduler. early_stopper: (optional) A utils provided early stopper, based on validation loss. num_batches : (optional) Integer To limit validation to certain number of batches. log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch. fp16 : (optional) To use Mixed Precision Training using float16 dtype. """ history = {} train_loss = [] val_iou = [] val_giou = [] if fp16 is True: print("Training with Mixed precision fp16 scaler") scaler = amp.GradScaler() else: scaler = None for epoch in tqdm(range(epochs)): print() print(f"Training Epoch = {epoch}") train_metrics = train_step(model, train_loader, device, optimizer, scheduler, num_batches, log_interval, scaler) val_metrics = val_step(model, val_loader, device, num_batches, log_interval) # Possibly we can use individual losses train_loss.append(train_metrics["total_loss"]) avg_iou = val_metrics["iou"] avg_giou = val_metrics["giou"] val_iou.append(avg_iou) val_giou.append(avg_giou) history = { "train": { "train_loss": train_loss }, "val": { "val_iou": val_iou, "val_giou": val_giou } } return history
def train( self, train_dataset, output_dir, show_running_loss=True, eval_data=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ model = self.model args = self.args device = self.device tb_writer = SummaryWriter(logdir=args.tensorboard_dir) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=self.args.dataloader_num_workers, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] custom_parameter_names = set() for group in self.args.custom_parameter_groups: params = group.pop("params") custom_parameter_names.update(params) param_group = {**group} param_group["params"] = [p for n, p in model.named_parameters() if n in params] optimizer_grouped_parameters.append(param_group) for group in self.args.custom_layer_parameters: layer_number = group.pop("layer") layer = f"layer.{layer_number}." group_d = {**group} group_nd = {**group} group_nd["weight_decay"] = 0.0 params_d = [] params_nd = [] for n, p in model.named_parameters(): if n not in custom_parameter_names and layer in n: if any(nd in n for nd in no_decay): params_nd.append(p) else: params_d.append(p) custom_parameter_names.add(n) group_d["params"] = params_d group_nd["params"] = params_nd optimizer_grouped_parameters.append(group_d) optimizer_grouped_parameters.append(group_nd) if not self.args.train_custom_parameters_only: optimizer_grouped_parameters.extend( [ { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] ) warmup_steps = math.ceil(t_total * args.warmup_ratio) args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps if args.optimizer == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) elif args.optimizer == "Adafactor": optimizer = Adafactor( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adafactor_eps, clip_threshold=args.adafactor_clip_threshold, decay_rate=args.adafactor_decay_rate, beta1=args.adafactor_beta1, weight_decay=args.weight_decay, scale_parameter=args.adafactor_scale_parameter, relative_step=args.adafactor_relative_step, warmup_init=args.adafactor_warmup_init, ) print("Using Adafactor for T5") else: raise ValueError( "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead.".format( args.optimizer ) ) if args.scheduler == "constant_schedule": scheduler = get_constant_schedule(optimizer) elif args.scheduler == "constant_schedule_with_warmup": scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps) elif args.scheduler == "linear_schedule_with_warmup": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) elif args.scheduler == "cosine_schedule_with_warmup": scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup": scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "polynomial_decay_schedule_with_warmup": scheduler = get_polynomial_decay_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, lr_end=args.polynomial_decay_schedule_lr_end, power=args.polynomial_decay_schedule_lr_end, ) else: raise ValueError("{} is not a valid scheduler.".format(args.scheduler)) if ( args.model_name and os.path.isfile(os.path.join(args.model_name, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name, "scheduler.pt"))) if args.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info(" Training started") global_step = 0 training_progress_scores = None tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent, mininterval=0) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args.model_name and os.path.exists(args.model_name): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name.split("/")[-1].split("-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores(**kwargs) if args.wandb_project: wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs) wandb.watch(self.model) if args.fp16: from torch.cuda import amp scaler = amp.GradScaler() for current_epoch in train_iterator: model.train() if epochs_trained > 0: epochs_trained -= 1 continue train_iterator.set_description(f"Epoch {epoch_number + 1} of {args.num_train_epochs}") batch_iterator = tqdm( train_dataloader, desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}", disable=args.silent, mininterval=0, ) for step, batch in enumerate(batch_iterator): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) if args.fp16: with amp.autocast(): outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] else: outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: batch_iterator.set_description( f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}" ) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: scaler.unscale_(optimizer) if args.optimizer == "AdamW": torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if args.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.wandb_project or self.is_sweeping: wandb.log( { "Training loss": current_loss, "lr": scheduler.get_last_lr()[0], "global_step": global_step, } ) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) self.save_model(output_dir_current, optimizer, scheduler, model=model) if args.evaluate_during_training and ( args.evaluate_during_training_steps > 0 and global_step % args.evaluate_during_training_steps == 0 ): # Only evaluate when single GPU otherwise metrics may not average well results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if args.save_eval_checkpoints: self.save_model(output_dir_current, optimizer, scheduler, model=model, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results ) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) else: if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results ) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) epoch_number += 1 output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args.save_model_every_epoch or args.evaluate_during_training: os.makedirs(output_dir_current, exist_ok=True) if args.save_model_every_epoch: self.save_model(output_dir_current, optimizer, scheduler, model=model) if args.evaluate_during_training and args.evaluate_each_epoch: results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) self.save_model(output_dir_current, optimizer, scheduler, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args.output_dir, "training_progress_scores.csv"), index=False) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping and args.early_stopping_consider_epochs: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) else: if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping and args.early_stopping_consider_epochs: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, )
def with_amp(self, **kwargs): self.use_amp = True self.amp_scaler = amp.GradScaler(**kwargs) return self
def train(hyp, opt, device, tb_writer=None): print(f'Hyperparameters {hyp}') log_dir = tb_writer.log_dir if tb_writer else 'runs/evolve' # run directory wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = log_dir + os.sep + 'results.txt' epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.local_rank # Save run settings with open(Path(log_dir) / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(Path(log_dir) / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict # train_path = data_dict['train'] train_path = f'{opt.trainset_path}/images/train' # test_path = data_dict['val'] test_path = f'{opt.trainset_path}/images/test' nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Remove previous results if rank in [-1, 0]: for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model if opt.not_use_SE: model = Model(opt.cfg, nc=nc).to(device) else: model = ModelSE(opt.cfg, nc=nc).to(device) print(model) exit() # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html # all-reduce operation is carried out during loss.backward(). # Thus, there would be redundant all-reduce communications in a accumulation procedure, # which means, the result is still right but the training speed gets slower. # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Load Model with torch_distributed_zero_first(rank): attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: exclude = ['anchor'] # exclude keys ckpt['model'] = { k: v for k, v in ckpt['model'].float().state_dict().items() if k in model.state_dict() and not any(x in k for x in exclude) and model.state_dict()[k].shape == v.shape } model.load_state_dict(ckpt['model'], strict=False) print('Transferred %g/%g items from %s' % (len(ckpt['model']), len(model.state_dict()), weights)) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ "Please delete or update %s and try again, or use --weights '' to train from scratch." \ % (weights, opt.cfg, weights, weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model, device_ids=[0, 1]) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[rank], output_device=rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=rank, world_size=opt.world_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Testloaderc if rank in [-1, 0]: # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) if rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: # Generate indices if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: print( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Autocast with amp.autocast(enabled=cuda): # Forward pred = model(imgs) # print([x.shape for x in pred]) # [1, 3, 76, 76, 25] [1, 3, 38, 38, 25] [1, 3, 19, 19, 25]) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # if not torch.isfinite(loss): # print('WARNING: non-finite loss, ending training ', loss_items) # return results # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train_model(self, device=None): if not device: device = self.device logging.info("train_model self.device: " + str(device)) self.model.to(device) args = self.args no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] custom_parameter_names = set() for group in self.args.custom_parameter_groups: params = group.pop("params") custom_parameter_names.update(params) param_group = {**group} param_group["params"] = [ p for n, p in self.model.named_parameters() if n in params ] optimizer_grouped_parameters.append(param_group) for group in self.args.custom_layer_parameters: layer_number = group.pop("layer") layer = f"layer.{layer_number}." group_d = {**group} group_nd = {**group} group_nd["weight_decay"] = 0.0 params_d = [] params_nd = [] for n, p in self.model.named_parameters(): if n not in custom_parameter_names and layer in n: if any(nd in n for nd in no_decay): params_nd.append(p) else: params_d.append(p) custom_parameter_names.add(n) group_d["params"] = params_d group_nd["params"] = params_nd optimizer_grouped_parameters.append(group_d) optimizer_grouped_parameters.append(group_nd) if not self.args.train_custom_parameters_only: optimizer_grouped_parameters.extend([ { "params": [ p for n, p in self.model.named_parameters() if n not in custom_parameter_names and not any( nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if n not in custom_parameter_names and any( nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ]) iteration_in_total = len( self.train_dl) // args.gradient_accumulation_steps * args.epochs optimizer, scheduler = self.build_optimizer(self.model, iteration_in_total) # warmup_steps = math.ceil(t_total * args.warmup_ratio) # args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps if args.n_gpu > 1: self.model = torch.nn.DataParallel(self.model) global_step = 0 training_progress_scores = None tr_loss, logging_loss = 0.0, 0.0 self.model.zero_grad() # train_iterator = trange(int(args.epochs), desc="Epoch", disable=args.silent, mininterval=0) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores() if args.fp16: from torch.cuda import amp scaler = amp.GradScaler() if self.args.fl_algorithm == "FedProx": global_model = copy.deepcopy(self.model) # for current_epoch in train_iterator: # model.train() for epoch in range(0, args.epochs): self.model.train() for batch_idx, batch in enumerate(self.train_dl): # print(batch) # batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) if args.fp16: with amp.autocast(): outputs = self.model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] else: outputs = self.model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if self.args.fl_algorithm == "FedProx": fed_prox_reg = 0.0 mu = self.args.fedprox_mu for (p, g_p) in zip(self.model.parameters(), global_model.parameters()): fed_prox_reg += ((mu / 2) * torch.norm( (p - g_p.data))**2) loss += fed_prox_reg current_loss = loss.item() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() logging.info( "epoch = %d, batch_idx = %d/%d, loss = %s" % (epoch, batch_idx, len(self.train_dl), current_loss)) if (batch_idx + 1) % args.gradient_accumulation_steps == 0: if args.fp16: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) if args.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule self.model.zero_grad() global_step += 1 return global_step, tr_loss / global_step
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info(f"Hyperparameters {hyp}") save_dir, epochs, batch_size, total_batch_size, weights, rank = ( Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank, ) # Directories wdir = save_dir / "weights" wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / "last.pt" best = wdir / "best.pt" results_file = save_dir / "results.txt" # Save run settings with open(save_dir / "hyp.yaml", "w") as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / "opt.yaml", "w") as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != "cpu" init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict["train"] test_path = data_dict["val"] nc, names = ( (1, ["item"]) if opt.single_cls else (int(data_dict["nc"]), data_dict["names"]) ) # number classes, names assert len(names) == nc, "%g names found for nc=%g dataset in %s" % ( len(names), nc, opt.data, ) # check # Model pretrained = weights.endswith(".pt") if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get("anchors"): ckpt["model"].yaml["anchors"] = round(hyp["anchors"]) # force autoanchor model = Model(opt.cfg or ckpt["model"].yaml, ch=3, nc=nc).to(device) # create exclude = ["anchor"] if opt.cfg or hyp.get("anchors") else [] # exclude keys state_dict = ckpt["model"].float().state_dict() # to FP32 state_dict = intersect_dicts( state_dict, model.state_dict(), exclude=exclude ) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( "Transferred %g/%g items from %s" % (len(state_dict), len(model.state_dict()), weights) ) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print("freezing %s" % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max( round(nbs / total_batch_size), 1 ) # accumulate loss before optimizing hyp["weight_decay"] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam( pg0, lr=hyp["lr0"], betas=(hyp["momentum"], 0.999) ) # adjust beta1 to momentum else: optimizer = optim.SGD( pg0, lr=hyp["lr0"], momentum=hyp["momentum"], nesterov=True ) optimizer.add_param_group( {"params": pg1, "weight_decay": hyp["weight_decay"]} ) # add pg1 with weight_decay optimizer.add_param_group({"params": pg2}) # add pg2 (biases) logger.info( "Optimizer groups: %g .bias, %g conv.weight, %g other" % (len(pg2), len(pg1), len(pg0)) ) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = ( lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp["lrf"]) + hyp["lrf"] ) # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init( config=opt, resume="allow", project="YOLOv5" if opt.project == "runs/train" else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get("wandb_id") if "ckpt" in locals() else None, ) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt["optimizer"] is not None: optimizer.load_state_dict(ckpt["optimizer"]) best_fitness = ckpt["best_fitness"] # Results if ckpt.get("training_results") is not None: with open(results_file, "w") as file: file.write(ckpt["training_results"]) # write results.txt # Epochs start_epoch = ckpt["epoch"] + 1 if opt.resume: assert ( start_epoch > 0 ), "%s training to %g epochs is finished, nothing to resume." % ( weights, epochs, ) if epochs < start_epoch: logger.info( "%s has been trained for %g epochs. Fine-tuning for %g additional epochs." % (weights, ckpt["epoch"], epochs) ) epochs += ckpt["epoch"] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [ check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info("Using SyncBatchNorm()") # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader( train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, ) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert ( mlc < nc ), "Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g" % ( mlc, nc, opt.data, nc - 1, ) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, )[ 0 ] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, save_dir=save_dir) if tb_writer: tb_writer.add_histogram("classes", c, 0) if wandb: wandb.log( { "Labels": [ wandb.Image(str(x), caption=x.name) for x in save_dir.glob("*labels*.png") ] } ) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp["anchor_t"], imgsz=imgsz) # Model parameters hyp["cls"] *= nc / 80.0 # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device ) # attach class weights model.names = names # Start training t0 = time.time() nw = max( round(hyp["warmup_epochs"] * nb), 1000 ) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info( "Image sizes %g train, %g test\n" "Using %g dataloader workers\nLogging results to %s\n" "Starting training for %g epochs..." % (imgsz, imgsz_test, dataloader.num_workers, save_dir, epochs) ) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = ( model.class_weights.cpu().numpy() * (1 - maps) ** 2 ) # class weights iw = labels_to_image_weights( dataset.labels, nc=nc, class_weights=cw ) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n ) # rand weighted idx # Broadcast if DDP if rank != -1: indices = ( torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n) ).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ("\n" + "%10s" * 8) % ("Epoch", "gpu_mem", "box", "obj", "cls", "total", "targets", "img_size") ) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _, ) in ( pbar ): # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = ( imgs.to(device, non_blocking=True).float() / 255.0 ) # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round() ) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x["lr"] = np.interp( ni, xi, [ hyp["warmup_bias_lr"] if j == 2 else 0.0, x["initial_lr"] * lf(epoch), ], ) if "momentum" in x: x["momentum"] = np.interp( ni, xi, [hyp["warmup_momentum"], hyp["momentum"]] ) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate( imgs, size=ns, mode="bilinear", align_corners=False ) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model ) # loss scaled by batch_size if rank != -1: loss *= ( opt.world_size ) # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = "%.3gG" % ( torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0 ) # (GB) s = ("%10s" * 2 + "%10.4g" * 6) % ( "%g/%g" % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1], ) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f"train_batch{ni}.jpg" # filename plot_images(images=imgs, targets=targets, paths=paths, fname=f) # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 3 and wandb: wandb.log( { "Mosaics": [ wandb.Image(str(x), caption=x.name) for x in save_dir.glob("train*.jpg") ] } ) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x["lr"] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=["yaml", "nc", "hyp", "gr", "names", "stride"] ) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0, ) # Write with open(results_file, "a") as f: f.write( s + "%10.4g" * 7 % results + "\n" ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system( "gsutil cp %s gs://%s/results/results%s.txt" % (results_file, opt.bucket, opt.name) ) # Log tags = [ "train/box_loss", "train/obj_loss", "train/cls_loss", # train loss "metrics/precision", "metrics/recall", "metrics/mAP_0.5", "metrics/mAP_0.5:0.95", "val/box_loss", "val/obj_loss", "val/cls_loss", # val loss "x/lr0", "x/lr1", "x/lr2", ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}) # W&B # Update best mAP fi = fitness( np.array(results).reshape(1, -1) ) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, "r") as f: # create checkpoint ckpt = { "epoch": epoch, "best_fitness": best_fitness, "training_results": f.read(), "model": ema.ema, "optimizer": None if final_epoch else optimizer.state_dict(), "wandb_id": wandb_run.id if wandb else None, } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else "" fresults, flast, fbest = ( save_dir / f"results{n}.txt", wdir / f"last{n}.pt", wdir / f"best{n}.pt", ) for f1, f2 in zip( [wdir / "last.pt", wdir / "best.pt", results_file], [flast, fbest, fresults] ): if f1.exists(): os.rename(f1, f2) # rename if str(f2).endswith(".pt"): # is *.pt strip_optimizer(f2) # strip optimizer os.system( "gsutil cp %s gs://%s/weights" % (f2, opt.bucket) ) if opt.bucket else None # upload # Finish if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: files = [ "results.png", "precision_recall_curve.png", "confusion_matrix.png", ] wandb.log( { "Results": [ wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists() ] } ) logger.info( "%g epochs completed in %.3f hours.\n" % (epoch - start_epoch + 1, (time.time() - t0) / 3600) ) else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() return results
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = str(log_dir / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) f = open(opt.data) data_config = json.load(f) trainset_paths = data_config['train'] test_paths = data_config['test'] dataset_root = data_config['root'] f.close() nc, names = (1, ['person']) # TODO: Use DDP logging. Only the first process is allowed to log. # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg else [] # exclude keys if type(ckpt['model']).__name__ == "OrderedDict": state_dict = ckpt['model'] else: state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Image sizes gs = int(max(model.stride)) # grid size (max stride) #imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples imgsz = opt.img_size # Trainloader dataloader, dataset = create_dataloader(dataset_root, trainset_paths, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=False, rank=rank, world_size=opt.world_size, workers=opt.workers, state="train") mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches #assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = cstrack.create_dataloader(dataset_root, test_paths, imgsz, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, state="test")[0] #MOT_loss mot_loss = MotLoss(dataset.nID, opt.emb_dim) # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: base_optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum optimizer = Lookahead(optimizer=base_optimizer, k=5, alpha=0.5) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) optimizer.add_param_group({'params': mot_loss.parameters()}) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.9 + 0.1 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict start_epoch = opt.start_epoch # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=(opt.local_rank)) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) #plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors #if not opt.noautoanchor: # check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes (%g , %g)' % (imgsz[0], imgsz[1])) logger.info('Using %g dataloader workers' % dataloader.num_workers) logger.info('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: # Generate indices if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(5, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'GIoU', 'idloss', 'dmloss', 'obj', 'total', 'targets', 'lr')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _, dense_mask ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 if opt.fineturn: x['lr'] = np.interp(ni, xi, [ 0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch) * 0.1 ]) else: x['lr'] = np.interp(ni, xi, [ 0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) for j, x in enumerate(optimizer.param_groups): if ni > nw and ni <= 20 * nb: if opt.fineturn: x['lr'] = x['initial_lr'] * 0.1 else: x['lr'] = x['initial_lr'] if ni > 20 * nb and ni <= 40 * nb: x['lr'] = x['initial_lr'] * 0.1 if ni > 40 * nb: x['lr'] = x['initial_lr'] * 0.1 lr_now = x['lr'] # Multi-scale if opt.multi_scale and ni > nw and epoch % 2 == 0: sz = random.randrange(imgsz[0] * 0.5, imgsz[0] * 1.0) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Autocast with amp.autocast(enabled=cuda): # Forward pred = model(imgs) # Loss loss, loss_items = mot_loss(pred, targets.to(device), dense_mask.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # if not torch.isfinite(loss): # logger.info('WARNING: non-finite loss, ending training ', loss_items) # return results # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 7) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], lr_now) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = eval.test( opt.data, batch_size=total_batch_size, imgsz=imgsz, model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 4 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish #if not opt.evolve: #plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train_sanity_fit( model: nn.Module, train_loader, device: str, num_batches: int = None, log_interval: int = 100, fp16: bool = False, ): """ Performs Sanity fit over train loader. Use this to dummy check your train_step function. It does not calculate metrics, timing, or does checkpointing. It iterates over both train_loader for given batches. Note: - It does not to loss.backward(). Args: model : A pytorch Faster RCNN Model. train_loader : Train loader. device : "cuda" or "cpu" num_batches : (optional) Integer To limit sanity fit over certain batches. Useful is data is too big even for sanity check. log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch. fp16: : (optional) If True uses PyTorch native mixed precision Training. """ model = model.to(device) model.train() cnt = 0 last_idx = len(train_loader) - 1 train_sanity_start = time.time() if fp16 is True: scaler = amp.GradScaler() for batch_idx, (inputs, targets) in enumerate(train_loader): last_batch = batch_idx == last_idx images = list(image.to(device) for image in inputs) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] if fp16 is True: with amp.autocast(): loss_dict = model(images, targets) else: loss_dict = model(images, targets) cnt += 1 if last_batch or batch_idx % log_interval == 0: print( f"Train sanity check passed for batch till {batch_idx} batches" ) if num_batches is not None: if cnt >= num_batches: print(f"Done till {num_batches} train batches") print("All specified batches done") train_sanity_end = time.time() print( f"Train sanity fit check passed in time {train_sanity_end-train_sanity_start}" ) return True train_sanity_end = time.time() print("All specified batches done") print( f"Train sanity fit check passed in time {train_sanity_end-train_sanity_start}" ) return True
def run(self): ## init distributed self.cfg = init_distributed(self.cfg) cfg = self.cfg # cfg.print() ## parser_dict self.dictionary = self._parser_dict() ## parser_datasets datasets, dataloaders, data_samplers, dataset_sizes = self._parser_datasets( ) ## parser_model model_ft = self._parser_model() # Scale learning rate based on global batch size if cfg.SCALE_LR.ENABLED: cfg.INIT_LR = cfg.INIT_LR * float( self.batch_size) / cfg.SCALE_LR.VAL scaler = amp.GradScaler(enabled=True) if cfg.WARMUP.NAME is not None: logger.info('Start warm-up ... ') self.warm_up(scaler, model_ft, dataloaders['train'], cfg) logger.info('finish warm-up!') ## parser_optimizer optimizer_ft = build_optimizer(cfg, model_ft) ## parser_lr_scheduler lr_scheduler_ft = build_lr_scheduler(cfg, optimizer_ft) if cfg.distributed: model_ft = DDP(model_ft, device_ids=[cfg.local_rank], output_device=(cfg.local_rank)) # Freeze freeze_models(model_ft) if self.cfg.PRETRAIN_MODEL is not None: if self.cfg.RESUME: self.start_epoch = self.ckpts.resume_checkpoint( model_ft, optimizer_ft) else: self.start_epoch = self.ckpts.load_checkpoint( self.cfg.PRETRAIN_MODEL, model_ft, optimizer_ft) ## vis network graph if self.cfg.TENSORBOARD_MODEL and False: self.tb_writer.add_graph(model_ft, (model_ft.dummy_input.cuda(), )) self.steps_per_epoch = int(dataset_sizes['train'] // self.batch_size) best_acc = 0.0 best_perf_rst = None for epoch in range(self.start_epoch + 1, self.cfg.N_MAX_EPOCHS): if cfg.distributed: dataloaders['train'].sampler.set_epoch(epoch) self.train_epoch(scaler, epoch, model_ft, datasets['train'], dataloaders['train'], optimizer_ft) lr_scheduler_ft.step() if self.cfg.DATASET.VAL and ( not epoch % cfg.EVALUATOR.EVAL_INTERVALS or epoch == self.cfg.N_MAX_EPOCHS - 1): acc, perf_rst = self.val_epoch(epoch, model_ft, datasets['val'], dataloaders['val']) if cfg.local_rank == 0: # start to save best performance model after learning rate decay to 1e-6 if best_acc < acc: self.ckpts.autosave_checkpoint(model_ft, epoch, 'best', optimizer_ft) best_acc = acc best_perf_rst = perf_rst # continue if not epoch % cfg.N_EPOCHS_TO_SAVE_MODEL: if cfg.local_rank == 0: self.ckpts.autosave_checkpoint(model_ft, epoch, 'last', optimizer_ft) if best_perf_rst is not None: logger.info(best_perf_rst.replace("(val)", "(best)")) if cfg.local_rank == 0: self.tb_writer.close() dist.destroy_process_group() if cfg.local_rank != 0 else None torch.cuda.empty_cache()
def __init__( self, model: Model, optimizer: torch.optim.Optimizer, data_loader: DataLoader, patience: Optional[int] = None, validation_metric: str = "-loss", validation_data_loader: DataLoader = None, num_epochs: int = 20, serialization_dir: Optional[str] = None, checkpointer: Checkpointer = None, cuda_device: Optional[Union[int, torch.device]] = None, grad_norm: Optional[float] = None, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[LearningRateScheduler] = None, momentum_scheduler: Optional[MomentumScheduler] = None, tensorboard_writer: TensorboardWriter = None, moving_average: Optional[MovingAverage] = None, batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, distributed: bool = False, local_rank: int = 0, world_size: int = 1, num_gradient_accumulation_steps: int = 1, use_amp: bool = False, ) -> None: super().__init__(serialization_dir, cuda_device, distributed, local_rank, world_size) # I am not calling move_to_gpu here, because if the model is # not already on the GPU then the optimizer is going to be wrong. self.model = model self.data_loader = data_loader self._validation_data_loader = validation_data_loader self.optimizer = optimizer if patience is None: # no early stopping if validation_data_loader is not None: logger.warning( "You provided a validation dataset but patience was set to None, " "meaning that early stopping is disabled" ) elif (not isinstance(patience, int)) or patience <= 0: raise ConfigurationError( '{} is an invalid value for "patience": it must be a positive integer ' "or None (if you want to disable early stopping)".format(patience) ) # For tracking is_best_so_far and should_stop_early self._metric_tracker = MetricTracker(patience, validation_metric) # Get rid of + or - self._validation_metric = validation_metric[1:] self._num_epochs = num_epochs if checkpointer is not None: self._checkpointer = checkpointer else: self._checkpointer = Checkpointer(serialization_dir) self._grad_norm = grad_norm self._grad_clipping = grad_clipping self._learning_rate_scheduler = learning_rate_scheduler self._momentum_scheduler = momentum_scheduler self._moving_average = moving_average self._batch_callbacks = batch_callbacks or [] self._epoch_callbacks = epoch_callbacks or [] # We keep the total batch number as an instance variable because it # is used inside a closure for the hook which logs activations in # `_enable_activation_logging`. self._batch_num_total = 0 self._tensorboard = tensorboard_writer or TensorboardWriter(serialization_dir) self._tensorboard.get_batch_num_total = lambda: self._batch_num_total self._tensorboard.enable_activation_logging(self.model) self._last_log = 0.0 # time of last logging self._num_gradient_accumulation_steps = num_gradient_accumulation_steps # Enable automatic mixed precision training. self._scaler: Optional[amp.GradScaler] = None self._use_amp = use_amp if self._use_amp: if self.cuda_device == torch.device("cpu"): raise ValueError("Using AMP requires a cuda device") self._scaler = amp.GradScaler() # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model` # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc. # # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the # normal case, reference to `Model` is retained. This reference is only used in # these places: `model.__call__`, `model.train` and `model.eval`. if self._distributed: self._pytorch_model = DistributedDataParallel( self.model, device_ids=None if self.cuda_device == torch.device("cpu") else [self.cuda_device], find_unused_parameters=True, ) else: self._pytorch_model = self.model
batch_size=128 * 50, shuffle=True) test_loader = Data.DataLoader(dataset=test_data, batch_size=128 * 50, shuffle=False) train_batch_num = len(train_loader) test_batch_num = len(test_loader) net = Network() if torch.cuda.is_available(): # net = nn.DataParallel(net) net.cuda() # +++++++++++++++++++++++++++++++ scaler = amp.GradScaler() # +++++++++++++++++++++++++++++++ opt = torch.optim.Adam(net.parameters(), lr=0.001) loss_func = nn.CrossEntropyLoss() for epoch_index in range(10): st = time.time() torch.set_grad_enabled(True) net.train() for train_batch_index, (img_batch, label_batch) in enumerate(train_loader): if torch.cuda.is_available(): img_batch = img_batch.cuda() label_batch = label_batch.cuda()
dataloader = torch.utils.data.DataLoader( dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_cpu, pin_memory=True, # pin_memory就是锁页内存,pin_memory=True,则意味着生成的Tensor数据最开始是属于内存中的锁页内存, # 这样将内存的Tensor转义到GPU的显存就会更快一些。 collate_fn=dataset.collate_fn, # 如何取样本的,我们可以定义自己的函数来准确地实现想要的功能 ) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr) # Adam优化器 scheduler = lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.95) scheduler.last_epoch = opt.start_epoch - 1 scaler = amp.GradScaler(enabled=True) Loss = YOLOLoss_total(opt.model_def) metrics = [ "grid_size", "loss", "x", "y", "w", "h", "conf", "cls", "cls_acc", "recall50", "recall75",
def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args["weight_decay"], }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total ) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["epochs"]), desc="Epoch", disable=args["silent"]) if args["fp16"]: from torch.cuda import amp scaler = amp.GradScaler() for _ in train_iterator: model.train() # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(tqdm(train_dataloader, desc=f"Running Training", disable=args["silent"])): batch = tuple(t.to(self.device) for t in batch) inputs = self._get_inputs_dict(batch) if args["fp16"]: with amp.autocast(): if self.sliding_window: outputs = model(inputs) else: outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) else: if self.sliding_window: outputs = model(inputs) else: outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["n_gpu"] > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) if args["fp16"]: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics if args["evaluate_during_training"]: # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step) logging_loss = tr_loss if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) os.makedirs(output_dir_current, exist_ok=True) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, )
def run(gpu_id, options, distributed=False): if distributed: dist.init_process_group( backend="nccl", rank=gpu_id, world_size=options.num_gpus, init_method="env://", ) torch.cuda.set_device(gpu_id) torch.manual_seed(options.seed) use_cuda = torch.cuda.is_available() and not options.no_cuda device = torch.device("cuda" if use_cuda else "cpu") logger = lavd.Logger(options.name, disabled=gpu_id != 0) # Parser needs to be rebuilt, since it can't be serialised and it is needed to even # detect the number of GPUs, but here it's only used to log it. parser = build_parser() if gpu_id == 0 else None spinner = logger.spinner("Initialising") spinner.start() checkpoint = (default_checkpoint if options.checkpoint is None else load_checkpoint( os.path.join(options.checkpoint, "stats.pt"))) # Either use the checkpoint directory as the configuration or use one of the # available pre-trained models. pre_trained = options.checkpoint or options.pre_trained # All but the primary GPU wait here, so that only the primary process loads the # pre-trained model and the rest uses the cached version. if distributed and gpu_id != 0: torch.distributed.barrier() model_kind = checkpoint["model"].get("kind") or options.model_kind use_special = True masked_lm = True if model_kind == "bert": if pre_trained is None: pre_trained = "bert-base-german-cased" config = BertConfig.from_pretrained(pre_trained) model = BertForMaskedLM.from_pretrained(pre_trained, config=config) tokeniser = BertTokenizer.from_pretrained(pre_trained) elif model_kind == "bert-scratch": # The pre_trained here is only for the configuartion (num layers etc.) # But the weights are not loaded if pre_trained is None: pre_trained = "bert-base-german-cased" # Use either the provided vocabulary or the pre_trained one. vocab = options.vocab or pre_trained tokeniser = BertTokenizer.from_pretrained(vocab) config = BertConfig.from_pretrained(pre_trained) config.vocab_size = tokeniser.vocab_size model = BertForMaskedLM(config) elif model_kind == "gpt2": if pre_trained is None: pre_trained = "gpt2" config = GPT2Config.from_pretrained(pre_trained) model = GPT2LMHeadModel.from_pretrained(pre_trained, config=config) tokeniser = GPT2Tokenizer.from_pretrained(pre_trained) masked_lm = False use_special = False elif model_kind == "gpt2-german": assert pre_trained is not None, "--pre-trained must be given for gpt2-german" config = GPT2Config.from_pretrained(pre_trained) model = GPT2LMHeadModel.from_pretrained(pre_trained, config=config) # Using the XLNetTokenizer because the pre-trained German GPT-2 model uses # SentencePiece and that's easiest way to use it. # That also means that the automatic tokenisation cannot be done, because XLNet # uses different placing of the special tokens. tokeniser = XLNetTokenizer.from_pretrained( pre_trained, keep_accents=True, unk_token="<unk>", # start and end of sequence use the same token bos_token="<endoftext>", eos_token="<endoftext>", ) masked_lm = False use_special = False elif model_kind == "gpt2-scratch": # The pre_trained here is only for the configuartion (num layers etc.) # But the weights are not loaded if pre_trained is None: pre_trained = "gpt2" # Use either the provided vocabulary or the pre_trained one. vocab = options.vocab or pre_trained tokeniser = GPT2Tokenizer.from_pretrained(vocab) config = GPT2Config.from_pretrained(pre_trained) config.vocab_size = tokeniser.vocab_size model = GPT2LMHeadModel(config) masked_lm = False use_special = False else: raise Exception("No model available for {}".format(model_kind)) model = model.to(device) # Primary process has loaded the model and the other can now load the cached # version. if distributed and gpu_id == 0: torch.distributed.barrier() train_dataset = TextDataset( options.train_text, tokeniser, use_special=use_special, manual_special=model_kind == "gpt2-german", ) train_sampler = (DistributedSampler(train_dataset, num_replicas=options.num_gpus, rank=gpu_id) if distributed else None) train_data_loader = DataLoader( train_dataset, batch_size=options.batch_size, # Only shuffle when not using a sampler shuffle=train_sampler is None, num_workers=options.actual_num_workers, sampler=train_sampler, pin_memory=True, ) validation_data_loaders = [] for val_file in options.validation_text: vals = val_file.split("=", 1) if len(vals) > 1: # Remove whitespace around the name name = vals[0].strip() # Expand the ~ to the full path as it won't be done automatically since it's # not at the beginning of the word. file_path = os.path.expanduser(vals[1]) else: name = None file_path = vals[0] validation_dataset = TextDataset( file_path, tokeniser, name=name, use_special=use_special, manual_special=model_kind == "gpt2-german", ) validation_sampler = (DistributedSampler( validation_dataset, num_replicas=options.num_gpus, rank=gpu_id) if distributed else None) validation_data_loader = DataLoader( validation_dataset, batch_size=options.batch_size, # Only shuffle when not using a sampler shuffle=validation_sampler is None, num_workers=options.actual_num_workers, sampler=validation_sampler, pin_memory=True, ) validation_data_loaders.append(validation_data_loader) initial_lr = options.lr # Only restore the learning rate if resuming from a checkpoint and not manually # resetting the learning rate. if len(checkpoint["train"]["lr"]) > 0 and not options.reset_lr: initial_lr = checkpoint["train"]["lr"][-1] no_decay = ["bias", "LayerNorm.weight"] optimiser_grouped_parameters = [ { "params": [ param for name, param in model.named_parameters() if not any(nd in name for nd in no_decay) ], "weight_decay": options.weight_decay, }, { "params": [ param for name, param in model.named_parameters() if any(nd in name for nd in no_decay) ], "weight_decay": 0.0, }, ] optimiser = AdamW(optimiser_grouped_parameters, lr=initial_lr, eps=options.adam_eps) lr_scheduler = get_linear_schedule_with_warmup( optimiser, num_warmup_steps=options.lr_warmup, num_training_steps=options.num_epochs, ) amp_scaler = amp.GradScaler() if use_cuda and options.fp16 else None if distributed: model = DistributedDataParallel(model, device_ids=[gpu_id], find_unused_parameters=True) validation_details = [ OrderedDict( name=data_loader.dataset.name, path=data_loader.dataset.path, size=len(data_loader.dataset), ) for data_loader in validation_data_loaders ] experiment = OrderedDict( model_kind=model_kind, train=OrderedDict(path=train_dataset.path, size=len(train_dataset)), validation=validation_details, options=options, ) log_experiment(logger, experiment) logger.log_command(parser, options) # Wait for all processes to load eveything before starting training. # Not strictly necessary, since they will wait once the actual model is run, but # this makes it nicer to show the spinner until all of them are ready. if distributed: torch.distributed.barrier() spinner.stop() if options.checkpoint is not None: resume_text = "Resuming from - Epoch {epoch}".format( epoch=checkpoint["epoch"]) logger.set_prefix(resume_text) epoch_results = [ OrderedDict( name="Train", stats=OrderedDict( loss=checkpoint["train"]["stats"]["loss"][-1], perplexity=checkpoint["train"]["stats"]["perplexity"][-1], ), ) ] + [ OrderedDict( name=val_name, stats=OrderedDict( loss=val_result["stats"]["loss"][-1], perplexity=val_result["stats"]["perplexity"][-1], ), ) for val_name, val_result in checkpoint["validation"].items() ] log_epoch_stats(logger, epoch_results, metrics) train( logger, model, optimiser, train_data_loader, validation_data_loaders, lr_scheduler=lr_scheduler, device=device, num_epochs=options.num_epochs, checkpoint=checkpoint, model_kind=model_kind, amp_scaler=amp_scaler, masked_lm=masked_lm, )
def __init__(self, cfg_path): with open(cfg_path, 'r') as rf: self.cfg = yaml.safe_load(rf) self.data_cfg = self.cfg['data'] self.model_cfg = self.cfg['model'] self.optim_cfg = self.cfg['optim'] self.val_cfg = self.cfg['val'] print(self.data_cfg) print(self.model_cfg) print(self.optim_cfg) print(self.val_cfg) self.tdata = MSCOCO( img_root=self.data_cfg['train_img_root'], ann_path=self.data_cfg['train_ann_path'], debug=self.data_cfg['debug'], augment=True, ) self.tloader = DataLoader(dataset=self.tdata, batch_size=self.data_cfg['batch_size'], num_workers=self.data_cfg['num_workers'], collate_fn=self.tdata.collate_fn, shuffle=True) self.vdata = MSCOCO( img_root=self.data_cfg['val_img_root'], ann_path=self.data_cfg['val_ann_path'], debug=False, augment=False, ) self.vloader = DataLoader(dataset=self.vdata, batch_size=self.data_cfg['batch_size'], num_workers=self.data_cfg['num_workers'], collate_fn=self.vdata.collate_fn, shuffle=False) print("train_data: ", len(self.tdata), " | ", "val_data: ", len(self.vdata)) print("train_iter: ", len(self.tloader), " | ", "val_iter: ", len(self.vloader)) model: torch.nn.Module = getattr( eval(self.model_cfg['type']), self.model_cfg['name'])(pretrained=self.model_cfg['pretrained'], num_classes=self.model_cfg['num_joints'], reduction=self.model_cfg['reduction']) self.scaler = amp.GradScaler( enabled=True) if self.optim_cfg['amp'] else None self.optimizer = Adam(model.parameters(), lr=self.optim_cfg['lr']) self.lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=self.optim_cfg['milestones'], gamma=self.optim_cfg['gamma']) # self.lr_scheduler = IterWarmUpCosineDecayMultiStepLRAdjust( # init_lr=self.optim_cfg['lr'], # milestones=self.optim_cfg['milestones'], # warm_up_epoch=1, # iter_per_epoch=len(self.tloader), # epochs=self.optim_cfg['epochs'] # ) assert torch.cuda.is_available(), "training only support cuda" assert torch.cuda.device_count() >= len( self.cfg['gpus']), "not have enough gpus" self.inp_device = torch.device("cuda:{:d}".format(self.cfg['gpus'][0])) self.out_device = torch.device("cuda:{:d}".format( self.cfg['gpus'][-1])) model.to(self.inp_device) self.model = nn.DataParallel(model, device_ids=self.cfg['gpus'], output_device=self.out_device) # self.ema = ModelEMA(self.model) self.creterion = nn.MSELoss() self.acc_func = HeatMapAcc() self.best_ap = 0. self.loss_logger = AverageLogger() self.acc_logger = AverageLogger() self.decoder = BasicKeyPointDecoder()
def train(hyp, # path/to/hyp.yaml or hyp dictionary opt, device, ): save_dir, epochs, batch_size, total_batch_size, weights, rank, single_cls = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank, \ opt.single_cls # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' # Hyperparameters if isinstance(hyp, str): with open(hyp) as f: hyp = yaml.safe_load(f) # load hyps dict logger.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.safe_dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.safe_dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.safe_load(f) # data dict # Loggers loggers = {'wandb': None, 'tb': None} # loggers dict if rank in [-1, 0]: # TensorBoard if not opt.evolve: prefix = colorstr('tensorboard: ') logger.info(f"{prefix}Start with 'tensorboard --logdir {opt.project}', view at http://localhost:6006/") loggers['tb'] = SummaryWriter(opt.save_dir) # W&B opt.hyp = hyp # add hyperparameters run_id = torch.load(weights).get('wandb_id') if weights.endswith('.pt') and os.path.isfile(weights) else None wandb_logger = WandbLogger(opt, save_dir.stem, run_id, data_dict) loggers['wandb'] = wandb_logger.wandb data_dict = wandb_logger.data_dict if wandb_logger.wandb: weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp # may update weights, epochs if resuming nc = 1 if single_cls else int(data_dict['nc']) # number of classes names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check is_coco = opt.data.endswith('coco.yaml') and nc == 80 # COCO dataset # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): weights = attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create exclude = ['anchor'] if (opt.cfg or hyp.get('anchors')) and not opt.resume else [] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR if opt.linear_lr: lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear else: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # EMA if ema and ckpt.get('ema'): ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) ema.updates = ckpt['updates'] # Results if ckpt.get('training_results') is not None: results_file.write_text(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs) if epochs < start_epoch: logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = max(int(model.stride.max()), 32) # grid size (max stride) nl = model.model[-1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, single_cls, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: ')) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: testloader = create_dataloader(test_path, imgsz_test, batch_size * 2, gs, single_cls, hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5, prefix=colorstr('val: '))[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, names, save_dir, loggers) if loggers['tb']: loggers['tb'].add_histogram('classes', c, 0) # TensorBoard # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) model.half().float() # pre-reduce anchor precision # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank, # nn.MultiheadAttention incompatibility with DDP https://github.com/pytorch/pytorch/issues/26698 find_unused_parameters=any(isinstance(layer, nn.MultiheadAttention) for layer in model.modules())) # Model parameters hyp['box'] *= 3. / nl # scale to layers hyp['cls'] *= nc / 80. * 3. / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl # scale to image size and layers hyp['label_smoothing'] = opt.label_smoothing model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) compute_loss = ComputeLoss(model) # init loss class logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n' f'Using {dataloader.num_workers} dataloader workers\n' f'Logging results to {save_dir}\n' f'Starting training for {epochs} epochs...') for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'labels', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() if loggers['tb'] and ni == 0: # TensorBoard with warnings.catch_warnings(): warnings.simplefilter('ignore') # suppress jit trace warning loggers['tb'].add_graph(torch.jit.trace(de_parallel(model), imgs[0:1], strict=False), []) elif plots and ni == 10 and wandb_logger.wandb: wandb_logger.log({'Mosaics': [wandb_logger.wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg') if x.exists()]}) # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for loggers scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP wandb_logger.current_epoch = epoch + 1 results, maps, _ = test.test(data_dict, batch_size=batch_size * 2, imgsz=imgsz_test, model=ema.ema, single_cls=single_cls, dataloader=testloader, save_dir=save_dir, save_json=is_coco and final_epoch, verbose=nc < 50 and final_epoch, plots=plots and final_epoch, wandb_logger=wandb_logger, compute_loss=compute_loss) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # append metrics, val_loss # Log tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2'] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if loggers['tb']: loggers['tb'].add_scalar(tag, x, epoch) # TensorBoard if wandb_logger.wandb: wandb_logger.log({tag: x}) # W&B # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi wandb_logger.end_epoch(best_result=best_fitness == fi) # Save model if (not opt.nosave) or (final_epoch and not opt.evolve): # if save ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': results_file.read_text(), 'model': deepcopy(de_parallel(model)).half(), 'ema': deepcopy(ema.ema).half(), 'updates': ema.updates, 'optimizer': optimizer.state_dict(), 'wandb_id': wandb_logger.wandb_run.id if wandb_logger.wandb else None} # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if wandb_logger.wandb: if ((epoch + 1) % opt.save_period == 0 and not final_epoch) and opt.save_period != -1: wandb_logger.log_model(last.parent, opt, epoch, fi, best_model=best_fitness == fi) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training ----------------------------------------------------------------------------------------------------- if rank in [-1, 0]: logger.info(f'{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.\n') if plots: plot_results(save_dir=save_dir) # save as results.png if wandb_logger.wandb: files = ['results.png', 'confusion_matrix.png', *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')]] wandb_logger.log({"Results": [wandb_logger.wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists()]}) if not opt.evolve: if is_coco: # COCO dataset for m in [last, best] if best.exists() else [last]: # speed, mAP tests results, _, _ = test.test(opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, conf_thres=0.001, iou_thres=0.7, model=attempt_load(m, device).half(), single_cls=single_cls, dataloader=testloader, save_dir=save_dir, save_json=True, plots=False) # Strip optimizers for f in last, best: if f.exists(): strip_optimizer(f) # strip optimizers if wandb_logger.wandb: # Log the stripped model wandb_logger.wandb.log_artifact(str(best if best.exists() else last), type='model', name='run_' + wandb_logger.wandb_run.id + '_model', aliases=['latest', 'best', 'stripped']) wandb_logger.finish_run() else: dist.destroy_process_group() torch.cuda.empty_cache() return results
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.is_distributed: print( "INFO:PyTorch: Initialize process group for distributed training") if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.gpu is not None: if not args.evaluate: print( "INFO:PyTorch: Use GPU: {} for training, the rank of this GPU is {}" .format(args.gpu, args.rank)) else: print( "INFO:PyTorch: Use GPU: {} for evaluating, the rank of this GPU is {}" .format(args.gpu, args.rank)) # set the name of the process setproctitle.setproctitle(args.proc_name + '_rank{}'.format(args.rank)) if not args.multiprocessing_distributed or \ (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): # define tensorboard summary val_writer = SummaryWriter(log_dir=os.path.join(args.model_dir, 'val')) # define loss function (criterion) and optimizer if args.is_label_smoothing: criterion = label_smoothing.label_smoothing_CE(reduction='mean') else: criterion = nn.CrossEntropyLoss() # create model if args.pretrained: model_info = "INFO:PyTorch: using pre-trained model '{}'".format( args.arch) else: model_info = "INFO:PyTorch: creating model '{}'".format(args.arch) print(model_info) model = splitnet.SplitNet(args, norm_layer=norm.norm(args.norm_mode), criterion=criterion) # print the number of parameters in the model print("INFO:PyTorch: The number of parameters in the model is {}".format( metric.get_the_number_of_params(model))) if args.is_summary: summary_choice = 0 if summary_choice == 0: summary.summary(model, torch.rand((1, 3, args.crop_size, args.crop_size)), target=torch.ones(1, dtype=torch.long)) else: flops, params = profile(model, inputs=(torch.rand((1, 3, args.crop_size, args.crop_size)), torch.ones(1, dtype=torch.long), 'summary')) print(clever_format([flops, params], "%.4f")) return None if args.is_distributed: if args.world_size > 1 and args.is_syncbn: print( "INFO:PyTorch: convert torch.nn.BatchNormND layer in the model to torch.nn.SyncBatchNorm layer" ) # only single gpu per process is currently supported model = nn.SyncBatchNorm.convert_sync_batchnorm(model) # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # optimizer param_groups = model.parameters( ) if args.is_wd_all else lr_scheduler.get_parameter_groups(model) if args.is_wd_all: print( "INFO:PyTorch: Applying weight decay to all learnable parameters in the model." ) if args.optimizer == 'SGD': print("INFO:PyTorch: using SGD optimizer.") optimizer = torch.optim.SGD( param_groups, args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True if args.is_nesterov else False) elif args.optimizer == "AdamW": print("INFO:PyTorch: using AdamW optimizer.") optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.999), eps=1e-4, weight_decay=args.weight_decay) elif args.optimizer == "RMSprop": # See efficientNet at https://github.com/tensorflow/tpu/ print("INFO:PyTorch: using RMSprop optimizer.") optimizer = torch.optim.RMSprop(param_groups, lr=args.lr, alpha=0.9, weight_decay=args.weight_decay, momentum=0.9) elif args.optimizer == "RMSpropTF": # https://github.com/rwightman/pytorch-image-models/blob/fcb6258877/timm/optim/rmsprop_tf.py print("INFO:PyTorch: using RMSpropTF optimizer.") optimizer = rmsprop_tf.RMSpropTF(param_groups, lr=args.lr, alpha=0.9, eps=0.001, weight_decay=args.weight_decay, momentum=0.9, decoupled_decay=False) else: raise NotImplementedError # PyTorch AMP loss scaler scaler = None if not args.is_amp else amp.GradScaler() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("INFO:PyTorch: => loading checkpoint '{}'".format( args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] """ if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) """ model.load_state_dict(checkpoint['state_dict']) print("INFO:PyTorch: Loading state_dict of optimizer") optimizer.load_state_dict(checkpoint['optimizer']) if "scaler" in checkpoint: print("INFO:PyTorch: Loading state_dict of AMP loss scaler") scaler.load_state_dict(checkpoint['scaler']) print("INFO:PyTorch: => loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("INFO:PyTorch: => no checkpoint found at '{}'".format( args.resume)) # accelarate the training torch.backends.cudnn.benchmark = True # Data loading code data_split_factor = args.loop_factor if args.is_diff_data_train else 1 print("INFO:PyTorch: => The number of views of train data is '{}'".format( data_split_factor)) train_loader, train_sampler = factory.get_data_loader( args.data, split_factor=data_split_factor, batch_size=args.batch_size, crop_size=args.crop_size, dataset=args.dataset, split="train", is_distributed=args.is_distributed, is_autoaugment=args.is_autoaugment, randaa=args.randaa, is_cutout=args.is_cutout, erase_p=args.erase_p, num_workers=args.workers) val_loader = factory.get_data_loader(args.data, batch_size=args.eval_batch_size, crop_size=args.crop_size, dataset=args.dataset, split="val", num_workers=args.workers) # learning rate scheduler scheduler = lr_scheduler.lr_scheduler( mode=args.lr_mode, init_lr=args.lr, num_epochs=args.epochs, iters_per_epoch=len(train_loader), lr_milestones=args.lr_milestones, lr_step_multiplier=args.lr_step_multiplier, slow_start_epochs=args.slow_start_epochs, slow_start_lr=args.slow_start_lr, end_lr=args.end_lr, multiplier=args.lr_multiplier, decay_factor=args.decay_factor, decay_epochs=args.decay_epochs, staircase=True) if args.evaluate: validate(val_loader, model, args) return None saved_ckpt_filenames = [] streams = None # streams = [torch.cuda.Stream() for i in range(args.loop_factor)] for epoch in range(args.start_epoch, args.epochs + 1): if args.is_distributed: train_sampler.set_epoch(epoch) # train for one epoch train(train_loader, model, optimizer, scheduler, epoch, args, streams, scaler=scaler) if (epoch + 1) % args.eval_per_epoch == 0: # evaluate on validation set acc_all = validate(val_loader, model, args) # remember best acc@1 and save checkpoint is_best = acc_all[0] > best_acc1 best_acc1 = max(acc_all[0], best_acc1) # save checkpoint if not args.multiprocessing_distributed or \ (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): # summary per epoch val_writer.add_scalar('avg_acc1', acc_all[0], global_step=epoch) if args.dataset == 'imagenet': val_writer.add_scalar('avg_acc5', acc_all[1], global_step=epoch) for i in range(2, args.loop_factor + 2): val_writer.add_scalar('{}_acc1'.format(i - 1), acc_all[i], global_step=epoch) val_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step=epoch) val_writer.add_scalar('best_acc1', best_acc1, global_step=epoch) # save checkpoints filename = "checkpoint_{0}.pth.tar".format(epoch) saved_ckpt_filenames.append(filename) # remove the oldest file if the number of saved ckpts is greater than args.max_ckpt_nums if len(saved_ckpt_filenames) > args.max_ckpt_nums: os.remove( os.path.join(args.model_dir, saved_ckpt_filenames.pop(0))) ckpt_dict = { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } if args.is_amp: ckpt_dict['scaler'] = scaler.state_dict() metric.save_checkpoint(ckpt_dict, is_best, args.model_dir, filename=filename) # clean GPU cache torch.cuda.empty_cache() sys.exit(0)
def __init__( self, model: Model, optimizer: torch.optim.Optimizer, data_loader: DataLoader, patience: Optional[int] = None, validation_metric: Union[str, List[str]] = "-loss", validation_data_loader: DataLoader = None, num_epochs: int = 20, serialization_dir: Optional[str] = None, checkpointer: Checkpointer = None, cuda_device: Optional[Union[int, torch.device]] = None, grad_norm: Optional[float] = None, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[LearningRateScheduler] = None, momentum_scheduler: Optional[MomentumScheduler] = None, moving_average: Optional[MovingAverage] = None, callbacks: List[TrainerCallback] = None, distributed: bool = False, local_rank: int = 0, world_size: int = 1, num_gradient_accumulation_steps: int = 1, use_amp: bool = False, enable_default_callbacks: bool = True, run_sanity_checks: bool = True, ) -> None: super().__init__( serialization_dir=serialization_dir, cuda_device=cuda_device, distributed=distributed, local_rank=local_rank, world_size=world_size, ) # I am not calling move_to_gpu here, because if the model is # not already on the GPU then the optimizer is going to be wrong. self.model = model self.data_loader = data_loader self.data_loader.set_target_device(self.cuda_device) self._validation_data_loader = validation_data_loader if self._validation_data_loader is not None: self._validation_data_loader.set_target_device(self.cuda_device) self.optimizer = optimizer if patience is None: # no early stopping if validation_data_loader is not None: logger.warning( "You provided a validation dataset but patience was set to None, " "meaning that early stopping is disabled" ) elif (not isinstance(patience, int)) or patience <= 0: raise ConfigurationError( '{} is an invalid value for "patience": it must be a positive integer ' "or None (if you want to disable early stopping)".format(patience) ) # For tracking is_best_so_far and should_stop_early self._metric_tracker = MetricTracker(validation_metric, patience) self._num_epochs = num_epochs self._checkpointer: Optional[Checkpointer] = checkpointer if checkpointer is None and serialization_dir is not None: self._checkpointer = Checkpointer(serialization_dir) self._grad_norm = grad_norm self._grad_clipping = grad_clipping self._learning_rate_scheduler = learning_rate_scheduler self._momentum_scheduler = momentum_scheduler self._moving_average = moving_average self._callbacks = callbacks or [] default_callbacks = list(DEFAULT_CALLBACKS) if enable_default_callbacks else [] if run_sanity_checks: default_callbacks.append(SanityChecksCallback) for callback_cls in default_callbacks: for callback in self._callbacks: if callback.__class__ == callback_cls: break else: self._callbacks.append(callback_cls(self._serialization_dir)) self._batch_num_total = 0 self._last_log = 0.0 # time of last logging self._num_gradient_accumulation_steps = num_gradient_accumulation_steps # Enable automatic mixed precision training. self._scaler: Optional[amp.GradScaler] = None self._use_amp = use_amp if self._use_amp: if self.cuda_device == torch.device("cpu"): raise ValueError("Using AMP requires a cuda device") self._scaler = amp.GradScaler() # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model` # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc. # # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the # normal case, reference to `Model` is retained. This reference is only used in # these places: `model.__call__`, `model.train` and `model.eval`. if self._distributed: self._pytorch_model = DistributedDataParallel( self.model, device_ids=None if self.cuda_device == torch.device("cpu") else [self.cuda_device], find_unused_parameters=True, ) else: self._pytorch_model = self.model
def main(): fold = 0 epoch = 3 mode = 1 batch = 2 num_workers = 1 SEED = 13 init_lr = 3e-4 warmup_factor = 10 #how long warmup_epo = 1 log = True seed_everything(SEED) model = HUB_MODELS['efficientnet-b0']('efficientnet-b0') model.to(DEVICE) df = pd.read_csv(os.path.join(path_data, 'train_folds.csv')) kernel = type(model).__name__ tr_idx = np.where(df.fold != fold)[0] vl_idx = np.where(df.fold == fold)[0] transforms_train = A.Compose([ # A.OneOf([ # A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15), # A.OpticalDistortion(distort_limit=0.11, shift_limit=0.15), # A.NoOp() # ]), # A.OneOf([ # A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2), # A.RandomGamma(gamma_limit=(50, 150)), # A.NoOp() # ]), # A.OneOf([ # A.RGBShift(r_shift_limit=20, b_shift_limit=15, g_shift_limit=15), # A.FancyPCA(3), # A.HueSaturationValue(hue_shift_limit=5, sat_shift_limit=5), # A.NoOp() # ]), # A.OneOf([ # A.CLAHE(), # A.NoOp() # ]), A.Transpose(p=0.5), A.VerticalFlip(p=0.5), A.HorizontalFlip(p=0.5), ]) # transforms_val = albumentations.Compose([]) dataset = { 'npy': [trainDataset_npy, 16], 'pkl': [trainDataset_pkl, 25], 'insta': [trainDataset_insta, None] } trainDataset, num = dataset['pkl'] td = trainDataset(df.iloc[tr_idx], df.iloc[tr_idx].isup_grade, num, rand=True, transform=transforms_train) vd = trainDataset(df.iloc[vl_idx], df.iloc[vl_idx].isup_grade, num, rand=False, transform=transforms_train) train_dl = DataLoader(td, batch_size=batch, sampler=RandomSampler(td), num_workers=num_workers) val_dl = DataLoader(vd, batch_size=batch, sampler=SequentialSampler(vd), num_workers=num_workers) optimizer = Adam(model.parameters(), lr=init_lr / warmup_factor) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, epoch - warmup_epo) scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, total_epoch=warmup_epo, after_scheduler=scheduler_cosine) criterion = nn.BCEWithLogitsLoss() scaler = amp.GradScaler() qwk_max = 0 for i in range(1, epoch + 1): print(f'Epoch: {i}') scheduler.step(i - 1) model.train() loss = train_epoch(model, train_dl, criterion, scaler, optimizer) model.eval() with torch.no_grad(): val_loss, pred, val_lab = train_epoch(model, val_dl, criterion, None, None) p = torch.cat(pred).cpu().numpy() t = torch.cat(val_lab).cpu().numpy() acc = (p == t).mean() * 100. qwk = cohen_kappa_score(p, t, weights='quadratic') #sch.step(val_loss) # Plateau if log: print('Log.....') lg = time.ctime( ) + ' ' + f'Epoch {i}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(loss):.5f}, val loss: {np.mean(val_loss):.5f}, acc: {(acc):.5f}, qwk: {(qwk):.5f}, fold: {fold+1}' print(lg) with open(os.path.join(path_log, f'log_{kernel}_kaggle.txt'), 'a') as appender: appender.write(lg + '\n') if qwk > qwk_max: print('Best ({:.6f} --> {:.6f}). Saving model ...'.format( qwk_max, qwk)) torch.save( model.state_dict(), os.path.join( path_model, f'{kernel}_kaggle_best_fold{fold+1}_epoch_{i}.pth')) qwk_max = qwk #make checkpoint #problem in win # name_check = '_'.join(time.ctime().split(':')) + '_model.pt' # torch.save({ # 'epoch': i, # 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': optimizer.state_dict() # }, os.path.join(path_checkpoint, name_check)) torch.save( model.state_dict(), os.path.join(path_model, '{kernel}_kaggle_final_fold{fold+1}.pth'))
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.scaler = amp.GradScaler()
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info(f'Hyperparameters {hyp}') save_dir, epochs, batch_size, total_batch_size, weights, rank = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes names = ['item'] if opt.single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round(hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp['lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init(config=opt, resume="allow", project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) loggers = {'wandb': wandb} # loggers dict # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs) if epochs < start_epoch: logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5)[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: Thread(target=plot_labels, args=(labels, save_dir, loggers), daemon=True).start() if tb_writer: tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test\n' 'Using %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, save_dir, epochs)) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 3 and wandb: wandb.log({"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg')]}) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2'] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}) # W&B # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': wandb_run.id if wandb else None} # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers final = best if best.exists() else last # final model for f in [last, best]: if f.exists(): strip_optimizer(f) # strip optimizers if opt.bucket: os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') # upload # Plots if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: files = ['results.png', 'precision_recall_curve.png', 'confusion_matrix.png'] wandb.log({"Results": [wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists()]}) if opt.log_artifacts: wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem) # Test best.pt logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) if opt.data.endswith('coco.yaml') and nc == 80: # if COCO for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]): # speed, mAP tests results, _, _ = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, conf_thres=conf, iou_thres=iou, model=attempt_load(final, device).half(), single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=save_json, plots=False) else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() return results
def train(args): # Set up logging and devices log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}") args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f"Using random seed {args.seed}...") random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info("Loading embeddings...") word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info("Building model...") model = BiDAF( word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, use_glove=args.use_glove, ) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f"Loading checkpoint from {args.load_path}...") model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = stats.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver( args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log, ) # Get optimizer and scheduler optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.0) # Constant LR # Get data loader log.info("Building dataset...") train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn, ) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader( dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn, ) # Train log.info("Training...") steps_till_eval = args.eval_steps epoch = step // len(train_dataset) scaler = amp.GradScaler() while epoch != args.num_epochs: epoch += 1 log.info(f"Starting epoch {epoch}...") with torch.enable_grad(), tqdm( total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward with amp.autocast(): log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward scaler.scale(loss).backward() scaler.unscale_(optimizer) nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scaler.step(optimizer) scaler.update() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar("train/NLL", loss_val, step) tbx.add_scalar("train/LR", optimizer.param_groups[0]["lr"], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f"Evaluating at step {step}...") ema.assign(model) results, pred_dict = evaluate( model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, ) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items()) log.info(f"Dev {results_str}") # Log to TensorBoard log.info("Visualizing in TensorBoard...") for k, v in results.items(): tbx.add_scalar(f"dev/{k}", v, step) util.visualize( tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split="dev", num_visuals=args.num_visuals, )