def validate(model, epoch, total_iter, criterion, valset, batch_size, collate_fn, distributed_run, batch_to_gpu, ema=False): """Handles all the validation scoring and printing""" was_training = model.training model.eval() tik = time.perf_counter() with torch.no_grad(): val_sampler = DistributedSampler(valset) if distributed_run else None val_loader = DataLoader(valset, num_workers=4, shuffle=False, sampler=val_sampler, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) val_meta = defaultdict(float) val_num_frames = 0 for i, batch in enumerate(val_loader): x, y, num_frames = batch_to_gpu(batch) y_pred = model(x) loss, meta = criterion(y_pred, y, is_training=False, meta_agg='sum') if distributed_run: for k, v in meta.items(): val_meta[k] += reduce_tensor(v, 1) val_num_frames += reduce_tensor(num_frames.data, 1).item() else: for k, v in meta.items(): val_meta[k] += v val_num_frames = num_frames.item() val_meta = {k: v / len(valset) for k, v in val_meta.items()} val_meta['took'] = time.perf_counter() - tik log( (epoch, ) if epoch is not None else (), tb_total_steps=total_iter, subset='val_ema' if ema else 'val', data=OrderedDict([('loss', val_meta['loss'].item()), ('mel_loss', val_meta['mel_loss'].item()), ('frames/s', num_frames.item() / val_meta['took']), ('took', val_meta['took'])]), ) if was_training: model.train() return val_meta
def main(): parser = argparse.ArgumentParser(description='PyTorch FastPitch Training', allow_abbrev=False) parser = parse_args(parser) args, _ = parser.parse_known_args() if args.p_arpabet > 0.0: cmudict.initialize(args.cmudict_path, keep_ambiguous=True) distributed_run = args.world_size > 1 torch.manual_seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) if args.local_rank == 0: if not os.path.exists(args.output): os.makedirs(args.output) log_fpath = args.log_file or os.path.join(args.output, 'nvlog.json') tb_subsets = ['train', 'val'] if args.ema_decay > 0.0: tb_subsets.append('val_ema') logger.init(log_fpath, args.output, enabled=(args.local_rank == 0), tb_subsets=tb_subsets) logger.parameters(vars(args), tb_subset='train') parser = models.parse_model_args('FastPitch', parser) args, unk_args = parser.parse_known_args() if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') torch.backends.cudnn.benchmark = args.cudnn_benchmark if distributed_run: init_distributed(args, args.world_size, args.local_rank) device = torch.device('cuda' if args.cuda else 'cpu') model_config = models.get_model_config('FastPitch', args) model = models.get_model('FastPitch', model_config, device) attention_kl_loss = AttentionBinarizationLoss() # Store pitch mean/std as params to translate from Hz during inference model.pitch_mean[0] = args.pitch_mean model.pitch_std[0] = args.pitch_std kw = dict(lr=args.learning_rate, betas=(0.9, 0.98), eps=1e-9, weight_decay=args.weight_decay) if args.optimizer == 'adam': optimizer = FusedAdam(model.parameters(), **kw) elif args.optimizer == 'lamb': optimizer = FusedLAMB(model.parameters(), **kw) else: raise ValueError scaler = torch.cuda.amp.GradScaler(enabled=args.amp) if args.ema_decay > 0: ema_model = copy.deepcopy(model) else: ema_model = None if distributed_run: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) start_epoch = [1] start_iter = [0] assert args.checkpoint_path is None or args.resume is False, ( "Specify a single checkpoint source") if args.checkpoint_path is not None: ch_fpath = args.checkpoint_path elif args.resume: ch_fpath = last_checkpoint(args.output) else: ch_fpath = None if ch_fpath is not None: load_checkpoint(args, model, ema_model, optimizer, scaler, start_epoch, start_iter, model_config, ch_fpath) start_epoch = start_epoch[0] total_iter = start_iter[0] criterion = FastPitchLoss( dur_predictor_loss_scale=args.dur_predictor_loss_scale, pitch_predictor_loss_scale=args.pitch_predictor_loss_scale, attn_loss_scale=args.attn_loss_scale) collate_fn = TTSCollate() if args.local_rank == 0: prepare_tmp(args.pitch_online_dir) trainset = TTSDataset(audiopaths_and_text=args.training_files, **vars(args)) valset = TTSDataset(audiopaths_and_text=args.validation_files, **vars(args)) if distributed_run: train_sampler, shuffle = DistributedSampler(trainset), False else: train_sampler, shuffle = None, True # 4 workers are optimal on DGX-1 (from epoch 2 onwards) train_loader = DataLoader(trainset, num_workers=4, shuffle=shuffle, sampler=train_sampler, batch_size=args.batch_size, pin_memory=True, persistent_workers=True, drop_last=True, collate_fn=collate_fn) if args.ema_decay: mt_ema_params = init_multi_tensor_ema(model, ema_model) model.train() bmark_stats = BenchmarkStats() torch.cuda.synchronize() for epoch in range(start_epoch, args.epochs + 1): epoch_start_time = time.perf_counter() epoch_loss = 0.0 epoch_mel_loss = 0.0 epoch_num_frames = 0 epoch_frames_per_sec = 0.0 if distributed_run: train_loader.sampler.set_epoch(epoch) accumulated_steps = 0 iter_loss = 0 iter_num_frames = 0 iter_meta = {} iter_start_time = time.perf_counter() epoch_iter = 0 num_iters = len(train_loader) // args.grad_accumulation for batch in train_loader: if accumulated_steps == 0: if epoch_iter == num_iters: break total_iter += 1 epoch_iter += 1 adjust_learning_rate(total_iter, optimizer, args.learning_rate, args.warmup_steps) model.zero_grad(set_to_none=True) x, y, num_frames = batch_to_gpu(batch) with torch.cuda.amp.autocast(enabled=args.amp): y_pred = model(x) loss, meta = criterion(y_pred, y) if (args.kl_loss_start_epoch is not None and epoch >= args.kl_loss_start_epoch): if args.kl_loss_start_epoch == epoch and epoch_iter == 1: print('Begin hard_attn loss') _, _, _, _, _, _, _, _, attn_soft, attn_hard, _, _ = y_pred binarization_loss = attention_kl_loss(attn_hard, attn_soft) kl_weight = min( (epoch - args.kl_loss_start_epoch) / args.kl_loss_warmup_epochs, 1.0) * args.kl_loss_weight meta['kl_loss'] = binarization_loss.clone().detach( ) * kl_weight loss += kl_weight * binarization_loss else: meta['kl_loss'] = torch.zeros_like(loss) kl_weight = 0 binarization_loss = 0 loss /= args.grad_accumulation meta = {k: v / args.grad_accumulation for k, v in meta.items()} if args.amp: scaler.scale(loss).backward() else: loss.backward() if distributed_run: reduced_loss = reduce_tensor(loss.data, args.world_size).item() reduced_num_frames = reduce_tensor(num_frames.data, 1).item() meta = { k: reduce_tensor(v, args.world_size) for k, v in meta.items() } else: reduced_loss = loss.item() reduced_num_frames = num_frames.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") accumulated_steps += 1 iter_loss += reduced_loss iter_num_frames += reduced_num_frames iter_meta = {k: iter_meta.get(k, 0) + meta.get(k, 0) for k in meta} if accumulated_steps % args.grad_accumulation == 0: logger.log_grads_tb(total_iter, model) if args.amp: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_thresh) scaler.step(optimizer) scaler.update() else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_thresh) optimizer.step() if args.ema_decay > 0.0: apply_multi_tensor_ema(args.ema_decay, *mt_ema_params) iter_mel_loss = iter_meta['mel_loss'].item() iter_kl_loss = iter_meta['kl_loss'].item() iter_time = time.perf_counter() - iter_start_time epoch_frames_per_sec += iter_num_frames / iter_time epoch_loss += iter_loss epoch_num_frames += iter_num_frames epoch_mel_loss += iter_mel_loss log( (epoch, epoch_iter, num_iters), tb_total_steps=total_iter, subset='train', data=OrderedDict([ ('loss', iter_loss), ('mel_loss', iter_mel_loss), ('kl_loss', iter_kl_loss), ('kl_weight', kl_weight), ('frames/s', iter_num_frames / iter_time), ('took', iter_time), ('lrate', optimizer.param_groups[0]['lr']) ]), ) accumulated_steps = 0 iter_loss = 0 iter_num_frames = 0 iter_meta = {} iter_start_time = time.perf_counter() # Finished epoch epoch_loss /= epoch_iter epoch_mel_loss /= epoch_iter epoch_time = time.perf_counter() - epoch_start_time log( (epoch, ), tb_total_steps=None, subset='train_avg', data=OrderedDict([('loss', epoch_loss), ('mel_loss', epoch_mel_loss), ('frames/s', epoch_num_frames / epoch_time), ('took', epoch_time)]), ) bmark_stats.update(epoch_num_frames, epoch_loss, epoch_mel_loss, epoch_time) validate(model, epoch, total_iter, criterion, valset, args.batch_size, collate_fn, distributed_run, batch_to_gpu) if args.ema_decay > 0: validate(ema_model, epoch, total_iter, criterion, valset, args.batch_size, collate_fn, distributed_run, batch_to_gpu, ema=True) maybe_save_checkpoint(args, model, ema_model, optimizer, scaler, epoch, total_iter, model_config) logger.flush() # Finished training if len(bmark_stats) > 0: log((), tb_total_steps=None, subset='train_avg', data=bmark_stats.get(args.benchmark_epochs_num)) validate(model, None, total_iter, criterion, valset, args.batch_size, collate_fn, distributed_run, batch_to_gpu)