def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 train_values = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, 'avg_stop_loss': 0, 'avg_align_score': 0, 'avg_step_time': 0, 'avg_loader_time': 0, 'avg_alignment_score': 0 } if c.bidirectional_decoder: train_values['avg_decoder_b_loss'] = 0 # decoder backward loss train_values['avg_decoder_c_loss'] = 0 # decoder consistency loss keep_avg = KeepAverage() keep_avg.add_values(train_values) print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) if use_cuda: batch_n_iter = int( len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() for num_iter, data in enumerate(data_loader): start_time = time.time() # format data text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data( data) loader_time = time.time() - end_time global_step += 1 # setup lr if c.noam_schedule: scheduler.step() optimizer.zero_grad() if optimizer_st: optimizer_st.zero_grad() # forward pass model if c.bidirectional_decoder: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss if not c.separate_stopnet and c.stopnet: loss += stop_loss # backward decoder if c.bidirectional_decoder: if c.loss_masking: decoder_backward_loss = criterion( torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths) else: decoder_backward_loss = criterion( torch.flip(decoder_backward_output, dims=(1, )), mel_input) decoder_c_loss = torch.nn.functional.l1_loss( torch.flip(decoder_backward_output, dims=(1, )), decoder_output) loss += decoder_backward_loss + decoder_c_loss keep_avg.update_values({ 'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item() }) loss.backward() optimizer, current_lr = adam_weight_decay(optimizer) grad_norm, grad_flag = check_update(model, c.grad_clip, ignore_stopnet=True) optimizer.step() # compute alignment score align_score = alignment_diagonal_score(alignments) keep_avg.update_value('avg_align_score', align_score) # backpass and check the grad norm for stop loss if c.separate_stopnet: stop_loss.backward() optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() else: grad_norm_st = 0 step_time = time.time() - start_time epoch_time += step_time if global_step % c.print_step == 0: print( " | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} " "DecoderLoss:{:.5f} StopLoss:{:.5f} AlignScore:{:.4f} GradNorm:{:.5f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "LoaderTime:{:.2f} LR:{:.6f}".format( num_iter, batch_n_iter, global_step, postnet_loss.item(), decoder_loss.item(), stop_loss.item(), align_score, grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, loader_time, current_lr), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) loss = reduce_tensor(loss.data, num_gpus) stop_loss = reduce_tensor(stop_loss.data, num_gpus) if c.stopnet else stop_loss if args.rank == 0: update_train_values = { 'avg_postnet_loss': float(postnet_loss.item()), 'avg_decoder_loss': float(decoder_loss.item()), 'avg_stop_loss': stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()), 'avg_step_time': step_time, 'avg_loader_time': loader_time } keep_avg.update_values(update_train_values) # Plot Training Iter Stats # reduce TB load if global_step % 10 == 0: iter_stats = { "loss_posnet": postnet_loss.item(), "loss_decoder": decoder_loss.item(), "lr": current_lr, "grad_norm": grad_norm, "grad_norm_st": grad_norm_st, "step_time": step_time } tb_logger.tb_train_iter_stats(global_step, iter_stats) if global_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, optimizer_st, postnet_loss.item(), OUT_PATH, global_step, epoch) # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img), } if c.bidirectional_decoder: figures["alignment_backward"] = plot_alignment( alignments_backward[0].data.cpu().numpy()) tb_logger.tb_train_figures(global_step, figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: train_audio = ap.inv_spectrogram(const_spec.T) else: train_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_train_audios(global_step, {'TrainAudio': train_audio}, c.audio["sample_rate"]) end_time = time.time() # print epoch stats print(" | > EPOCH END -- GlobalStep:{} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgStopLoss:{:.5f} AvgAlignScore:{:3f} EpochTime:{:.2f} " "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format( global_step, keep_avg['avg_postnet_loss'], keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'], keep_avg['avg_align_score'], epoch_time, keep_avg['avg_step_time'], keep_avg['avg_loader_time']), flush=True) # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], "stop_loss": keep_avg['avg_stop_loss'], "alignment_score": keep_avg['avg_align_score'], "epoch_time": epoch_time } tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, global_step) return keep_avg['avg_postnet_loss'], global_step
def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) model.train() epoch_time = 0 train_values = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, 'avg_stop_loss': 0, 'avg_align_score': 0, 'avg_step_time': 0, 'avg_loader_time': 0, 'avg_alignment_score': 0 } keep_avg = KeepAverage() keep_avg.add_values(train_values) print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) if use_cuda: batch_n_iter = int( len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] speaker_names = data[2] linear_input = data[3] if c.model in ["Tacotron", "TacotronGST" ] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] avg_text_length = torch.mean(text_lengths.float()) avg_spec_length = torch.mean(mel_lengths.float()) loader_time = time.time() - end_time if c.use_speaker_embedding: speaker_ids = [ speaker_mapping[speaker_name] for speaker_name in speaker_names ] speaker_ids = torch.LongTensor(speaker_ids) else: speaker_ids = None # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) global_step += 1 # setup lr if c.lr_decay: scheduler.step() optimizer.zero_grad() if optimizer_st: optimizer_st.zero_grad() # dispatch data to GPU if use_cuda: text_input = text_input.cuda(non_blocking=True) text_lengths = text_lengths.cuda(non_blocking=True) mel_input = mel_input.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True) linear_input = linear_input.cuda( non_blocking=True) if c.model in ["Tacotron", "TacotronGST" ] else None stop_targets = stop_targets.cuda(non_blocking=True) if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) # forward pass model decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss if not c.separate_stopnet and c.stopnet: loss += stop_loss loss.backward() optimizer, current_lr = adam_weight_decay(optimizer) grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() # compute alignment score align_score = alignment_diagonal_score(alignments) keep_avg.update_value('avg_align_score', align_score) # backpass and check the grad norm for stop loss if c.separate_stopnet: stop_loss.backward() optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() else: grad_norm_st = 0 step_time = time.time() - start_time epoch_time += step_time if global_step % c.print_step == 0: print( " | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} " "DecoderLoss:{:.5f} StopLoss:{:.5f} AlignScore:{:.4f} GradNorm:{:.5f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "LoaderTime:{:.2f} LR:{:.6f}".format( num_iter, batch_n_iter, global_step, postnet_loss.item(), decoder_loss.item(), stop_loss.item(), align_score, grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, loader_time, current_lr), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) loss = reduce_tensor(loss.data, num_gpus) stop_loss = reduce_tensor(stop_loss.data, num_gpus) if c.stopnet else stop_loss if args.rank == 0: update_train_values = { 'avg_postnet_loss': float(postnet_loss.item()), 'avg_decoder_loss': float(decoder_loss.item()), 'avg_stop_loss': stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()), 'avg_step_time': step_time, 'avg_loader_time': loader_time } keep_avg.update_values(update_train_values) # Plot Training Iter Stats # reduce TB load if global_step % 10 == 0: iter_stats = { "loss_posnet": postnet_loss.item(), "loss_decoder": decoder_loss.item(), "lr": current_lr, "grad_norm": grad_norm, "grad_norm_st": grad_norm_st, "step_time": step_time } tb_logger.tb_train_iter_stats(global_step, iter_stats) if global_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, optimizer_st, postnet_loss.item(), OUT_PATH, global_step, epoch) # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_train_figures(global_step, figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: train_audio = ap.inv_spectrogram(const_spec.T) else: train_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_train_audios(global_step, {'TrainAudio': train_audio}, c.audio["sample_rate"]) end_time = time.time() # print(start_time-end_time) # print epoch stats print(" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} " "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format( global_step, keep_avg['avg_postnet_loss'], keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'], keep_avg['avg_align_score'], epoch_time, keep_avg['avg_step_time'], keep_avg['avg_loader_time']), flush=True) # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], "stop_loss": keep_avg['avg_stop_loss'], "alignment_score": keep_avg['avg_align_score'], "epoch_time": epoch_time } tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, global_step) return keep_avg['avg_postnet_loss'], global_step