def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = 'taco_step%sK' % (repr1(k)) save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / '%s' % (repr1(step))) save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / '%s' % (repr1(step)), 600) msg = '| Epoch: %s/%s (%s/%s) | Loss: %.4f | %.2f steps/s | Step: %sk | ' % ( repr1(e), repr1(epochs), repr1(i), repr1(total_iters), avg_loss, speed, repr1(k)) stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer, train_set, test_set, lr, total_steps): # Use same device as model parameters device = next(model.parameters()).device for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = (total_steps - model.get_step()) // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0. for i, (x, y, m) in enumerate(train_set, 1): x, m, y = x.to(device), m.to(device), y.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: y_hat = data_parallel_workaround(model, x, m) else: y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) loss = loss_func(y_hat, y) optimizer.zero_grad() loss.backward() if hp.voc_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.voc_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.voc_checkpoint_every == 0: gen_testset(model, test_set, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap, paths.voc_output) ckpt_name = f'wave_step{k}K' save_checkpoint('voc', paths, model, optimizer, name=ckpt_name, is_silent=True) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('voc', paths, model, optimizer, is_silent=True) model.log(paths.voc_log, msg) print(' ')
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer, train_set, test_set, lr, total_steps): # Use same device as model parameters device = next(model.parameters()).device # set learning rate for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = (total_steps - model.get_step()) // total_iters + 1 total_number_of_batches = len(train_set) writer = SummaryWriter("runs/{0}-{1}".format( model_name_prefix, datetime.now().strftime("%Y%m%d-%H%M%S"))) scheduler = StepLR(optimizer, step_size=1, gamma=0.983) for e in range(EPOCH, epochs + 1): start = time.time() running_loss = 0. avg_loss = 0 for i, (x, y, m) in enumerate(train_set, 1): x, m, y = x.to(device), m.to(device), y.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: y_hat = data_parallel_workaround(model, x, m) else: y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) loss = loss_func(y_hat, y) optimizer.zero_grad() loss.backward() if hp.voc_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.voc_clip_grad_norm) optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 # Write to tensorboard per batch writer.add_scalar('Epoch loss', loss.item(), e * total_number_of_batches + i) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | ' stream(msg) """ ####################### Testing ############################ torch.cuda.empty_cache() loss_test = 0 for _, (x_test, y_test, m_test) in enumerate(test_set, 1): x_test, m_test, y_test = x_test.to(device), m_test.to(device), y_test.to(device) if device.type == 'cuda' and torch.cuda.device_count() > 1: raise RuntimeError("Unsupported") else: y_test_hat = model(x_test, m_test) if model.mode == 'RAW': y_test_hat = y_test_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y_test = y_test.float() y_test = y_test.unsqueeze(-1) loss_test += loss_func(y_test_hat, y_test).item() avg_loss_test = loss_test / len(test_set) msg = f'| Epoch: {e}/{epochs} | Test-Loss: {loss_test:.4f} | Test-AvgLoss: {avg_loss_test:.4f} | ' stream("\n") stream(msg) writer.add_scalar('Test loss', loss_test, e) writer.add_scalar('Average test loss', avg_loss_test, e) ############################################################ """ # Write to tensorboard per epoch writer.add_scalar('Running loss', running_loss, e) writer.add_scalar('Average loss', avg_loss, e) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('voc', paths, model, optimizer, name="{0}-epoch-{1}-loss-{2}".format( model_name_prefix, e, avg_loss), is_silent=True) model.log(paths.voc_log, msg) print(' ') scheduler.step() print('Epoch:', e, 'LR:', scheduler.get_lr())
def train_loop(paths: Paths, model, optimizer, train_set, lr, train_steps, mel_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 dur_running_loss = 0 # Perform 1 epoch for i, (x, m, ids, mel_len, dur) in enumerate(train_set, 1): x, m, dur = x.to(device), m.to(device), dur.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m_hat, m_post_hat, dur_hat = data_parallel_workaround( model, x, m, dur) else: m_hat, m_post_hat, dur_hat = model(x, m, dur) lin_loss = F.l1_loss(m_hat, m) post_loss = F.l1_loss(m_post_hat, m) dur_loss = F.l1_loss(dur_hat, dur) loss = lin_loss + post_loss + dur_loss optimizer.zero_grad() loss.backward() if hp.forward_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.forward_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += post_loss.item() avg_loss = running_loss / i dur_running_loss += dur_loss.item() dur_avg_loss = dur_running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.forward_checkpoint_every == 0: ckpt_name = f'fast_speech_step{k}K' save_checkpoint('forward', paths, model, optimizer, name=ckpt_name, is_silent=True) if mel_example in ids: idx = ids.index(mel_example) try: seq = x[idx].tolist() m_gen = model.generate(seq) save_spectrogram(m_gen, paths.forward_mel_plot / f'{step}_gen', 600) except Exception: traceback.print_exc() save_spectrogram(np_now(m_post_hat[idx]), paths.forward_mel_plot / f'{step}_gta', 600) save_spectrogram(np_now(m[idx]), paths.forward_mel_plot / f'{step}_target', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {avg_loss:#.4} ' \ f'| Duration Loss: {dur_avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) model.log(paths.forward_log, msg) save_checkpoint('forward', paths, model, optimizer, is_silent=True)
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _, att_guides) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention, r = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention, r = model(x, m) #print(att_guides.shape) orig_attention = attention n = int(len(att_guides[0]) / r) #print("n", n) #reduce guide by r factor ga = [a[t] for a in att_guides for t in range(0, len(a), r)] assert n == len(attention[0]) guided_attention = [ga[k:k + n] for k in range(0, len(ga), n)] attention = np_now(attention) attention = [ pad2d_nonzero(x, n, len(att_guides[0][0])) for x in attention ] guided_attention = torch.tensor(guided_attention) guided_attention = guided_attention.to(device) attention = torch.tensor(attention) attention = attention.to(device) #create attention mask attention_masks = torch.ne(attention, -1).type(torch.FloatTensor) attention_masks = torch.tensor(attention_masks) attention_masks = attention.to(device) multiply = torch.abs( attention * guided_attention) * attention_masks attention_loss = torch.sum(multiply) mask_sum = torch.sum(attention_masks) attention_loss /= mask_sum m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) #print("mask sum", mask_sum) #print("attention loss", attention_loss) #print("m losses", m1_loss, m2_loss) prev_loss = m1_loss + m2_loss #print("prev loss", prev_loss) loss = m1_loss + m2_loss + attention_loss #print("loss + att", loss) optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(orig_attention[idx][:, :160]), paths.tts_attention / f'{step}') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def tts_train_loop_af_offline(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example, hp=None): # setattr(model, 'mode', 'attention_forcing') # import pdb def smooth(d, eps=float(1e-10)): u = 1.0 / float(d.size()[2]) return eps * u + (1 - eps) * d device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss_out, running_loss_attn = 0, 0 # Perform 1 epoch for i, (x, m, ids, _, attn_ref) in enumerate(train_set, 1): # print(x.size()) # print(m.size()) # print(attn_ref.size()) # # print(m1_hat.size(), m2_hat.size()) # # print(attention.size(), attention.size(1)*model.r) # pdb.set_trace() x, m, attn_ref = x.to(device), m.to(device), attn_ref.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention = data_parallel_workaround( model, x, m, False, attn_ref) else: m1_hat, m2_hat, attention = model(x, m, generate_gta=False, attn_ref=attn_ref) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) # attn_loss = F.kl_div(torch.log(smooth(attention)), smooth(attn_ref), reduction='mean') # 'batchmean' attn_loss = F.l1_loss(smooth(attention), smooth(attn_ref)) loss_out = m1_loss + m2_loss loss_attn = attn_loss * hp.attn_loss_coeff loss = loss_out + loss_attn optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss_out += loss_out.item() avg_loss_out = running_loss_out / i running_loss_attn += loss_attn.item() avg_loss_attn = running_loss_attn / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attn_ref[idx][:, :160]), paths.tts_attention / f'{step}_tf') save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / f'{step}_af') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss_out: {avg_loss_out:#.4}; Output_attn: {avg_loss_attn:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example, max_y, max_x): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _, padded_att_guides) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention, r = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention, r = model(x, m) reduced_guides = [] att_guide_path = hp.attention_path for j, item_id in enumerate(ids): att = np.load(f'{att_guide_path}/{item_id}.npy') reduced = att[0::r] pred_attention = attention[j] n_frames = pred_attention.shape[0] n_phones = pred_attention.shape[-1] # pred_attention = torch.tensor(pred_attention) # reduced = torch.tensor(reduced) padded_guides = pad2d_nonzero(reduced, n_frames, n_phones) #padded_guides = torch.tensor(padded_guides) reduced_guides.append(padded_guides) reduced_guides = torch.tensor(reduced_guides) mask = torch.ne(reduced_guides, -1).type(torch.FloatTensor) mask = torch.tensor(mask) padded_guides = [ pad2d_zero(x, n_frames, n_phones) for x in reduced_guides ] padded_guides = torch.tensor(padded_guides) padded_guides = padded_guides.to(device) attention = attention.to(device) mask = mask.to(device) attention = attention * mask print("guide att shape", att.shape) print(att) print("reduced guide", padded_guides.shape) # print("attention size",n_frames, n_phones) print("mask", mask.shape) print(mask) print(padded_guides.shape, attention.shape, mask.shape) print(attention) print(padded_guides) multiply = torch.pow((attention - padded_guides), 2) print(multiply) #multiply = torch.pow((pred_attention - padded_guides),2)* mask #print(multiply) attention_loss = torch.sum(multiply) print(attention_loss) mask_sum1 = torch.sum(mask) attention_loss /= mask_sum1 print(attention_loss) # batch_attention_losses.append(attention_loss) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) #average_att_loss = sum(batch_attention_losses)/len(batch_attention_losses) #print("attention loss", average_att_loss) #print("m losses", m1_loss, m2_loss) prev_loss = m1_loss + m2_loss print("prev loss", prev_loss) loss = m1_loss + m2_loss + attention_loss print("loss + att", loss) #exit() optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / f'{step}') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')