def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool, save_every: int, backup_every: int, force_restart: bool): # Check to make sure the hop length is correctly factorised assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length # Instantiate the model print("Initializing the model...") model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).cuda() # Initialize the optimizer optimizer = optim.Adam(model.parameters()) for p in optimizer.param_groups: p["lr"] = hp.voc_lr loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss # Load the weights model_dir = models_dir.joinpath(run_id) model_dir.mkdir(exist_ok=True) weights_fpath = model_dir.joinpath(run_id + ".pt") if force_restart or not weights_fpath.exists(): print("\nStarting the training of WaveRNN from scratch\n") model.save(weights_fpath, optimizer) else: print("\nLoading weights at %s" % weights_fpath) model.load(weights_fpath, optimizer) print("WaveRNN weights loaded from step %d" % model.step) # Initialize the dataset metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \ voc_dir.joinpath("synthesized.txt") mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath( "mels_gta") wav_dir = syn_dir.joinpath("audio") dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir) test_loader = DataLoader(dataset, batch_size=1, shuffle=True, pin_memory=True) # Begin the training simple_table([('Batch size', hp.voc_batch_size), ('LR', hp.voc_lr), ('Sequence Len', hp.voc_seq_len)]) for epoch in range(1, 350): data_loader = DataLoader(dataset, collate_fn=collate_vocoder, batch_size=hp.voc_batch_size, num_workers=2, shuffle=True, pin_memory=True) start = time.time() running_loss = 0. for i, (x, y, m) in enumerate(data_loader, 1): x, m, y = x.cuda(), m.cuda(), y.cuda() # Forward pass y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) print("y shape:", y.shape) print("y_hat shape:", y_hat.shape) # Backward pass loss = loss_func(y_hat, y) optimizer.zero_grad() loss.backward() optimizer.step() running_loss += loss.item() speed = i / (time.time() - start) avg_loss = running_loss / i step = model.get_step() k = step // 1000 if backup_every != 0 and step % backup_every == 0: model.checkpoint(model_dir, optimizer) if save_every != 0 and step % save_every == 0: model.save(weights_fpath, optimizer) msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \ f"Loss: {avg_loss:.4f} | {speed:.1f} " \ f"steps/s | Step: {k}k | " stream(msg) gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap, model_dir) print("")
def train(run_id='', syn_dir=None, voc_dirs=[], mel_dir_name='', models_dir=None, log_dir='', ground_truth=False, save_every=1000, backup_every=1000, log_every=1000, force_restart=False, total_epochs=10000, logger=None): # Check to make sure the hop length is correctly factorised assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length # Instantiate the model print("Initializing the model...") model = WaveRNN( rnn_dims=hp.voc_rnn_dims, # 512 fc_dims=hp.voc_fc_dims, # 512 bits=hp.bits, # 9 pad=hp.voc_pad, # 2 upsample_factors=hp.voc_upsample_factors, # (3, 4, 5, 5) -> 300, (5,5,12)? feat_dims=hp.num_mels, # 80 compute_dims=hp.voc_compute_dims, # 128 res_out_dims=hp.voc_res_out_dims, # 128 res_blocks=hp.voc_res_blocks, # 10 hop_length=hp.hop_length, # 300 sample_rate=hp.sample_rate, # 24000 mode=hp.voc_mode # RAW (or MOL) ).cuda() # hp.apply_preemphasis in VocoderDataset # hp.mu_law in VocoderDataset # hp.voc_seq_len in VocoderDataset # hp.voc_lr in optimizer # hp.voc_batch_size for train # Initialize the optimizer optimizer = optim.Adam(model.parameters()) for p in optimizer.param_groups: p["lr"] = hp.voc_lr # 0.0001 loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss # Load the weights model_dir = models_dir.joinpath(run_id) # gta_model/gtaxxxx model_dir.mkdir(exist_ok=True) weights_fpath = model_dir.joinpath(run_id + ".pt") # gta_model/gtaxxx/gtaxxx.pt if force_restart or not weights_fpath.exists(): print("\nStarting the training of WaveRNN from scratch\n") model.save(str(weights_fpath), optimizer) else: print("\nLoading weights at %s" % weights_fpath) model.load(str(weights_fpath), optimizer) print("WaveRNN weights loaded from step %d" % model.step) # Initialize the dataset #metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \ # voc_dir.joinpath("synthesized.txt") #mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath("mels_gta") #wav_dir = syn_dir.joinpath("audio") #dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir) #dataset = VocoderDataset(str(voc_dir), 'mels-gta-1099579078086', 'audio') dataset = VocoderDataset([str(voc_dir) for voc_dir in voc_dirs], mel_dir_name, 'audio') #test_loader = DataLoader(dataset, # batch_size=1, # shuffle=True, # pin_memory=True) # Begin the training simple_table([('Batch size', hp.voc_batch_size), ('LR', hp.voc_lr), ('Sequence Len', hp.voc_seq_len)]) for epoch in range(1, total_epochs): data_loader = DataLoader(dataset, collate_fn=collate_vocoder, batch_size=hp.voc_batch_size, num_workers=30, shuffle=True, pin_memory=True) start = time.time() running_loss = 0. # start from 1 for i, (x, y, m) in enumerate(data_loader, 1): # cur [B, L], future [B, L] bit label, mels [B, D, T] x, m, y = x.cuda(), m.cuda(), y.cuda() # Forward pass # [B, L], [B, D, T] -> [B, L, C] y_hat = model(x, m) if model.mode == 'RAW': # [B, L, C] -> [B, C, L, 1] y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() # [B, L, 1] y = y.unsqueeze(-1) # Backward pass # [B, C, L, 1], [B, L, 1] # cross_entropy for RAW loss = loss_func(y_hat, y) optimizer.zero_grad() loss.backward() optimizer.step() running_loss += loss.item() speed = i / (time.time() - start) avg_loss = running_loss / i step = model.get_step() k = step // 1000 if backup_every != 0 and step % backup_every == 0 : model.checkpoint(str(model_dir), optimizer) if save_every != 0 and step % save_every == 0 : model.save(str(weights_fpath), optimizer) if log_every != 0 and step % log_every == 0 : logger.scalar_summary("loss", loss.item(), step) total_data=len(data_loader) speed=speed avg_loss=avg_loss k=k total_data=total_data msg = ("| Epoch: {epoch} ({i}/{total_data}) | " +\ "Loss: {avg_loss:.4f} | {speed:.1f} " +\ "steps/s | Step: {k}k | ").format(**vars()) stream(msg) #gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, # hp.voc_target, hp.voc_overlap, model_dir) print("")
def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool, save_every: int, backup_every: int, force_restart: bool): # Check to make sure the hop length is correctly factorised # assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length # Instantiate the model print("Initializing the model...") # model = WaveRNN( # rnn_dims=hp.voc_rnn_dims, # fc_dims=hp.voc_fc_dims, # bits=hp.bits, # pad=hp.voc_pad, # upsample_factors=hp.voc_upsample_factors, # feat_dims=hp.num_mels, # compute_dims=hp.voc_compute_dims, # res_out_dims=hp.voc_res_out_dims, # res_blocks=hp.voc_res_blocks, # hop_length=hp.hop_length, # sample_rate=hp.sample_rate, # mode=hp.voc_mode # ).cuda() model = model_VC(32, 256, 512, 32).cuda() # Initialize the optimizer optimizer = optim.Adam(model.parameters()) for p in optimizer.param_groups: p["lr"] = hp.voc_lr loss_recon = nn.MSELoss() loss_content = nn.L1Loss() # Load the weights model_dir = models_dir.joinpath(run_id) model_dir.mkdir(exist_ok=True) weights_fpath = model_dir.joinpath(run_id + ".pt") if force_restart or not weights_fpath.exists(): print("\nStarting the training of AutoVC from scratch\n") model.save(weights_fpath, optimizer) else: print("\nLoading weights at %s" % weights_fpath) model.load(weights_fpath, optimizer) print("AutoVC weights loaded from step %d" % model.step) # Initialize the dataset metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \ voc_dir.joinpath("synthesized.txt") mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath( "mels_gta") wav_dir = syn_dir.joinpath("audio") #2019.11.26 embed_dir = syn_dir.joinpath("embeds") dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir, embed_dir) test_loader = DataLoader(dataset, batch_size=1, shuffle=True, pin_memory=True) # Begin the training simple_table([('Batch size', hp.voc_batch_size), ('LR', hp.voc_lr), ('Sequence Len', hp.voc_seq_len)]) for epoch in range(1, 350): model.train() data_loader = DataLoader(dataset, collate_fn=collate_vocoder, batch_size=hp.voc_batch_size, num_workers=2, shuffle=True, pin_memory=True) start = time.time() running_loss = 0. for i, (m, e, _) in enumerate(data_loader, 1): #print("e:",e.shape) #print("m:",m.shape) model.train() m, e = m.cuda(), e.cuda() # Forward pass C, X_C, X_before, X_after, _ = model(m, e, e) #c_org shape: torch.Size([100, 256, 1]) #x shape: torch.Size([100, 80, 544]) #c_org_expand shape torch.Size([100, 256, 544]) #encoder_outputs shape: torch.Size([100, 544, 320]) #C shape: torch.Size([100, 544, 64]) #X shape: torch.Size([100, 1, 544, 80]) X_after = X_after.squeeze(1).permute(0, 2, 1) X_before = X_before.squeeze(1).permute(0, 2, 1) #print("C shape:",C.shape) #if X_C: # print("X_C shape:",X_C.shape) #print("X shape:",X.shape) # Backward pass loss_rec_before = loss_recon(X_before, m) loss_rec_after = loss_recon(X_after, m) loss_c = loss_content(C, X_C) loss = loss_rec_before + loss_rec_after + loss_c #print("recon loss:",loss1) #print("content loss:",loss2) optimizer.zero_grad() loss.backward() optimizer.step() #print("loss:",loss.item()) running_loss += loss.item() #print("running loss:",running_loss) speed = i / (time.time() - start) avg_loss = running_loss / i #print("avg_loss:",avg_loss) step = model.get_step() if hp.decay_learning_rate == True: p["lr"] = _learning_rate_decay(p["lr"], step) k = step // 1000 if step % 100 == 0 and step != 0: model.eval() plt.figure(1) C, X_C, X_before, X_after, _ = model(m, e, e) X_after = X_after.squeeze(1).permute(0, 2, 1) mel_out = torch.tensor(X_after).clone().detach().cpu().numpy() from synthesizer import audio from synthesizer.hparams import hparams wav = audio.inv_mel_spectrogram(mel_out[0, :, :], hparams) librosa.output.write_wav("out.wav", np.float32(wav), hparams.sample_rate) mel_out = mel_out[0, :, :].transpose(1, 0) plt.imshow(mel_out.T, interpolation='nearest', aspect='auto') plt.title("Generate Spectrogram") save_path = model_dir p_path = save_path.joinpath("generate.png") plt.savefig(p_path) plt.figure(2) m_out = m.squeeze(1).permute(0, 2, 1) m_out = torch.tensor(m).clone().detach().cpu().numpy() m_out = m_out[0, :, :].transpose(1, 0) plt.imshow(m_out.T, interpolation='nearest', aspect='auto') plt.title("Orignal Spectrogram") o_path = save_path.joinpath("orignal.png") plt.savefig(o_path) if backup_every != 0 and step % backup_every == 0: model.checkpoint(model_dir, optimizer) if save_every != 0 and step % save_every == 0: model.save(weights_fpath, optimizer) torch.save(model, "model_ttsdb_48_48.pkl") msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \ f"Loss: {avg_loss:.4f} | {speed:.1f} " \ f"steps/s | Step: {k}k | " stream(msg) # gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,hp.voc_target,model_dir) print("")
def gen_display(self, i, seq_len, b_size, gen_rate): pbar = progbar(i, seq_len) #msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | ' msg = '| %s %s/%s | Batch Size: %s | Gen Rate: %.1fkHz | ' % (pbar, i*b_size, seq_len*b_size, b_size, gen_rate) stream(msg)
def gen_display(self, i, seq_len, b_size, gen_rate): pbar = progbar(i, seq_len) msg = '| %s %d/%d | Batch Size: %d | Gen Rate: %.1fkHz | ' % ( pbar, i * b_size, seq_len * b_size, b_size, gen_rate) stream(msg)