def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode="", num=100, check=True): text_seq = text_seq[:-1] text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) sequence = np.array(text_to_sequence(text_seq, hp.hparams.text_cleaners))[None, 1] pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) model.eval() with torch.no_grad(): mel, mel_postnet = model(text, pos, alpha=alpha) if not os.path.exists("results_kor_0730_nam_95000"): os.mkdir("results_kor_0730_nam_95000") new_name = text_seq.replace(" ", "_") new_name = new_name.replace("?", "_") new_name = new_name[:-1] new_name2 = new_name + str(num) + mode + ".wav" new_name3 = "results_kor_0730_nam_95000/" + new_name2 mel = mel[0].cpu().numpy().T mel_postnet = mel_postnet[0].cpu().numpy().T plot_data([mel, mel_postnet], file_name=new_name) start = int(round(time.time() * 1000)) wav = audio.inv_mel_spectrogram(mel_postnet) end = int(round(time.time() * 1000)) audio.save_wav(wav, os.path.join("results_kor_0730_nam_95000", new_name2)) clean_text = new_name.replace("_", " ") if check: x, _, _, y, _, _ = WERCER([new_name3], [str(clean_text)]) else: x = 0 y = 0 print("Total time : ", end - start) print() return new_name, x, y
def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode="", num=100): text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) start = int(round(time.time() * 1000)) model.eval() with torch.no_grad(): mel, mel_postnet = model(text, pos, alpha=alpha) end = int(round(time.time() * 1000)) tt = end - start print("Total - making mel : %d ms\n" % tt) mel = mel[0].cpu().numpy().T mel_postnet = mel_postnet[0].cpu().numpy().T #plot_data([mel, mel_postnet]) wav = audio.inv_mel_spectrogram(mel_postnet) print("Wav Have Been Synthesized.\n") if not os.path.exists("results"): os.mkdir("results") new_name = text_seq.replace(" ", "_") audio.save_wav( wav, os.path.join("results", new_name + str(num) + mode + ".wav")) return new_name
def get_tacotron2_alignment_test(text_seq): hparams = hp_tacotron2.create_hparams() hparams.sampling_rate = hp.sample_rate checkpoint_path = os.path.join("Tacotron2", os.path.join("outdir", "checkpoint_51000")) tacotron2 = train_tacotron2.load_model(hparams) tacotron2.load_state_dict(torch.load(checkpoint_path)["state_dict"]) _ = tacotron2.cuda().eval().half() sequence = np.array(text_to_sequence(text_seq, hp.hparams.text_cleaners))[None, :] print("sequence size", np.shape(sequence)) sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel, mel_postnet, _, alignment = tacotron2.inference(sequence) wav = audio.inv_mel_spectrogram(mel_postnet.float().data.cpu().numpy()[0]) file_name = text_seq.replace(" ", "_") audio.save_wav(wav, "%s.wav" % file_name) alignment = alignment.float().data.cpu().numpy()[0] print("alignment size", np.shape(alignment)) get_D(alignment) return alignment
def __getitem__(self, index): id = self.meta_data.iloc[index]['id'] text = self.meta_data.iloc[index]['text'] input_seq = text_to_sequence(text) mels = np.load(f'{self.path}/mels/{id}') linears = np.load(f'{self.path}/linears/{id}') return input_seq, (mels, linears)
def synthesis_waveglow(text_seq, model, waveglow, alpha=1.0, mode=""): text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) model.eval() with torch.no_grad(): _, mel_postnet = model(text, pos, alpha=alpha) with torch.no_grad(): wav = waveglow.infer(mel_postnet, sigma=0.666) print("Wav Have Been Synthesized.") if not os.path.exists("results"): os.mkdir("results") audio.save_wav(wav[0].data.cpu().numpy(), os.path.join("results", text_seq + mode + ".wav"))
def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode=""): text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) model.eval() with torch.no_grad(): mel, mel_postnet = model(text, pos, alpha=alpha) mel = mel[0].cpu().numpy().T mel_postnet = mel_postnet[0].cpu().numpy().T plot_data([mel, mel_postnet]) wav = audio.inv_mel_spectrogram(mel_postnet) print("Wav Have Been Synthesized.") if not os.path.exists("results"): os.mkdir("results") audio.save_wav(wav, os.path.join("results", text_seq + mode + ".wav"))
def __getitem__(self, idx): index = idx + 1 mel_name = os.path.join(self.dataset_path, self.paths[idx]) #mel_name = os.path.join( #t self.dataset_path, "nam_speech-mel-%05d.npy" % index) # mel_name = os.path.join( #self.dataset_path, "nam_speech-mel-%05d.npy" % index) # self.dataset_path, "ljspeech-mel-%05d.npy" %index) # print(mel_name) mel_np = np.load(mel_name) # print(self.text[idx]) # print(mel_name) character = self.text[idx] character = text_to_sequence(character, hp.hparams.text_cleaners) character = np.array(character) if not hp.pre_target: return {"text": character, "mel": mel_np} else: # filename = "0"*(7-len(str(idx)))+str(idx) #i=0 # while(i<13): align_path = os.path.join(hp.alignment_target_path, self.path_as[idx]) # if os.path.exists(path): # print(align_path) alignment = np.load(align_path) #print("text %s" %character) #print("alignment %s" %alignment) #print("mel %s" %mel_np) #print(mel_name) #print(align_path) return {"text": character, "mel": mel_np, "alignment": alignment}
def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode="", num=100, check=True, cute=False): text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).cuda().to(device) pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) model.eval() #mel = generate_mels(model, text, pos, 1, 0) with torch.no_grad(): mel, mel_postnet = model(text, pos, alpha=alpha) if not os.path.exists("results_kor_0730_indiv"): os.mkdir("results_kor_0730_indiv") new_name = text_seq.replace(" ", "_") new_name = new_name.replace("?", "_") if (cute): new_name2 = new_name + "_cute" new_name2 = new_name + str(num) + ".wav" new_name3 = "results_kor_0730_indiv/" + new_name2 if (cute): #high-pitched sound mel_postnet = mel_postnet[0].cpu().numpy().T else: #print('mel', mel.max(), mel.mean(), mel.min()) #print('mel.shape' , mel_postnet.shape) mel_postnet = mel_postnet.data.cpu().numpy()[0].T mel_postnet = mel_postnet[:, :-1] mel_postnet = np.append(mel_postnet, np.ones((80, 0), dtype=np.float32) * -4.0, axis=1) #print(mel.shape) mel = mel[0].cpu().numpy().T #print('mel_postnet', mel_postnet.max(), mel_postnet.mean(), mel_postnet.min()) plot_data([mel, mel_postnet], file_name=new_name) mels = [] mels.append(mel_postnet) if (cute): wav = audio.inv_mel_spectrogram(mel_postnet) else: stft = audio.taco_stft() wav = mels_to_wavs_GL(mels, stft) audio.save_wav( wav, os.path.join("results_kor_0730_indiv", new_name + str(num) + ".wav")) clean_text = new_name.replace("_", " ") if check: x, _, _, y, _, _ = WERCER([new_name3], [str(clean_text)]) else: x = 0 y = 0 return new_name, x, y
def get_text(self, text): text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) return text_norm
break return txt if __name__ == "__main__": # Test tacotron2 = get_tacotron2() text_path = os.path.join('dataset/nam', "train.txt") text = process_text(text_path) i = 0 for i in range(len(text)): text_seq = np.array(text_to_sequence(text[i], ['korean_cleaners']))[None, :] text_seq = torch.from_numpy(text_seq) alignment = get_one_alignment(text_seq, tacotron2) file_name = "%d.npy" % i dir_path = "./alignment_targets_nam" file_path = os.path.join(dir_path, file_name) if not os.path.exists(dir_path): os.mkdir(dir_path) if not os.path.exists(file_path): f = open(file_path, 'a+') f.close() np.save(file_path, alignment) if i % 100 == 0: print("current step : %d\n" % i)