def __getitem__(self, idx): name = self.metas.iloc[idx, 0] path = "{}/wavs/{}.wav".format(self.root_path, name) # Text normalization text = self.metas.iloc[idx, 1] text_norm = self.metas.iloc[idx, 2] text_encoded = np.array(text_to_sequence(text_norm, self.text_cleaner)) text_pos = np.array([idx + 1 for idx, _ in enumerate(text_encoded)]) data = { "name": name, "text": text, "text_norm": text_norm, "text_encoded": text_encoded, "text_pos": text_pos, "text_len": text_encoded.shape[-1], "sr": self.sr } if not self.exclude_mels: wav, sr = librosa.load(path, sr=self.sr) # wav is [-1.0, 1.0] if sr != self.sr: raise ValueError("{} SR doesn't match target {} SR".format( sr, self.sr)) # Audio processing wav, _ = librosa.effects.trim(wav, frame_length=self.win_len, hop_length=self.hop_len) if self.mels_path: mel = np.load(os.path.join(self.mels_path, name + ".mel.npy")) else: mel = librosa.feature.melspectrogram(wav, sr=sr, n_fft=self.n_fft, win_length=self.win_len, hop_length=self.hop_len, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax, power=1.0) mel = audio.dynamic_range_compression(mel) data_mel = { "wav": wav, "mel": mel, "mel_len": mel.shape[-1], } data.update(data_mel) if self.aligns_path: aligns = np.load( os.path.join(self.aligns_path, name + ".align.npy")) data['align'] = aligns return data
def __getitem__(self, idx): text = self.texts[idx] # Text normalization text_encoded = np.array(text_to_sequence(text, self.text_cleaner)) text_pos = np.array([idx + 1 for idx, _ in enumerate(text_encoded)]) data = { "text": text, "text_norm": text, "text_encoded": text_encoded, "text_pos": text_pos, } return data
def synthesize(text, voice, sigma=0.6, denoiser_strength=0.1, is_fp16=False): hparams = create_hparams() hparams.sampling_rate = 22050 if voice == "papaito": voice_model = "nvidia_tacotron2_papaito_300" elif voice == "constantino": voice_model = "tacotron2_Constantino_600" elif voice == "orador": voice_model = "checkpoint_tacotron2_29000_es" checkpoint_path = "/home/debian/workspace/models/" + voice_model model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval().half() waveglow_path = '/home/debian/workspace/models/waveglow_256channels_ljs_v2.pt' waveglow = torch.load(waveglow_path, map_location='cuda')['model'] _ = waveglow.cuda().eval().half() denoiser = Denoiser(waveglow) #text="¡Cágate lorito!" #with open(filelist_path, encoding='utf-8', mode='r') as f: # text = f.read() sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) #mel = torch.unsqueeze(mel, 0) mel = mel_outputs.half() if is_fp16 else mel_outputs audio = np.array([]) with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * hparams.max_wav_value audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') return audio, hparams.sampling_rate
#checkpoint_path = "output/checkpoint_29000" checkpoint_path = args.checkpoint_path model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval().half() #waveglow_path = '/media/debian/SSD_USB/models/waveglow_256channels_ljs_v2.pt' waveglow = torch.load(args.waveglow_path)['model'] _ = waveglow.cuda().eval().half() denoiser = Denoiser(waveglow) #text="¡Cágate lorito!" with open(args.filelist_path, encoding='utf-8', mode='r') as f: text = f.read() sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) #mel = torch.unsqueeze(mel, 0) mel = mel_outputs.half() if args.is_fp16 else mel_outputs with torch.no_grad(): audio = waveglow.infer(mel, sigma=args.sigma) if args.denoiser_strength > 0: audio = denoiser(audio, args.denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') file_name = "audio" audio_path = os.path.join(args.output_dir,