def say(self, text, output): # load the model model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq, self.CONFIG.num_mels, self.CONFIG.r) # load the audio processor ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels, self.CONFIG.min_level_db, self.CONFIG.frame_shift_ms, self.CONFIG.frame_length_ms, self.CONFIG.ref_level_db, self.CONFIG.num_freq, self.CONFIG.power, self.CONFIG.preemphasis, 60) # load model state if self.use_cuda: cp = torch.load(self.MODEL_PATH) else: cp = torch.load(self.MODEL_PATH, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp['model']) if self.use_cuda: model.cuda() model.eval() model.decoder.max_decoder_steps = 400 wavs = self.text2audio(text, model, self.CONFIG, self.use_cuda, ap) audio = np.concatenate(wavs) ap.save_wav(audio, output) return
def tts(text, model_path='model/best_model.pth.tar', config_path='model/config.json', use_cuda=False): CONFIG = load_config(config_path) model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r) if use_cuda: cp = torch.load(model_path + seq_to_seq_test_model_fname, map_location='cuda:0') else: cp = torch.load(model_path, map_location=lambda storage, loc: storage) model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() model.decoder.max_decoder_steps = 250 ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db, CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, CONFIG.preemphasis, griffin_lim_iters=50) t_1 = time.time() text_cleaner = [CONFIG.text_cleaner] seq = np.array(text_to_sequence(text, text_cleaner)) chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: chars_var = chars_var.cuda() linear_out = model.forward(chars_var.long()) linear_out = linear_out[0].data.cpu().numpy() waveform = ap.inv_spectrogram(linear_out.T) waveform = waveform[:ap.find_endpoint(waveform)] out_path = 'static/samples/' os.makedirs(out_path, exist_ok=True) file_name = text.replace(" ", "_").replace(".", "") + ".wav" out_path = os.path.join(out_path, file_name) ap.save_wav(waveform, out_path) # print(" > Run-time: {}".format(time.time() - t_1)) return file_name
class TestTTSDataset(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") dataset = TTSDataset.MyDataset( r, c.text_cleaner, compute_linear_spec=True, ap=self.ap, meta_data=items, tp=c.characters, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), use_phonemes=False, ) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers, ) return dataloader, dataset def test_loader(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] neg_values = text_input[text_input < 0] check_count = len(neg_values) assert check_count == 0, " !! Negative values in text_input: {}".format( check_count) # TODO: more assertion here assert isinstance(speaker_name[0], str) assert linear_input.shape[0] == c.batch_size assert linear_input.shape[2] == self.ap.fft_size // 2 + 1 assert mel_input.shape[0] == c.batch_size assert mel_input.shape[2] == c.audio["num_mels"] # check normalization ranges if self.ap.symmetric_norm: assert mel_input.max() <= self.ap.max_norm assert mel_input.min() >= -self.ap.max_norm # pylint: disable=invalid-unary-operand-type assert mel_input.min() < 0 else: assert mel_input.max() <= self.ap.max_norm assert mel_input.min() >= 0 def test_batch_group_shuffle(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 16) last_length = 0 frames = dataset.items for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] avg_length = mel_lengths.numpy().mean() assert avg_length >= last_length dataloader.dataset.sort_items() is_items_reordered = False for idx, item in enumerate(dataloader.dataset.items): if item != frames[idx]: is_items_reordered = True break assert is_items_reordered def test_padding_and_spec(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(1, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] # check mel_spec consistency wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) mel = self.ap.melspectrogram(wav).astype("float32") mel = torch.FloatTensor(mel).contiguous() mel_dl = mel_input[0] # NOTE: Below needs to check == 0 but due to an unknown reason # there is a slight difference between two matrices. # TODO: Check this assert cond more in detail. assert abs(mel.T - mel_dl).max() < 1e-5, abs(mel.T - mel_dl).max() # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() wav = self.ap.inv_melspectrogram(mel_spec.T) self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") # check linear-spec linear_spec = linear_input[0].cpu().numpy() wav = self.ap.inv_spectrogram(linear_spec.T) self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") # check the last time step to be zero padded assert linear_input[0, -1].sum() != 0 assert linear_input[0, -2].sum() != 0 assert mel_input[0, -1].sum() != 0 assert mel_input[0, -2].sum() != 0 assert stop_target[0, -1] == 1 assert stop_target[0, -2] == 0 assert stop_target.sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[0] == linear_input[0].shape[0] assert mel_lengths[0] == mel_input[0].shape[0] # Test for batch size 2 dataloader, dataset = self._create_dataloader(2, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] if mel_lengths[0] > mel_lengths[1]: idx = 0 else: idx = 1 # check the first item in the batch assert linear_input[idx, -1].sum() != 0 assert linear_input[idx, -2].sum() != 0, linear_input assert mel_input[idx, -1].sum() != 0 assert mel_input[idx, -2].sum() != 0, mel_input assert stop_target[idx, -1] == 1 assert stop_target[idx, -2] == 0 assert stop_target[idx].sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[idx] == mel_input[idx].shape[0] assert mel_lengths[idx] == linear_input[idx].shape[0] # check the second itme in the batch assert linear_input[1 - idx, -1].sum() == 0 assert mel_input[1 - idx, -1].sum() == 0 assert stop_target[1, mel_lengths[1] - 1] == 1 assert stop_target[1, mel_lengths[1]:].sum() == 0 assert len(mel_lengths.shape) == 1
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.pwgan = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.vocoder_checkpoint: self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file, self.config.wavernn_config, self.config.use_cuda) if self.config.pwgan_file: self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file, self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) print(f" > model reduction factor: {cp['r']}") def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_state_dict( torch.load(model_file, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 self.vocoder_config = load_config(model_config) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval() def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file, map_location="cpu") self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def load_pwgan(self, lib_path, model_file, model_config, use_cuda): if lib_path: # set this if ParallelWaveGAN is not installed globally sys.path.append(lib_path) try: #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator except ImportError as e: raise RuntimeError( f"cannot import parallel-wavegan, either install it or set its directory using the --pwgan_lib_path command line argument: {e}" ) print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) with open(model_config) as f: self.pwgan_config = yaml.load(f, Loader=yaml.Loader) self.pwgan = ParallelWaveGANGenerator( **self.pwgan_config["generator_params"]) self.pwgan.load_state_dict( torch.load(model_file, map_location="cpu")["model"]["generator"]) self.pwgan.remove_weight_norm() if use_cuda: self.pwgan.cuda() self.pwgan.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) @staticmethod def split_into_sentences(text): text = " " + text + " <stop>" text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = list(filter( None, [s.strip() for s in sentences])) # remove empty sentences return sentences def tts(self, text, speaker_id=None): start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(sens) speaker_id = id_to_torch(speaker_id) if speaker_id is not None and self.use_cuda: speaker_id = speaker_id.cuda() for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config) inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda) inputs = inputs.unsqueeze(0) # synthesize voice decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( self.tts_model, inputs, self.tts_config, False, speaker_id, None) # convert outputs to numpy if self.vocoder_model: vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) wav = self.vocoder_model.inference(vocoder_input) if self.use_cuda: wav = wav.cpu().numpy() else: wav = wav.numpy() wav = wav.flatten() elif self.wavernn: vocoder_input = None if self.tts_config.model == "Tacotron": vocoder_input = torch.FloatTensor( self.ap.out_linear_to_mel( linear_spec=postnet_output.T).T).T.unsqueeze(0) else: vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) if self.use_cuda: vocoder_input.cuda() wav = self.wavernn.generate( vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio['sample_rate'] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return out
class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.ap = AudioProcessor(**conf) def test_audio_synthesis(self): """1. load wav 2. set normalization parameters 3. extract mel-spec 4. invert to wav and save the output """ print(" > Sanity check for the process wav -> mel -> wav") def _test(max_norm, signal_norm, symmetric_norm, clip_norm): self.ap.max_norm = max_norm self.ap.signal_norm = signal_norm self.ap.symmetric_norm = symmetric_norm self.ap.clip_norm = clip_norm wav = self.ap.load_wav(WAV_FILE) mel = self.ap.melspectrogram(wav) wav_ = self.ap.inv_melspectrogram(mel) file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format( max_norm, signal_norm, symmetric_norm, clip_norm ) print(" | > Creating wav file at : ", file_name) self.ap.save_wav(wav_, OUT_PATH + file_name) # maxnorm = 1.0 _test(1.0, False, False, False) _test(1.0, True, False, False) _test(1.0, True, True, False) _test(1.0, True, False, True) _test(1.0, True, True, True) # maxnorm = 4.0 _test(4.0, False, False, False) _test(4.0, True, False, False) _test(4.0, True, True, False) _test(4.0, True, False, True) _test(4.0, True, True, True) def test_normalize(self): """Check normalization and denormalization for range values and consistency""" print(" > Testing normalization and denormalization.") wav = self.ap.load_wav(WAV_FILE) wav = self.ap.sound_norm(wav) # normalize audio to get abetter normalization range below. self.ap.signal_norm = False x = self.ap.melspectrogram(wav) x_old = x self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.min() >= 0 - 1, x_norm.min() # check denorm. x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() # check denorm. x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() # pylint: disable=invalid-unary-operand-type assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.max_norm = 1.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3 self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.max_norm = 1.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type assert x_norm.min() < 0, x_norm.min() x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3 def test_scaler(self): scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") conf.stats_path = scaler_stats_path conf.preemphasis = 0.0 conf.do_trim_silence = True conf.signal_norm = True ap = AudioProcessor(**conf) mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) self.ap.signal_norm = False self.ap.preemphasis = 0.0 # test scaler forward and backward transforms wav = self.ap.load_wav(WAV_FILE) mel_reference = self.ap.melspectrogram(wav) mel_norm = ap.melspectrogram(wav) mel_denorm = ap.denormalize(mel_norm) assert abs(mel_reference - mel_denorm).max() < 1e-4 def test_compute_f0(self): # pylint: disable=no-self-use ap = AudioProcessor(**conf) wav = ap.load_wav(WAV_FILE) pitch = ap.compute_f0(wav) mel = ap.melspectrogram(wav) assert pitch.shape[0] == mel.shape[1]
class TestTTSDataset(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): # load dataset meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) items = meta_data_train + meta_data_eval tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, compute_linear_spec=True, return_wav=True, tokenizer=tokenizer, ap=self.ap, samples=items, batch_group_size=bgs, min_text_len=c.min_text_len, max_text_len=c.max_text_len, min_audio_len=c.min_audio_len, max_audio_len=c.max_audio_len, start_by_longest=start_by_longest, ) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers, ) return dataloader, dataset def test_loader(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(1, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data["token_id"] _ = data["token_id_lengths"] speaker_name = data["speaker_names"] linear_input = data["linear"] mel_input = data["mel"] mel_lengths = data["mel_lengths"] _ = data["stop_targets"] _ = data["item_idxs"] wavs = data["waveform"] neg_values = text_input[text_input < 0] check_count = len(neg_values) # check basic conditions self.assertEqual(check_count, 0) self.assertEqual(linear_input.shape[0], mel_input.shape[0], c.batch_size) self.assertEqual(linear_input.shape[2], self.ap.fft_size // 2 + 1) self.assertEqual(mel_input.shape[2], c.audio["num_mels"]) self.assertEqual(wavs.shape[1], mel_input.shape[1] * c.audio.hop_length) self.assertIsInstance(speaker_name[0], str) # make sure that the computed mels and the waveform match and correctly computed mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy()) # remove padding in mel-spectrogram mel_dataloader = mel_input[0].T.numpy()[:, :mel_lengths[0]] # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding mel_new = mel_new[:, :mel_lengths[0]] ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) mel_diff = (mel_new[:, :mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg] self.assertLess(abs(mel_diff.sum()), 1e-5) # check normalization ranges if self.ap.symmetric_norm: self.assertLessEqual(mel_input.max(), self.ap.max_norm) self.assertGreaterEqual( mel_input.min(), -self.ap.max_norm # pylint: disable=invalid-unary-operand-type ) self.assertLess(mel_input.min(), 0) else: self.assertLessEqual(mel_input.max(), self.ap.max_norm) self.assertGreaterEqual(mel_input.min(), 0) def test_batch_group_shuffle(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 16) last_length = 0 frames = dataset.samples for i, data in enumerate(dataloader): if i == self.max_loader_iter: break mel_lengths = data["mel_lengths"] avg_length = mel_lengths.numpy().mean() dataloader.dataset.preprocess_samples() is_items_reordered = False for idx, item in enumerate(dataloader.dataset.samples): if item != frames[idx]: is_items_reordered = True break self.assertGreaterEqual(avg_length, last_length) self.assertTrue(is_items_reordered) def test_start_by_longest(self): """Test start_by_longest option. Ther first item of the fist batch must be longer than all the other items. """ if ok_ljspeech: dataloader, _ = self._create_dataloader(2, c.r, 0, True) dataloader.dataset.preprocess_samples() for i, data in enumerate(dataloader): if i == self.max_loader_iter: break mel_lengths = data["mel_lengths"] if i == 0: max_len = mel_lengths[0] print(mel_lengths) self.assertTrue(all(max_len >= mel_lengths)) def test_padding_and_spectrograms(self): def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): self.assertNotEqual(linear_input[idx, -1].sum(), 0) # check padding self.assertNotEqual(linear_input[idx, -2].sum(), 0) self.assertNotEqual(mel_input[idx, -1].sum(), 0) self.assertNotEqual(mel_input[idx, -2].sum(), 0) self.assertEqual(stop_target[idx, -1], 1) self.assertEqual(stop_target[idx, -2], 0) self.assertEqual(stop_target[idx].sum(), 1) self.assertEqual(len(mel_lengths.shape), 1) self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0]) self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0]) if ok_ljspeech: dataloader, _ = self._create_dataloader(1, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break linear_input = data["linear"] mel_input = data["mel"] mel_lengths = data["mel_lengths"] stop_target = data["stop_targets"] item_idx = data["item_idxs"] # check mel_spec consistency wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) mel = self.ap.melspectrogram(wav).astype("float32") mel = torch.FloatTensor(mel).contiguous() mel_dl = mel_input[0] # NOTE: Below needs to check == 0 but due to an unknown reason # there is a slight difference between two matrices. # TODO: Check this assert cond more in detail. self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() wav = self.ap.inv_melspectrogram(mel_spec.T) self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") # check linear-spec linear_spec = linear_input[0].cpu().numpy() wav = self.ap.inv_spectrogram(linear_spec.T) self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") # check the outputs check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) # Test for batch size 2 dataloader, _ = self._create_dataloader(2, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break linear_input = data["linear"] mel_input = data["mel"] mel_lengths = data["mel_lengths"] stop_target = data["stop_targets"] item_idx = data["item_idxs"] # set id to the longest sequence in the batch if mel_lengths[0] > mel_lengths[1]: idx = 0 else: idx = 1 # check the longer item in the batch check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) # check the other item in the batch self.assertEqual(linear_input[1 - idx, -1].sum(), 0) self.assertEqual(mel_input[1 - idx, -1].sum(), 0) self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) self.assertEqual(stop_target[1, mel_lengths[1]:].sum(), stop_target.shape[1] - mel_lengths[1]) self.assertEqual(len(mel_lengths.shape), 1)
class Synthesizer(object): def __init__(self, tts_checkpoint, tts_config, vocoder_checkpoint=None, vocoder_config=None, use_cuda=False): """General 🐸 TTS interface for inference. It takes a tts and a vocoder model and synthesize speech from the provided text. The text is divided into a list of sentences using `pysbd` and synthesize speech on each sentence separately. If you have certain special characters in your text, you need to handle them before providing the text to Synthesizer. TODO: handle multi-speaker and GST inference. Args: tts_checkpoint (str): path to the tts model file. tts_config (str): path to the tts config file. vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None. vocoder_config (str, optional): path to the vocoder config file. Defaults to None. use_cuda (bool, optional): enable/disable cuda. Defaults to False. """ self.tts_checkpoint = tts_checkpoint self.tts_config = tts_config self.vocoder_checkpoint = vocoder_checkpoint self.vocoder_config = vocoder_config self.use_cuda = use_cuda self.wavernn = None self.vocoder_model = None self.num_speakers = 0 self.tts_speakers = None self.speaker_embedding_dim = None self.seg = self.get_segmenter("en") self.use_cuda = use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(tts_checkpoint, tts_config, use_cuda) self.output_sample_rate = self.tts_config.audio["sample_rate"] if vocoder_checkpoint: self.load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda) self.output_sample_rate = self.vocoder_config.audio["sample_rate"] @staticmethod def get_segmenter(lang): return pysbd.Segmenter(language=lang, clean=True) def load_speakers(self): # load speakers if self.model_config.use_speaker_embedding is not None: self.tts_speakers = load_speaker_mapping( self.tts_config.tts_speakers_json) self.num_speakers = len(self.tts_speakers) else: self.num_speakers = 0 # set external speaker embedding if self.tts_config.use_external_speaker_embedding_file: speaker_embedding = self.tts_speakers[list( self.tts_speakers.keys())[0]]["embedding"] self.speaker_embedding_dim = len(speaker_embedding) def init_speaker(self, speaker_idx): # load speakers speaker_embedding = None if hasattr(self, "tts_speakers") and speaker_idx is not None: assert speaker_idx < len( self.tts_speakers ), f" [!] speaker_idx is out of the range. {speaker_idx} vs {len(self.tts_speakers)}" if self.tts_config.use_external_speaker_embedding_file: speaker_embedding = self.tts_speakers[speaker_idx]["embedding"] return speaker_embedding def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) if "characters" in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) self.tts_model = setup_model(self.input_size, num_speakers=self.num_speakers, c=self.tts_config) self.tts_model.load_checkpoint(tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config["audio"]) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda() def save_wav(self, wav, path): wav = np.array(wav) self.ap.save_wav(wav, path, self.output_sample_rate) def split_into_sentences(self, text): return self.seg.segment(text) def tts(self, text, speaker_idx=None): start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(" > Text splitted to sentences.") print(sens) speaker_embedding = self.init_speaker(speaker_idx) use_gl = self.vocoder_model is None for sen in sens: # synthesize voice waveform, _, _, mel_postnet_spec, _, _ = synthesis( self.tts_model, sen, self.tts_config, self.use_cuda, self.ap, speaker_idx, None, False, self.tts_config.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding, ) if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" # renormalize spectrogram based on vocoder config vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) # compute scale factor for possible sample rate mismatch scale_factor = [ 1, self.vocoder_config["audio"]["sample_rate"] / self.ap.sample_rate ] if scale_factor[1] != 1: print(" > interpolating tts model output.") vocoder_input = interpolate_vocoder_input( scale_factor, vocoder_input) else: vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference( vocoder_input.to(device_type)) if self.use_cuda and not use_gl: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() waveform = waveform.squeeze() # trim silence waveform = trim_silence(waveform, self.ap) wavs += list(waveform) wavs += [0] * 10000 # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio["sample_rate"] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return wavs
class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestAudio, self).__init__(*args, **kwargs) self.ap = AudioProcessor(**conf.audio) def test_audio_synthesis(self): """ 1. load wav 2. set normalization parameters 3. extract mel-spec 4. invert to wav and save the output """ print(" > Sanity check for the process wav -> mel -> wav") def _test(max_norm, signal_norm, symmetric_norm, clip_norm): self.ap.max_norm = max_norm self.ap.signal_norm = signal_norm self.ap.symmetric_norm = symmetric_norm self.ap.clip_norm = clip_norm wav = self.ap.load_wav(WAV_FILE) mel = self.ap.melspectrogram(wav) wav_ = self.ap.inv_mel_spectrogram(mel) file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav"\ .format(max_norm, signal_norm, symmetric_norm, clip_norm) print(" | > Creating wav file at : ", file_name) self.ap.save_wav(wav_, OUT_PATH + file_name) # maxnorm = 1.0 _test(1., False, False, False) _test(1., True, False, False) _test(1., True, True, False) _test(1., True, False, True) _test(1., True, True, True) # maxnorm = 4.0 _test(4., False, False, False) _test(4., True, False, False) _test(4., True, True, False) _test(4., True, False, True) _test(4., True, True, True) def test_normalize(self): """Check normalization and denormalization for range values and consistency """ print(" > Testing normalization and denormalization.") wav = self.ap.load_wav(WAV_FILE) self.ap.signal_norm = False x = self.ap.melspectrogram(wav) x_old = x self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.min() >= 0 - 1, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.max_norm = 1.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3 self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.max_norm = 1.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() assert x_norm.min() < 0, x_norm.min() x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.vocoder_model = None self.config = config print(config) self.seg = self.get_segmenter("en") self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.vocoder_checkpoint: self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint, self.config.wavernn_config, self.config.use_cuda) @staticmethod def get_segmenter(lang): return pysbd.Segmenter(language=lang, clean=True) def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) print(f" > model reduction factor: {cp['r']}") def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_state_dict( torch.load(model_file, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 self.vocoder_config = load_config(model_config) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval() def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file, map_location="cpu") self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): return self.seg.segment(text) def tts(self, text, speaker_id=None): start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(sens) speaker_id = id_to_torch(speaker_id) if speaker_id is not None and self.use_cuda: speaker_id = speaker_id.cuda() for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config) inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda) inputs = inputs.unsqueeze(0) # synthesize voice _, postnet_output, _, _ = run_model_torch(self.tts_model, inputs, self.tts_config, False, speaker_id, None) if self.vocoder_model: # use native vocoder model vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) wav = self.vocoder_model.inference(vocoder_input) if self.use_cuda: wav = wav.cpu().numpy() else: wav = wav.numpy() wav = wav.flatten() elif self.wavernn: # use 3rd paty wavernn vocoder_input = None if self.tts_config.model == "Tacotron": vocoder_input = torch.FloatTensor( self.ap.out_linear_to_mel( linear_spec=postnet_output.T).T).T.unsqueeze(0) else: vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) if self.use_cuda: vocoder_input.cuda() wav = self.wavernn.generate( vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: # use GL if self.use_cuda: postnet_output = postnet_output[0].cpu() else: postnet_output = postnet_output[0] postnet_output = postnet_output.numpy() wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio['sample_rate'] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return out
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.config = config self.use_cuda = config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_path, self.config.tts_file, self.config.tts_config, config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(config.wavernn_lib_path, config.wavernn_path, config.wavernn_file, config.wavernn_config, config.use_cuda) def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping( os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(self.model_file) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]: self.tts_model.decoder.set_r(cp['r']) def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) print(sens) if not sens: sens = [text + '.'] for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) # synthesize voice decoder_output, postnet_output, alignments, _ = run_model( self.tts_model, inputs, self.tts_config, False, None, None) # convert outputs to numpy postnet_output, decoder_output, _ = parse_outputs( postnet_output, decoder_output, alignments) if self.wavernn: postnet_output = postnet_output[0].data.cpu().numpy() wav = self.wavernn.generate( torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
class Synthesizer: def __init__( self, config_path, model_path, use_cuda=False, vocoder_path="", vocoder_config_path="", batched_vocoder=True, speakers_json="", speaker_fileid=None, gst_style=None, wavegrad_iters=50, ): self.config_path = config_path self.model_path = model_path self.use_cuda = use_cuda self.vocoder_path = vocoder_path self.vocoder_config_path = vocoder_config_path self.batched_vocoder = batched_vocoder self.speakers_json = speakers_json self.speaker_fileid = speaker_fileid self.gst_style = gst_style self.wavegrad_iters = wavegrad_iters self.model = None def load(self): # load the config C = load_config(self.config_path) self.config = C # Resolve scale_stats path stats_path = C.audio.get("stats_path") if stats_path and not os.path.isfile(stats_path): # Look for stats next to config model_stats_path = os.path.join(os.path.dirname(self.config_path), "scale_stats.npy") if os.path.isfile(model_stats_path): # Patch config C.audio["stats_path"] = model_stats_path else: _LOGGER.warning("No scale stats found at %s", C.audio["stats_path"]) C.audio["stats_path"] = "" C.forward_attn_mask = True if "gst" not in C.keys(): # Patch config gst = { "gst_use_speaker_embedding": False, "gst_style_input": None, "gst_embedding_dim": 512, "gst_num_heads": 4, "gst_style_tokens": 10, } C["gst"] = gst setattr(C, "gst", gst) if "use_external_speaker_embedding_file" not in C.keys(): C["use_external_speaker_embedding_file"] = False setattr(C, "use_external_speaker_embedding_file", False) if "gst_use_speaker_embedding" not in C.gst: C.gst["gst_use_speaker_embedding"] = False # load the audio processor ap = AudioProcessor(**C.audio) self.ap = ap # if the vocabulary was passed, replace the default if "characters" in C.keys(): symbols, phonemes = make_symbols(**C.characters) else: from TTS.tts.utils.text.symbols import phonemes, symbols speaker_embedding = None speaker_embedding_dim = None num_speakers = 0 # load speakers if self.speakers_json != "": speaker_mapping = json.load(open(self.speakers_json, "r")) num_speakers = len(speaker_mapping) if C.use_external_speaker_embedding_file: if self.speaker_fileid is not None: speaker_embedding = speaker_mapping[ self.speaker_fileid]["embedding"] else: # if speaker_fileid is not specificated use the first sample in speakers.json speaker_embedding = speaker_mapping[list( speaker_mapping.keys())[0]]["embedding"] speaker_embedding_dim = len(speaker_embedding) self.speaker_embedding = speaker_embedding # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim) cp = torch.load(self.model_path, map_location=torch.device("cpu")) model.load_state_dict(cp["model"]) model.eval() if self.use_cuda: model.cuda() if hasattr(model.decoder, "set_r"): model.decoder.set_r(cp["r"]) self.model = model # load vocoder model if self.vocoder_path: VC = load_config(self.vocoder_config_path) # Resolve scale_stats path stats_path = VC.audio.get("stats_path") if stats_path and not os.path.isfile(stats_path): # Look for stats next to config vocoder_stats_path = os.path.join( os.path.dirname(self.vocoder_config_path), "scale_stats.npy") if os.path.isfile(vocoder_stats_path): # Patch config VC.audio["stats_path"] = vocoder_stats_path else: # Try next to TTS config vocoder_stats_path = os.path.join( os.path.dirname(self.config_path), "scale_stats.npy") if os.path.isfile(vocoder_stats_path): # Patch config VC.audio["stats_path"] = vocoder_stats_path else: _LOGGER.warning("No vocoder scale stats found at %s", VC.audio["stats_path"]) VC.audio["stats_path"] = "" self.ap_vocoder = AudioProcessor(**VC.audio) vocoder_model = setup_generator(VC) vocoder_model.load_state_dict( torch.load(self.vocoder_path, map_location="cpu")["model"]) vocoder_model.remove_weight_norm() vocoder_model.inference_padding = 0 if self.use_cuda: vocoder_model.cuda() vocoder_model.eval() if hasattr(vocoder_model, "compute_noise_level"): noise_schedule_path = os.path.join( os.path.dirname(self.vocoder_path), "noise_schedule.npy") if os.path.isfile(noise_schedule_path): _LOGGER.debug("Loading noise schedule from %s", noise_schedule_path) beta = np.load(noise_schedule_path, allow_pickle=True).tolist()["beta"] else: # Use if not computed noise schedule with tune_wavegrad _LOGGER.debug("Using default noise schedule") beta = np.linspace(1e-6, 0.01, self.wavegrad_iters) vocoder_model.compute_noise_level(beta) else: vocoder_model = None VC = None self.ap_vocoder = None self.vocoder_model = vocoder_model self.vocoder_config = VC # synthesize voice self.use_griffin_lim = self.vocoder_model is None if not C.use_external_speaker_embedding_file: if self.speaker_fileid and self.speaker_fileid.isdigit(): self.speaker_fileid = int(self.speaker_fileid) else: self.speaker_fileid = None else: self.speaker_fileid = None if (self.gst_style is None) and ("gst" in C.keys()): gst_style = C.gst.get("gst_style_input", None) else: # check if gst_style string is a dict, if is dict convert else use string try: gst_style = json.loads(self.gst_style) if max(map(int, gst_style.keys())) >= C.gst["gst_style_tokens"]: raise RuntimeError( "The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}" .format(max(map(int, gst_style.keys())), C.gst["gst_style_tokens"])) except ValueError: gst_style = self.gst_style self.gst_style = gst_style # Pre-load language if C.get("phoneme_backend") == "gruut": load_gruut_language(C["phoneme_language"]) # Compute scale factors in case TTS/vocoder sample rates differ # See: https://github.com/mozilla/TTS/issues/520 self.scale_factors = None if self.ap_vocoder and (self.ap.sample_rate != self.ap_vocoder.sample_rate): self.scale_factors = (1, self.ap_vocoder.sample_rate / self.ap.sample_rate) @property def sample_rate(self) -> int: """Get output sample rate""" if self.ap_vocoder: return self.ap_vocoder.sample_rate return self.ap.sample_rate # ------------------------------------------------------------------------- def synthesize(self, text: str, text_is_phonemes: bool = False) -> bytes: """Synthesize WAV bytes from text""" if not self.model: self.load() wav = tts( self.model, self.vocoder_model, text, self.config, self.use_cuda, self.ap, self.use_griffin_lim, self.speaker_fileid, speaker_embedding=self.speaker_embedding, gst_style=self.gst_style, text_is_phonemes=text_is_phonemes, ap_vocoder=self.ap_vocoder, scale_factors=self.scale_factors, ) with io.BytesIO() as wav_io: if self.ap_vocoder: # Use vocoder sample rate self.ap_vocoder.save_wav(wav, wav_io) else: # Use original sample rate self.ap.save_wav(wav, wav_io) return wav_io.getvalue()
def main(**kwargs): global symbols, phonemes # pylint: disable=global-statement current_date = date.today() current_date = current_date.strftime("%B %d %Y") start_time = time.time() # read passed variables from gui text = kwargs['text'] # text to generate speech from use_cuda = kwargs['use_cuda'] # if gpu exists default is true project = kwargs['project'] # path to project folder vocoder_type = kwargs['vocoder'] # vocoder type, default is GL use_gst = kwargs['use_gst'] # use style_wave for prosody style_dict = kwargs['style_input'] # use style_wave for prosody speaker_id = kwargs['speaker_id'] # name of the selected speaker sentence_file = kwargs['sentence_file'] # path to file if generate from file out_path = kwargs['out_path'] # path to save the output wav batched_vocoder = True # load speakers speakers_file_path = Path(project, "speakers.json") if speakers_file_path.is_file(): speaker_data = json.load(open(speakers_file_path, 'r')) num_speakers = len(speaker_data) #get the speaker id for selected speaker if speaker_id >= num_speakers: print('Speaker ID outside of number of speakers range. Using default 0.') speaker_id = 0 speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0] else: speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0] else: speaker_name = 'Default' num_speakers = 0 speaker_id = None # load the config config_path = Path(project, "config.json") C = load_config(config_path) if use_gst: if style_dict is not None: style_input = style_dict else: style_input = None # load the audio processor ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) # find the tts model file in project folder try: tts_model_file = glob(str(Path(project, '*.pth.tar'))) if not tts_model_file: raise FileNotFoundError model_path = tts_model_file[0] except FileNotFoundError: print('[!] TTS Model not found in path: "{}"'.format(project)) # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) # if gpu is not available use cpu model, state = load_checkpoint(model, model_path, use_cuda=use_cuda) model.decoder.max_decoder_steps = 2000 model.eval() print(' > Model step:', state['step']) print(' > Model r: ', state['r']) # load vocoder if vocoder_type is 'MelGAN': try: model_file = glob(str(Path(project, 'vocoder/*.pth.tar'))) vocoder, ap_vocoder = load_vocoder(str(Path('TTS')), str(model_file[0]), str(Path(project, 'vocoder/config.json')), use_cuda) except Exception: print('[!] Error loading vocoder: "{}"'.format(project)) sys.exit(0) elif vocoder_type is 'WaveRNN': try: model_file = glob(str(Path(project, 'vocoder/*.pkl'))) vocoder, ap_vocoder = load_vocoder(str(Path('TTS')), str(model_file[0]), str(Path(project, 'config.yml')), use_cuda) except Exception: print('[!] Error loading vocoder: "{}"'.format(project)) sys.exit(0) else: vocoder, ap_vocoder = None, None print(" > Vocoder: {}".format(vocoder_type)) print(' > Using style input: {}\n'.format(style_input)) if sentence_file != '': with open(sentence_file, "r", encoding='utf8') as f: list_of_sentences = [s.strip() for s in f.readlines()] else: list_of_sentences = [text.strip()] # iterate over every passed sentence and synthesize for _, tts_sentence in enumerate(list_of_sentences): wav_list = [] # remove character which are not alphanumerical or contain ',. ' tts_sentence = clean_sentence(tts_sentence) print(" > Text: {}".format(tts_sentence)) # build filename current_time = datetime.now().strftime("%H%M%S") file_name = ' '.join(tts_sentence.split(" ")[:10]) # if multiple sentences in one line -> split them tts_sentence = split_into_sentences(tts_sentence) # if sentence was split in sub-sentences -> iterate over them for sentence in tts_sentence: # synthesize voice _, _, _, wav = tts(model, vocoder, C, None, sentence, ap, ap_vocoder, use_cuda, batched_vocoder, speaker_id=speaker_id, style_input=style_input, figures=False) # join sub-sentences back together and add a filler between them wav_list += list(wav) wav_list += [0] * 10000 wav = np.array(wav_list) # finalize filename file_name = "_".join([str(current_time), file_name]) file_name = file_name.translate( str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' if out_path == "": out_dir = str(Path(project, 'output', current_date, speaker_name)) out_path = os.path.join(out_dir, file_name) else: out_dir = os.path.dirname(out_path) # create output directory if it doesn't exist if not os.path.isdir(out_dir): os.makedirs(out_dir, exist_ok=True) # save generated wav to disk ap.save_wav(wav, out_path) end_time = time.time() print(" > Run-time: {}".format(end_time - start_time)) print(" > Saving output to {}\n".format(out_path))
class Synthesizer(object): def __init__( self, tts_checkpoint: str, tts_config_path: str, tts_speakers_file: str = "", tts_languages_file: str = "", vocoder_checkpoint: str = "", vocoder_config: str = "", encoder_checkpoint: str = "", encoder_config: str = "", use_cuda: bool = False, ) -> None: """General 🐸 TTS interface for inference. It takes a tts and a vocoder model and synthesize speech from the provided text. The text is divided into a list of sentences using `pysbd` and synthesize speech on each sentence separately. If you have certain special characters in your text, you need to handle them before providing the text to Synthesizer. TODO: set the segmenter based on the source language Args: tts_checkpoint (str): path to the tts model file. tts_config_path (str): path to the tts config file. vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None. vocoder_config (str, optional): path to the vocoder config file. Defaults to None. encoder_checkpoint (str, optional): path to the speaker encoder model file. Defaults to `""`, encoder_config (str, optional): path to the speaker encoder config file. Defaults to `""`, use_cuda (bool, optional): enable/disable cuda. Defaults to False. """ self.tts_checkpoint = tts_checkpoint self.tts_config_path = tts_config_path self.tts_speakers_file = tts_speakers_file self.tts_languages_file = tts_languages_file self.vocoder_checkpoint = vocoder_checkpoint self.vocoder_config = vocoder_config self.encoder_checkpoint = encoder_checkpoint self.encoder_config = encoder_config self.use_cuda = use_cuda self.tts_model = None self.vocoder_model = None self.speaker_manager = None self.num_speakers = 0 self.tts_speakers = {} self.language_manager = None self.num_languages = 0 self.tts_languages = {} self.d_vector_dim = 0 self.seg = self._get_segmenter("en") self.use_cuda = use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self._load_tts(tts_checkpoint, tts_config_path, use_cuda) self.output_sample_rate = self.tts_config.audio["sample_rate"] if vocoder_checkpoint: self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda) self.output_sample_rate = self.vocoder_config.audio["sample_rate"] else: print(" > Using Griffin-Lim as no vocoder model defined") @staticmethod def _get_segmenter(lang: str): """get the sentence segmenter for the given language. Args: lang (str): target language code. Returns: [type]: [description] """ return pysbd.Segmenter(language=lang, clean=True) def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. 1. Load the model config. 2. Init the AudioProcessor. 3. Init the model from the config. 4. Move the model to the GPU if CUDA is enabled. 5. Init the speaker manager for the model. Args: tts_checkpoint (str): path to the model checkpoint. tts_config_path (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ # pylint: disable=global-statement self.tts_config = load_config(tts_config_path) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) speaker_manager = self._init_speaker_manager() language_manager = self._init_language_manager() if not self.encoder_checkpoint: self._set_speaker_encoder_paths_from_tts_config() speaker_manager = self._init_speaker_encoder(speaker_manager) if language_manager is not None: self.tts_model = setup_tts_model( config=self.tts_config, speaker_manager=speaker_manager, language_manager=language_manager, ) else: self.tts_model = setup_tts_model(config=self.tts_config, speaker_manager=speaker_manager) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() def _set_speaker_encoder_paths_from_tts_config(self): """Set the encoder paths from the tts model config for models with speaker encoders.""" if hasattr(self.tts_config, "model_args") and hasattr( self.tts_config.model_args, "speaker_encoder_config_path"): self.encoder_checkpoint = self.tts_config.model_args.speaker_encoder_model_path self.encoder_config = self.tts_config.model_args.speaker_encoder_config_path def _is_use_speaker_embedding(self): """Check if the speaker embedding is used in the model""" # we handle here the case that some models use model_args some don't use_speaker_embedding = False if hasattr(self.tts_config, "model_args"): use_speaker_embedding = self.tts_config["model_args"].get( "use_speaker_embedding", False) use_speaker_embedding = use_speaker_embedding or self.tts_config.get( "use_speaker_embedding", False) return use_speaker_embedding def _is_use_d_vector_file(self): """Check if the d-vector file is used in the model""" # we handle here the case that some models use model_args some don't use_d_vector_file = False if hasattr(self.tts_config, "model_args"): config = self.tts_config.model_args use_d_vector_file = config.get("use_d_vector_file", False) config = self.tts_config use_d_vector_file = use_d_vector_file or config.get( "use_d_vector_file", False) return use_d_vector_file def _init_speaker_manager(self): """Initialize the SpeakerManager""" # setup if multi-speaker settings are in the global model config speaker_manager = None speakers_file = get_from_config_or_model_args_with_default( self.tts_config, "speakers_file", None) if self._is_use_speaker_embedding(): if self.tts_speakers_file: speaker_manager = SpeakerManager( speaker_id_file_path=self.tts_speakers_file) elif speakers_file: speaker_manager = SpeakerManager( speaker_id_file_path=speakers_file) if self._is_use_d_vector_file(): d_vector_file = get_from_config_or_model_args_with_default( self.tts_config, "d_vector_file", None) if self.tts_speakers_file: speaker_manager = SpeakerManager( d_vectors_file_path=self.tts_speakers_file) elif d_vector_file: speaker_manager = SpeakerManager( d_vectors_file_path=d_vector_file) return speaker_manager def _init_speaker_encoder(self, speaker_manager): """Initialize the SpeakerEncoder""" if self.encoder_checkpoint: if speaker_manager is None: speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config) else: speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) return speaker_manager def _init_language_manager(self): """Initialize the LanguageManager""" # setup if multi-lingual settings are in the global model config language_manager = None if check_config_and_model_args(self.tts_config, "use_language_embedding", True): if self.tts_languages_file: language_manager = LanguageManager( language_ids_file_path=self.tts_languages_file) elif self.tts_config.get("language_ids_file", None): language_manager = LanguageManager( language_ids_file_path=self.tts_config.language_ids_file) else: language_manager = LanguageManager(config=self.tts_config) return language_manager def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. 1. Load the vocoder config. 2. Init the AudioProcessor for the vocoder. 3. Init the vocoder model from the config. 4. Move the model to the GPU if CUDA is enabled. Args: model_file (str): path to the model checkpoint. model_config (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda() def split_into_sentences(self, text) -> List[str]: """Split give text into sentences. Args: text (str): input text in string format. Returns: List[str]: list of sentences. """ return self.seg.segment(text) def save_wav(self, wav: List[int], path: str) -> None: """Save the waveform as a file. Args: wav (List[int]): waveform as a list of values. path (str): output path to save the waveform. """ wav = np.array(wav) self.ap.save_wav(wav, path, self.output_sample_rate) def tts( self, text: str, speaker_name: str = "", language_name: str = "", speaker_wav: Union[str, List[str]] = None, style_wav=None, ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. Args: text (str): input text. speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "". language_name (str, optional): language id for multi-language models. Defaults to "". speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None. style_wav ([type], optional): style waveform for GST. Defaults to None. Returns: List[int]: [description] """ start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(" > Text splitted to sentences.") print(sens) # handle multi-speaker speaker_embedding = None speaker_id = None if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: # get the speaker embedding from the saved d_vectors. speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker( speaker_name)[0] speaker_embedding = np.array(speaker_embedding)[ None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name speaker_id = self.tts_model.speaker_manager.speaker_ids[ speaker_name] elif not speaker_name and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " "You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model." ) else: speaker_embedding = None else: if speaker_name: raise ValueError( f" [!] Missing speakers.json file path for selecting speaker {speaker_name}." "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) # handle multi-lingaul language_id = None if self.tts_languages_file or ( hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None): if language_name and isinstance(language_name, str): language_id = self.tts_model.language_manager.language_id_mapping[ language_name] elif not language_name: raise ValueError( " [!] Look like you use a multi-lingual model. " "You need to define either a `language_name` or a `style_wav` to use a multi-lingual model." ) else: raise ValueError( f" [!] Missing language_ids.json file path for selecting language {language_name}." "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. " ) # compute a new d_vector from the given clip. if speaker_wav is not None: speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip( speaker_wav) use_gl = self.vocoder_model is None for sen in sens: # synthesize voice outputs = synthesis( model=self.tts_model, text=sen, CONFIG=self.tts_config, use_cuda=self.use_cuda, ap=self.ap, speaker_id=speaker_id, language_id=language_id, language_name=language_name, style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, d_vector=speaker_embedding, ) waveform = outputs["wav"] mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach( ).cpu().numpy() if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" # renormalize spectrogram based on vocoder config vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) # compute scale factor for possible sample rate mismatch scale_factor = [ 1, self.vocoder_config["audio"]["sample_rate"] / self.ap.sample_rate, ] if scale_factor[1] != 1: print(" > interpolating tts model output.") vocoder_input = interpolate_vocoder_input( scale_factor, vocoder_input) else: vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference( vocoder_input.to(device_type)) if self.use_cuda and not use_gl: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() waveform = waveform.squeeze() # trim silence if self.tts_config.audio["do_trim_silence"] is True: waveform = trim_silence(waveform, self.ap) wavs += list(waveform) wavs += [0] * 10000 # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio["sample_rate"] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return wavs
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.pwgan = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file, self.config.wavernn_config, self.config.use_cuda) if self.config.pwgan_file: self.load_pwgan(self.config.pwgan_file, self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model'], map_location="cpu") if use_cuda: self.wavernn.cuda() self.wavernn.eval() def load_pwgan(self, model_file, model_config, use_cuda): #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) with open(model_config) as f: self.pwgan_config = yaml.load(f, Loader=yaml.Loader) self.pwgan = ParallelWaveGANGenerator( **self.pwgan_config["generator_params"]) self.pwgan.load_state_dict( torch.load(model_file, map_location="cpu")["model"]["generator"]) self.pwgan.remove_weight_norm() self.pwgan_ap = AudioProcessorVocoder(**self.pwgan_config["audio"]) if use_cuda: self.pwgan.cuda() self.pwgan.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " <stop>" text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = list(filter(None, [s.strip() for s in sentences])) return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) print(sens) if not sens: sens = [text + '.'] for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) # synthesize voice decoder_output, postnet_output, alignments, _ = run_model( self.tts_model, inputs, self.tts_config, False, None, None) # convert outputs to numpy postnet_output, decoder_output, _ = parse_outputs( postnet_output, decoder_output, alignments) if self.pwgan: input_tensor = torch.FloatTensor(postnet_output.T).unsqueeze(0) if self.use_cuda: input_tensor.cuda() wav = self.pwgan.inference( input_tensor, hop_size=self.pwgan_ap.hop_length).data.cpu().numpy() else: wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
class Synthesizer(object): def __init__( self, tts_checkpoint: str, tts_config_path: str, tts_speakers_file: str = "", vocoder_checkpoint: str = "", vocoder_config: str = "", encoder_checkpoint: str = "", encoder_config: str = "", use_cuda: bool = False, ) -> None: """General 🐸 TTS interface for inference. It takes a tts and a vocoder model and synthesize speech from the provided text. The text is divided into a list of sentences using `pysbd` and synthesize speech on each sentence separately. If you have certain special characters in your text, you need to handle them before providing the text to Synthesizer. TODO: set the segmenter based on the source language Args: tts_checkpoint (str): path to the tts model file. tts_config_path (str): path to the tts config file. vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None. vocoder_config (str, optional): path to the vocoder config file. Defaults to None. encoder_checkpoint (str, optional): path to the speaker encoder model file. Defaults to `""`, encoder_config (str, optional): path to the speaker encoder config file. Defaults to `""`, use_cuda (bool, optional): enable/disable cuda. Defaults to False. """ self.tts_checkpoint = tts_checkpoint self.tts_config_path = tts_config_path self.tts_speakers_file = tts_speakers_file self.vocoder_checkpoint = vocoder_checkpoint self.vocoder_config = vocoder_config self.encoder_checkpoint = encoder_checkpoint self.encoder_config = encoder_config self.use_cuda = use_cuda self.tts_model = None self.vocoder_model = None self.speaker_manager = None self.num_speakers = 0 self.tts_speakers = {} self.d_vector_dim = 0 self.seg = self._get_segmenter("en") self.use_cuda = use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self._load_tts(tts_checkpoint, tts_config_path, use_cuda) self.output_sample_rate = self.tts_config.audio["sample_rate"] if vocoder_checkpoint: self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda) self.output_sample_rate = self.vocoder_config.audio["sample_rate"] @staticmethod def _get_segmenter(lang: str): """get the sentence segmenter for the given language. Args: lang (str): target language code. Returns: [type]: [description] """ return pysbd.Segmenter(language=lang, clean=True) def _load_speakers(self, speaker_file: str) -> None: """Load the SpeakerManager to organize multi-speaker TTS. It loads the speakers meta-data and the speaker encoder if it is defined. Args: speaker_file (str): path to the speakers meta-data file. """ print("Loading speakers ...") self.speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config, ) self.speaker_manager.load_d_vectors_file( self.tts_config.get("d_vector_file", speaker_file)) self.num_speakers = self.speaker_manager.num_speakers self.d_vector_dim = self.speaker_manager.d_vector_dim def _set_tts_speaker_file(self): """Set the TTS speaker file used by a multi-speaker model.""" # setup if multi-speaker settings are in the global model config if (hasattr(self.tts_config, "use_speaker_embedding") and self.tts_config.use_speaker_embedding is True): if self.tts_config.use_d_vector_file: self.tts_speakers_file = (self.tts_speakers_file if self.tts_speakers_file else self.tts_config["d_vector_file"]) self.tts_config["d_vector_file"] = self.tts_speakers_file else: self.tts_speakers_file = (self.tts_speakers_file if self.tts_speakers_file else self.tts_config["speakers_file"]) # setup if multi-speaker settings are in the model args config if (self.tts_speakers_file is None and hasattr(self.tts_config, "model_args") and hasattr( self.tts_config.model_args, "use_speaker_embedding") and self.tts_config.model_args.use_speaker_embedding): _args = self.tts_config.model_args if _args.use_d_vector_file: self.tts_speakers_file = (self.tts_speakers_file if self.tts_speakers_file else _args["d_vector_file"]) _args["d_vector_file"] = self.tts_speakers_file else: self.tts_speakers_file = (self.tts_speakers_file if self.tts_speakers_file else _args["speakers_file"]) def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. Args: tts_checkpoint (str): path to the model checkpoint. tts_config_path (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ # pylint: disable=global-statement self.tts_config = load_config(tts_config_path) # Patch stats_path stats_path = self.tts_config["audio"].get("stats_path", "") if stats_path and (not os.path.isfile(stats_path)): stats_path = os.path.join(os.path.dirname(tts_checkpoint), os.path.split(stats_path)[1]) self.tts_config["audio"]["stats_path"] = stats_path # Patch speakers file speakers_file = self.tts_config.get("model_args", {}).get("speakers_file", "") if speakers_file and (not os.path.isfile(speakers_file)): speakers_file = os.path.join(os.path.dirname(tts_checkpoint), os.path.split(speakers_file)[1]) self.tts_config["model_args"]["speakers_file"] = speakers_file self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) self.tts_model = setup_tts_model(config=self.tts_config) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() self._set_tts_speaker_file() def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. Args: model_file (str): path to the model checkpoint. model_config (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) # Patch stats_path stats_path = self.vocoder_config["audio"].get("stats_path", "") if stats_path and (not os.path.isfile(stats_path)): stats_path = os.path.join(os.path.dirname(model_file), os.path.split(stats_path)[1]) self.vocoder_config["audio"]["stats_path"] = stats_path self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda() def split_into_sentences(self, text) -> List[str]: """Split give text into sentences. Args: text (str): input text in string format. Returns: List[str]: list of sentences. """ return self.seg.segment(text) def save_wav(self, wav: List[int], path: str) -> None: """Save the waveform as a file. Args: wav (List[int]): waveform as a list of values. path (str): output path to save the waveform. """ wav = np.array(wav) self.ap.save_wav(wav, path, self.output_sample_rate) def tts(self, text: str, speaker_idx: str = "", speaker_wav=None, style_wav=None) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. Args: text (str): input text. speaker_idx (str, optional): spekaer id for multi-speaker models. Defaults to "". speaker_wav (): style_wav ([type], optional): style waveform for GST. Defaults to None. Returns: List[int]: [description] """ start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(" > Text splitted to sentences.") print(sens) # handle multi-speaker speaker_embedding = None speaker_id = None if isinstance(speaker_idx, int): speaker_id = speaker_idx elif self.tts_speakers_file: if speaker_idx and isinstance(speaker_idx, str): if self.tts_config.use_d_vector_file: # get the speaker embedding from the saved d_vectors. speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker( speaker_idx)[0] else: # get speaker idx from the speaker name try: speaker_id = self.tts_model.speaker_manager.speaker_ids[ speaker_idx] except KeyError: # Interpet as int speaker_id = int(speaker_idx) elif not speaker_idx and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " "You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model." ) else: speaker_embedding = None else: if speaker_idx: raise ValueError( f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}." "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) # compute a new d_vector from the given clip. if speaker_wav is not None: speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip( speaker_wav) use_gl = self.vocoder_model is None for sen in sens: # synthesize voice outputs = synthesis( model=self.tts_model, text=sen, CONFIG=self.tts_config, use_cuda=self.use_cuda, ap=self.ap, speaker_id=speaker_id, style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, d_vector=speaker_embedding, ) waveform = outputs["wav"] mel_postnet_spec = ( outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()) if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" # renormalize spectrogram based on vocoder config vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) # compute scale factor for possible sample rate mismatch scale_factor = [ 1, self.vocoder_config["audio"]["sample_rate"] / self.ap.sample_rate, ] if scale_factor[1] != 1: print(" > interpolating tts model output.") vocoder_input = interpolate_vocoder_input( scale_factor, vocoder_input) else: vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference( vocoder_input.to(device_type)) if self.use_cuda and not use_gl: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() waveform = waveform.squeeze() # trim silence waveform = trim_silence(waveform, self.ap) wavs += list(waveform) wavs += [0] * 10000 # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio["sample_rate"] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return wavs
class tts_class: def __init__(self): # Set constants ROOT_PATH = 'TTS/tts_model/' MODEL_PATH = ROOT_PATH + '/best_model.pth.tar' # MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar' CONFIG_PATH = ROOT_PATH + '/config.json' OUT_FOLDER = ROOT_PATH + '/test' self.CONFIG = load_config(CONFIG_PATH) self.use_cuda = True # True # load the model self.model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq, self.CONFIG.num_mels, self.CONFIG.r) # load the audio processor self.ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels, self.CONFIG.min_level_db, self.CONFIG.frame_shift_ms, self.CONFIG.frame_length_ms, self.CONFIG.ref_level_db, self.CONFIG.num_freq, self.CONFIG.power, self.CONFIG.preemphasis, 60) # load model state if self.use_cuda: cp = torch.load(MODEL_PATH) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if self.use_cuda: self.model.cuda() self.model.eval() self.model.decoder.max_decoder_steps = 500 self.nlp = spacy.load("en") def process(self, text): self.model.decoder.max_decoder_steps = 500 wavefiles = self.text2audio(text, self.model, self.CONFIG, self.use_cuda, self.ap) return wavefiles def tts(self, model, text, CONFIG, use_cuda, ap, wavefile, figures=True): waveform, alignment, spectrogram, stop_tokens = create_speech( model, text, CONFIG, use_cuda, ap) self.ap.save_wav(waveform, wavefile) def text2audio(self, text, model, CONFIG, use_cuda, ap): wavefiles = [] base_name = "gen_{}.wav" doc = self.nlp(text) for i, sent in enumerate(doc.sents): text = sent.text.strip() wavefile = base_name.format(i) self.tts(model, text, CONFIG, use_cuda, ap, wavefile) wavefiles.append(wavefile) return wavefiles def play(self, wavefiles): voice = AudioSegment.empty() for wavefile in wavefiles: voice += AudioSegment.from_wav(wavefile) play(voice) for w in wavefiles: os.remove(w)
else: # check if gst_style string is a dict, if is dict convert else use string try: gst_style = json.loads(args.gst_style) if max(map(int, gst_style.keys())) >= C.gst['gst_style_tokens']: raise RuntimeError( "The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}" .format(max(map(int, gst_style.keys())), C.gst['gst_style_tokens'])) except ValueError: gst_style = args.gst_style wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style) # save the results file_name = args.text.replace(" ", "_")[0:10] file_name = file_name.translate( str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' out_path = os.path.join(args.out_path, file_name) print(" > Saving output to {}".format(out_path)) ap.save_wav(wav, out_path)
class MozillaTTS: """ Wrapper for Mozilla TTS Related repositories: - Mozilla TTS: - https://github.com/mozilla/TTS - commit 824c091 - data: https://drive.google.com/drive/folders/1FJRjGDAqWIyZRX4CsppaIPEW8UWXCWzF?usp=drive_open - WaveRNN(optional): - https://github.com/erogol/WaveRNN - commit 8a1c152 - data: https://drive.google.com/drive/folders/1wpPn3a0KQc6EYtKL0qOi4NqEmhML71Ve """ def __init__(self, tts_model, tts_config, wavernn_model=None, wavernn_config=None, device="cpu"): from TTS.utils.generic_utils import load_config self.tts_config = load_config(tts_config) self.tts_config.windowing = True if not torch.cuda.is_available(): device = "cpu" self.use_cuda = device != "cpu" self.device = torch.device(device) self.tts_model_path = tts_model self._load_tts() if wavernn_model and wavernn_config: self.use_gl = False self.batched_wavernn = True self.wavernn_model_path = wavernn_model self.wavernn_config = load_config(wavernn_config) self._load_wavernn() else: self.use_gl = True def _load_tts(self): # LOAD TTS MODEL from TTS.utils.text.symbols import symbols, phonemes from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import setup_model # load the model num_chars = len(phonemes) if self.tts_config.use_phonemes else len(symbols) self.tts_model = setup_model(num_chars, self.tts_config) # load the audio processor self._ap = AudioProcessor(**self.tts_config.audio) # load model state cp = torch.load(self.tts_model_path, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) self.tts_model.to(self.device) self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 2000 def _load_wavernn(self): from WaveRNN.models.wavernn import Model self.wavernn = Model( rnn_dims=512, fc_dims=512, mode="mold", pad=2, upsample_factors=self.wavernn_config.upsample_factors, # set this depending on dataset feat_dims=self.wavernn_config.audio["num_mels"], compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self._ap.hop_length, sample_rate=self._ap.sample_rate, ).to(self.device) check = torch.load(self.wavernn_model_path) self.wavernn.load_state_dict(check['model']) self.wavernn.eval() def __call__(self, text, out_path): waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(self.tts_model, text, self.tts_config, self.use_cuda, self._ap, False, self.tts_config.enable_eos_bos_chars) if not self.use_gl: waveform = self.wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).to(self.device), batched=self.batched_wavernn, target=11000, overlap=550) self._ap.save_wav(waveform, out_path)