Exemplo n.º 1
0
 def load_data(self, idx):
     if self.cached:
         wav_name = self.items[idx][1]
         mel_name = self.items[idx][2]
         linear_name = self.items[idx][3]
         text = self.items[idx][0]
         text = np.asarray(text_to_sequence(text, [self.cleaners]),
                           dtype=np.int32)
         if wav_name.split('.')[-1] == 'npy':
             wav = self.load_np(wav_name)
         else:
             wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
         mel = self.load_np(mel_name)
         linear = self.load_np(linear_name)
         sample = {
             'text': text,
             'wav': wav,
             'item_idx': self.items[idx][1],
             'mel': mel,
             'linear': linear
         }
     else:
         text, wav_file = self.items[idx]
         text = np.asarray(text_to_sequence(text, [self.cleaners]),
                           dtype=np.int32)
         wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
         sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1]}
     return sample
Exemplo n.º 2
0
def tsau(input_text, save_path):
    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
            ]

    if args.vocoder == 'wavernn':
        voc_k = voc_model.get_step() // 1000
        tts_k = tts_model.get_step() // 1000

        simple_table([
            ('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r),
            ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
            ('Generation Mode', 'Batched' if batched else 'Unbatched'),
            ('Target Samples', target if batched else 'N/A'),
            ('Overlap Samples', overlap if batched else 'N/A')
        ])

    elif args.vocoder == 'griffinlim':
        tts_k = tts_model.get_step() // 1000
        simple_table([('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r),
                      ('Vocoder Type', 'Griffin-Lim'),
                      ('GL Iters', args.iters)])

    for i, x in enumerate(inputs, 1):

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, attention = tts_model.generate(x)
        # Fix mel spectrogram scaling to be from 0 to 1
        m = (m + 4) / 8
        np.clip(m, 0, 1, out=m)

        if args.vocoder == 'griffinlim':
            v_type = args.vocoder
        elif args.vocoder == 'wavernn' and args.batched:
            v_type = 'wavernn_batched'
        else:
            v_type = 'wavernn_unbatched'

        if save_attn: save_attention(attention, save_path)

        if args.vocoder == 'wavernn':
            m = torch.tensor(m).unsqueeze(0)
            voc_model.generate(m, save_path, batched, target, overlap,
                               hp.mu_law)
        elif args.vocoder == 'griffinlim':
            wav = reconstruct_waveform(m, n_iter=args.iters)
            save_wav(wav, save_path)

    print('\n\nDone.\n')
Exemplo n.º 3
0
 def load_tts(self, model_path, model_file, model_config, use_cuda):
     tts_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_file)
     print(" > Loading TTS model ...")
     print(" | > model config: ", tts_config)
     print(" | > model file: ", model_file)
     self.tts_config = load_config(tts_config)
     self.use_phonemes = self.tts_config.use_phonemes
     self.ap = AudioProcessor(**self.tts_config.audio)
     if self.use_phonemes:
         self.input_size = len(phonemes)
         self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars)
     else:
         self.input_size = len(symbols)
         self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner])
     self.tts_model = setup_model(self.input_size, self.tts_config)
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
     # load the model
     self.tts_model.load_state_dict(cp['model'])
     if use_cuda:
         self.tts_model.cuda()
     self.tts_model.eval()
     self.tts_model.decoder.max_decoder_steps = 3000
Exemplo n.º 4
0
def tts(model, text):
    """Convert text to speech waveform given a Tacotron model.
	"""
    if USE_CUDA:
        model = model.cuda()

    # NOTE: dropout in the decoder should be activated for generalization!
    # model.decoder.eval()
    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text_to_sequence(text))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    if USE_CUDA:
        sequence = sequence.cuda()

    # Greedy decoding
    mel_outputs, linear_outputs, gate_outputs, alignments = model(sequence)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram
Exemplo n.º 5
0
    def load_data(self, idx):
        text, wav_file, speaker_name = self.items[idx]

        # print(" | >  load_data idx: {}".format(self.items[idx]))
        #  print(" | >  load_data wav_file: {}".format(wav_file))
        #  print(" | >  load_data text: {}".format(text))
        #  print(" | >  load_data speaker_name: {}".format(speaker_name))

        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)

        if self.use_phonemes:
            text = self.load_phoneme_sequence(wav_file, text)
        else:
            text = np.asarray(text_to_sequence(text, [self.cleaners]),
                              dtype=np.int32)

        assert text.size > 0, self.items[idx][1]
        assert wav.size > 0, self.items[idx][1]

        sample = {
            'text': text,
            'wav': wav,
            'item_idx': self.items[idx][1],
            'speaker_name': speaker_name
        }
        return sample
Exemplo n.º 6
0
def tts(model, text):
	"""Convert text to speech waveform given a Tacotron model.
	"""
	if USE_CUDA:
		model = model.cuda()
	# TODO: Turning off dropout of decoder's prenet causes serious performance regression, not sure why.
	# model.decoder.eval()
	model.encoder.eval()
	model.postnet.eval()

	sequence = np.array(text_to_sequence(text))
	sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
	if USE_CUDA:
		sequence = sequence.cuda()

	# Greedy decoding
	mel_outputs, linear_outputs, alignments = model(sequence)

	linear_output = linear_outputs[0].cpu().data.numpy()
	spectrogram = audio._denormalize(linear_output)
	alignment = alignments[0].cpu().data.numpy()

	# Predicted audio signal
	waveform = audio.inv_spectrogram(linear_output.T)

	return waveform, alignment, spectrogram
    def synthesis(self,
                  text,
                  speaker_embedding,
                  noise_embedding,
                  wave_path="log/synthesis/wave/",
                  plot_path="log/synthesis/plot/"):
        """
        TODO: Provide Batch Synthesis
        :param text: "hello, world"
        :param speaker_embedding: Any[Speaker]
        :param noise_embedding: Any [Noise]
        :param wave_path: "log/synthesis/wave"
        :param plot_path: "log/synthesis/plot"
        :return: FloatTensor [Time] for wave, FloatTensor [Encoder, Decoder] for attention
        """
        with torch.no_grad():
            makedirs(str(wave_path), exist_ok=True)
            makedirs(str(plot_path), exist_ok=True)
            phone = text_to_sequence(text.strip(), hp.cleaner_names)
            mel, linear, attention = self.tacotron.generate(
                phone, speaker_embedding, noise_embedding)

            audio_path = f'{wave_path}_GL_input_{text[:10]}_{self.tts_k}k.wav'
            atten_path = f"{plot_path}_Attention_input_{text[:10]}_{self.tts_k}k"

            save_attention(attention, atten_path)

            print(list(linear.shape))
            wave = self.stft.inverse_linear(linear, iteration=40)[0]
            save_from_torch(wave, audio_path, hp.sampling_rate)

            return wave, attention
def create_attention_guides(fpath):
    dataset_ids = []
    mel_lengths = []
    text_lengths = []

    with open(f'{fpath}/dataset.pkl', 'rb') as f:
        dataset = pickle.load(f)

    for (item_id, l) in dataset:
        dataset_ids += [item_id]
        mel_lengths += [l]

    with open(f'{fpath}/text_dict.pkl', 'rb') as f:
        text_dict = pickle.load(f)

    for item_id in dataset_ids:
        x = text_to_sequence(text_dict[item_id], ['blizz_cleaners'])

        text_lengths += [len(x)]

    for i, id in enumerate(dataset_ids):

        attfile = os.path.join(fpath, 'diagonal_attention_guides', id + '.npy')
        att = get_attention_guide(text_lengths[i], mel_lengths[i], g=0.2)
        np.save(attfile, att)
Exemplo n.º 9
0
 def load_data(self, idx):
     if self.cached:
         wav_name = self.items[idx][1]
         mel_name = self.items[idx][2]
         linear_name = self.items[idx][3]
         text = self.items[idx][0]
         
         if wav_name.split('.')[-1] == 'npy':
             wav = self.load_np(wav_name)
         else:
             wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
         mel = self.load_np(mel_name)
         linear = self.load_np(linear_name)
     else:
         text, wav_file = self.items[idx]
         wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
         mel = None
         linear = None
     
     if self.use_phonemes:
         text = self.load_phoneme_sequence(wav_file, text)
     else: 
         text = np.asarray(
             text_to_sequence(text, [self.cleaners]), dtype=np.int32)
     sample = {'text': text, 'wav': wav, 'item_idx': os.path.basename(self.items[idx][1]).split('.')[0], 'mel':mel, 'linear': linear}
     return sample
Exemplo n.º 10
0
Arquivo: eval.py Projeto: geneing/TTS
 def load_model(self, model_path, model_config, wavernn_path, use_cuda):
     
     self.model_file = model_path
     print(" > Loading model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", self.model_file)
     config = load_config(model_config)
     self.config = config
     self.use_cuda = use_cuda
     self.use_phonemes = config.use_phonemes
     self.ap = AudioProcessor(**config.audio)
     
     if self.use_phonemes:
         self.input_size = len(phonemes)
         self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language)
     else:
         self.input_size = len(symbols)
         self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner])
     
     self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True)
     self.model.decoder.max_decoder_steps = 8000
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
     # load the model
     self.model.load_state_dict(cp['model'])
     if use_cuda:
         self.model.cuda()
     self.model.eval()
     self.vocoder=WaveRNNVocoder.Vocoder()
     self.vocoder.loadWeights(wavernn_path)
     self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000)
Exemplo n.º 11
0
    def load_model(self, model_path, model_name, model_config, use_cuda):
        model_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_name)
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.use_phonemes = config.use_phonemes
        self.ap = AudioProcessor(**config.audio)

        if self.use_phonemes:
            self.input_size = len(phonemes)
            self.input_adapter = lambda sen: phoneme_to_sequence(
                sen, [self.config.text_cleaner], self.config.phoneme_language)
        else:
            self.input_size = len(symbols)
            self.input_adapter = lambda sen: text_to_sequence(
                sen, [self.config.text_cleaner])

        self.model = Tacotron(self.input_size, config.embedding_size,
                              self.ap.num_freq, self.ap.num_mels, config.r)
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file,
                            map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()
Exemplo n.º 12
0
 def __getitem__(self, index):
     id = self.metadata[index]
     x = text_to_sequence(self.text_dict[id], hp.tts_cleaner_names)
     mel = np.load(f'{self.path}mel/{id}.npy')
     spk_embed = np.load(f'{self.path}spk_embeds/{id}.npy')
     mel_len = mel.shape[-1]
     return x, mel, id, mel_len, spk_embed
Exemplo n.º 13
0
    def __getitem__(self, index):
        item_id = self.metadata[index]
        text = self.text_dict[item_id]
        x = text_to_sequence(text)
        x = np.array(x)

        pad_idx = 10
        punc_level = np.full_like(x, pad_idx)
        new_x = []
        in_quote = False
        for i, ph_idx in enumerate(x[::-1]):
            if ph_idx in PUNCTUATION_INDICES:
                punc_level[:len(x) - i] = ph_idx
            if ph_idx == 3:  # closing bracket
                punc_level[:len(x) - i] = pad_idx
            if ph_idx == 2:
                if in_quote:
                    punc_level[:len(x) - i] = pad_idx
                else:
                    in_quote = True
            # if ph_idx not in PUNCTUATION_INDICES:
            else:
                new_x.append(ph_idx)
        new_x = np.array(new_x[::-1])
        x = np.stack([new_x, punc_level])

        # print("!" * 100)
        # print("LENS", len(punc_level), len(new_x))
        # print(new_x)
        # print(punc_level)
        mel = np.load(str(self.path / 'mel' / f'{item_id}.npy'))
        mel_len = mel.shape[-1]
        dur = np.load(str(self.path / 'alg' / f'{item_id}.npy'))
        pitch = np.load(str(self.path / 'phon_pitch' / f'{item_id}.npy'))
        return x, mel, item_id, mel_len, dur, pitch
Exemplo n.º 14
0
 def __getitem__(self, index):
     item_id = self.metadata[index]
     text = self.text_dict[item_id]
     x = text_to_sequence(text)
     mel = np.load(self.path / 'mel' / f'{item_id}.npy')
     mel_len = mel.shape[-1]
     return x, mel, item_id, mel_len
Exemplo n.º 15
0
 def __getitem__(self, idx):
     text, wav_file = self.items[idx]
     text = np.asarray(text_to_sequence(text, [self.cleaners]),
                       dtype=np.int32)
     wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
     sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1]}
     return sample
Exemplo n.º 16
0
    def synthesize(self, inputs):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq_input = [text_to_sequence(j, cleaner_names) for j in inputs]
        seq_length = [len(j) for j in seq_input]
        max_len = max(seq_length)
        inputs = [_pad_input(j, max_len) for j in seq_input]
        seq = np.stack((x for x in inputs))

        # seq = text_to_sequence(text, cleaner_names)
        if not self.model_filename.endswith('.pb'):
            feed_dict = {
                self.model.inputs: np.asarray(seq, dtype=np.int32),
                self.model.input_lengths: np.asarray(seq_length, dtype=np.int32)
            }
        else:
            feed_dict = {
                self.inputs: np.asarray(seq, dtype=np.int32),
                self.input_lengths: np.asarray(seq_length, dtype=np.int32)
            }

        wav = self.session.run(self.wav_output, feed_dict=feed_dict)

        output = []
        print('wav.shape:', wav.shape)
        for wav_index in range(wav.shape[0]):
            wav_index_temp = audio.inv_preemphasis(wav[wav_index])

            wav_index_temp = wav_index_temp[:audio.find_endpoint(wav_index_temp)]
            # wav_index_temp = vad_check(wav_index_temp, hparams.sample_rate)

            out = io.BytesIO()
            audio.save_wav(wav_index_temp, out)
            output.append(out)
        return output
Exemplo n.º 17
0
 def __getitem__(self, idx):
     wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav'
     text = self.frames[idx][1]
     text = np.asarray(text_to_sequence(text, [self.cleaners]),
                       dtype=np.int32)
     wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
     sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
     return sample
Exemplo n.º 18
0
 def __getitem__(self, index):
     item_id = self.metadata[index]
     text = self.text_dict[item_id]
     x = text_to_sequence(text)
     mel = np.load(str(self.path / 'mel' / f'{item_id}.npy'))
     mel_len = mel.shape[-1]
     dur = np.load(str(self.path / 'alg' / f'{item_id}.npy'))
     pitch = np.load(str(self.path / 'phon_pitch' / f'{item_id}.npy'))
     return x, mel, item_id, mel_len, dur, pitch
 def loading_thread(self):
     while True:
         try:
             text, wave, speaker, male, augmented = self.loader.sample()
             phoneme = text_to_sequence(text, hp.cleaner_names)
             phoneme = torch.from_numpy(np.int64(phoneme))
             self.loading_queue.put((phoneme.to(self.device, non_blocking=True), wave.to(self.device, non_blocking=True), speaker, augmented))
         except Exception as e:
             print("Loading Thread Error", str(e))
Exemplo n.º 20
0
def synthesis(model, ap, text, use_cuda, text_cleaner):
    text_cleaner = [text_cleaner]
    seq = np.array(text_to_sequence(text, text_cleaner))
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda().long()
    _, linear_out, alignments, _ = model.forward(chars_var)
    linear_out = linear_out[0].data.cpu().numpy()
    wav = ap.inv_spectrogram(linear_out.T)
    return wav, linear_out, alignments
Exemplo n.º 21
0
 def __getitem__(self, index):
     item_id = self.metadata[index]
     x = text_to_sequence(self.text_dict[item_id], hp.tts_cleaner_names)
     mel = np.load(self.path / 'mel' / f'{item_id}.npy')
     mel_len = mel.shape[-1]
     if hp.mode in ['teacher_forcing', 'attention_forcing_online']:
         return x, mel, item_id, mel_len
     elif hp.mode == 'attention_forcing_offline':
         attn_ref = np.load(self.path / hp.attn_ref_path / f'{item_id}.npy')
         return x, mel, item_id, mel_len, attn_ref
Exemplo n.º 22
0
def getTTS(input_text, batched, voc_model, tts_model, hp):

    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
            ]

    voc_k = voc_model.get_step() // 1000
    tts_k = tts_model.get_step() // 1000

    r = tts_model.r

    simple_table([('WaveRNN', str(voc_k) + 'k'),
                  (f'Tacotron(r={r})', str(tts_k) + 'k'),
                  ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', 11_000 if batched else 'N/A'),
                  ('Overlap Samples', 550 if batched else 'N/A')])

    wav_list = []

    for i, x in enumerate(inputs, 1):

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, attention = tts_model.generate(x)

        save_path = './sound/' + str(uuid.uuid4()) + '.wav'

        m = torch.tensor(m).unsqueeze(0)
        m = (m + 4) / 8

        wav_file = voc_model.generate(m, save_path, batched, 3000, 550,
                                      hp.mu_law)

        wav_list.append(wav_file)
        wav_list.append(save_path)

    print('\n\nDone.\n')

    return wav_list
Exemplo n.º 23
0
 def __getitem__(self, index):
     item_id = self.metadata[index]
     x = text_to_sequence(self.text_dict[item_id], hp.tts_cleaner_names)
     mel = np.load(self.path / 'mel' / f'{item_id}.npy')
     mel_len = mel.shape[-1]
     if self.alignments:
         dur = np.load(self.path / 'alg' / f'{item_id}.npy')
     else:
         # dummy durations to simplify collate func
         dur = np.zeros((mel.shape[0], 1))
     return x, mel, item_id, mel_len, dur
Exemplo n.º 24
0
    def __getitem__(self, index):
        item_id = self.metadata[index]
        text = self.text_dict[item_id]
        x = text_to_sequence(text)
        x = np.array([ch for ch in x if ch not in PUNCTUATION_INDICES])

        x = np.stack([x, np.zeros_like(x)])

        mel = np.load(str(self.path / 'mel' / f'{item_id}.npy'))
        mel_len = mel.shape[-1]
        return x, mel, item_id, mel_len
Exemplo n.º 25
0
 def __getitem__(self, idx):
     sidx = self.frames[idx][0]
     sidx_files = self.wav_files_dict[sidx]
     file_name = random.choice(sidx_files)
     wav_name = os.path.join(self.wav_dir, file_name)
     text = self.frames[idx][2]
     text = np.asarray(text_to_sequence(text, [self.cleaners]),
                       dtype=np.int32)
     wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
     sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
     return sample
Exemplo n.º 26
0
    def generate(self, 華, input_text):
        inputs = [text_to_sequence(input_text.strip(), ['basic_cleaners'])]
        if hp.tts_model == 'tacotron2':
            self.gen_tacotron2(華, inputs)

        elif hp.tts_model == 'tacotron':
            self.gen_tacotron(華, inputs)

        else:
            print(f"Wrong tts model type {{{tts_model_type}}}")

        print('\n\nDone.\n')
Exemplo n.º 27
0
	def _get_next_example(self):
		"""
		Gets a single example (input, mel_target, linear_target, cost) from disk
		"""
		if self._offset >= len(self._metadata):
			self._offset = 0
			np.random.shuffle(self._metadata)
		meta = self._metadata[self._offset]
		self._offset += 1
		text = meta[2]
		input_data = np.asarray(text_to_sequence(text, Config.Cleaners), dtype=np.int32)
		mel_target = np.load(os.path.join(self._datadir, meta[0]))
		return (input_data, mel_target, len(mel_target))
Exemplo n.º 28
0
    def synthesize(self, text, index, out_dir):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
        }
        mels = self.session.run(self.mel_outputs, feed_dict=feed_dict)

        # Write the spectrogram to disk
        mel_filename = 'ljspeech-mel-eval-{:05d}.npy'.format(index)
        np.save(os.path.join(out_dir, mel_filename), mels, allow_pickle=False)

        print('mel spectrograms saved under {}'.format(out_dir))
Exemplo n.º 29
0
def synthesize(input_text, tts_model, voc_model, alpha=1.0, device=torch.device('cuda')):
    text = clean_text(input_text.strip())
    x = text_to_sequence(text)
    _, m, _ = tts_model.generate(x, alpha=alpha)
    if voc_model == 'griffinlim':
        wav = reconstruct_waveform(m, n_iter=32)
    elif isinstance(voc_model, WaveRNN):
        m = torch.tensor(m).unsqueeze(0)
        wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target, hp.voc_overlap, hp.mu_law)
    else:
        m = torch.tensor(m).unsqueeze(0).to(device)
        with torch.no_grad():
            wav = voc_model.inference(m).cpu().numpy()
    return wav
Exemplo n.º 30
0
def synthesize(input_text, tts_model, voc_model, alpha=1.0):
    x = text_to_sequence(input_text.strip(), ['english_cleaners'])
    m = tts_model.generate(x, alpha=alpha)
    # Fix mel spectrogram scaling to be from 0 to 1
    m = (m + 4) / 8
    np.clip(m, 0, 1, out=m)
    if voc_model == 'griffinlim':
        wav = reconstruct_waveform(m, n_iter=32)
    else:
        m = torch.tensor(m).unsqueeze(0)
        wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target,
                                 hp.voc_overlap, hp.mu_law)
        print()
    return wav