Python Tacotron.generate примеры использования

Язык программирования: Python

Пространство имен/Пакет: models.tacotron

Класс/Тип: Tacotron

Метод/Функция: generate

Примеров на hotexamples.com: 6

Python Tacotron.generate - 6 примеров найдено. Это лучшие примеры Python кода для models.tacotron.Tacotron.generate, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Tacotron(30)

parameters(30)

eval(14)

get_step(13)

load_state_dict(13)

cuda(11)

forward(8)

load(8)

initialize(6)

generate(6)

train(5)

log(4)

restore(4)

state_dict(4)

get_r(3)

add_loss(2)

add_optimizer(2)

r(2)

inference(1)

from_config(1)

reset_step(1)

set_r(1)

Пример #1

Показать файл

Файл: taco_trainer.py Проект: x0rzkov/ForwardTacotron

    def generate_plots(self, model: Tacotron, session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, x_lens, m_lens = session.val_sample
        x, m = x.to(device), m.to(device)

        m1_hat, m2_hat, att = model(x, m)
        att = np_now(att)[0]
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        att_fig = plot_attention(att)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)

        self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, att = model.generate(x[0].tolist(),
                                             steps=m_lens[0] + 20)
        att_fig = plot_attention(att)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        self.writer.add_figure('Generated/attention', att_fig, model.step)
        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

Пример #2

Показать файл

Файл: taco_trainer.py Проект: as-ideas/ForwardTacotron

    def generate_plots(self, model: Tacotron, session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        batch = session.val_sample
        batch = to_device(batch, device=device)
        m1_hat, m2_hat, att = model(batch['x'], batch['mel'])
        att = np_now(att)[0]
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m_target = np_now(batch['mel'])[0, :600, :]

        att_fig = plot_attention(att)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_target_fig = plot_mel(m_target)

        self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_target_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = self.dsp.griffinlim(m2_hat)
        target_wav = self.dsp.griffinlim(m_target)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)

        m1_hat, m2_hat, att = model.generate(batch['x'][0:1],
                                             steps=batch['mel_len'][0] + 20)
        att_fig = plot_attention(att)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        self.writer.add_figure('Generated/attention', att_fig, model.step)
        self.writer.add_figure('Generated/target', m_target_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = self.dsp.griffinlim(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=self.dsp.sample_rate)

Пример #3

Показать файл

Файл: quick_start.py Проект: zhangxinaaaa/tts-kor

    voc_k = voc_model.get_step() // 1000
    tts_k = tts_model.get_step() // 1000

    r = tts_model.r

    simple_table([('WaveRNN', str(voc_k) + 'k'),
                  (f'Tacotron(r={r})', str(tts_k) + 'k'),
                  ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', 11_000 if batched else 'N/A'),
                  ('Overlap Samples', 550 if batched else 'N/A')])

    for i, x in enumerate(inputs, 1):

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, attention = tts_model.generate(x)

        if input_text:
            save_path = f'quick_start/__input_{input_text[:10]}_{tts_k}k.wav'
        else:
            save_path = f'quick_start/{i}_batched{str(batched)}_{tts_k}k.wav'

        # save_attention(attention, save_path)

        m = torch.tensor(m).unsqueeze(0)
        m = (m + 4) / 8
        print(m.size())
        print(m)
        voc_model.generate(m, save_path, batched, 11_000, 550, hp.mu_law)

    print('\n\nDone.\n')

Пример #4

Показать файл

class TaiwaneseTacotron():
    def __init__(self):
        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS')
        self.args = parser.parse_args()
        self.args.vocoder = 'wavernn'
        self.args.hp_file = 'hparams.py'
        self.args.voc_weights = False
        self.args.tts_weights = False
        self.args.save_attn = False
        self.args.batched = True
        self.args.target = None
        self.args.overlap = None
        self.args.force_cpu = False
        #================ vocoder ================#
        if self.args.vocoder in ['griffinlim', 'gl']:
            self.args.vocoder = 'griffinlim'
        elif self.args.vocoder in ['wavernn', 'wr']:
            self.args.vocoder = 'wavernn'
        else:
            raise argparse.ArgumentError('Must provide a valid vocoder type!')

        hp.configure(self.args.hp_file)  # Load hparams from file

        # set defaults for any arguments that depend on hparams
        if self.args.vocoder == 'wavernn':
            if self.args.target is None:
                self.args.target = hp.voc_target
            if self.args.overlap is None:
                self.args.overlap = hp.voc_overlap
            if self.args.batched is None:
                self.args.batched = hp.voc_gen_batched

        #================ others ================#
        paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)
        print("hello")
        print(paths.base)
        if not self.args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        # === Wavernn === #
        if self.args.vocoder == 'wavernn':
            print('\nInitialising WaveRNN Model...\n')
            self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                                     fc_dims=hp.voc_fc_dims,
                                     bits=hp.bits,
                                     pad=hp.voc_pad,
                                     upsample_factors=hp.voc_upsample_factors,
                                     feat_dims=hp.num_mels,
                                     compute_dims=hp.voc_compute_dims,
                                     res_out_dims=hp.voc_res_out_dims,
                                     res_blocks=hp.voc_res_blocks,
                                     hop_length=hp.hop_length,
                                     sample_rate=hp.sample_rate,
                                     mode=hp.voc_mode).to(device)

            voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights
            #print(paths.voc_latest_weights)
            self.voc_model.load(voc_load_path)

        # === Tacotron === #
        if hp.tts_model == 'tacotron':
            print('\nInitialising Tacotron Model...\n')
            self.tts_model = Tacotron(
                embed_dims=hp.tts_embed_dims,
                num_chars=len(symbols),
                encoder_dims=hp.tts_encoder_dims,
                decoder_dims=hp.tts_decoder_dims,
                n_mels=hp.num_mels,
                fft_bins=hp.num_mels,
                postnet_dims=hp.tts_postnet_dims,
                encoder_K=hp.tts_encoder_K,
                lstm_dims=hp.tts_lstm_dims,
                postnet_K=hp.tts_postnet_K,
                num_highways=hp.tts_num_highways,
                dropout=hp.tts_dropout,
                stop_threshold=hp.tts_stop_threshold).to(device)

            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Tacotron2 === #
        elif hp.tts_model == 'tacotron2':
            print('\nInitializing Tacotron2 Model...\n')
            self.tts_model = Tacotron2().to(device)
            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Infomation === #
        if hp.tts_model == 'tacotron':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron', str(tts_k) + 'k'),
                              ('r', self.tts_model.r),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])

        elif hp.tts_model == 'tacotron2':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron2', str(tts_k) + 'k'),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron2', str(tts_k) + 'k'),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])

    def generate(self, 華, input_text):
        inputs = [text_to_sequence(input_text.strip(), ['basic_cleaners'])]
        if hp.tts_model == 'tacotron2':
            self.gen_tacotron2(華, inputs)

        elif hp.tts_model == 'tacotron':
            self.gen_tacotron(華, inputs)

        else:
            print(f"Wrong tts model type {{{tts_model_type}}}")

        print('\n\nDone.\n')

    # custom function
    def gen_tacotron2(self, 華, inputs):
        for i, x in enumerate(inputs, 1):
            print(f'\n| Generating {i}/{len(inputs)}')
            print(x)

            x = np.array(x)[None, :]
            x = torch.autograd.Variable(torch.from_numpy(x)).cuda().long()

            self.tts_model.eval()
            mel_outputs, mel_outputs_postnet, _, alignments = self.tts_model.inference(
                x)
            if self.args.vocoder == 'griffinlim':
                v_type = self.args.vocoder
            elif self.args.vocoder == 'wavernn' and self.args.batched:
                v_type = 'wavernn_batched'
            else:
                v_type = 'wavernn_unbatched'

            # == define output name == #
            if len(華) == 0:
                output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0]
            elif 1 <= len(華) <= 9:
                output_name = 華[:-1]
            elif 9 < len(華):
                output_name = 華[:8]
            print(output_name)
            save_path = "output/{}.wav".format(output_name)
            ##

            if self.args.vocoder == 'wavernn':
                m = mel_outputs_postnet
                self.voc_model.generate(m, save_path, self.args.batched,
                                        hp.voc_target, hp.voc_overlap,
                                        hp.mu_law)

            elif self.args.vocoder == 'griffinlim':
                m = torch.squeeze(mel_outputs_postnet).detach().cpu().numpy()
                wav = reconstruct_waveform(m, n_iter=self.args.iters)
                save_wav(wav, save_path)

    # custom function
    def gen_tacotron(self, 華, inputs):
        for i, x in enumerate(inputs, 1):
            print(f'\n| Generating {i}/{len(inputs)}')
            _, m, attention = self.tts_model.generate(x)
            # Fix mel spectrogram scaling to be from 0 to 1
            m = (m + 4) / 8
            np.clip(m, 0, 1, out=m)

            if self.args.vocoder == 'griffinlim':
                v_type = self.args.vocoder
            elif self.args.vocoder == 'wavernn' and self.args.batched:
                v_type = 'wavernn_batched'
            else:
                v_type = 'wavernn_unbatched'
            # == define output name == #
            if len(華) == 0:
                output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0]
            elif 1 <= len(華) <= 9:
                output_name = 華[:-1]
            elif 9 < len(華):
                output_name = 華[:8]
            print(output_name)
            save_path = "output/{}.wav".format(output_name)
            ##
            if self.args.vocoder == 'wavernn':
                m = torch.tensor(m).unsqueeze(0)
                self.voc_model.generate(m, save_path, self.args.batched,
                                        hp.voc_target, hp.voc_overlap,
                                        hp.mu_law)

            elif self.args.vocoder == 'griffinlim':
                wav = reconstruct_waveform(m, n_iter=self.args.iters)
                save_wav(wav, save_path)

Пример #5

Показать файл

    def TTS_Wave(self):
        os.makedirs('quick_start/tts_weights/', exist_ok=True)
        os.makedirs('quick_start/voc_weights/', exist_ok=True)

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r')
        zip_ref.extractall('quick_start/voc_weights/')
        zip_ref.close()

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r')
        zip_ref.extractall('quick_start/tts_weights/')
        zip_ref.close()

        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS Generator')
        parser.add_argument('-name', metavar='name', type=str,help='name of pdf')
        parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!')
        parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)')
        parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)')
        parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index')
        parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples')
        parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment')
        parser.set_defaults(batched=hp.voc_gen_batched)
        parser.set_defaults(target=hp.voc_target)
        parser.set_defaults(overlap=hp.voc_overlap)
        parser.set_defaults(input_text=None)
        parser.set_defaults(weights_path=None)
        args = parser.parse_args()

        batched = args.batched
        target = args.target
        overlap = args.overlap
        input_text = args.input_text
        weights_path = args.weights_path

        if not args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
            torch.cuda.set_device(0)
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        print('\nInitialising WaveRNN Model...\n')

        # Instantiate WaveRNN Model
        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                            fc_dims=hp.voc_fc_dims,
                            bits=hp.bits,
                            pad=hp.voc_pad,
                            upsample_factors=hp.voc_upsample_factors,
                            feat_dims=hp.num_mels,
                            compute_dims=hp.voc_compute_dims,
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode='MOL').to(device)

        voc_model.restore('quick_start/voc_weights/latest_weights.pyt')

        print('\nInitialising Tacotron Model...\n')

        # Instantiate Tacotron Model
        tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                             num_chars=len(symbols),
                             encoder_dims=hp.tts_encoder_dims,
                             decoder_dims=hp.tts_decoder_dims,
                             n_mels=hp.num_mels,
                             fft_bins=hp.num_mels,
                             postnet_dims=hp.tts_postnet_dims,
                             encoder_K=hp.tts_encoder_K,
                             lstm_dims=hp.tts_lstm_dims,
                             postnet_K=hp.tts_postnet_K,
                             num_highways=hp.tts_num_highways,
                             dropout=hp.tts_dropout).to(device)


        tts_model.restore('quick_start/tts_weights/latest_weights.pyt')

        if input_text:
            inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
        else:
            with open('final.txt') as f:
                inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f]

        voc_k = voc_model.get_step() // 1000
        tts_k = tts_model.get_step() // 1000

        r = tts_model.get_r()

        simple_table([('WaveRNN', str(voc_k) + 'k'),
                      (f'Tacotron(r={r})', str(tts_k) + 'k'),
                      ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                      ('Target Samples', target if batched else 'N/A'),
                      ('Overlap Samples', overlap if batched else 'N/A')])

        for i, x in enumerate(inputs, 1):

            print("f'\n| Generating {i}/{len(inputs)}'")
            _, m, attention = tts_model.generate(x)

            if input_text:
                save_path = './output_audio/'+str(i)+'.wav'
            else:
                save_path = './output_audio/'+str(i)+'.wav'

            # save_attention(attention, save_path)

            m = torch.tensor(m).unsqueeze(0)
            m = (m + 4) / 8

            voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law)


            if i == 2:

                temp1 = AudioSegment.from_wav("./output_audio/"+str(i-1)+".wav")
                temp2 = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = temp1 + temp2

                os.remove("./output_audio/"+str(i-1)+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")

            elif i > 2:

                preTemp = AudioSegment.from_wav("./output_audio/"+self.path[:-4]+".wav")

                newTemp = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = preTemp + newTemp

                os.remove("./output_audio/"+self.path[:-4]+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")


        print("Done")

Пример #6

Показать файл

Файл: gen_tacotron.py Проект: dipjyoti92/SC-WaveRNN

    voc_k = voc_model.get_step() // 1000
    tts_k = tts_model.get_step() // 1000

    simple_table([('WaveRNN', str(voc_k) + 'k'),
                  ('Tacotron', str(tts_k) + 'k'),
                  ('r', tts_model.r.item()),
                  ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', target if batched else 'N/A'),
                  ('Overlap Samples', overlap if batched else 'N/A')])

    for i, x in enumerate(inputs, 1) :
  
        spk_embds, file_name = get_spk_embed(files, enc_path)

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, attention = tts_model.generate(x, spk_embds)

        if input_text :
           # save_path = f'{paths.tts_output}__input_{input_text[:10]}_{tts_k}k.wav'
            save_path = f'{out}{i}_{file_name}_batched{str(batched)}_{tts_k}k.wav'
        else :
            save_path = f'{out}{i}_{file_name}_batched{str(batched)}_{tts_k}k.wav'

        if save_attn : save_attention(attention, save_path)

        m = torch.tensor(m).unsqueeze(0)
        m = (m + 4) / 8

        voc_model.generate(m, spk_embds, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law)

    print('\n\nDone.\n')