예제 #1
0
    model = Tacotron(embed_dims=hp.tts_embed_dims,
                     num_chars=len(symbols),
                     encoder_dims=hp.tts_encoder_dims,
                     decoder_dims=hp.tts_decoder_dims,
                     n_mels=hp.num_mels,
                     fft_bins=hp.num_mels,
                     postnet_dims=hp.tts_postnet_dims,
                     encoder_K=hp.tts_encoder_K,
                     lstm_dims=hp.tts_lstm_dims,
                     postnet_K=hp.tts_postnet_K,
                     num_highways=hp.tts_num_highways,
                     dropout=hp.tts_dropout).cuda()

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    model.restore(paths.tts_latest_weights)

    # model.reset_step()

    # model.set_r(hp.tts_r)

    optimiser = optim.Adam(model.parameters())

    current_step = model.get_step()

    if not force_gta:

        for session in hp.tts_schedule:

            r, lr, max_step, batch_size = session
예제 #2
0
    # Instantiate Tacotron Model
    tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                         num_chars=len(symbols),
                         encoder_dims=hp.tts_encoder_dims,
                         decoder_dims=hp.tts_decoder_dims,
                         n_mels=hp.num_mels,
                         fft_bins=hp.num_mels,
                         postnet_dims=hp.tts_postnet_dims,
                         encoder_K=hp.tts_encoder_K,
                         lstm_dims=hp.tts_lstm_dims,
                         postnet_K=hp.tts_postnet_K,
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout).cuda()

    tts_restore_path = weights_path if weights_path else paths.tts_latest_weights
    tts_model.restore(tts_restore_path)

    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
            ]

    voc_k = voc_model.get_step() // 1000
    tts_k = tts_model.get_step() // 1000

    simple_table([('WaveRNN', str(voc_k) + 'k'),
                  ('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r.item()),
                  ('Generation Mode', 'Batched' if batched else 'Unbatched'),
예제 #3
0
    # Instantiate Tacotron Model
    tts_model = Tacotron(r=hp.tts_r,
                         embed_dims=hp.tts_embed_dims,
                         num_chars=len(symbols),
                         encoder_dims=hp.tts_encoder_dims,
                         decoder_dims=hp.tts_decoder_dims,
                         n_mels=hp.num_mels,
                         fft_bins=hp.num_mels,
                         postnet_dims=hp.tts_postnet_dims,
                         encoder_K=hp.tts_encoder_K,
                         lstm_dims=hp.tts_lstm_dims,
                         postnet_K=hp.tts_postnet_K,
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout).cuda()

    tts_model.restore('quick_start/tts_weights/latest_weights.pyt')

    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
            ]

    voc_k = voc_model.get_step() // 1000
    tts_k = tts_model.get_step() // 1000

    simple_table([('WaveRNN', str(voc_k) + 'k'),
                  ('Tacotron', str(tts_k) + 'k'),
                  ('Generation Mode', 'Batched' if batched else 'Unbatched'),
예제 #4
0
    def TTS_Wave(self):
        os.makedirs('quick_start/tts_weights/', exist_ok=True)
        os.makedirs('quick_start/voc_weights/', exist_ok=True)

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r')
        zip_ref.extractall('quick_start/voc_weights/')
        zip_ref.close()

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r')
        zip_ref.extractall('quick_start/tts_weights/')
        zip_ref.close()

        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS Generator')
        parser.add_argument('-name', metavar='name', type=str,help='name of pdf')
        parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!')
        parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)')
        parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)')
        parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index')
        parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples')
        parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment')
        parser.set_defaults(batched=hp.voc_gen_batched)
        parser.set_defaults(target=hp.voc_target)
        parser.set_defaults(overlap=hp.voc_overlap)
        parser.set_defaults(input_text=None)
        parser.set_defaults(weights_path=None)
        args = parser.parse_args()

        batched = args.batched
        target = args.target
        overlap = args.overlap
        input_text = args.input_text
        weights_path = args.weights_path

        if not args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
            torch.cuda.set_device(0)
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        print('\nInitialising WaveRNN Model...\n')

        # Instantiate WaveRNN Model
        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                            fc_dims=hp.voc_fc_dims,
                            bits=hp.bits,
                            pad=hp.voc_pad,
                            upsample_factors=hp.voc_upsample_factors,
                            feat_dims=hp.num_mels,
                            compute_dims=hp.voc_compute_dims,
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode='MOL').to(device)

        voc_model.restore('quick_start/voc_weights/latest_weights.pyt')

        print('\nInitialising Tacotron Model...\n')

        # Instantiate Tacotron Model
        tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                             num_chars=len(symbols),
                             encoder_dims=hp.tts_encoder_dims,
                             decoder_dims=hp.tts_decoder_dims,
                             n_mels=hp.num_mels,
                             fft_bins=hp.num_mels,
                             postnet_dims=hp.tts_postnet_dims,
                             encoder_K=hp.tts_encoder_K,
                             lstm_dims=hp.tts_lstm_dims,
                             postnet_K=hp.tts_postnet_K,
                             num_highways=hp.tts_num_highways,
                             dropout=hp.tts_dropout).to(device)


        tts_model.restore('quick_start/tts_weights/latest_weights.pyt')

        if input_text:
            inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
        else:
            with open('final.txt') as f:
                inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f]

        voc_k = voc_model.get_step() // 1000
        tts_k = tts_model.get_step() // 1000

        r = tts_model.get_r()

        simple_table([('WaveRNN', str(voc_k) + 'k'),
                      (f'Tacotron(r={r})', str(tts_k) + 'k'),
                      ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                      ('Target Samples', target if batched else 'N/A'),
                      ('Overlap Samples', overlap if batched else 'N/A')])

        for i, x in enumerate(inputs, 1):

            print("f'\n| Generating {i}/{len(inputs)}'")
            _, m, attention = tts_model.generate(x)

            if input_text:
                save_path = './output_audio/'+str(i)+'.wav'
            else:
                save_path = './output_audio/'+str(i)+'.wav'

            # save_attention(attention, save_path)

            m = torch.tensor(m).unsqueeze(0)
            m = (m + 4) / 8

            voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law)


            if i == 2:

                temp1 = AudioSegment.from_wav("./output_audio/"+str(i-1)+".wav")
                temp2 = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = temp1 + temp2

                os.remove("./output_audio/"+str(i-1)+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")

            elif i > 2:

                preTemp = AudioSegment.from_wav("./output_audio/"+self.path[:-4]+".wav")

                newTemp = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = preTemp + newTemp

                os.remove("./output_audio/"+self.path[:-4]+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")


        print("Done")