Пример #1
0
def synthesis(m, s, CONFIG, use_cuda, ap, language=None):
    """ Given the text, synthesising the audio """
    if language is None:
        language = CONFIG.phoneme_language
    text_cleaner = [CONFIG.text_cleaner]
    # print(phoneme_to_sequence(s, text_cleaner))
    # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner)))
    if CONFIG.use_phonemes:
        seq = np.asarray(phoneme_to_sequence(s, text_cleaner, language),
                         dtype=np.int32)
    else:
        seq = np.asarray(text_to_sequence(s, text_cleaner), dtype=np.int32)
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    mel_spec, linear_spec, alignments, stop_tokens = m.forward(
        chars_var.long())
    linear_spec = linear_spec[0].data.cpu().numpy()
    mel_spec = mel_spec[0].data.cpu().numpy()
    alignment = alignments[0].cpu().data.numpy()
    mel_tensor = torch.FloatTensor(mel_spec.T).unsqueeze(0)
    if torch.cuda.is_available():
        mel_tensor = mel_tensor.cuda()
    wav = wavernn.generate(mel_tensor, batched=True, target=11000, overlap=550)
    return wav
Пример #2
0
def visualize(alignment,
              spectrogram_postnet,
              stop_tokens,
              text,
              hop_length,
              CONFIG,
              spectrogram=None,
              output_path=None):
    if spectrogram is not None:
        num_plot = 4
    else:
        num_plot = 3

    label_fontsize = 16
    fig = plt.figure(figsize=(8, 24))

    plt.subplot(num_plot, 1, 1)
    plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
    plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
    plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
    if CONFIG.use_phonemes:
        seq = phoneme_to_sequence(text, [CONFIG.text_cleaner],
                                  CONFIG.phoneme_language,
                                  CONFIG.enable_eos_bos_chars)
        text = sequence_to_phoneme(seq)
        print(text)
    plt.yticks(range(len(text)), list(text))
    plt.colorbar()

    stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy()
    plt.subplot(num_plot, 1, 2)
    plt.plot(range(len(stop_tokens)), list(stop_tokens))

    plt.subplot(num_plot, 1, 3)
    librosa.display.specshow(spectrogram_postnet.T,
                             sr=CONFIG.audio['sample_rate'],
                             hop_length=hop_length,
                             x_axis="time",
                             y_axis="linear")
    plt.xlabel("Time", fontsize=label_fontsize)
    plt.ylabel("Hz", fontsize=label_fontsize)
    plt.tight_layout()
    plt.colorbar()

    if spectrogram is not None:
        plt.subplot(num_plot, 1, 4)
        librosa.display.specshow(spectrogram.T,
                                 sr=CONFIG.audio['sample_rate'],
                                 hop_length=hop_length,
                                 x_axis="time",
                                 y_axis="linear")
        plt.xlabel("Time", fontsize=label_fontsize)
        plt.ylabel("Hz", fontsize=label_fontsize)
        plt.tight_layout()
        plt.colorbar()

    if output_path:
        print(output_path)
        fig.savefig(output_path)
        plt.close()
Пример #3
0
    def _generate_and_cache_phoneme_sequence(self, text, cache_path):
        """generate a phoneme sequence from text.

        since the usage is for subsequent caching, we never add bos and
        eos chars here. Instead we add those dynamically later; based on the
        config option."""
        phonemes = phoneme_to_sequence(text, [self.cleaners],
                                       language=self.phoneme_language,
                                       enable_eos_bos=False)
        phonemes = np.asarray(phonemes, dtype=np.int32)
        np.save(cache_path, phonemes)
        return phonemes
def synthesis(m, s, CONFIG, use_cuda, ap, language=None):
    """ Given the text, synthesising the audio """
    if language is None:
        language = CONFIG.phoneme_language
    text_cleaner = [CONFIG.text_cleaner]
    # print(phoneme_to_sequence(s, text_cleaner))
    # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner)))
    if CONFIG.use_phonemes:
        seq = np.asarray(phoneme_to_sequence(s, text_cleaner, language),
                         dtype=np.int32)
    else:
        seq = np.asarray(text_to_sequence(s, text_cleaner), dtype=np.int32)
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    mel_spec, linear_spec, alignments, stop_tokens = m.forward(
        chars_var.long())
    linear_spec = linear_spec[0].data.cpu().numpy()
    mel_spec = mel_spec[0].data.cpu().numpy()
    alignment = alignments[0].cpu().data.numpy()
    wav = ap.inv_spectrogram(linear_spec.T)
    wav = wav[:ap.find_endpoint(wav)]
    return wav
Пример #5
0
def visualize(alignment,
              postnet_output,
              stop_tokens,
              text,
              hop_length,
              CONFIG,
              decoder_output=None,
              output_path=None,
              figsize=(8, 24)):
    if decoder_output is not None:
        num_plot = 4
    else:
        num_plot = 3

    label_fontsize = 16
    fig = plt.figure(figsize=figsize)

    plt.subplot(num_plot, 1, 1)
    plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
    plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
    plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
    # compute phoneme representation and back
    if CONFIG.use_phonemes:
        seq = phoneme_to_sequence(
            text, [CONFIG.text_cleaner],
            CONFIG.phoneme_language,
            CONFIG.enable_eos_bos_chars,
            tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
        text = sequence_to_phoneme(
            seq,
            tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
        print(text)
    plt.yticks(range(len(text)), list(text))
    plt.colorbar()
    # plot stopnet predictions
    plt.subplot(num_plot, 1, 2)
    plt.plot(range(len(stop_tokens)), list(stop_tokens))
    # plot postnet spectrogram
    plt.subplot(num_plot, 1, 3)
    librosa.display.specshow(postnet_output.T,
                             sr=CONFIG.audio['sample_rate'],
                             hop_length=hop_length,
                             x_axis="time",
                             y_axis="linear",
                             fmin=CONFIG.audio['mel_fmin'],
                             fmax=CONFIG.audio['mel_fmax'])

    plt.xlabel("Time", fontsize=label_fontsize)
    plt.ylabel("Hz", fontsize=label_fontsize)
    plt.tight_layout()
    plt.colorbar()

    if decoder_output is not None:
        plt.subplot(num_plot, 1, 4)
        librosa.display.specshow(decoder_output.T,
                                 sr=CONFIG.audio['sample_rate'],
                                 hop_length=hop_length,
                                 x_axis="time",
                                 y_axis="linear",
                                 fmin=CONFIG.audio['mel_fmin'],
                                 fmax=CONFIG.audio['mel_fmax'])
        plt.xlabel("Time", fontsize=label_fontsize)
        plt.ylabel("Hz", fontsize=label_fontsize)
        plt.tight_layout()
        plt.colorbar()

    if output_path:
        print(output_path)
        fig.savefig(output_path)
        plt.close()