Пример #1
0
def hello():
    waveglow.load_waveglow_torch('../models/waveglow/waveglow_v5_model.pt')
    # melgan.load_melgan_model(r'E:\githup\zhrtvc\models\vocoder\saved_models\melgan\melgan_multi_speaker.pt',
    #                          args_path=r'E:\githup\zhrtvc\models\vocoder\saved_models\melgan\args.yml')
    melgan.load_melgan_torch('../models/melgan/melgan_multi_speaker_model.pt')

    # mellotron.load_mellotron_model(r'E:\githup\zhrtvc\models\mellotron\samples\checkpoint\checkpoint-000000.pt',
    #                                hparams_path=r'E:\githup\zhrtvc\models\mellotron\samples\metadata\hparams.yml')
    #
    # torch.save(mellotron._model, '../models/mellotron/mellotron_samples_model.pt')
    mellotron.load_mellotron_torch(
        '../models/mellotron/mellotron_samples_model.pt')

    # text, mel, speaker, f0
    text = torch.randint(0, 100, [4, 50]).cuda()
    style = 0  # torch.rand(4, 80, 400).cuda()
    speaker = torch.randint(0, 10, [4]).cuda()
    f0 = None  # torch.rand(4, 400)

    mels = mellotron.generate_mel(text=text,
                                  style=style,
                                  speaker=speaker,
                                  f0=f0)

    for mel in mels:
        print(mel.shape)

    mel = torch.rand(4, 80, 400).cuda()

    wav = waveglow.generate_wave(mel)
    print(wav.shape)
Пример #2
0
def tts_sdk_base(text, speaker='biaobei', audio='24', output='', **kwargs):
    """语音合成函数式SDK接口。
    text为待合成的文本。
    speaker可设置为内置的发音人名称,可选名称见_reference_audio_dict;默认的发音人名称列表见resource/reference_audio/__init__.py。
    audio如果是数字,则调用内置的语音作为发音人参考音频;如果是语音路径,则调用audio路径的语音作为发音人参考音频。
    output如果以.wav结尾,则为保存语音文件的路径;如果以play开头,则合成语音后自动播放语音。
    """
    global _dataloader
    if _dataloader is None:
        load_models(**kwargs)
        load_audio(**kwargs)

    if str(audio).isdigit():
        audio = _reference_audio_list[(int(audio) - 1) %
                                      len(_reference_audio_list)]
    elif os.path.isfile(audio):
        audio = str(audio)
    elif isinstance(audio, bytes):
        tmp_audio = tempfile.TemporaryFile(suffix='.wav')
        tmp_audio.write(audio)
        audio = tmp_audio.name
    elif isinstance(audio, str) and len(audio) >= 256:
        tmp_audio = tempfile.TemporaryFile(suffix='.wav')
        tmp_audio.write(base64.standard_b64decode(audio))
        audio = tmp_audio.name
    elif speaker in _reference_audio_dict:
        audio = _reference_audio_dict[speaker]
    else:
        raise AssertionError
    text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data(
        dataloader=_dataloader,
        text=text,
        speaker=speaker,
        audio=audio,
        device=_device)

    mels, mels_postnet, gates, alignments = mellotron.generate_mel(
        text_data, style_data, speaker_data, f0_data)

    out_gate = gates.cpu().numpy()[0]
    end_idx = np.argmax(out_gate > kwargs.get('gate_threshold', 0.2)
                        ) or np.argmax(out_gate) or out_gate.shape[0]

    mels_postnet = mels_postnet[:, :, :end_idx]
    vocoder_name = kwargs.get('vocoder', 'waveglow')
    if vocoder_name == 'waveglow':
        wavs = waveglow.generate_wave(mel=mels_postnet, **kwargs)
    else:
        wavs = _stft.griffin_lim(mels_postnet, n_iters=10)

    wav_output = wavs.squeeze(0).cpu().numpy()

    if output.startswith('play'):
        aukit.play_sound(wav_output, sr=_stft.sampling_rate)
    if output.endswith('.wav'):
        aukit.save_wav(wav_output, output, sr=_stft.sampling_rate)
    wav_output = aukit.anything2bytes(wav_output, sr=_stft.sampling_rate)
    return wav_output
Пример #3
0
def tts_sdk(text, speaker='biaobei', audio='0', **kwargs):
    global _dataloader
    if _dataloader is None:
        load_models(**kwargs)

    if str(audio).isdigit():
        audio = _reference_audio_list[(int(audio) - 1) %
                                      len(_reference_audio_list)]
    elif os.path.isfile(audio):
        audio = str(audio)
    elif isinstance(audio, bytes):
        tmp_audio = tempfile.TemporaryFile(suffix='.wav')
        tmp_audio.write(audio)
        audio = tmp_audio.name
    elif isinstance(audio, str) and len(audio) >= 100:
        tmp_audio = tempfile.TemporaryFile(suffix='.wav')
        tmp_audio.write(base64.standard_b64decode(audio))
        audio = tmp_audio.name
    elif speaker in _reference_audio_dict:
        audio = _reference_audio_dict[speaker]
    else:
        raise AssertionError
    text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data(
        dataloader=_dataloader,
        text=text,
        speaker=speaker,
        audio=audio,
        device=_device)

    mels, mels_postnet, gates, alignments = mellotron.generate_mel(
        text_data, style_data, speaker_data, f0_data)

    out_gate = gates.cpu().numpy()[0]
    end_idx = np.argmax(
        out_gate > 0.2) or np.argmax(out_gate) or out_gate.shape[0]

    mels_postnet = mels_postnet[:, :, :end_idx]
    if _use_waveglow:
        wavs = waveglow.generate_wave(mel=mels_postnet, **kwargs)
    else:
        wavs = _stft.griffin_lim(mels_postnet, n_iters=5)

    wav_output = wavs.squeeze(0).cpu().numpy()

    output = kwargs.get('output', '')
    if output.startswith('play'):
        aukit.play_sound(wav_output, sr=_stft.sampling_rate)
    if output.endswith('.wav'):
        aukit.save_wav(wav_output, output, sr=_stft.sampling_rate)
    wav_output = aukit.anything2bytes(wav_output, sr=_stft.sampling_rate)
    return wav_output
Пример #4
0
        pydub.AudioSegment.silent(3000, frame_rate=args.sampling_rate).export(
            audio, format='wav')

        text = '这是个试水的例子。'
        speaker = 'speaker'
        text_data, style_data, speaker_data, f0_data = transform_mellotron_input_data(
            dataloader=dataloader,
            text=text,
            speaker=speaker,
            audio=audio,
            device=_device)

        mels, mels_postnet, gates, alignments = mellotron.generate_mel(
            text_data, style_data, speaker_data, f0_data)

        wavs = waveglow.generate_wave(mel=mels, **waveglow_kwargs)

        wav_output = wavs.squeeze().cpu().numpy()
        aukit.save_wav(wav_output,
                       os.path.join(tmpdir, 'demo_example.wav'),
                       sr=args.sampling_rate)

    print('Test success done.')

    # 模型推理

    if os.path.isfile(texts_path):
        text_inputs = [w.strip() for w in open(texts_path, encoding='utf8')]
        if args.is_simple:
            text_inputs = np.random.choice(text_inputs,
                                           min(len(text_inputs), 10),
Пример #5
0
    else:
        _device = args.device

    # 模型导入
    load_models(args)

    # 模型测试
    text_test = '这是个试水的例子。'

    text_data, style_data, speaker_data, f0_data = transform_mellotron_input_data(
        text=text_test, device=_device)

    mels, mels_postnet, gates, alignments = mellotron.generate_mel(
        text_data, style_data, speaker_data, f0_data)

    wavs = waveglow.generate_wave(mel=mels)

    with tempfile.TemporaryDirectory() as tmpdir:
        wav_output = wavs.squeeze().cpu().numpy()
        aukit.save_wav(wav_output,
                       os.path.join(tmpdir, 'demo_example.wav'),
                       sr=args.sampling_rate)

    # 模型推理

    if os.path.isfile(args.input):
        text_inputs = [w.strip() for w in open(args.input, encoding='utf8')]
    else:
        text_inputs = [args.input]

    output_dir = args.output
Пример #6
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        # if not vocoder.is_loaded():
        if not waveglow.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / aukit._sr) * 1000  # Synthesizer.sample_rate
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        wav = None
        vocname = ""
        if self.ui.current_vocoder_fpath is not None:
            model_fpath = self.ui.current_vocoder_fpath
            vocname = Path(model_fpath).parent.stem
            wav = waveglow.generate_wave(spec)
            # if Path(model_fpath).parent.stem == "melgan":
            #     self.ui.log("Waveform generation with MelGAN... ")
            #     wav = vocoder_melgan.infer_waveform_melgan(spec, model_fpath)
            #
            # elif Path(model_fpath).parent.stem == "wavernn":
            #     self.ui.log("Waveform generation with WaveRNN... ")
            #     wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
            # elif Path(model_fpath).parent.stem == "waveglow":
            #     wav = waveglow.generate_wave(spec)
        if wav is None:
            vocname = "griffinlim"
            self.ui.log("Waveform generation with Griffin-Lim... ")
            # wav = Synthesizer.griffin_lim(spec)
            wav = aukit.inv_mel_spectrogram(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, aukit._sr)

        fref = self.ui.selected_utterance.name
        ftime = '{}'.format(time_formatter())
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / aukit._sr)
        fvoc = vocname
        fname = filename_formatter('{}_{}_{}_{}ms_{}.wav'.format(fref, ftime, fvoc, fms, ftext))
        audio.save_wav(wav, self._out_wav_dir.joinpath(fname), aukit._sr)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_{}".format(time_formatter())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
Пример #7
0
            audio, format='wav')

        text = '这是个试水的例子。'
        speaker = 'speaker'
        text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data(
            dataloader=dataloader,
            text=text,
            speaker=speaker,
            audio=audio,
            device=_device)

        mels, mels_postnet, gates, alignments = mellotron.generate_mel(
            text_data, style_data, speaker_data, f0_data)

        if _use_waveglow:
            wavs = waveglow.generate_wave(mel=mels_postnet, **waveglow_kwargs)
        else:
            wavs = _stft.griffin_lim(mels_postnet)
            wav_output = wavs.squeeze().cpu().numpy()
            aukit.save_wav(wav_output,
                           os.path.join(tmpdir, 'demo_example.wav'),
                           sr=args.sampling_rate)

    print('Test success done.')

    # 模型推理

    if os.path.isfile(texts_path):
        text_inputs = [w.strip() for w in open(texts_path, encoding='utf8')]
        if args.is_simple:
            text_inputs = np.random.choice(text_inputs,
Пример #8
0
def voice_clone_interface(audio: str, text: str, speaker: str) -> str:
    denoise.noisy_processing(audio, audio)  # 对输入音频进行降噪处理
    #    for text_input in tqdm(zip(audio_lst, text_lst, speaker_lst), 'TTS', total=len(audio_lst), ncols=100):
    # for text_input in tqdm(text_inputs, 'TTS', ncols=100):
    # print('Running: {}'.format(text_input))

    # # audio, text, speaker = text_input  # .split('\t') # 遍历一个一个
    # print("audio的内容:",audio) # '/home/project/zhrtvc/data/samples/aishell3/wav/SSB00110401.wav'
    # print("text的内容:",text) # '三百零五千三百三十四。'
    # print("speaker的内容:",speaker) # 'SSB0011'

    text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data(
        dataloader=dataloader,
        text=text,
        speaker=speaker,
        audio=audio,
        device=_device)

    mels, mels_postnet, gates, alignments = mellotron.generate_mel(
        text_data, style_data, speaker_data, f0_data)

    out_gate = gates.cpu().numpy()[0]
    end_idx = np.argmax(out_gate > 0.2) or out_gate.shape[0]

    mels_postnet = mels_postnet[:, :, :end_idx]
    if _use_waveglow:
        print("use waveglow:")
        wavs = waveglow.generate_wave(mel=mels_postnet, **waveglow_kwargs)
    else:
        print("use waveglow:")
        wavs = _stft.griffin_lim(mels_postnet, n_iters=5)

    # 保存数据
    cur_text = filename_formatter_re.sub('', unidecode.unidecode(text))[:15]
    cur_time = time.strftime('%Y%m%d-%H%M%S')
    outpath = os.path.join(output_dir,
                           "demo_{}_{}_out.wav".format(cur_time, cur_text))
    # print("outpath的路径:",outpath)

    wav_output = wavs.squeeze(0).cpu().numpy()
    aukit.save_wav(wav_output, outpath,
                   sr=args.sampling_rate)  # sampling_rate=22050

    if isinstance(audio, (Path, str)) and Path(audio).is_file():
        # # 原声
        # refpath_raw = os.path.join(output_dir, "demo_{}_{}_ref_copy.wav".format(cur_time, cur_text))
        # shutil.copyfile(audio, refpath_raw)

        # 重采样
        wav_input, sr = aukit.load_wav(audio, with_sr=True)
        wav_input = librosa.resample(wav_input, sr, args.sampling_rate)
        refpath = os.path.join(output_dir,
                               "demo_{}_{}_ref.wav".format(cur_time, cur_text))
        aukit.save_wav(wav_input, refpath, sr=args.sampling_rate)

        # # 声码器
        # wavs_ref = waveglow.generate_wave(mel=mel_data, **waveglow_kwargs)
        # outpath_ref = os.path.join(output_dir, "demo_{}_{}_ref_waveglow.wav".format(cur_time, cur_text))
        # wav_output_ref = wavs_ref.squeeze(0).cpu().numpy()
        # aukit.save_wav(wav_output_ref, outpath_ref, sr=args.sampling_rate)

    fig_path = os.path.join(output_dir,
                            "demo_{}_{}_fig.jpg".format(cur_time, cur_text))

    plot_mel_alignment_gate_audio(
        mel=mels_postnet.squeeze(0).cpu().numpy(),
        alignment=alignments.squeeze(0).cpu().numpy(),
        gate=gates.squeeze(0).cpu().numpy(),
        audio=wav_output[::args.sampling_rate // 1000])
    plt.savefig(fig_path)
    plt.close()
    # 先屏蔽掉信息
    # yml_path = os.path.join(output_dir, "demo_{}_{}_info.yml".format(cur_time, cur_text))
    # info_dict = locals2dict(locals())
    # with open(yml_path, 'wt', encoding='utf8') as fout:
    #     yaml.dump(info_dict, fout, encoding='utf-8', allow_unicode=True)

    # log_path = os.path.join(output_dir, "info_dict.txt".format(cur_time))
    # with open(log_path, 'at', encoding='utf8') as fout:
    #     fout.write('{}\n'.format(json.dumps(info_dict, ensure_ascii=False)))

    print('Test success done.返回克隆的音频为:', outpath)
    denoise.noisy_processing(outpath, outpath)  # 对输出音频进行降噪处理
    return outpath