def hello(): waveglow.load_waveglow_torch('../models/waveglow/waveglow_v5_model.pt') # melgan.load_melgan_model(r'E:\githup\zhrtvc\models\vocoder\saved_models\melgan\melgan_multi_speaker.pt', # args_path=r'E:\githup\zhrtvc\models\vocoder\saved_models\melgan\args.yml') melgan.load_melgan_torch('../models/melgan/melgan_multi_speaker_model.pt') # mellotron.load_mellotron_model(r'E:\githup\zhrtvc\models\mellotron\samples\checkpoint\checkpoint-000000.pt', # hparams_path=r'E:\githup\zhrtvc\models\mellotron\samples\metadata\hparams.yml') # # torch.save(mellotron._model, '../models/mellotron/mellotron_samples_model.pt') mellotron.load_mellotron_torch( '../models/mellotron/mellotron_samples_model.pt') # text, mel, speaker, f0 text = torch.randint(0, 100, [4, 50]).cuda() style = 0 # torch.rand(4, 80, 400).cuda() speaker = torch.randint(0, 10, [4]).cuda() f0 = None # torch.rand(4, 400) mels = mellotron.generate_mel(text=text, style=style, speaker=speaker, f0=f0) for mel in mels: print(mel.shape) mel = torch.rand(4, 80, 400).cuda() wav = waveglow.generate_wave(mel) print(wav.shape)
def tts_sdk_base(text, speaker='biaobei', audio='24', output='', **kwargs): """语音合成函数式SDK接口。 text为待合成的文本。 speaker可设置为内置的发音人名称,可选名称见_reference_audio_dict;默认的发音人名称列表见resource/reference_audio/__init__.py。 audio如果是数字,则调用内置的语音作为发音人参考音频;如果是语音路径,则调用audio路径的语音作为发音人参考音频。 output如果以.wav结尾,则为保存语音文件的路径;如果以play开头,则合成语音后自动播放语音。 """ global _dataloader if _dataloader is None: load_models(**kwargs) load_audio(**kwargs) if str(audio).isdigit(): audio = _reference_audio_list[(int(audio) - 1) % len(_reference_audio_list)] elif os.path.isfile(audio): audio = str(audio) elif isinstance(audio, bytes): tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(audio) audio = tmp_audio.name elif isinstance(audio, str) and len(audio) >= 256: tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(base64.standard_b64decode(audio)) audio = tmp_audio.name elif speaker in _reference_audio_dict: audio = _reference_audio_dict[speaker] else: raise AssertionError text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=_dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) out_gate = gates.cpu().numpy()[0] end_idx = np.argmax(out_gate > kwargs.get('gate_threshold', 0.2) ) or np.argmax(out_gate) or out_gate.shape[0] mels_postnet = mels_postnet[:, :, :end_idx] vocoder_name = kwargs.get('vocoder', 'waveglow') if vocoder_name == 'waveglow': wavs = waveglow.generate_wave(mel=mels_postnet, **kwargs) else: wavs = _stft.griffin_lim(mels_postnet, n_iters=10) wav_output = wavs.squeeze(0).cpu().numpy() if output.startswith('play'): aukit.play_sound(wav_output, sr=_stft.sampling_rate) if output.endswith('.wav'): aukit.save_wav(wav_output, output, sr=_stft.sampling_rate) wav_output = aukit.anything2bytes(wav_output, sr=_stft.sampling_rate) return wav_output
def tts_sdk(text, speaker='biaobei', audio='0', **kwargs): global _dataloader if _dataloader is None: load_models(**kwargs) if str(audio).isdigit(): audio = _reference_audio_list[(int(audio) - 1) % len(_reference_audio_list)] elif os.path.isfile(audio): audio = str(audio) elif isinstance(audio, bytes): tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(audio) audio = tmp_audio.name elif isinstance(audio, str) and len(audio) >= 100: tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(base64.standard_b64decode(audio)) audio = tmp_audio.name elif speaker in _reference_audio_dict: audio = _reference_audio_dict[speaker] else: raise AssertionError text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=_dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) out_gate = gates.cpu().numpy()[0] end_idx = np.argmax( out_gate > 0.2) or np.argmax(out_gate) or out_gate.shape[0] mels_postnet = mels_postnet[:, :, :end_idx] if _use_waveglow: wavs = waveglow.generate_wave(mel=mels_postnet, **kwargs) else: wavs = _stft.griffin_lim(mels_postnet, n_iters=5) wav_output = wavs.squeeze(0).cpu().numpy() output = kwargs.get('output', '') if output.startswith('play'): aukit.play_sound(wav_output, sr=_stft.sampling_rate) if output.endswith('.wav'): aukit.save_wav(wav_output, output, sr=_stft.sampling_rate) wav_output = aukit.anything2bytes(wav_output, sr=_stft.sampling_rate) return wav_output
# 模型测试 with tempfile.TemporaryDirectory() as tmpdir: audio = os.path.join(tmpdir, 'audio_example.wav') pydub.AudioSegment.silent(3000, frame_rate=args.sampling_rate).export( audio, format='wav') text = '这是个试水的例子。' speaker = 'speaker' text_data, style_data, speaker_data, f0_data = transform_mellotron_input_data( dataloader=dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) wavs = waveglow.generate_wave(mel=mels, **waveglow_kwargs) wav_output = wavs.squeeze().cpu().numpy() aukit.save_wav(wav_output, os.path.join(tmpdir, 'demo_example.wav'), sr=args.sampling_rate) print('Test success done.') # 模型推理 if os.path.isfile(texts_path): text_inputs = [w.strip() for w in open(texts_path, encoding='utf8')] if args.is_simple:
def voice_clone_interface(audio: str, text: str, speaker: str) -> str: denoise.noisy_processing(audio, audio) # 对输入音频进行降噪处理 # for text_input in tqdm(zip(audio_lst, text_lst, speaker_lst), 'TTS', total=len(audio_lst), ncols=100): # for text_input in tqdm(text_inputs, 'TTS', ncols=100): # print('Running: {}'.format(text_input)) # # audio, text, speaker = text_input # .split('\t') # 遍历一个一个 # print("audio的内容:",audio) # '/home/project/zhrtvc/data/samples/aishell3/wav/SSB00110401.wav' # print("text的内容:",text) # '三百零五千三百三十四。' # print("speaker的内容:",speaker) # 'SSB0011' text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) out_gate = gates.cpu().numpy()[0] end_idx = np.argmax(out_gate > 0.2) or out_gate.shape[0] mels_postnet = mels_postnet[:, :, :end_idx] if _use_waveglow: print("use waveglow:") wavs = waveglow.generate_wave(mel=mels_postnet, **waveglow_kwargs) else: print("use waveglow:") wavs = _stft.griffin_lim(mels_postnet, n_iters=5) # 保存数据 cur_text = filename_formatter_re.sub('', unidecode.unidecode(text))[:15] cur_time = time.strftime('%Y%m%d-%H%M%S') outpath = os.path.join(output_dir, "demo_{}_{}_out.wav".format(cur_time, cur_text)) # print("outpath的路径:",outpath) wav_output = wavs.squeeze(0).cpu().numpy() aukit.save_wav(wav_output, outpath, sr=args.sampling_rate) # sampling_rate=22050 if isinstance(audio, (Path, str)) and Path(audio).is_file(): # # 原声 # refpath_raw = os.path.join(output_dir, "demo_{}_{}_ref_copy.wav".format(cur_time, cur_text)) # shutil.copyfile(audio, refpath_raw) # 重采样 wav_input, sr = aukit.load_wav(audio, with_sr=True) wav_input = librosa.resample(wav_input, sr, args.sampling_rate) refpath = os.path.join(output_dir, "demo_{}_{}_ref.wav".format(cur_time, cur_text)) aukit.save_wav(wav_input, refpath, sr=args.sampling_rate) # # 声码器 # wavs_ref = waveglow.generate_wave(mel=mel_data, **waveglow_kwargs) # outpath_ref = os.path.join(output_dir, "demo_{}_{}_ref_waveglow.wav".format(cur_time, cur_text)) # wav_output_ref = wavs_ref.squeeze(0).cpu().numpy() # aukit.save_wav(wav_output_ref, outpath_ref, sr=args.sampling_rate) fig_path = os.path.join(output_dir, "demo_{}_{}_fig.jpg".format(cur_time, cur_text)) plot_mel_alignment_gate_audio( mel=mels_postnet.squeeze(0).cpu().numpy(), alignment=alignments.squeeze(0).cpu().numpy(), gate=gates.squeeze(0).cpu().numpy(), audio=wav_output[::args.sampling_rate // 1000]) plt.savefig(fig_path) plt.close() # 先屏蔽掉信息 # yml_path = os.path.join(output_dir, "demo_{}_{}_info.yml".format(cur_time, cur_text)) # info_dict = locals2dict(locals()) # with open(yml_path, 'wt', encoding='utf8') as fout: # yaml.dump(info_dict, fout, encoding='utf-8', allow_unicode=True) # log_path = os.path.join(output_dir, "info_dict.txt".format(cur_time)) # with open(log_path, 'at', encoding='utf8') as fout: # fout.write('{}\n'.format(json.dumps(info_dict, ensure_ascii=False))) print('Test success done.返回克隆的音频为:', outpath) denoise.noisy_processing(outpath, outpath) # 对输出音频进行降噪处理 return outpath