def run_joint(fpath, sr=_sr, outdir=Path("")): """ run joint :param fpath: :param sr: :param outdir: :return: """ curdir = Path(fpath).parent outdir = Path(outdir) outdir.mkdir(exist_ok=True, parents=True) with open(outdir.joinpath("metadata.csv"), "wt", encoding="utf8") as fout: load_pair = load_pairs(fpath) for spk, ptpairs_raw in tqdm(load_pair, desc="speaker", ncols=100): gen_pair = choice_pairs(ptpairs_raw, n_choice=100) for num, ptpairs_joint in enumerate(tqdm(gen_pair, desc="choice", ncols=100), 1): wtpairs_joint = [(aukit.load_wav(p, sr=sr), t) for p, t in ptpairs_joint] wav, text = joint_audio_and_text(wtpairs_joint) parts = list(Path(ptpairs_joint[0][0]).relative_to(curdir).parts)[:-1] parts.append("{}_{:06d}.wav".format(spk, num)) outname = "/".join(parts) outpath = outdir.joinpath(outname) outpath.parent.mkdir(exist_ok=True, parents=True) aukit.save_wav(wav, sr=sr, path=outpath) fout.write("{}\t{}\n".format(outname, text))
def tts_sdk_base(text, speaker='biaobei', audio='24', output='', **kwargs): """语音合成函数式SDK接口。 text为待合成的文本。 speaker可设置为内置的发音人名称,可选名称见_reference_audio_dict;默认的发音人名称列表见resource/reference_audio/__init__.py。 audio如果是数字,则调用内置的语音作为发音人参考音频;如果是语音路径,则调用audio路径的语音作为发音人参考音频。 output如果以.wav结尾,则为保存语音文件的路径;如果以play开头,则合成语音后自动播放语音。 """ global _dataloader if _dataloader is None: load_models(**kwargs) load_audio(**kwargs) if str(audio).isdigit(): audio = _reference_audio_list[(int(audio) - 1) % len(_reference_audio_list)] elif os.path.isfile(audio): audio = str(audio) elif isinstance(audio, bytes): tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(audio) audio = tmp_audio.name elif isinstance(audio, str) and len(audio) >= 256: tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(base64.standard_b64decode(audio)) audio = tmp_audio.name elif speaker in _reference_audio_dict: audio = _reference_audio_dict[speaker] else: raise AssertionError text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=_dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) out_gate = gates.cpu().numpy()[0] end_idx = np.argmax(out_gate > kwargs.get('gate_threshold', 0.2) ) or np.argmax(out_gate) or out_gate.shape[0] mels_postnet = mels_postnet[:, :, :end_idx] vocoder_name = kwargs.get('vocoder', 'waveglow') if vocoder_name == 'waveglow': wavs = waveglow.generate_wave(mel=mels_postnet, **kwargs) else: wavs = _stft.griffin_lim(mels_postnet, n_iters=10) wav_output = wavs.squeeze(0).cpu().numpy() if output.startswith('play'): aukit.play_sound(wav_output, sr=_stft.sampling_rate) if output.endswith('.wav'): aukit.save_wav(wav_output, output, sr=_stft.sampling_rate) wav_output = aukit.anything2bytes(wav_output, sr=_stft.sampling_rate) return wav_output
def validate(model, criterion, valset, iteration, batch_size, n_gpus, collate_fn, logger, distributed_run, rank, outdir=Path()): """Handles all the validation scoring and printing""" save_flag = True model.eval() with torch.no_grad(): val_sampler = DistributedSampler(valset) if distributed_run else None val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, shuffle=True, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) # shuffle=False, val_loss = 0.0 for i, batch in enumerate(val_loader): x, y = model.parse_batch(batch) # y: 2部分 # x: (text_padded, input_lengths, mel_padded, max_len, output_lengths, speaker_ids, f0_padded), # y: (mel_padded, gate_padded) # x: # torch.Size([4, 64]) # torch.Size([4]) # torch.Size([4, 401, 347]) # y: # torch.Size([4, 401, 439]) # torch.Size([4, 439]) y_pred = model(x) # y_pred: 4部分 # y_pred: # torch.Size([4, 401, 439]) # torch.Size([4, 401, 439]) # torch.Size([4, 439]) # torch.Size([4, 439, 114]) mel_outputs, mel_outputs_2, gate_outputs, alignments = y_pred loss = criterion(y_pred, y) if outdir and save_flag: curdir = outdir.joinpath('validation', f'{iteration:06d}-{loss.data.cpu().numpy():.4f}') curdir.mkdir(exist_ok=True, parents=True) plt.imsave(curdir.joinpath('spectrogram_pred.png'), mel_outputs[0].cpu().numpy()) plt.imsave(curdir.joinpath('spectrogram_true.png'), y[0][0].cpu().numpy()) plt.imsave(curdir.joinpath('alignment_pred.png'), alignments[0].cpu().numpy().T) wav_output = inv_linearspectrogram(mel_outputs[0].cpu().numpy()) aukit.save_wav(wav_output, curdir.joinpath('griffinlim_pred.wav'), sr=hparams.sampling_rate) wav_output = inv_linearspectrogram(y[0][0].cpu().numpy()) aukit.save_wav(wav_output, curdir.joinpath('griffinlim_true.wav'), sr=hparams.sampling_rate) save_flag = False if distributed_run: reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss val_loss = val_loss / (i + 1) model.train() if rank == 0: print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss)) logger.log_validation(val_loss, model, y, y_pred, iteration, x)
def singing_voice_v2(): # Singing Voice from Music Score data = get_data_from_musicxml('data/sinsy/csongdb_f00002_000_en.musicxml', 132, convert_stress=True) panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]} n_speakers_per_part = 4 frequency_scaling = 0.4 n_seconds = 90 audio_stereo = np.zeros((hparams.sampling_rate * n_seconds, 2), dtype=np.float32) for i, (part, v) in enumerate(data.items()): rhythm = data[part]['rhythm'].cuda() pitch_contour = data[part]['pitch_contour'].cuda() text_encoded = data[part]['text_encoded'].cuda() for k in range(n_speakers_per_part): pan = k # pan = np.random.randint(panning[part][0], panning[part][1]) if any(x in part.lower() for x in ('soprano', 'alto', 'female')): speaker_id = torch.LongTensor([next(female_speakers)]).cuda() else: speaker_id = torch.LongTensor([next(male_speakers)]).cuda() print("{} MellotronID {} pan {}".format(part, speaker_id.item(), pan)) with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour * frequency_scaling, rhythm)) plot_mel_f0_alignment(mel_outputs_postnet.data.cpu().numpy()[0], mel_outputs_postnet.data.cpu().numpy()[0], pitch_contour.data.cpu().numpy()[0, 0], rhythm.data.cpu().numpy()[:, 0].T) plt.show() out_mel = mel_outputs_postnet.data.cpu().numpy()[0] t0 = time.time() # wav = aukit.inv_mel_spectrogram() out_wav = infer_waveform_melgan(out_mel) print(time.time() - t0) aukit.save_wav(out_wav, "logs/musicxml_melgan_{}.wav".format(time.strftime("%Y%m%d-%H%M%S")), sr=22050) aukit.play_audio(out_wav, sr=22050) t0 = time.time() audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0] audio = audio.cpu().numpy() audio = panner(audio, pan) print(time.time() - t0) audio_stereo[:audio.shape[0]] += audio write("logs/{} {}.wav".format(part, speaker_id.item()), hparams.sampling_rate, audio) out_wav = audio aukit.play_audio(out_wav, sr=22050)
def tts_sdk(text, speaker='biaobei', audio='0', **kwargs): global _dataloader if _dataloader is None: load_models(**kwargs) if str(audio).isdigit(): audio = _reference_audio_list[(int(audio) - 1) % len(_reference_audio_list)] elif os.path.isfile(audio): audio = str(audio) elif isinstance(audio, bytes): tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(audio) audio = tmp_audio.name elif isinstance(audio, str) and len(audio) >= 100: tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(base64.standard_b64decode(audio)) audio = tmp_audio.name elif speaker in _reference_audio_dict: audio = _reference_audio_dict[speaker] else: raise AssertionError text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=_dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) out_gate = gates.cpu().numpy()[0] end_idx = np.argmax( out_gate > 0.2) or np.argmax(out_gate) or out_gate.shape[0] mels_postnet = mels_postnet[:, :, :end_idx] if _use_waveglow: wavs = waveglow.generate_wave(mel=mels_postnet, **kwargs) else: wavs = _stft.griffin_lim(mels_postnet, n_iters=5) wav_output = wavs.squeeze(0).cpu().numpy() output = kwargs.get('output', '') if output.startswith('play'): aukit.play_sound(wav_output, sr=_stft.sampling_rate) if output.endswith('.wav'): aukit.save_wav(wav_output, output, sr=_stft.sampling_rate) wav_output = aukit.anything2bytes(wav_output, sr=_stft.sampling_rate) return wav_output
def change_speed_one(kwargs: dict): inpath = kwargs.get("inpath") outpath = kwargs.get("outpath") rate = kwargs.get("rate") if Path(outpath).exists() and os.path.getsize(outpath) > 8000: return Path(outpath).parent.mkdir(exist_ok=True, parents=True) hp = Dict2Obj() hp.update(melgan_hparams) hp.update({"hop_size": int(melgan_hparams["hop_size"] * rate)}) try: wav = aukit.load_wav(inpath, sr=_sr) mel = wav2mel(wav, hparams=hp) out = infer_waveform_melgan(mel, load_path=_melgan_load_path) aukit.save_wav(out, outpath, sr=_sr) except Exception as e: print(e) print(kwargs) return kwargs
## Generating the waveform # print("Synthesizing the waveform ...") wav_outputs = msyner.stft.griffin_lim(torch.from_numpy(spec[None]), n_iters=5) wav_output = wav_outputs[0].cpu().numpy() # print("Waveform shape: {}".format(wav.shape)) # Save it on the disk cur_text = filename_formatter_re.sub( '', unidecode.unidecode(text))[:15] cur_time = time.strftime('%Y%m%d-%H%M%S') out_path = out_dir.joinpath("demo_{}_{}_out.wav".format( cur_time, cur_text)) aukit.save_wav(wav_output, out_path, sr=msyner.stft.sampling_rate) # save if isinstance(audio, (Path, str)) and Path(audio).is_file(): ref_path = out_dir.joinpath("demo_{}_{}_ref.wav".format( cur_time, cur_text)) shutil.copyfile(audio, ref_path) fig_path = out_dir.joinpath("demo_{}_{}_fig.jpg".format( cur_time, cur_text)) plot_mel_alignment_gate_audio( spec, align, gate, wav[::msyner.stft.sampling_rate // 1000]) plt.savefig(fig_path) plt.close() yml_path = out_dir.joinpath("demo_{}_{}_info.yml".format( cur_time, cur_text))
speaker = 'speaker' text_data, style_data, speaker_data, f0_data = transform_mellotron_input_data( dataloader=dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) wavs = waveglow.generate_wave(mel=mels, **waveglow_kwargs) wav_output = wavs.squeeze().cpu().numpy() aukit.save_wav(wav_output, os.path.join(tmpdir, 'demo_example.wav'), sr=args.sampling_rate) print('Test success done.') # 模型推理 if os.path.isfile(texts_path): text_inputs = [w.strip() for w in open(texts_path, encoding='utf8')] if args.is_simple: text_inputs = np.random.choice(text_inputs, min(len(text_inputs), 10), replace=False) else: text_inputs = [texts_path]
def validate(model, criterion, valset, iteration, batch_size, n_gpus, collate_fn, logger, distributed_run, rank, outdir=Path(), hparams=None): """Handles all the validation scoring and printing""" save_flag = True model.eval() with torch.no_grad(): val_sampler = DistributedSampler(valset) if distributed_run else None val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, shuffle=True, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) # shuffle=False, val_loss = 0.0 for i, batch in enumerate(tqdm(val_loader, 'validate', ncols=100)): x, y = model.parse_batch(batch) # y: 2部分 # x: (text_padded, input_lengths, mel_padded, max_len, output_lengths, speaker_ids, f0_padded), # y: (mel_padded, gate_padded) # x: # torch.Size([4, 64]) # torch.Size([4]) # torch.Size([4, 401, 347]) # y: # torch.Size([4, 401, 439]) # torch.Size([4, 439]) y_pred = model(x) # y_pred: 4部分 # y_pred: # torch.Size([4, 401, 439]) # torch.Size([4, 401, 439]) # torch.Size([4, 439]) # torch.Size([4, 439, 114]) mel_outputs, mel_outputs_postnet, gate_outputs, alignments = y_pred loss = criterion(y_pred, y) if outdir and save_flag: curdir = outdir.joinpath('validation', f'{iteration:06d}-{loss.data.cpu().numpy():.4f}') curdir.mkdir(exist_ok=True, parents=True) idx = random.randint(0, alignments.size(0) - 1) gate_output = gate_outputs[idx].data.cpu().numpy() end_idx = np.argmax(gate_output > 0.5) or gate_output.shape[0] mel = mel_outputs_postnet[idx][:, :end_idx].unsqueeze(0) wav_outputs = valset.stft.griffin_lim(mel) wav_output = wav_outputs[0].cpu().numpy() aukit.save_wav(wav_output, curdir.joinpath('griffinlim_pred.wav'), sr=hparams.sampling_rate) mel_targets = y[0] gate_targets = y[1] gate_target = gate_targets[idx].data.cpu().numpy() end_idx = np.argmax(gate_target > 0.5) or gate_target.shape[0] mel = mel_targets[idx][:, :end_idx].unsqueeze(0) wav_inputs = valset.stft.griffin_lim(mel) wav_input = wav_inputs[0].cpu().numpy() aukit.save_wav(wav_input, curdir.joinpath('griffinlim_true.wav'), sr=hparams.sampling_rate) plot_mel_alignment_gate_audio(target=mel_targets[idx].cpu().numpy(), mel=mel_outputs[idx].cpu().numpy(), alignment=alignments[idx].cpu().numpy().T, gate=torch.sigmoid(gate_outputs[idx]).cpu().numpy(), audio=wav_output[::hparams.sampling_rate // 100]) plt.savefig(curdir.joinpath('figure.png')) plt.close() save_flag = False if distributed_run: reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss val_loss = val_loss / (i + 1) model.train() if rank == 0: print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss)) logger.log_validation(val_loss, model, y, y_pred, iteration, x)
print("Spectrogram shape: {}".format(spec.shape)) print("Alignment shape: {}".format(align.shape)) ## Generating the waveform print("Synthesizing the waveform ...") wav = melgan_inference(model=melgan_model, spec=spec) wav_tf = aukit.inv_linear_spectrogram(spec) print("Waveform shape: {}".format(wav.shape)) # Save it on the disk cur_time = time.strftime('%Y%m%d_%H%M%S') fpath = args.out_dir.joinpath( "demo_out_{}_melgan.wav".format(cur_time)) outpath = fpath # librosa.output.write_wav(fpath, generated_wav.astype(np.float32), synthesizer.sample_rate) aukit.save_wav(wav, fpath, sr=16000) # save fpath = args.out_dir.joinpath( "demo_out_{}_griffinlim.wav".format(cur_time)) aukit.save_wav(wav, fpath, sr=16000) # save fpath = args.out_dir.joinpath( "demo_out_{}_spectrogram.jpg".format(cur_time)) # plt.imsave(fpath, spec) plt.pcolor(spec) plt.colorbar() plt.savefig(fpath) plt.close() fpath = args.out_dir.joinpath( "demo_out_{}_alignment.jpg".format(cur_time))
print("Creating the spectrogram ...") spec = msyner.synthesize(text=text, speaker=speaker) # spec, align = synthesize_one(text, speaker=speaker, with_alignment=True, # hparams=_hparams, encoder_fpath=args.encoder_model_fpath) print("Spectrogram shape: {}".format(spec.shape)) # print("Alignment shape: {}".format(align.shape)) ## Generating the waveform print("Synthesizing the waveform ...") wav = griffinlim_vocoder(spec) print("Waveform shape: {}".format(wav.shape)) # Save it on the disk cur_time = time.strftime('%Y%m%d_%H%M%S') fpath = args.out_dir.joinpath("demo_out_{}.wav".format(cur_time)) # librosa.output.write_wav(fpath, generated_wav.astype(np.float32), synthesizer.sample_rate) aukit.save_wav(wav, fpath, sr=_hparams.sampling_rate) # save txt_path = args.out_dir.joinpath("info_dict.txt".format(cur_time)) with open(txt_path, 'at', encoding='utf8') as fout: dt = dict(text=text, audio_path=str(fpath), speaker=speaker, time=cur_time) out = json.dumps(dt, ensure_ascii=False) fout.write('{}\n'.format(out)) num_generated += 1 print("\nSaved output as %s\n\n" % fpath) if args.play: aukit.play_audio(fpath, sr=_hparams.sampling_rate) except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n") traceback.print_exc()
def remove_noise_audio(inpath, outpath): """音频降噪。""" import aukit wav = aukit.load_wav(inpath, sr=16000) out = aukit.remove_noise(wav, sr=16000) aukit.save_wav(out, outpath, sr=16000)
def voice_clone_interface(audio: str, text: str, speaker: str) -> str: denoise.noisy_processing(audio, audio) # 对输入音频进行降噪处理 # for text_input in tqdm(zip(audio_lst, text_lst, speaker_lst), 'TTS', total=len(audio_lst), ncols=100): # for text_input in tqdm(text_inputs, 'TTS', ncols=100): # print('Running: {}'.format(text_input)) # # audio, text, speaker = text_input # .split('\t') # 遍历一个一个 # print("audio的内容:",audio) # '/home/project/zhrtvc/data/samples/aishell3/wav/SSB00110401.wav' # print("text的内容:",text) # '三百零五千三百三十四。' # print("speaker的内容:",speaker) # 'SSB0011' text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) out_gate = gates.cpu().numpy()[0] end_idx = np.argmax(out_gate > 0.2) or out_gate.shape[0] mels_postnet = mels_postnet[:, :, :end_idx] if _use_waveglow: print("use waveglow:") wavs = waveglow.generate_wave(mel=mels_postnet, **waveglow_kwargs) else: print("use waveglow:") wavs = _stft.griffin_lim(mels_postnet, n_iters=5) # 保存数据 cur_text = filename_formatter_re.sub('', unidecode.unidecode(text))[:15] cur_time = time.strftime('%Y%m%d-%H%M%S') outpath = os.path.join(output_dir, "demo_{}_{}_out.wav".format(cur_time, cur_text)) # print("outpath的路径:",outpath) wav_output = wavs.squeeze(0).cpu().numpy() aukit.save_wav(wav_output, outpath, sr=args.sampling_rate) # sampling_rate=22050 if isinstance(audio, (Path, str)) and Path(audio).is_file(): # # 原声 # refpath_raw = os.path.join(output_dir, "demo_{}_{}_ref_copy.wav".format(cur_time, cur_text)) # shutil.copyfile(audio, refpath_raw) # 重采样 wav_input, sr = aukit.load_wav(audio, with_sr=True) wav_input = librosa.resample(wav_input, sr, args.sampling_rate) refpath = os.path.join(output_dir, "demo_{}_{}_ref.wav".format(cur_time, cur_text)) aukit.save_wav(wav_input, refpath, sr=args.sampling_rate) # # 声码器 # wavs_ref = waveglow.generate_wave(mel=mel_data, **waveglow_kwargs) # outpath_ref = os.path.join(output_dir, "demo_{}_{}_ref_waveglow.wav".format(cur_time, cur_text)) # wav_output_ref = wavs_ref.squeeze(0).cpu().numpy() # aukit.save_wav(wav_output_ref, outpath_ref, sr=args.sampling_rate) fig_path = os.path.join(output_dir, "demo_{}_{}_fig.jpg".format(cur_time, cur_text)) plot_mel_alignment_gate_audio( mel=mels_postnet.squeeze(0).cpu().numpy(), alignment=alignments.squeeze(0).cpu().numpy(), gate=gates.squeeze(0).cpu().numpy(), audio=wav_output[::args.sampling_rate // 1000]) plt.savefig(fig_path) plt.close() # 先屏蔽掉信息 # yml_path = os.path.join(output_dir, "demo_{}_{}_info.yml".format(cur_time, cur_text)) # info_dict = locals2dict(locals()) # with open(yml_path, 'wt', encoding='utf8') as fout: # yaml.dump(info_dict, fout, encoding='utf-8', allow_unicode=True) # log_path = os.path.join(output_dir, "info_dict.txt".format(cur_time)) # with open(log_path, 'at', encoding='utf8') as fout: # fout.write('{}\n'.format(json.dumps(info_dict, ensure_ascii=False))) print('Test success done.返回克隆的音频为:', outpath) denoise.noisy_processing(outpath, outpath) # 对输出音频进行降噪处理 return outpath