def synthesize1(filename, bpm=80, speaker_id=14, outname="sample.wav"): tacotron, waveglow, denoiser = model data = get_data_from_musicxml(filename, bpm) sampling_rate = 22050 frequency_scaling = 0.4 n_seconds = 90 audio_stereo = np.zeros((sampling_rate * n_seconds, 2), dtype=np.float32) data_v = list(data.values())[0] rhythm = data_v['rhythm'].cpu() pitch_contour = data_v['pitch_contour'].cpu() text_encoded = data_v['text_encoded'].cpu() speaker_id = torch.LongTensor([speaker_id]).cpu() with torch.no_grad(): some_number_i_dont_know_what_is_this = 0 # Seems to be a number from 0 to 10 mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = tacotron.inference_noattention( (text_encoded, some_number_i_dont_know_what_is_this, speaker_id, pitch_contour * frequency_scaling, rhythm)) audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.66), 0.01)[0, 0] audio = audio.cpu().numpy() pan = 0 audio = panner(audio, pan) audio_stereo[:audio.shape[0]] += audio write(outname, sampling_rate, audio)
def singing_voice_v2(): # Singing Voice from Music Score data = get_data_from_musicxml('data/sinsy/csongdb_f00002_000_en.musicxml', 132, convert_stress=True) panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]} n_speakers_per_part = 4 frequency_scaling = 0.4 n_seconds = 90 audio_stereo = np.zeros((hparams.sampling_rate * n_seconds, 2), dtype=np.float32) for i, (part, v) in enumerate(data.items()): rhythm = data[part]['rhythm'].cuda() pitch_contour = data[part]['pitch_contour'].cuda() text_encoded = data[part]['text_encoded'].cuda() for k in range(n_speakers_per_part): pan = k # pan = np.random.randint(panning[part][0], panning[part][1]) if any(x in part.lower() for x in ('soprano', 'alto', 'female')): speaker_id = torch.LongTensor([next(female_speakers)]).cuda() else: speaker_id = torch.LongTensor([next(male_speakers)]).cuda() print("{} MellotronID {} pan {}".format(part, speaker_id.item(), pan)) with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour * frequency_scaling, rhythm)) plot_mel_f0_alignment(mel_outputs_postnet.data.cpu().numpy()[0], mel_outputs_postnet.data.cpu().numpy()[0], pitch_contour.data.cpu().numpy()[0, 0], rhythm.data.cpu().numpy()[:, 0].T) plt.show() out_mel = mel_outputs_postnet.data.cpu().numpy()[0] t0 = time.time() # wav = aukit.inv_mel_spectrogram() out_wav = infer_waveform_melgan(out_mel) print(time.time() - t0) aukit.save_wav(out_wav, "logs/musicxml_melgan_{}.wav".format(time.strftime("%Y%m%d-%H%M%S")), sr=22050) aukit.play_audio(out_wav, sr=22050) t0 = time.time() audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0] audio = audio.cpu().numpy() audio = panner(audio, pan) print(time.time() - t0) audio_stereo[:audio.shape[0]] += audio write("logs/{} {}.wav".format(part, speaker_id.item()), hparams.sampling_rate, audio) out_wav = audio aukit.play_audio(out_wav, sr=22050)
with open(GLconfig) as f: GLconfig = yaml.load(f, Loader=yaml.Loader) spec0 = mel_outputs_postnet.cpu().detach().numpy()[0,:,:] wavGL = mel_to_wav(spec0.T,GLconfig) # GL-write audio filename = 'iteration-gl-'+str(iteration)+timeStr+'.wav' sf.write(filename, wavGL,16000) # mel save from feats.TTS_Feat import np_save mel_filename = 'iteration-mel-'+str(iteration)+timeStr+'.npy' np_save(mel_filename,spec0) # waveglow # audio_stereo = np.zeros((hparams.sampling_rate*n_seconds, 2), dtype=np.float32) if 1: data = get_data_from_musicxml('data/haendel_hallelujah.musicxml', 132, convert_stress=True) # print('data=',data) # print('data.shape=',data.shape) part = 'Soprano' audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0] audio = audio.cpu().numpy() # panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]} # pan = np.random.randint(panning[part][0], panning[part][1]) # audio = panner(audio, pan) # audio_stereo[:audio.shape[0]] += audio # audio_stereo = audio_stereo / np.max(np.abs(audio_stereo)) filename = 'iteration-waveglow-'+str(iteration)+timeStr+'.wav' sf.write(filename, audio, 16000) # plot t0 = np.linspace(0,len(audio)-1,len(audio))/hparams.sampling_rate