예제 #1
0
def synthesize1(filename, bpm=80, speaker_id=14, outname="sample.wav"):
    tacotron, waveglow, denoiser = model
    data = get_data_from_musicxml(filename, bpm)

    sampling_rate = 22050
    frequency_scaling = 0.4
    n_seconds = 90
    audio_stereo = np.zeros((sampling_rate * n_seconds, 2), dtype=np.float32)

    data_v = list(data.values())[0]

    rhythm = data_v['rhythm'].cpu()
    pitch_contour = data_v['pitch_contour'].cpu()
    text_encoded = data_v['text_encoded'].cpu()

    speaker_id = torch.LongTensor([speaker_id]).cpu()

    with torch.no_grad():
        some_number_i_dont_know_what_is_this = 0  # Seems to be a number from 0 to 10
        mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = tacotron.inference_noattention(
            (text_encoded, some_number_i_dont_know_what_is_this, speaker_id,
             pitch_contour * frequency_scaling, rhythm))
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.66),
                         0.01)[0, 0]
        audio = audio.cpu().numpy()
        pan = 0
        audio = panner(audio, pan)
        audio_stereo[:audio.shape[0]] += audio
        write(outname, sampling_rate, audio)
예제 #2
0
def singing_voice_v2():
    # Singing Voice from Music Score
    data = get_data_from_musicxml('data/sinsy/csongdb_f00002_000_en.musicxml', 132, convert_stress=True)
    panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]}
    n_speakers_per_part = 4
    frequency_scaling = 0.4
    n_seconds = 90
    audio_stereo = np.zeros((hparams.sampling_rate * n_seconds, 2), dtype=np.float32)
    for i, (part, v) in enumerate(data.items()):
        rhythm = data[part]['rhythm'].cuda()
        pitch_contour = data[part]['pitch_contour'].cuda()
        text_encoded = data[part]['text_encoded'].cuda()

        for k in range(n_speakers_per_part):
            pan = k
            # pan = np.random.randint(panning[part][0], panning[part][1])
            if any(x in part.lower() for x in ('soprano', 'alto', 'female')):
                speaker_id = torch.LongTensor([next(female_speakers)]).cuda()
            else:
                speaker_id = torch.LongTensor([next(male_speakers)]).cuda()
            print("{} MellotronID {} pan {}".format(part, speaker_id.item(), pan))

            with torch.no_grad():
                mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention(
                    (text_encoded, mel, speaker_id, pitch_contour * frequency_scaling, rhythm))

            plot_mel_f0_alignment(mel_outputs_postnet.data.cpu().numpy()[0],
                                  mel_outputs_postnet.data.cpu().numpy()[0],
                                  pitch_contour.data.cpu().numpy()[0, 0],
                                  rhythm.data.cpu().numpy()[:, 0].T)
            plt.show()

            out_mel = mel_outputs_postnet.data.cpu().numpy()[0]
            t0 = time.time()
            # wav = aukit.inv_mel_spectrogram()
            out_wav = infer_waveform_melgan(out_mel)
            print(time.time() - t0)

            aukit.save_wav(out_wav, "logs/musicxml_melgan_{}.wav".format(time.strftime("%Y%m%d-%H%M%S")), sr=22050)
            aukit.play_audio(out_wav, sr=22050)

            t0 = time.time()
            audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
            audio = audio.cpu().numpy()
            audio = panner(audio, pan)
            print(time.time() - t0)

            audio_stereo[:audio.shape[0]] += audio
            write("logs/{} {}.wav".format(part, speaker_id.item()), hparams.sampling_rate, audio)
            out_wav = audio

            aukit.play_audio(out_wav, sr=22050)
예제 #3
0
        with open(GLconfig) as f:
            GLconfig = yaml.load(f, Loader=yaml.Loader)
        spec0 = mel_outputs_postnet.cpu().detach().numpy()[0,:,:]
        wavGL = mel_to_wav(spec0.T,GLconfig)
        # GL-write audio
        filename = 'iteration-gl-'+str(iteration)+timeStr+'.wav'
        sf.write(filename, wavGL,16000)
    # mel save
    from feats.TTS_Feat import np_save
    mel_filename = 'iteration-mel-'+str(iteration)+timeStr+'.npy'
    np_save(mel_filename,spec0)

    # waveglow 
    # audio_stereo = np.zeros((hparams.sampling_rate*n_seconds, 2), dtype=np.float32)
    if 1:
        data = get_data_from_musicxml('data/haendel_hallelujah.musicxml', 132, convert_stress=True)
        # print('data=',data)
        # print('data.shape=',data.shape)
        part = 'Soprano'
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
        audio = audio.cpu().numpy()
        # panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]}
        # pan = np.random.randint(panning[part][0], panning[part][1])
        # audio = panner(audio, pan)
        # audio_stereo[:audio.shape[0]] += audio            
        # audio_stereo = audio_stereo / np.max(np.abs(audio_stereo))
        filename = 'iteration-waveglow-'+str(iteration)+timeStr+'.wav'
        sf.write(filename, audio, 16000)

    # plot
    t0 = np.linspace(0,len(audio)-1,len(audio))/hparams.sampling_rate