Пример #1
0
 def pre_process_batch(self, batch):
     len_batch = np.asarray([len(item) for item in batch], dtype=np.int32)
     max_len = np.max(len_batch)
     wav_batch = np.asarray([
         np.pad(item, (0, max_len - item_len),
                mode='constant',
                constant_values=0.)
         for item, item_len in zip(batch, len_batch)
     ])
     wav_batch = np.expand_dims(wav_batch, axis=-1)
     wav_batch = audio.quantize(audio.miu_law(wav_batch))
     return wav_batch, len_batch
Пример #2
0
    def __getitem__(self, index):
        entry = self.metadata[index]
        m = np.load(entry[2].strip())
        wav = np.load(entry[1].strip())

        if hp.input_type == 'raw' or hp.input_type=='mixture':
            wav = wav.astype(np.float32)
        elif hp.input_type == 'mulaw':
            wav = mulaw_quantize(wav, hp.mulaw_quantize_channels).astype(np.int)
        elif hp.input_type == 'bits':
            wav = quantize(wav).astype(np.int)
        else:
            raise ValueError("hp.input_type {} not recognized".format(hp.input_type))
        return m, wav
Пример #3
0
def convert_voice(model, wav_s, wav_t, emb_s, emb_t):
    """Arguments:
    cvae - ACVAE model
    embedder - DeepSpeakerModel
    wav_s - source voice
    wav_t - target voice
    Returns: 
    wav file with words from source voice, voice from target voice
    """
    pic_dir = "../figure/"
    if not os.path.exists(pic_dir):
        os.mkdir(pic_dir)
    feat_t, mean_t, std_t, sp_t, f0_t, ap_t = get_features(wav_t,
                                                           training=False)
    feat_s, mean_s, std_s, sp_s, f0_s, ap_s = get_features(wav_s,
                                                           training=False)

    logf0s_mean_s = np.mean(np.ma.log(f0_s))
    logf0s_std_s = np.std(np.ma.log(f0_s))
    logf0s_mean_t = np.mean(np.ma.log(f0_t))
    logf0s_std_t = np.std(np.ma.log(f0_t))

    f0_converted = pitch_conversion(f0=f0_s,
                                    mean_log_src=logf0s_mean_s,
                                    std_log_src=logf0s_std_s,
                                    mean_log_target=logf0s_mean_t,
                                    std_log_target=logf0s_std_t)

    mu_enc, logvar_enc = model.encoder([
        np.expand_dims(feat_s.astype(np.float32), [0, -1]),
        emb_s.astype(np.float32)
    ])
    z_enc = model.reparameterize(mu_enc, logvar_enc)
    #    norm_f0_c = quantize([1]*len(f0_s))
    norm_f0 = quantize(f0_s)

    nmfe_converted = model.decoder([
        z_enc,
        tf.reshape(emb_t, (1, -1)),
        np.expand_dims(norm_f0.astype(np.float32), [0, 1])
    ])
    nmfe_recon = model.decoder([
        z_enc,
        tf.reshape(emb_s, (1, -1)),
        np.expand_dims(norm_f0.astype(np.float32), [0, 1])
    ])
    nmfe_converted = np.squeeze(nmfe_converted.numpy())

    #    print("Val L1 loss: {}".format(np.mean(np.abs(np.squeeze(nmfe_recon.numpy()) - feat_s)).mean()))
    nmfe_recon = np.squeeze(nmfe_recon.numpy())

    mfe_converted = np.exp(nmfe_converted.T * std_t + mean_t)
    mfe_recon = np.exp(nmfe_recon.T * std_s + mean_s)

    plt.imshow(nmfe_converted[:100])
    plt.colorbar()
    plt.savefig(pic_dir + "convert.png")
    plt.close("all")
    plt.imshow(nmfe_recon[:100])
    plt.colorbar()
    plt.savefig(pic_dir + "recon.png")
    plt.close("all")
    plt.imshow(feat_s[:100])
    plt.colorbar()
    plt.savefig(pic_dir + "orig.png")
    plt.close("all")

    sp_converted_s = mfe2sp(mfe_recon)
    sp_converted_t = mfe2sp(mfe_converted)

    plt.imshow(sp_converted_s)
    plt.colorbar()
    plt.savefig(pic_dir + "sp_recovered.png")
    plt.close("all")
    plt.imshow(sp_converted_t)
    plt.colorbar()
    plt.savefig(pic_dir + "sp_converted.png")
    plt.close("all")
    factor = np.divide(sp_converted_t, sp_converted_s)

    sp_gained = np.multiply(sp_s[:len(factor)], factor[:len(sp_s)])
    #remove too big peaks:
    #    sp_gained = np.minimum(sp_gained, sp_s.max(axis=1, keepdims=True)[:len(sp_gained)])

    plt.plot(mfe_recon[50], color="green")
    plt.plot(mfe_converted[50], color="red")
    plt.plot(feat_s[50], color="black")
    plt.savefig(pic_dir + "mfe.png")
    plt.close("all")
    plt.figure(figsize=(5, 10))
    plt.plot(sp_converted_t[50], color="red")
    plt.plot(sp_converted_s[50], color="green")
    plt.plot(sp_s[50], color="black")
    plt.plot(sp_gained[50], color="blue")
    plt.savefig(pic_dir + "sp.png")
    plt.close("all")
    plt.plot(sp_converted_t[10], color="red")
    plt.plot(sp_converted_s[10], color="green")
    plt.plot(sp_s[10], color="black")
    plt.plot(sp_gained[10], color="blue")
    plt.savefig(pic_dir + "sp2.png")
    plt.close("all")
    plt.imshow(sp_t)
    plt.savefig(pic_dir + "target_sp.png")
    plt.imshow(sp_s)
    plt.savefig(pic_dir + "sp_orig.png")
    plt.close("all")
    #deemphasis
    plt.imshow(sp_gained)
    plt.savefig(pic_dir + "sp_gained.png")

    sp_gained = sp_gained / preemph_transform.reshape(1, -1)

    wav_transformed = pyworld.synthesize(
        f0_converted[:len(sp_gained)],
        #    wav_transformed = pyworld.synthesize(np.array([np.exp(logf0s_mean_t)] * len(sp_gained)),
        sp_gained,
        ap_s[:len(sp_gained)],
        SAMPLE_RATE,
        FRAME_PERIOD)
    #normalize amplitude
    wav_result = librosa.util.normalize(wav_transformed)
    wav_result = wav_result.astype(np.float32)
    return wav_result