示例#1
0
    def compute_mel(wav, f0_clean, duration=None, basename=None, snr=None):
        if duration is not None:
            # Compute mel-scale spectrogram from preprocessed wav with duration
            mel_spectrogram, energy, clipt = get_mel_and_energy(wav,
                                                                duration,
                                                                norm=False)

            # Compute rescaled energy (0 to 1)
            min_, max_ = hp.energy_min, hp.energy_max
            energy_rescaled = (energy - min_) / (max_ - min_)
            energy_rescaled = np.clip(energy_rescaled, 0, 1)

            # Compute normalized pitch
            warnings.filterwarnings('error')
            # Get f0 from noisy dataset.
            f0 = get_f0_noisy(wav, duration)
            if (f0 == 0.).all() or (energy_rescaled == 0.).all():
                if (f0 == 0.).all():
                    print("all zero f0! basename:{} SNR: {}".format(
                        basename, snr))
                else:
                    print("all zero energy! basename:{} SNR: {}".format(
                        basename, snr))
            try:
                f0_norm = utils.speaker_normalization(f0)
            except Warning:
                index_nonzero = (f0 > -1e10)
                mean_f0, std_f0 = np.mean(f0[index_nonzero]), np.std(
                    f0[index_nonzero])
                print(
                    'Warning was raised as an exception! basename: {}, mean_f0={} std_f0={}'
                    .format(basename, mean_f0,
                            std_f0))  #, f0, mean_f0, std_f0, f0_file_path)
                f0_norm = utils.speaker_normalization(f0_clean)
            warnings.simplefilter("ignore", DeprecationWarning)

            mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
            f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
            energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)

            # Sanity check
            mel_clean = np.load(
                os.path.join(out_dir, 'mel_clean', mel_filename))
            assert mel_clean.shape == mel_spectrogram.T.shape, "Computed mel should be the same size with pre-calculated one."
            return (mel_spectrogram.T, f0, f0_norm, energy,
                    energy_rescaled), (mel_filename, f0_filename,
                                       energy_filename), clipt
        else:
            # Get mel without any prior info
            print("Oh finally i got here..")
            exit(0)
            mel_spectrogram, _, _ = Audio.tools.get_mel_from_wav(
                torch.FloatTensor(wav), norm=False)
            return mel_spectrogram.T
示例#2
0
def _processing_data(hparams, full_path, spk_label, spk_emb, gender, npz_name,
                     pbar, i):
    if gender == 'M':
        lo, hi = 50, 250
    elif gender == 'F':
        lo, hi = 100, 600
    else:
        raise ValueError

    prng = RandomState(int(random.random()))
    x, fs = librosa.load(full_path, sr=hparams.sample_rate)
    assert fs == hparams.sample_rate
    if x.shape[0] % hparams.hop_size == 0:
        x = np.concatenate((x, np.array([1e-06])), axis=0)
    y = signal.filtfilt(b, a, x)
    wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06

    # compute spectrogram
    D = pySTFT(wav).T
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - hparams.ref_level_db
    S = (D_db + 100) / 100

    # extract f0
    f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768,
                        fs,
                        hparams.hop_size,
                        min=lo,
                        max=hi,
                        otype=2)
    index_nonzero = (f0_rapt != -1e10)
    mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(
        f0_rapt[index_nonzero])
    f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)

    assert len(S) == len(f0_rapt)

    data = {
        'mel': S.astype(np.float32),
        'f0': f0_norm.astype(np.float32),
        'spk_label': spk_label
    }
    if spk_emb is not None:
        data['spk_emb'] = spk_emb

    np.savez(npz_name, **data)
    pbar.update(i)
示例#3
0
def extract_f0(wav, fs):
    f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768,
                        fs,
                        256,
                        min=lo,
                        max=hi,
                        otype=2)
    index_nonzero = (f0_rapt != -1e10)
    mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(
        f0_rapt[index_nonzero])
    f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)

    f0_quantized = quantize_f0_numpy(f0_norm)[0]
    f0_onehot = f0_quantized[np.newaxis, :, :]
    print(f0_onehot.shape)

    if f0_onehot.shape[1] <= 192:
        f0_onehot, _ = pad_seq_to_2(f0_onehot, 192)

    return torch.from_numpy(f0_onehot).to(device)
            y = signal.filtfilt(b, a, x)
            wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06

            # compute spectrogram
            D = pySTFT(wav).T
            D_mel = np.dot(D, mel_basis)
            D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
            S = (D_db + 100) / 100

            # extract f0
            f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768,
                                fs,
                                256,
                                min=lo,
                                max=hi,
                                otype=2)
            index_nonzero = (f0_rapt != -1e10)
            mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(
                f0_rapt[index_nonzero])
            f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0,
                                            std_f0)

            assert len(S) == len(f0_rapt)

            np.save(os.path.join(targetDir, basename),
                    S.astype(np.float32),
                    allow_pickle=False)
            np.save(os.path.join(targetDir_f0, basename),
                    f0_norm.astype(np.float32),
                    allow_pickle=False)
    print("All done in {:.3f}s".format(time.perf_counter() - start_time_total))
示例#5
0
def synthesize_with_reference(idx_info, name, noisy_input, audio_path, tg_path,
                              speaker_id, inspection):
    global model, vocoder, step
    start_time = time.perf_counter()

    # Prepare Reference Data
    if speaker_id is not None:
        spker_embed_path = os.path.join(
            hp.preprocessed_path, "spker_embed",
            "{}-spker_embed-{}.npy".format(hp.dataset, speaker_id))
        speaker_embed = torch.from_numpy(np.load(spker_embed_path)).to(device)
    else:
        try:
            # VCTK fileformat
            speaker_id = name.split("_")[0]
            spker_embed_path = os.path.join(
                hp.preprocessed_path, "spker_embed",
                "{}-spker_embed-{}.npy".format(hp.dataset, speaker_id))
            speaker_embed = torch.from_numpy(
                np.load(spker_embed_path)).to(device)
        except:
            # General cases
            speaker_id = None
            speaker_embed = torch.from_numpy(
                embedding.predict_embedding(speaker_embedder, audio_path))

    # Outdir
    outdir = os.path.join(hp.test_path(),
                          "{}_by_{}_{}".format(name, speaker_id, step))
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    text = utils.get_transcript(
        os.path.join(audio_path.replace(".wav", ".txt")))
    if not os.path.isfile(tg_path):
        tg_path = "NO TextGrid"
        _, wav = read(audio_path)
        if noisy_input:
            f0 = sptk.rapt(wav.astype(np.float32) * hp.max_wav_value,
                           hp.sampling_rate,
                           hp.encoder_hidden,
                           min=hp.f0_min,
                           max=hp.f0_max,
                           otype=2)  # log f0
            f0 = np.exp(f0)
        else:
            f0, _ = pw.dio(wav.astype(np.float64),
                           hp.sampling_rate,
                           frame_period=hp.hop_length / hp.sampling_rate *
                           1000)
        mel, energy, _ = Audio.tools.get_mel_from_wav(
            torch.FloatTensor(np.array(wav)))
        mel = mel.T.numpy().astype(np.float32)
        energy = energy.numpy().astype(np.float32)
        utils.plot_data([(mel.T, f0, energy)], ['Reference Spectrogram'],
                        filename=os.path.join(
                            outdir,
                            '{}_{}_{}.png'.format("Reference", name,
                                                  text[:100])))
    else:
        f0, energy, mel = get_processed_data_from_wav(audio_path, tg_path,
                                                      noisy_input)
        utils.plot_data([(mel.T, f0, energy)], ['Reference Spectrogram'],
                        filename=os.path.join(
                            outdir,
                            '{}_{}_{}.png'.format("Reference", name,
                                                  text[:100])))

    # Prepare Audio Inputs
    energy = (energy - hp.energy_min) / (hp.energy_max - hp.energy_min)
    f0_norm = utils.speaker_normalization(f0)
    mel, mel_len, energy, f0, f0_norm = preprocess_audio(
        mel, energy, f0, f0_norm)

    print("\n\n---------------- [{}/{}]: {} ----------------".format(
        idx_info[0] + 1, idx_info[1],
        audio_path.split('/')[-1]))
    print('Audio Path:', audio_path)
    print('TextGrid Path:', tg_path)
    print('Speaker ID:', speaker_id)

    # Synthesize
    success = 0
    for sentence in sentences:
        text = preprocess_text(sentence)
        synthesize(outdir, model, vocoder, text, sentence, speaker_embed,
                   speaker_id, inspection, mel, mel_len, f0, f0_norm, energy,
                   args.duration_control, args.pitch_control,
                   args.energy_control)
        success += 1
    print("Synthesized {} out of {} in {:.3f}s".format(
        success, len(sentences),
        time.perf_counter() - start_time))