Пример #1
0
def _get_mfcc_and_spec(wav, preemphasis_coeff, n_fft, win_length, hop_length):

    # Pre-emphasis
    y_preem = preemphasis(wav, coeff=preemphasis_coeff)

    # Get spectrogram
    D = librosa.stft(y=y_preem,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     win_length=win_length)
    mag = np.abs(D)

    # Get mel-spectrogram
    mel_basis = librosa.filters.mel(hp.default.sr, hp.default.n_fft,
                                    hp.default.n_mels)  # (n_mels, 1+n_fft//2)
    mel = np.dot(mel_basis, mag)  # (n_mels, t) # mel spectrogram

    # Get mfccs, amp to db
    mag_db = amp_to_db(mag)
    mel_db = amp_to_db(mel)
    mfccs = np.dot(librosa.filters.dct(hp.default.n_mfcc, mel_db.shape[0]),
                   mel_db)

    # Normalization (0 ~ 1)
    mag_db = normalize_0_1(mag_db, hp.default.max_db, hp.default.min_db)
    mel_db = normalize_0_1(mel_db, hp.default.max_db, hp.default.min_db)

    return mfccs.T, mag_db.T, mel_db.T  # (t, n_mfccs), (t, 1+n_fft/2), (t, n_mels)
Пример #2
0
def process_utterance(wav_path,
                      fft_size=1024,
                      hop_size=256,
                      win_length=1024,
                      window="hann",
                      num_mels=80,
                      fmin=80,
                      fmax=7600,
                      eps=1e-10,
                      sample_rate=24000,
                      loud_norm=False,
                      min_level_db=-100,
                      return_linear=False,
                      trim_long_sil=False,
                      vocoder='pwg',
                      change_loud=False,
                      loud_range_min=0.9,
                      loud_range_max=1.1):
    if isinstance(wav_path, str):
        if trim_long_sil:
            wav, _ = trim_long_silences(wav_path, sample_rate)
        else:
            wav, _ = librosa.core.load(wav_path, sr=sample_rate)
    else:
        wav = wav_path

    if change_loud:
        sample_num = wav.shape[0] // (sample_rate *
                                      2) + 1  # sample point every 1 seconds
        random_point = np.random.permutation(wav.shape[0])
        sample_up, sample_down = random_point[:sample_num], random_point[
            sample_num:2 * sample_num]
        fp_up = np.random.uniform(2, loud_range_max, sample_num)
        fp_down = np.random.uniform(loud_range_min, 0.5, sample_num)
        fp = np.concatenate([fp_up, fp_down])
        xp = np.concatenate([sample_up, sample_down])
        index = np.argsort(xp)
        xp = xp[index]
        fp = fp[index]
        # print(xp.shape)
        change_curve = np.interp(np.arange(wav.shape[0]), xp, fp)

        wav = wav * change_curve
        if (np.abs(wav) > 1.0).sum() / wav.shape[0] > 1 / 200:
            print("too much wav out of 1", wav_path)
        wav = np.clip(wav, -1.0, 1.0)

    if loud_norm:
        assert not change_loud
        meter = pyln.Meter(sample_rate)  # create BS.1770 meter
        loudness = meter.integrated_loudness(wav)
        wav = pyln.normalize.loudness(wav, loudness, -22.0)
        if np.abs(wav).max() > 1:
            wav = wav / np.abs(wav).max()

    # get amplitude spectrogram
    x_stft = librosa.stft(wav,
                          n_fft=fft_size,
                          hop_length=hop_size,
                          win_length=win_length,
                          window=window,
                          pad_mode="constant")
    spc = np.abs(x_stft)  # (n_bins, T)

    # get mel basis
    fmin = 0 if fmin is -1 else fmin
    fmax = sample_rate / 2 if fmax is -1 else fmax
    mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin,
                                    fmax)
    mel = mel_basis @ spc

    if vocoder == 'pwg':
        mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
    elif vocoder == 'waveglow':
        mel = audio.dynamic_range_compression(mel)
    else:
        assert False, f'"{vocoder}" is not in ["pwg", "waveglow"].'

    l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
    wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
    wav = wav[:mel.shape[1] * hop_size]

    if not return_linear:
        return wav, mel
    else:
        spc = audio.amp_to_db(spc)
        spc = audio.normalize(spc, {'min_level_db': min_level_db})
        return wav, mel, spc