def _get_mfcc_and_spec(wav, preemphasis_coeff, n_fft, win_length, hop_length): # Pre-emphasis y_preem = preemphasis(wav, coeff=preemphasis_coeff) # Get spectrogram D = librosa.stft(y=y_preem, n_fft=n_fft, hop_length=hop_length, win_length=win_length) mag = np.abs(D) # Get mel-spectrogram mel_basis = librosa.filters.mel(, hp.default.n_fft, hp.default.n_mels) # (n_mels, 1+n_fft//2) mel =, mag) # (n_mels, t) # mel spectrogram # Get mfccs, amp to db mag_db = amp_to_db(mag) mel_db = amp_to_db(mel) mfccs =, mel_db.shape[0]), mel_db) # Normalization (0 ~ 1) mag_db = normalize_0_1(mag_db, hp.default.max_db, hp.default.min_db) mel_db = normalize_0_1(mel_db, hp.default.max_db, hp.default.min_db) return mfccs.T, mag_db.T, mel_db.T # (t, n_mfccs), (t, 1+n_fft/2), (t, n_mels)
def process_utterance(wav_path, fft_size=1024, hop_size=256, win_length=1024, window="hann", num_mels=80, fmin=80, fmax=7600, eps=1e-10, sample_rate=24000, loud_norm=False, min_level_db=-100, return_linear=False, trim_long_sil=False, vocoder='pwg', change_loud=False, loud_range_min=0.9, loud_range_max=1.1): if isinstance(wav_path, str): if trim_long_sil: wav, _ = trim_long_silences(wav_path, sample_rate) else: wav, _ = librosa.core.load(wav_path, sr=sample_rate) else: wav = wav_path if change_loud: sample_num = wav.shape[0] // (sample_rate * 2) + 1 # sample point every 1 seconds random_point = np.random.permutation(wav.shape[0]) sample_up, sample_down = random_point[:sample_num], random_point[ sample_num:2 * sample_num] fp_up = np.random.uniform(2, loud_range_max, sample_num) fp_down = np.random.uniform(loud_range_min, 0.5, sample_num) fp = np.concatenate([fp_up, fp_down]) xp = np.concatenate([sample_up, sample_down]) index = np.argsort(xp) xp = xp[index] fp = fp[index] # print(xp.shape) change_curve = np.interp(np.arange(wav.shape[0]), xp, fp) wav = wav * change_curve if (np.abs(wav) > 1.0).sum() / wav.shape[0] > 1 / 200: print("too much wav out of 1", wav_path) wav = np.clip(wav, -1.0, 1.0) if loud_norm: assert not change_loud meter = pyln.Meter(sample_rate) # create BS.1770 meter loudness = meter.integrated_loudness(wav) wav = pyln.normalize.loudness(wav, loudness, -22.0) if np.abs(wav).max() > 1: wav = wav / np.abs(wav).max() # get amplitude spectrogram x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window, pad_mode="constant") spc = np.abs(x_stft) # (n_bins, T) # get mel basis fmin = 0 if fmin is -1 else fmin fmax = sample_rate / 2 if fmax is -1 else fmax mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax) mel = mel_basis @ spc if vocoder == 'pwg': mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T) elif vocoder == 'waveglow': mel = audio.dynamic_range_compression(mel) else: assert False, f'"{vocoder}" is not in ["pwg", "waveglow"].' l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1) wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0) wav = wav[:mel.shape[1] * hop_size] if not return_linear: return wav, mel else: spc = audio.amp_to_db(spc) spc = audio.normalize(spc, {'min_level_db': min_level_db}) return wav, mel, spc