def preprocess(file_path, sr=32000, mono=True, n_fft=1024, hop_length=192, n_mels=128, fmax=None, log_spec=False): if mono: sig, sr = librosa.load(file_path, sr=sr, mono=True) sig = sig[np.newaxis] else: sig, sr = librosa.load(file_path, sr=sr, mono=False) # sig, sf_sr = sf.read(file_path) # sig = np.transpose(sig, (1, 0)) # sig = np.asarray([librosa.resample(s, sf_sr, sr) for s in sig]) # sig = librosa.effects.pitch_shift(sig, sr, n_steps=pitch_shift) for y in sig: # compute stft stft = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=None, window='hann', center=True, pad_mode='reflect') # keep only amplitures stft = np.abs(stft) # spectrogram weighting if log_spec: stft = np.log10(stft + 1) else: freqs = librosa.core.fft_frequencies(sr=sr, n_fft=n_fft) stft = librosa.perceptual_weighting(stft**2, freqs, ref=1.0, amin=1e-10, top_db=99.0) # apply mel filterbank spectrogram = librosa.feature.melspectrogram(S=stft, sr=sr, n_mels=n_mels, fmax=fmax) # print(sig.shape) # print(spectrogram.shape) return spectrogram
def __spectrogram_V1(self, signal, fft_window_size, hop_length, log_spectrogram, n_mels, fmax): # compute stft stft = librosa.stft(signal, n_fft=fft_window_size, hop_length=hop_length, win_length=None, window='hann', center=True, pad_mode='reflect') # keep only magnitude stft = np.abs(stft) # spectrogram weighting if log_spectrogram: stft = np.log10(stft + 1) else: freqs = librosa.core.fft_frequencies(sr=self.sample_rate, n_fft=fft_window_size) stft = librosa.perceptual_weighting(stft**2, freqs, ref=1.0, amin=1e-10, top_db=99.0) # apply mel filterbank spectrogram = librosa.feature.melspectrogram(S=stft, sr=self.sample_rate, n_mels=n_mels, fmax=fmax) spectrogram = np.asarray(spectrogram) return spectrogram
def processor_d18(file_path): n_fft = 2048 # 2048 sr = 22050 # 22050 # 44100 # 32000 mono = True # log_spec = False n_mels = 256 hop_length = 512 fmax = None if mono: # this is the slowest part resampling sig, sr = librosa.load(file_path, sr=sr, mono=True) sig = sig[np.newaxis] else: sig, sr = librosa.load(file_path, sr=sr, mono=False) # sig, sf_sr = sf.read(file_path) # sig = np.transpose(sig, (1, 0)) # sig = np.asarray([librosa.resample(s, sf_sr, sr) for s in sig]) spectrograms = [] for y in sig: # compute stftnp.asfortranarray(x) stft = librosa.stft(np.asfortranarray(y), n_fft=n_fft, hop_length=hop_length, win_length=None, window='hann', center=True, pad_mode='reflect') # keep only amplitures stft = np.abs(stft) # spectrogram weighting if log_spec: stft = np.log10(stft + 1) else: freqs = librosa.core.fft_frequencies(sr=sr, n_fft=n_fft) stft = librosa.perceptual_weighting(stft**2, freqs, ref=1.0, amin=1e-10, top_db=80.0) # apply mel filterbank spectrogram = librosa.feature.melspectrogram(S=stft, sr=sr, n_mels=n_mels, fmax=fmax) # keep spectrogram spectrograms.append(np.asarray(spectrogram)) spectrograms = np.asarray(spectrograms, dtype=np.float32) return spectrograms
def pre_process(pathname): sampling_rate = 32000 # duration = 4 # duration = 5 # confX['hop_length'] = 520 # 20ms hop_length = 192 # fmin = 20 # fmax = sampling_rate // 2 fmax = None # confX['n_mels'] = 48 n_mels = 128 # confX['n_fft'] = confX['n_mels'] * 20 n_fft = 1024 # audio_split = 'dont_crop' # samples = sampling_rate * duration # dims = (n_mels, 1 + int(np.floor(samples / hop_length)), 1) # y, sr = librosa.load(pathname, sr = sampling_rate) y, sr = librosa.load(pathname, sr=None) # y, (trim_begin, trim_end) = librosa.effects.trim(y) # Amplitudes of STFT stft = np.abs( librosa.stft(y, n_fft=n_fft, hop_length=hop_length, window='hann', center=True, pad_mode='reflect')) print('stft shape:', stft.shape) freqs = librosa.core.fft_frequencies(sr=sampling_rate, n_fft=n_fft) stft = librosa.perceptual_weighting(stft * 2, freqs, ref=1.0, amin=1e-10, top_db=99.0) print('stft shape:', stft.shape) # Apply mel filterbank # Power param is set to 2 (power) by default mel_spect = librosa.feature.melspectrogram(S=stft, sr=sampling_rate, n_mels=n_mels, fmax=fmax) print('mel shape:', mel_spect.shape) log_mel_spect = librosa.core.power_to_db(mel_spect) print('log mel shape:', log_mel_spect.shape) # spectrogram = librosa.feature.melspectrogram(S = stft) # Keep spectrogram # return np.asarray(spectrogram) return np.asarray(log_mel_spect)
def process(self, file_path, **kwargs): n_fft = 1024 sr = 32000 mono = True log_spec = False n_mels = 128 hop_length = 192 fmax = None if mono: sig, sr = librosa.load(file_path, sr=sr, mono=True) sig = sig[np.newaxis] else: sig, sr = librosa.load(file_path, sr=sr, mono=False) # sig, sf_sr = sf.read(file_path) # sig = np.transpose(sig, (1, 0)) # sig = np.asarray([librosa.resample(s, sf_sr, sr) for s in sig]) spectrograms = [] for y in sig: # compute stft stft = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=None, window='hann', center=True, pad_mode='reflect') # keep only amplitures stft = np.abs(stft) # spectrogram weighting if log_spec: stft = np.log10(stft + 1) else: freqs = librosa.core.fft_frequencies(sr=sr, n_fft=n_fft) stft = librosa.perceptual_weighting(stft**2, freqs, ref=1.0, amin=1e-10, top_db=99.0) # apply mel filterbank spectrogram = librosa.feature.melspectrogram(S=stft, sr=sr, n_mels=n_mels, fmax=fmax) # keep spectrogram spectrograms.append(np.asarray(spectrogram)) spectrograms = np.asarray(spectrograms) return spectrograms
def mel_weight(S, power): global _mel_freqs if _mel_freqs is None: _mel_freqs = librosa.mel_frequencies(S.shape[0], fmin=hparams.fmin) S = librosa.perceptual_weighting(np.abs(S)**power, _mel_freqs, ref=hparams.ref_level_db) S = _normalize(S - hparams.ref_level_db) return S
def compute_loudness(self, n_fft=256): fourier = librosa.stft(self.audio_raw, n_fft=n_fft) S = np.abs(fourier * np.conj(fourier)) log_S = librosa.perceptual_weighting( S**2, librosa.fft_frequencies(n_fft=n_fft)) self.loudness = log_S.sum(axis=0, keepdims=True)[0] self.n_points = int(n_fft / 4) self.n_windows = int(np.ceil(len(self.audio_raw) / self.n_points)) print('processing windows: {}'.format(self.n_windows)) print('points per windows: {}'.format(self.n_points))
def extractor(self, wav, sr, feature_type, full_file_path): """ TODO - экстрактор принимает файлик - `librosa` принимает wav-файлы в `float`, а не в `int` - нужна доп. обработка этого Определение экстракторов фичей (и постаугментаторов) файлов. Добавление нового экстрактора выглядит как добавление нового `elif`. :param wav: объект класса `numpy.ndarray`, считанный wav-файл :param sr: sample rate wav-файла :param feature_type: объект класса `str`, название экстрактора :param full_file_path: объект класса `str`, полный путь до wav-файла """ # сырые данные if feature_type == 'raw': features = wav # фурье коэффициенты elif feature_type == 'fft': _, _, spectr = signal.spectrogram(wav, nperseg=200, nfft=200, fs=8000, noverlap=128) features = spectr elif feature_type == 'mel': features = librosa.feature.melspectrogram(wav, n_mels=128, sr=sr, n_fft=2048, hop_length=1024) elif feature_type == 'mfcc': features = librosa.feature.mfcc(wav, n_mfcc=40, sr=sr) elif feature_type == 'percep_spec': n_fft = 512 stft = librosa.stft(wav, n_fft=n_fft, hop_length=n_fft // 4, win_length=None, window='hann', center=True, pad_mode='reflect') stft = np.abs(stft) freqs = librosa.core.fft_frequencies(sr=sr, n_fft=n_fft) stft = librosa.perceptual_weighting(stft ** 2, freqs, ref=1.0, amin=1e-10, top_db=99.0) features = librosa.feature.melspectrogram(S=stft, sr=sr, n_mels=128, fmax=sr//2) # эмбеддинги VGGish модели # подробнее: elif feature_type == 'vggish': from models.vggish import vggish_gen_embeddings, VGGISH_CHECKPOINT_PATH, \ VGGISH_PCA_PARAMS_PATH embeddings = vggish_gen_embeddings.get_embeddings(VGGISH_CHECKPOINT_PATH, VGGISH_PCA_PARAMS_PATH, full_file_path) features = embeddings else: pass # for another types of features return features
def spectrogram(y, power, pcen=False): global _mel_freqs stftS = librosa.stft(y, n_fft=hparams.fft_size, hop_length=hparams.hop_size) if hparams.use_preemphasis: y = preemphasis(y) S = librosa.stft(y, n_fft=hparams.fft_size, hop_length=hparams.hop_size) if _mel_freqs is None: _mel_freqs = librosa.mel_frequencies(S.shape[0], fmin=hparams.fmin) _S = librosa.perceptual_weighting(np.abs(S)**power, _mel_freqs, ref=hparams.ref_level_db) return _normalize(_S - hparams.ref_level_db), stftS
def cqtgram(y, sr, hop_length=512, octave_bins=24, n_octaves=8, fmin=40, perceptual_weighting=False): s_complex = librosa.cqt( y, sr=sr, hop_length=hop_length, bins_per_octave=octave_bins, n_bins=octave_bins * n_octaves, fmin=fmin, ) specgram = np.abs(s_complex) if perceptual_weighting: freqs = librosa.cqt_frequencies(specgram.shape[0], fmin=fmin, bins_per_octave=octave_bins) specgram = librosa.perceptual_weighting(specgram**2, freqs, ref=np.max) else: specgram = librosa.amplitude_to_db(specgram, ref=np.max) return specgram
def predictOne(self, samples: Signal): """Calculates the cqt of the given audio using librosa. Args: samples (Signal): The samples of the audio. grid (list of float): The . Returns: tuple of List[float]: The cqt of the audio. """ sr = samples.sampleRate hop_length = self.parameters["hopLength"].value n_bins = self.parameters["binNumber"].value cqt_sr = sr / hop_length cqt = librosa.cqt(samples.values, sr=sr, hop_length=hop_length, n_bins=n_bins) linear_cqt = np.abs(cqt) if self.parameters["scale"].value == "Amplitude": result = linear_cqt elif self.parameters["scale"].value == "Power": result = linear_cqt**2 elif self.parameters["scale"].value == "MSAF": result = librosa.amplitude_to_db(linear_cqt**2, ref=np.max) result += np.min( result ) * -1 # Inverting the db scale (don't know if this is correct) elif self.parameters["scale"].value == "Power dB": result = librosa.amplitude_to_db( linear_cqt, ref=np.max) # Based on Librosa, standard power spectrum in dB result += np.min(result) * -1 elif self.parameters["scale"].value == "Perceived dB": freqs = librosa.cqt_frequencies(linear_cqt.shape[0], fmin=librosa.note_to_hz('C1')) result = librosa.perceptual_weighting(linear_cqt**2, freqs, ref=np.max) result += np.min(result) * -1 else: raise ValueError("parameterScale is not a correct value") return (Signal(result.T, sampleRate=cqt_sr), )
def _filename_to_spec(file_path, n_fft=1024, sr=44100, mono=True, log_spec=False, n_mels=64, hop_length=512, fmax=None): samples, sr = librosa.load(file_path, sr=sr, mono=mono) # Compute stft stft = librosa.stft(samples, n_fft=n_fft, hop_length=hop_length, win_length=None, window='hann', center=True, pad_mode='reflect') # Get only frequencies and ignore phases. stft = np.abs(stft) # Select our spectrogram weighting. if log_spec: stft = np.log10(stft + 1) else: freqs = librosa.core.fft_frequencies(sr=sr, n_fft=n_fft) stft = librosa.perceptual_weighting(stft**2, freqs, ref=1.0, amin=1e-10, top_db=99.0) # Apply mel filterbank. spectrogram = librosa.feature.melspectrogram(S=stft, sr=sr, n_mels=n_mels, fmax=fmax).T # Visualize #plt.pcolormesh(spectrogram.T) #plt.show() assert spectrogram.shape[1] == n_mels return spectrogram
def cqtgram(self, y, hop_length=512, octave_bins=24, n_octaves=8, fmin=40, perceptual_weighting=False): S_complex = librosa.cqt( y, sr=self.sr, hop_length=hop_length, bins_per_octave=octave_bins, n_bins=octave_bins*n_octaves, fmin=fmin, real=False ) S = np.abs(S_complex) if perceptual_weighting: freqs = librosa.cqt_frequencies( S.shape[0], fmin=fmin, bins_per_octave=octave_bins ) S = librosa.perceptual_weighting(S**2, freqs, ref_power=np.max) else: S = librosa.logamplitude(S**2, ref_power=np.max) return S
def cqtgram(self, y, hop_length=512, octave_bins=24, n_octaves=8, fmin=40, perceptual_weighting=False): S_complex = librosa.cqt(y, sr=self.sr, hop_length=hop_length, bins_per_octave=octave_bins, n_bins=octave_bins * n_octaves, fmin=fmin, real=False) S = np.abs(S_complex) if perceptual_weighting: freqs = librosa.cqt_frequencies(S.shape[0], fmin=fmin, bins_per_octave=octave_bins) S = librosa.perceptual_weighting(S**2, freqs, ref_power=np.max) else: S = librosa.logamplitude(S**2, ref_power=np.max) return S
#np.angle(D[f, t]) is the phase of frequency bin f at frame t #Rmq: ft = magnitude * phase Magnitude_l = np.abs(ft_left) Magnitude_r = np.abs(ft_left) #Phase = np.angle(ft_left) Power_l = Magnitude_l**2 Power_r = Magnitude_r**2 print(Power_l.shape) #Remove of the boucle ? fft_frequencies = librosa.fft_frequencies(sr=sr, n_fft=n_fft) print(fft_frequencies.shape) #Perceptual weighting of a power spectrogram pw_l = librosa.perceptual_weighting(S=Magnitude_l**2, frequencies=fft_frequencies) pw_r = librosa.perceptual_weighting(S=Magnitude_r**2, frequencies=fft_frequencies) #more option as power_to_db: ref=1.0, amin=1e-10, top_db=80.0 print(pw_l.shape) ms_l = librosa.feature.melspectrogram(S=pw_l, n_mels=256) ms_r = librosa.feature.melspectrogram(S=pw_r, n_mels=256) #by default n_mels=128 print(ms_l.shape) tranform = np.empty((2, 256, 431)) tranform[0] = ms_l tranform[1] = ms_r path_save = data_path + "\\" + save1_path + "\\"
def perceptual_cqt(y,sr): C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('A1'))) freqs = librosa.cqt_frequencies(C.shape[0], fmin=librosa.note_to_hz('A1'))#Adapted to music perceptual_CQT = librosa.perceptual_weighting(C**2, freqs, ref=np.max)# Uses return perceptual_CQT
def weighted_spectro(audio, sr): C = np.abs(librosa.cqt(audio, sr=sr, fmin=cqt_fmin)) freqs = librosa.cqt_frequencies(C.shape[0], fmin=cqt_fmin) perceptual_sdb = librosa.perceptual_weighting(C**2, freqs, ref=np.max) return perceptual_sdb, librosa.db_to_power(perceptual_sdb)
print(x.shape, sr) #x.shape = (276480,) sr = 22050 plt.figure(figsize=(14, 5)) librosa.display.waveplot(x, sr=sr) plt.show() X = librosa.stft(x) Xdb = librosa.amplitude_to_db(abs(X)) plt.figure(figsize=(14, 5)) librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log') plt.colorbar() plt.show() #Perceptual Weighting: freq = librosa.core.fft_frequencies(sr=sr) mag = librosa.perceptual_weighting(abs(X)**2, freq) librosa.display.specshow(mag, sr=sr, x_axis='time', y_axis='log') plt.colorbar() plt.show() r = librosa.autocorrelate(x, max_size=6000) sample = r[:300] plt.figure(figsize=(14, 5)) plt.plot(sample) plt.show() #Chroma Features sound_len = 400 chrom = librosa.feature.chroma_stft(x, sr=sr, hop_length=sound_len) plt.figure(figsize=(14, 5)) librosa.display.specshow(chrom,