def test_spectral_bandwidth_synthetic(): # This test ensures that a signal confined to a single frequency bin # always achieves 0 bandwidth k = 5 def __test(S, freq, sr, n_fft, norm, p): bw = librosa.feature.spectral_bandwidth(S=S, freq=freq, norm=norm, p=p) assert not np.any(bw) srand() # construct a fake spectrogram sr = 22050 n_fft = 1024 S = np.zeros((1 + n_fft // 2, 10)) S[k, :] = 1.0 for norm in [False, True]: for p in [1, 2]: # With vanilla frequencies yield __test, S, None, sr, n_fft, norm, p # With explicit frequencies freq = librosa.fft_frequencies(sr=sr, n_fft=n_fft) yield __test, S, freq, sr, n_fft, norm, p # And if we modify the frequencies freq = 3 * librosa.fft_frequencies(sr=sr, n_fft=n_fft) yield __test, S, freq, sr, n_fft, norm, p # Or if we make up random frequencies for each frame freq = np.random.randn(*S.shape) yield __test, S, freq, sr, n_fft, norm, p
def test_spectral_bandwidth_onecol(): # This test checks for issue https://github.com/librosa/librosa/issues/552 # failure when the spectrogram has a single column def __test(S, freq): bw = librosa.feature.spectral_bandwidth(S=S, freq=freq) assert bw.shape == (1, 1) k = 5 srand() # construct a fake spectrogram sr = 22050 n_fft = 1024 S = np.zeros((1 + n_fft // 2, 1)) S[k, :] = 1.0 # With vanilla frequencies yield __test, S, None # With explicit frequencies freq = librosa.fft_frequencies(sr=sr, n_fft=n_fft) yield __test, S, freq # And if we modify the frequencies freq = 3 * librosa.fft_frequencies(sr=sr, n_fft=n_fft) yield __test, S, freq # Or if we make up random frequencies for each frame freq = np.random.randn(*S.shape) yield __test, S, freq
def plot_stft(freq_domain_data, sr): print('Frequencies: ', librosa.fft_frequencies(sr=sr, n_fft=FRAME_SIZE)) print(freq_domain_data.shape) for freq in range( librosa.fft_frequencies(sr=sr, n_fft=FRAME_SIZE).shape[0]): plt.plot(librosa.fft_frequencies(sr=sr, n_fft=FRAME_SIZE), np.abs(freq_domain_data[:, :])) plt.show() choice = input('Continue?(Y/N):') if choice != 'y' and choice != 'Y': break
def plot_melfilters(sr=16000, n_fft=512, n_mels=10, fmin=0, fmax=None): mel = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, norm=None) mel_norm = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, norm=1) freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft) fig, (base_ax, norm_ax) = plt.subplots(2, figsize=(16, 8)) base_ax.plot(freqs, mel.T) base_ax.set_title('Mel filters') norm_ax.plot(freqs, mel_norm.T) norm_ax.set_title('Mel filters normalized (area=1)') base_ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) norm_ax.set_xlabel('Frequency (Hz)') return fig
def test_salience_multi_static(s_multi, filter_peaks): S, sr = s_multi freqs = librosa.fft_frequencies(sr=sr) sal_all = librosa.salience( S, freqs=freqs, harmonics=[0.5, 1, 2, 3], kind="slinear", filter_peaks=filter_peaks, fill_value=0, ) sal_0 = librosa.salience( S[0], freqs=freqs, harmonics=[0.5, 1, 2, 3], kind="slinear", filter_peaks=filter_peaks, fill_value=0, ) sal_1 = librosa.salience( S[1], freqs=freqs, harmonics=[0.5, 1, 2, 3], kind="slinear", filter_peaks=filter_peaks, fill_value=0, ) assert np.allclose(sal_all[0], sal_0) assert np.allclose(sal_all[1], sal_1) assert not np.allclose(sal_0, sal_1)
def get_spectrogram(self): """ The original spectrogram is first mean filtered. Then it's dimensions are edited so as to match that of the tempogram. This is done using scaling and 2d interpolation. It is finally multiplied by a multiplier :return: Manipulated Spectrogram """ spectrogram = librosa.amplitude_to_db(self.stft, ref=np.max) # Spectrogram spectrogram = librosa.decompose.nn_filter( spectrogram, aggregate=np.mean) # Spectrogram Mean Filtered freqs = librosa.fft_frequencies(sr=self.sample_rate) sampling_freqs = np.logspace(start=0, stop=np.log(freqs[-1]) / np.log(10), num=self.tempo_final.shape[0], endpoint=True) sampling_freqs_indices = (sampling_freqs - np.min(sampling_freqs)) / \ (np.max(sampling_freqs) - np.min(sampling_freqs)) * spectrogram.shape[0] freq_indices = np.arange(0, spectrogram.shape[0]) time_indices = np.arange(0, spectrogram.shape[1]) f = interpolate.interp2d(time_indices, freq_indices, spectrogram, kind='linear') spectrogram = f(time_indices, sampling_freqs_indices) spectrogram = (spectrogram - np.min(spectrogram)) / \ (np.max(spectrogram) - np.min(spectrogram)) return spectrogram * self.spectr_mult
def test_spectral_rolloff_synthetic(): srand() sr = 22050 n_fft = 2048 def __test(S, freq, pct): rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr, freq=freq, roll_percent=pct) if freq is None: freq = librosa.fft_frequencies(sr=sr, n_fft=n_fft) idx = np.floor(pct * freq.shape[0]).astype(int) assert np.allclose(rolloff, freq[idx]) S = np.ones((1 + n_fft // 2, 10)) for pct in [0.25, 0.5, 0.95]: # Implicit frequencies yield __test, S, None, pct # Explicit frequencies freq = librosa.fft_frequencies(sr=sr, n_fft=n_fft) yield __test, S, freq, pct # And time-varying frequencies freq = np.cumsum(np.abs(np.random.randn(*S.shape)), axis=0) yield __test, S, freq, pct
def phase_angle_inc_generator(_fftFrameSize, _hop_length, sampleRate): #Finding the period of each bin's central frequency of my fft. #to stop a divide by 0. fft_frequencies = librosa.fft_frequencies(sr=sampleRate, n_fft=_fftFrameSize) # print("Phase_gen: ", fft_frequencies.shape, fft_frequencies[0], fft_frequencies[1], fft_frequencies[1024]) fft_frequencies[0] = 1 fft_freq_period_sample = sampleRate / fft_frequencies #Stopping the 0's element of fft_freq_period_sample from being "inf". #MAJOR ISSUE: 0th element is inf, 1st element is 2048, 1024th element is 2. What is the 0th element and why does this STFT return 1 more value than half the frameSize. # the answer is probably the reason the phase vocoding sounds lame. #Going to make the 0th bin phase 0 later in this code. fft_freq_period_sample[0] = _fftFrameSize #dividing the hoplength by the period of the bins # to create an value to increment the phase by for each hop. # and scaling the the value to a number between 2*PI. fft_freq_period_angle_hopInc = (_hop_length / fft_freq_period_sample) * 2 * np.pi fft_freq_period_angle_hopInc[0] = 0.0 #return the array of hop incrememnts. return fft_freq_period_angle_hopInc
def test_spectral_centroid_synthetic(): k = 5 def __test(S, freq, sr, n_fft): cent = librosa.feature.spectral_centroid(S=S, freq=freq) if freq is None: freq = librosa.fft_frequencies(sr=sr, n_fft=n_fft) assert np.allclose(cent, freq[k]) srand() # construct a fake spectrogram sr = 22050 n_fft = 1024 S = np.zeros((1 + n_fft // 2, 10)) S[k, :] = 1.0 yield __test, S, None, sr, n_fft freq = librosa.fft_frequencies(sr=sr, n_fft=n_fft) yield __test, S, freq, sr, n_fft # And if we modify the frequencies freq *= 3 yield __test, S, freq, sr, n_fft # Or if we make up random frequencies for each frame freq = np.random.randn(*S.shape) yield __test, S, freq, sr, n_fft
def rms_energy_infra(filenames, *, frame_length: int = 500, hop_length: int = None, threshold: int = 20): import librosa if hop_length is None: hop_length = frame_length / 4 res = [] for fn in filenames: y, fs = librosa.load(fn, None) frame_length_used = int(fs * frame_length / 1000) hop_length_used = int(fs * hop_length / 1000) S, _ = librosa.magphase( librosa.stft(y, n_fft=frame_length_used, hop_length=hop_length_used)) freqs = librosa.fft_frequencies(sr=fs, n_fft=frame_length_used) S[freqs > threshold, :] = 0 res.append(librosa.feature.rms(S=S, frame_length=frame_length_used, hop_length=hop_length_used)\ .reshape(-1, 1).astype('float32')) return res
def __test(S, freq, sr, n_fft): cent = librosa.feature.spectral_centroid(S=S, freq=freq) if freq is None: freq = librosa.fft_frequencies(sr=sr, n_fft=n_fft) assert np.allclose(cent, freq[k])
def __call__(self, sample): y, sr = sample['wav'] y, sr = librosa.resample(y, sr, 22050), 22050 ref_times, ref_freqs = sample['gt'] fft_f = librosa.fft_frequencies(sr, self.n_fft) f_interp = interp1d(librosa.time_to_frames(ref_times, sr, self.hop_length, self.n_fft), ref_freqs, fill_value=0.0, bounds_error=False) fft = librosa.stft(y, self.n_fft, self.hop_length) n_fft = np.zeros(fft.shape, dtype=fft.dtype) for frame in range(fft.shape[1]): freq = f_interp(frame) for i in range(self.n_harmonics): idx = np.argmin(np.abs(fft_f - freq * (i + 1))) if np.abs(fft_f[idx] - freq * (i + 1)) < fft_f[idx] * (2**(1 / 24) - 1): n_fft[idx, frame] = fft[idx, frame] else: fft[:, frame] = 0 y = librosa.istft(n_fft, self.hop_length) y = y / max(y) if self.new_key is None: sample['wav'] = y, sr else: sample[self.new_key] = y, sr return sample
def freq_slice(fmin, fmax, sr, n_fft): '''Calculate the slice needed to select a frequency band. Arguments: fmin, fmax (int): the frequency bounds sr (int): the sample rate n_fft (int): the fft size Returns: slice(i[fmin], i[fmax]) ''' if not sr or not n_fft: raise ParameterError("You must set a sr=({}) and n_fft=({})".format( sr, n_fft)) if fmin and fmin < 0: raise ParameterError("fmin={} must be nonnegative".format(fmin)) if fmax and fmax > (sr / 2): raise ParameterError( "fmax={} must be smaller than nyquist, f={}".format(fmax, sr)) fft_frequencies = librosa.fft_frequencies(sr=sr, n_fft=n_fft) bin_start = np.where(fft_frequencies >= fmin)[0][0] if fmin else None bin_stop = np.where(fft_frequencies < fmax)[0][-1] if fmax else None return slice(bin_start, bin_stop)
def old_stuff(): """backup: old lingering stuff""" y, f0 = get_f0_series('/home/rafa/dev/sound/440-10-partials/440-10-partials.wav') # Overlay F0 over a spectrogram import matplotlib.pyplot as plt import numpy as np import librosa.display amplitude = np.abs(librosa.stft(y)) spectrum = librosa.amplitude_to_db(amplitude, ref=np.max) frequencies = librosa.fft_frequencies() fig, ax = plt.subplots() img = librosa.display.specshow(spectrum, x_axis='time', y_axis='log', ax=ax) ax.set(title='pYIN fundamental frequency estimation') fig.colorbar(img, ax=ax, format="%+2.f dB") times = librosa.times_like(f0) ax.plot(times, f0*FIFTY_CENTS_BWD, label='bwd', color='red', linewidth=1) ax.plot(times, f0, label='f0', color='cyan', linewidth=1) ax.plot(times, f0*FIFTY_CENTS_FWD, label='fwd', color='red', linewidth=1) ax.legend(loc='upper right') fig.savefig('plot.png')
def compute_loudness(audio, sample_rate=16000, frame_rate=50, n_fft=2048, range_db=120.0, ref_db=20.7): """Perceptual loudness in dB, relative to white noise, amplitude=1. Args: audio: tensor. Shape [batch_size, audio_length] or [audio_length]. sample_rate: Audio sample rate in Hz. frame_rate: Rate of loudness frames in Hz. n_fft: Fft window size. range_db: Sets the dynamic range of loudness in decibels. The minimum loudness (per a frequency bin) corresponds to -range_db. ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a slight dependence on fft_size due to different granularity of perceptual weighting. Returns: Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,]. """ # Temporarily a batch dimension for single examples. is_1d = (len(audio.shape) == 1) if is_1d: audio = audio[None, :] # Take STFT. hop_length = sample_rate // frame_rate s = torch.stft(audio, n_fft=n_fft, hop_length=hop_length) # batch, frequency_bins, n_frames # Compute power of each bin amplitude = torch.sqrt(amp(s) + 1e-5) #sqrt(0) gives nan gradient power_db = torch.log10(amplitude + 1e-5) power_db *= 20.0 # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[None, :, None] loudness = power_db + torch.from_numpy(a_weighting.astype(np.float32)).to( audio.device) # Set dynamic range. loudness -= ref_db loudness = torch.clamp(loudness, min=-range_db) # Average over frequency bins. loudness = torch.mean(loudness, dim=1) # Remove temporary batch dimension. loudness = loudness[0] if is_1d else loudness # Compute expected length of loudness vector n_secs = audio.shape[-1] / float( sample_rate) # `n_secs` can have milliseconds expected_len = int(n_secs * frame_rate) # Pad with `-range_db` noise floor or trim vector loudness = pad_or_trim_to_expected_length(loudness, expected_len, -range_db) return loudness
def filterbank_log(sr, n_freq, n_bins=84, bins_per_octave=12, fmin=None, spread=0.125): # pragma: no cover """[np] Approximate a constant-Q filter bank for a fixed-window STFT. Each filter is a log-normal window centered at the corresponding frequency. Note: `logfrequency` in librosa 0.4 (deprecated), so copy-and-pasted, `tuning` was removed, `n_freq` instead of `n_fft`. Parameters ---------- sr : number > 0 [scalar] audio sampling rate n_freq : int > 0 [scalar] number of frequency bins n_bins : int > 0 [scalar] Number of bins. Defaults to 84 (7 octaves). bins_per_octave : int > 0 [scalar] Number of bins per octave. Defaults to 12 (semitones). fmin : float > 0 [scalar] Minimum frequency bin. Defaults to `C1 ~= 32.70` spread : float > 0 [scalar] Spread of each filter, as a fraction of a bin. Returns ------- C : np.ndarray [shape=(n_bins, 1 + n_fft/2)] log-frequency filter bank. """ if fmin is None: fmin = 32.70319566 # What's the shape parameter for our log-normal filters? sigma = float(spread) / bins_per_octave # Construct the output matrix basis = np.zeros((n_bins, n_freq)) # Get log frequencies of bins log_freqs = np.log2(librosa.fft_frequencies(sr, (n_freq - 1) * 2)[1:]) for i in range(n_bins): # What's the center (median) frequency of this filter? c_freq = fmin * (2.0 ** (float(i) / bins_per_octave)) # Place a log-normal window around c_freq basis[i, 1:] = np.exp(-0.5 * ((log_freqs - np.log2(c_freq)) / sigma) ** 2 - np.log2(sigma) - log_freqs) # Normalize the filters basis = librosa.util.normalize(basis, norm=1, axis=1) return basis.astype(K.floatx())
def test_spectral_centroid_synthetic(S_ideal, freq): n_fft = 2 * (S_ideal.shape[0] - 1) cent = librosa.feature.spectral_centroid(S=S_ideal, freq=freq) if freq is None: freq = librosa.fft_frequencies(sr=22050, n_fft=n_fft) assert np.allclose(cent, freq[5])
def ambix_power_map_freq(audio, audio_rate, freq_lims): import librosa # Filter out frequencies audio_masked = [] for a in audio: spect = librosa.core.stft(a) mask = (librosa.fft_frequencies(audio_rate) > freq_lims[0]).astype(float) * \ (librosa.fft_frequencies(audio_rate) < freq_lims[1]).astype(float) spec_masked = spect * mask[:, np.newaxis] audio_masked += [librosa.core.istft(spec_masked)] audio_masked = np.stack(audio_masked, 0) # Compute source map audio_maps = ambix_power_map(audio_masked, audio_rate=audio_rate, outp_rate=5, angular_res=5.) return audio_maps
def __init__(self, sr, n_fft, min_db): super().__init__() self.min_db = min_db freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft) self.a_weighting = torch.nn.Parameter( data=torch.from_numpy(librosa.A_weighting(freqs + 1e-10)), requires_grad=False, )
def relacion_fundamental_harmonicos(file): y, sr = librosa.load(file) # y, sr = librosa.load("../audios/clash cymbals/clash-cymbals__long_forte_undamped.mp3") h_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] S = np.abs(librosa.stft(y)) fft_freqs = librosa.fft_frequencies(sr=sr) S_harm = librosa.interp_harmonics(S, fft_freqs, h_range, axis=0) return np.sum(S_harm[1]) / np.sum(S_harm[2:])
def filterbank_log(sample_rate, n_freq, n_bins=84, bins_per_octave=12, f_min=None, spread=0.125): """A function that returns a approximation of constant-Q filter banks for a fixed-window STFT. Each filter is a log-normal window centered at the corresponding frequency. Args: sample_rate (`int`): audio sampling rate n_freq (`int`): number of the input frequency bins. E.g., `n_fft / 2 + 1` n_bins (`int`): number of the resulting log-frequency bins. Defaults to 84 (7 octaves). bins_per_octave (`int`): number of bins per octave. Defaults to 12 (semitones). f_min (`float`): lowest frequency that is going to be included in the log filterbank. Defaults to `C1 ~= 32.70` spread (`float`): spread of each filter, as a fraction of a bin. Returns: (`Tensor`): log-frequency filterbanks. Shape=`(n_freq, n_bins)` Note: The code is originally from `logfrequency` in librosa 0.4 (deprecated) and copy-and-pasted. `tuning` parameter was removed and we use `n_freq` instead of `n_fft`. """ if f_min is None: f_min = 32.70319566 f_max = f_min * 2 ** (n_bins / bins_per_octave) if f_max > sample_rate // 2: raise RuntimeError( 'Maximum frequency of log filterbank should be lower or equal to the maximum' 'frequency of the input (defined by its sample rate), ' 'but f_max=%f and maximum frequency is %f. \n' 'Fix it by reducing n_bins, increasing bins_per_octave and/or reducing f_min.\n' 'You can also do it by increasing sample_rate but it means you need to upsample' 'the input audio data, too.' % (f_max, sample_rate) ) # What's the shape parameter for our log-normal filters? sigma = float(spread) / bins_per_octave # Construct the output matrix basis = np.zeros((n_bins, n_freq)) # Get log frequencies of bins log_freqs = np.log2(librosa.fft_frequencies(sample_rate, (n_freq - 1) * 2)[1:]) for i in range(n_bins): # What's the center (median) frequency of this filter? c_freq = f_min * (2.0 ** (float(i) / bins_per_octave)) # Place a log-normal window around c_freq basis[i, 1:] = np.exp( -0.5 * ((log_freqs - np.log2(c_freq)) / sigma) ** 2 - np.log2(sigma) - log_freqs ) # Normalize the filters basis = librosa.util.normalize(basis, norm=1, axis=1) basis = basis.astype(K.floatx()) return tf.convert_to_tensor(basis.T)
def extract_n_freq(N, audio, sample_rate=44100, n_fft=2048, use_stft=False): audio = _normalize(audio) audio = _clean_input(audio) fft_data = _get_fft_data(audio) bin_to_freq = librosa.fft_frequencies(sample_rate, n_fft) top_N = _get_n_top(fft_data, N, bin_to_freq) top_N = [(freq, amp) for freq, amp in top_N if freq > 40] top_N = _normalize_N_top(top_N) return top_N
def poly_S(poly_coeffs, poly_freq): if poly_freq is None: poly_freq = librosa.fft_frequencies() S = np.zeros_like(poly_freq) for i, c in enumerate(poly_coeffs): S += c * poly_freq**i return S.reshape((poly_freq.shape[0], -1))
def __test(S, freq, pct): rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr, freq=freq, roll_percent=pct) if freq is None: freq = librosa.fft_frequencies(sr=sr, n_fft=n_fft) idx = np.floor(pct * freq.shape[0]).astype(int) assert np.allclose(rolloff, freq[idx])
def perceptual_weights(): """A-weighted frequency-dependent perceptual loudness weights""" frequencies = librosa.fft_frequencies(sr=torchcrepe.SAMPLE_RATE, n_fft=torchcrepe.WINDOW_SIZE) # A warning is raised for nearly inaudible frequencies, but it ends up # defaulting to -100 db. That default is fine for our purposes. with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) return librosa.A_weighting(frequencies)[:, None] - REF_DB
def test_freq_ticks(): def __test(locs, freqs, n_ticks, axis): if freqs is None: args = [locs] fmax = max(locs) else: args = [locs, freqs] fmax = max(freqs) fig = plt.figure() (ticks, labels), fmt = librosa.display.frequency_ticks(*args, axis=axis, n_ticks=n_ticks) plt.close(fig) if n_ticks is None: n_ticks = len(locs) eq_(len(ticks), n_ticks) eq_(len(labels), n_ticks) if fmt == 'mHz': assert fmax <= 1e1 elif fmt == 'Hz': assert fmax <= 1e4 elif fmt == 'kHz': assert fmax <= 1e7 elif fmt == 'MHz': assert fmax <= 1e10 elif fmt == 'GHz': assert fmax > 1e10 else: raise ValueError('Incorrect fmt={}'.format(fmt)) if axis == 'x': cls = matplotlib.axis.XTick elif axis == 'y': cls = matplotlib.axis.YTick else: raise ValueError('Incorrect axis={}'.format(axis)) assert all([isinstance(_, cls) for _ in ticks]) for sr in [1e-3, 1e1, 1e3, 1e5, 1e8, 1e12]: locs = librosa.fft_frequencies(sr=sr, n_fft=32) for freqs in [None, locs]: for n_ticks in [3, 5, None]: for axis in ['x', 'y']: yield __test, locs, freqs, n_ticks, axis yield raises(librosa.ParameterError)(__test), locs, freqs, n_ticks, 23
def compute_loudness(self, n_fft=256): fourier = librosa.stft(self.audio_raw, n_fft=n_fft) S = np.abs(fourier * np.conj(fourier)) log_S = librosa.perceptual_weighting( S**2, librosa.fft_frequencies(n_fft=n_fft)) self.loudness = log_S.sum(axis=0, keepdims=True)[0] self.n_points = int(n_fft / 4) self.n_windows = int(np.ceil(len(self.audio_raw) / self.n_points)) print('processing windows: {}'.format(self.n_windows)) print('points per windows: {}'.format(self.n_points))
def __extract_frame_features_single_file(wav_file): y, sr = librosa.load(str(wav_file), sr=SR, dtype=np.float32) D = librosa.stft(y, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH, window='hann') mag, phase = librosa.magphase(D, power=1) features = [] mel_s = librosa.feature.melspectrogram(S=mag**2, n_mels=40, fmax=SR) freq = librosa.fft_frequencies(sr=SR, n_fft=FRAME_LENGTH) # MFCC shape: (N_MFCC,t) where t is number of frames mfcc_feat = librosa.feature.mfcc(S=librosa.power_to_db(mel_s), n_mfcc=N_MFCC) features.append(mfcc_feat) # MFCC delta1, shape: (N_MFCC, t) mfcc_delta = librosa.feature.delta(mfcc_feat) features.append(mfcc_delta) # MFCC delta2 # mfcc_delta2 = librosa.feature.delta(mfcc_feat, order=2) # features.append(mfcc_delta2) # zero crossing rate, shape: (1, t) zcr = librosa.feature.zero_crossing_rate(y, FRAME_LENGTH, HOP_LENGTH) features.append(zcr) # RMS energy, shape: (1, t) rmse = librosa.feature.rmse(S=mag) features.append(rmse) # spectral_centroid, shape: (1, t) spec_centroid = librosa.feature.spectral_centroid(S=mag, freq=freq) features.append(spec_centroid) # spectrual_bandwidth, shape: (1, t) spec_bandwidth = librosa.feature.spectral_bandwidth(S=mag, freq=freq, p=2) features.append(spec_bandwidth) # spectral_flatness, shape: (1, t) spec_flatness = librosa.feature.spectral_flatness(S=mag) features.append(spec_flatness) # spectral_rolloff, shape: (1, t) spec_rolloff = librosa.feature.spectral_rolloff(S=mag, freq=freq, roll_percent=0.85) features.append(spec_rolloff) result = np.concatenate(features, axis=0) # (num_features, t) return result
def test_freq_ticks(): def __test(locs, freqs, n_ticks, axis): if freqs is None: args = [locs] fmax = max(locs) else: args = [locs, freqs] fmax = max(freqs) plt.figure() (ticks, labels), fmt = librosa.display.frequency_ticks(*args, axis=axis, n_ticks=n_ticks) if n_ticks is None: n_ticks = len(locs) eq_(len(ticks), n_ticks) eq_(len(labels), n_ticks) if fmt == 'mHz': assert fmax <= 1e1 elif fmt == 'Hz': assert fmax <= 1e4 elif fmt == 'kHz': assert fmax <= 1e7 elif fmt == 'MHz': assert fmax <= 1e10 elif fmt == 'GHz': assert fmax > 1e10 else: raise ValueError('Incorrect fmt={}'.format(fmt)) if axis == 'x': cls = matplotlib.axis.XTick elif axis == 'y': cls = matplotlib.axis.YTick else: raise ValueError('Incorrect axis={}'.format(axis)) assert all([isinstance(_, cls) for _ in ticks]) for sr in [1e-3, 1e1, 1e3, 1e5, 1e8, 1e12]: locs = librosa.fft_frequencies(sr=sr, n_fft=32) for freqs in [None, locs]: for n_ticks in [3, 5, None]: for axis in ['x', 'y']: yield __test, locs, freqs, n_ticks, axis yield raises(librosa.ParameterError)(__test), locs, freqs, n_ticks, 23
def test_spectral_rolloff_synthetic(S, freq, pct): sr = 22050 rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr, freq=freq, roll_percent=pct) n_fft = 2 * (S.shape[0] - 1) if freq is None: freq = librosa.fft_frequencies(sr=sr, n_fft=n_fft) idx = np.floor(pct * freq.shape[0]).astype(int) assert np.allclose(rolloff, freq[idx])
def __test(sr, n_fft): freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft) # DC assert freqs[0] == 0 # Nyquist, positive here for more convenient display purposes assert freqs[-1] == sr / 2.0 # Ensure that the frequencies increase linearly dels = np.diff(freqs) assert np.allclose(dels, dels[0])
def __test(sr, n_fft): freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft) # DC eq_(freqs[0], 0) # Nyquist, positive here for more convenient display purposes eq_(freqs[-1], sr / 2.0) # Ensure that the frequencies increase linearly dels = np.diff(freqs) assert np.allclose(dels, dels[0])
def test_poly_features_synthetic(): srand() sr = 22050 n_fft = 2048 def __test(S, coeffs, freq): order = coeffs.shape[0] - 1 p = librosa.feature.poly_features(S=S, sr=sr, n_fft=n_fft, order=order, freq=freq) for i in range(S.shape[-1]): assert np.allclose(coeffs, p[::-1, i].squeeze()) def __make_data(coeffs, freq): S = np.zeros_like(freq) for i, c in enumerate(coeffs): S = S + c * freq**i S = S.reshape((freq.shape[0], -1)) return S for order in range(1, 3): freq = librosa.fft_frequencies(sr=sr, n_fft=n_fft) coeffs = np.atleast_1d(np.arange(1, 1+order)) # First test: vanilla S = __make_data(coeffs, freq) yield __test, S, coeffs, None # And with explicit frequencies yield __test, S, coeffs, freq # And with alternate frequencies freq = freq**2.0 S = __make_data(coeffs, freq) yield __test, S, coeffs, freq # And multi-dimensional freq = np.cumsum(np.abs(np.random.randn(1 + n_fft//2, 2)), axis=0) S = __make_data(coeffs, freq) yield __test, S, coeffs, freq
def logfrequency(sr, n_fft, bins_per_octave=12, tuning=0.0, fmin=None, fmax=None, spread=0.125): '''Approximate a constant-Q filterbank for a fixed-window STFT. Each filter is a log-normal window centered at the corresponding pitch frequency. :usage: >>> # Simple log frequency filters >>> logfs_fb = librosa.filters.logfrequency(22050, 4096) >>> # Use a narrower frequency range >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, fmin=110, fmax=880) >>> # Use narrower filters for sparser response: 5% of a semitone >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, spread=0.05) >>> # Or wider: 50% of a semitone >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, spread=0.5) :parameters: - sr : int > 0 audio sampling rate - n_fft : int > 0 FFT window size - bins_per_octave : int > 0 Number of bins per octave. Defaults to 12 (semitones). - tuning : None or float in [-0.5, +0.5] Tuning correction parameter, in fractions of a bin. - fmin : float > 0 Minimum frequency bin. Defaults to ``C1 ~= 16.35`` - fmax : float > 0 Maximum frequency bin. Defaults to ``C9 = 4816.01`` - spread : float > 0 Spread of each filter, as a fraction of a bin. :returns: - C : np.ndarray, shape=(ceil(log(fmax/fmin)) * bins_per_octave, 1 + n_fft/2) CQT filter bank. ''' if fmin is None: fmin = librosa.midi_to_hz(librosa.note_to_midi('C1')) if fmax is None: fmax = librosa.midi_to_hz(librosa.note_to_midi('C9')) # Apply tuning correction correction = 2.0**(float(tuning) / bins_per_octave) # How many bins can we get? n_filters = int(np.ceil(bins_per_octave * np.log2(float(fmax) / fmin))) # What's the shape parameter for our log-normal filters? sigma = float(spread) / bins_per_octave # Construct the output matrix basis = np.zeros( (n_filters, n_fft /2 + 1) ) # Get log frequencies of bins log_freqs = np.log2(librosa.fft_frequencies(sr, n_fft)[1:]) for i in range(n_filters): # What's the center (median) frequency of this filter? center_freq = correction * fmin * (2.0**(float(i)/bins_per_octave)) # Place a log-normal window around center_freq # We skip the sqrt(2*pi) normalization because it will wash out below anyway basis[i, 1:] = np.exp(-0.5 * ((log_freqs - np.log2(center_freq)) /sigma)**2 - np.log2(sigma) - log_freqs) # Normalize each filter c_norm = np.sqrt(np.sum(basis[i]**2)) if c_norm > 0: basis[i] = basis[i] / c_norm return basis
def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False): """Create a Filterbank matrix to combine FFT bins into Mel-frequency bins :usage: >>> mel_fb = librosa.filters.mel(22050, 2048) >>> # Or clip the maximum frequency to 8KHz >>> mel_fb = librosa.filters.mel(22050, 2048, fmax=8000) :parameters: - sr : int > 0 sampling rate of the incoming signal - n_fft : int > 0 number of FFT components - n_mels : int > 0 number of Mel bands to generate - fmin : float >= 0 lowest frequency (in Hz) - fmax : float >= 0 highest frequency (in Hz). If ``None``, use ``fmax = sr / 2.0`` - htk : bool use HTK formula instead of Slaney :returns: - M : np.ndarray, shape=(n_mels, 1+ n_fft/2) Mel transform matrix """ if fmax is None: fmax = sr / 2.0 # Initialize the weights n_mels = int(n_mels) weights = np.zeros((n_mels, int(1 + n_fft / 2))) # Center freqs of each FFT bin fftfreqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft) # 'Center freqs' of mel bands - uniformly spaced between limits freqs = librosa.mel_frequencies(n_mels, fmin=fmin, fmax=fmax, htk=htk, extra=True) # Slaney-style mel is scaled to be approx constant energy per channel enorm = 2.0 / (freqs[2:n_mels+2] - freqs[:n_mels]) for i in xrange(n_mels): # lower and upper slopes for all bins lower = (fftfreqs - freqs[i]) / (freqs[i+1] - freqs[i]) upper = (freqs[i+2] - fftfreqs) / (freqs[i+2] - freqs[i+1]) # .. then intersect them with each other and zero weights[i] = np.maximum(0, np.minimum(lower, upper)) * enorm[i] return weights
def logfrequency(sr, n_fft, n_bins=84, bins_per_octave=12, tuning=0.0, fmin=None, spread=0.125): '''Approximate a constant-Q filterbank for a fixed-window STFT. Each filter is a log-normal window centered at the corresponding frequency. :usage: >>> # Simple log frequency filters >>> logfs_fb = librosa.filters.logfrequency(22050, 4096) >>> # Use a narrower frequency range >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, n_bins=48, fmin=110) >>> # Use narrower filters for sparser response: 5% of a semitone >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, spread=0.05) >>> # Or wider: 50% of a semitone >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, spread=0.5) :parameters: - sr : int > 0 audio sampling rate - n_fft : int > 0 FFT window size - n_bins : int > 0 Number of bins. Defaults to 84 (7 octaves). - bins_per_octave : int > 0 Number of bins per octave. Defaults to 12 (semitones). - tuning : None or float in [-0.5, +0.5] Tuning correction parameter, in fractions of a bin. - fmin : float > 0 Minimum frequency bin. Defaults to ``C2 ~= 32.70`` - spread : float > 0 Spread of each filter, as a fraction of a bin. :returns: - C : np.ndarray, shape=(n_bins, 1 + n_fft/2) log-frequency filter bank. ''' if fmin is None: fmin = librosa.midi_to_hz(librosa.note_to_midi('C2')) # Apply tuning correction correction = 2.0**(float(tuning) / bins_per_octave) # What's the shape parameter for our log-normal filters? sigma = float(spread) / bins_per_octave # Construct the output matrix basis = np.zeros((n_bins, 1 + n_fft/2)) # Get log frequencies of bins log_freqs = np.log2(librosa.fft_frequencies(sr, n_fft)[1:]) for i in range(n_bins): # What's the center (median) frequency of this filter? c_freq = correction * fmin * (2.0**(float(i)/bins_per_octave)) # Place a log-normal window around c_freq basis[i, 1:] = np.exp(-0.5 * ((log_freqs - np.log2(c_freq)) / sigma)**2 - np.log2(sigma) - log_freqs) # Normalize each filter c_norm = np.sqrt(np.sum(basis[i]**2)) if c_norm > 0: basis[i] = basis[i] / c_norm return basis