def valid_int(x, cast=None): '''Ensure that an input value is integer-typed. This is primarily useful for ensuring integrable-valued array indices. Parameters ---------- x : number A scalar value to be cast to int cast : function [optional] A function to modify `x` before casting. Default: `np.floor` Returns ------- x_int : int `x_int = int(cast(x))` Raises ------ ParameterError If `cast` is provided and is not callable. ''' if cast is None: cast = np.floor if not six.callable(cast): raise ParameterError('cast parameter must be callable') return int(cast(x))
def __early_downsample(y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale): '''Perform early downsampling on an audio signal, if it applies.''' downsample_count = __early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves) if downsample_count > 0 and res_type == 'kaiser_fast': downsample_factor = 2**(downsample_count) hop_length //= downsample_factor if len(y) < downsample_factor: raise ParameterError('Input signal length={:d} is too short for ' '{:d}-octave CQT'.format(len(y), n_octaves)) new_sr = sr / float(downsample_factor) y = resample(y, sr, new_sr, res_type=res_type, scale=True) # If we're not going to length-scale after CQT, we # need to compensate for the downsampling factor here if not scale: y *= np.sqrt(downsample_factor) sr = new_sr return y, sr, hop_length
def time_stretch(y, rate, **kwargs): '''Time-stretch an audio series by a fixed rate. Parameters ---------- y : np.ndarray [shape=(n,)] audio time series rate : float > 0 [scalar] Stretch factor. If `rate > 1`, then the signal is sped up. If `rate < 1`, then the signal is slowed down. kwargs : additional keyword arguments. See `librosa.decompose.stft` for details. Returns ------- y_stretch : np.ndarray [shape=(round(n/rate),)] audio time series stretched by the specified rate See Also -------- pitch_shift : pitch shifting librosa.core.phase_vocoder : spectrogram phase vocoder pyrubberband.pyrb.time_stretch : high-quality time stretching using RubberBand Examples -------- Compress to be twice as fast >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> y_fast = librosa.effects.time_stretch(y, 2.0) Or half the original speed >>> y_slow = librosa.effects.time_stretch(y, 0.5) ''' if rate <= 0: raise ParameterError('rate must be a positive number') # Construct the short-term Fourier transform (STFT) stft = stft(y, **kwargs) # Stretch by phase vocoding stft_stretch = phase_vocoder(stft, rate) # Predict the length of y_stretch len_stretch = int(round(len(y) / rate)) # Invert the STFT y_stretch = istft(stft_stretch, dtype=y.dtype, length=len_stretch, **kwargs) return y_stretch
def valid_intervals(intervals): '''Ensure that an array is a valid representation of time intervals: - intervals.ndim == 2 - intervals.shape[1] == 2 Parameters ---------- intervals : np.ndarray [shape=(n, 2)] set of time intervals Returns ------- valid : bool True if `intervals` passes validation. ''' if intervals.ndim != 2 or intervals.shape[-1] != 2: raise ParameterError('intervals must have shape (n, 2)') return True
def onset_detect(y=None, sr=48000, onset_envelope=None, hop_length=512, backtrack=False, energy=None, units='frames', **kwargs): """Basic onset detector. Locate note onset events by picking peaks in an onset strength envelope. The `peak_pick` parameters were chosen by large-scale hyper-parameter optimization over the dataset provided by [1]_. .. [1] https://github.com/CPJKU/onset_db Parameters ---------- y : np.ndarray [shape=(n,)] audio time series sr : number > 0 [scalar] sampling rate of `y` onset_envelope : np.ndarray [shape=(m,)] (optional) pre-computed onset strength envelope hop_length : int > 0 [scalar] hop length (in samples) units : {'frames', 'samples', 'time'} The units to encode detected onset events in. By default, 'frames' are used. backtrack : bool If `True`, detected onset events are backtracked to the nearest preceding minimum of `energy`. This is primarily useful when using onsets as slice points for segmentation. energy : np.ndarray [shape=(m,)] (optional) An energy function to use for backtracking detected onset events. If none is provided, then `onset_envelope` is used. kwargs : additional keyword arguments Additional parameters for peak picking. See `librosa.util.peak_pick` for details. Returns ------- onsets : np.ndarray [shape=(n_onsets,)] estimated positions of detected onsets, in whichever units are specified. By default, frame indices. .. note:: If no onset strength could be detected, onset_detect returns an empty list. Raises ------ ParameterError if neither `y` nor `onsets` are provided or if `units` is not one of 'frames', 'samples', or 'time' See Also -------- onset_strength : compute onset strength per-frame onset_backtrack : backtracking onset events librosa.util.peak_pick : pick peaks from a time series Examples -------- Get onset times from a signal >>> y, sr = librosa.load(librosa.util.example_audio_file(), ... offset=30, duration=2.0) >>> onset_frames = librosa.onset.onset_detect(y=y, sr=sr) >>> librosa.frames_to_time(onset_frames, sr=sr) array([ 0.07 , 0.395, 0.511, 0.627, 0.766, 0.975, 1.207, 1.324, 1.44 , 1.788, 1.881]) Or use a pre-computed onset envelope >>> o_env = librosa.onset.onset_strength(y, sr=sr) >>> times = librosa.frames_to_time(np.arange(len(o_env)), sr=sr) >>> onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr) >>> import matplotlib.pyplot as plt >>> D = np.abs(librosa.stft(y)) >>> plt.figure() >>> ax1 = plt.subplot(2, 1, 1) >>> librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max), ... x_axis='time', y_axis='log') >>> plt.title('Power spectrogram') >>> plt.subplot(2, 1, 2, sharex=ax1) >>> plt.plot(times, o_env, label='Onset strength') >>> plt.vlines(times[onset_frames], 0, o_env.max(), color='r', alpha=0.9, ... linestyle='--', label='Onsets') >>> plt.axis('tight') >>> plt.legend(frameon=True, framealpha=0.75) >>> plt.show() """ # First, get the frame->beat strength profile if we don't already have one if onset_envelope is None: if y is None: raise ParameterError('y or onset_envelope must be provided') onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length) # Shift onset envelope up to be non-negative # (a common normalization step to make the threshold more consistent) onset_envelope -= onset_envelope.min() # Do we have any onsets to grab? if not onset_envelope.any(): return np.array([], dtype=np.int) # Normalize onset strength function to [0, 1] range onset_envelope /= onset_envelope.max() # These parameter settings found by large-scale search kwargs.setdefault('pre_max', 0.03 * sr // hop_length) # 30ms kwargs.setdefault('post_max', 0.00 * sr // hop_length + 1) # 0ms kwargs.setdefault('pre_avg', 0.10 * sr // hop_length) # 100ms kwargs.setdefault('post_avg', 0.10 * sr // hop_length + 1) # 100ms kwargs.setdefault('wait', 0.03 * sr // hop_length) # 30ms kwargs.setdefault('delta', 0.07) # Peak pick the onset envelope onsets = peak_pick(onset_envelope, **kwargs) # Optionally backtrack the events if backtrack: if energy is None: energy = onset_envelope onsets = onset_backtrack(onsets, energy) if units == 'frames': pass elif units == 'samples': onsets = frames_to_samples(onsets, hop_length=hop_length) elif units == 'time': onsets = frames_to_time(onsets, hop_length=hop_length, sr=sr) else: raise ParameterError('Invalid unit type: {}'.format(units)) return onsets
def match_events(events_from, events_to, left=True, right=True): '''Match one set of events to another. This is useful for tasks such as matching beats to the nearest detected onsets, or frame-aligned events to the nearest zero-crossing. .. note:: A target event may be matched to multiple source events. Examples -------- >>> # Sources are multiples of 7 >>> s_from = np.arange(0, 100, 7) >>> s_from array([ 0, 7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98]) >>> # Targets are multiples of 10 >>> s_to = np.arange(0, 100, 10) >>> s_to array([ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90]) >>> # Find the matching >>> idx = librosa.util.match_events(s_from, s_to) >>> idx array([0, 1, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 8, 9, 9]) >>> # Print each source value to its matching target >>> zip(s_from, s_to[idx]) [(0, 0), (7, 10), (14, 10), (21, 20), (28, 30), (35, 30), (42, 40), (49, 50), (56, 60), (63, 60), (70, 70), (77, 80), (84, 80), (91, 90), (98, 90)] Parameters ---------- events_from : ndarray [shape=(n,)] Array of events (eg, times, sample or frame indices) to match from. events_to : ndarray [shape=(m,)] Array of events (eg, times, sample or frame indices) to match against. left : bool right : bool If `False`, then matched events cannot be to the left (or right) of source events. Returns ------- event_mapping : np.ndarray [shape=(n,)] For each event in `events_from`, the corresponding event index in `events_to`. `event_mapping[i] == arg min |events_from[i] - events_to[:]|` See Also -------- match_intervals Raises ------ ParameterError If either array of input events is not the correct shape ''' if len(events_from) == 0 or len(events_to) == 0: raise ParameterError('Attempting to match empty event list') # If we can't match left or right, then only strict equivalence # counts as a match. if not (left or right) and not np.all(np.in1d(events_from, events_to)): raise ParameterError('Cannot match events with left=right=False ' 'and events_from is not contained ' 'in events_to') # If we can't match to the left, then there should be at least one # target event greater-equal to every source event if (not left) and max(events_to) < max(events_from): raise ParameterError('Cannot match events with left=False ' 'and max(events_to) < max(events_from)') # If we can't match to the right, then there should be at least one # target event less-equal to every source event if (not right) and min(events_to) > min(events_from): raise ParameterError('Cannot match events with right=False ' 'and min(events_to) > min(events_from)') # Pre-allocate the output array output = np.empty_like(events_from, dtype=np.int) # Compute how many rows we can process at once within the memory block n_rows = int(spectralUtil.MAX_MEM_BLOCK / (np.prod(output.shape[1:]) * len(events_to) * events_from.itemsize)) # Make sure we can at least make some progress n_rows = max(1, n_rows) # Iterate over blocks of the data for bl_s in range(0, len(events_from), n_rows): bl_t = min(bl_s + n_rows, len(events_from)) event_block = events_from[bl_s:bl_t] # distance[i, j] = |events_from - events_to[j]| distance = np.abs(np.subtract.outer(event_block, events_to)).astype(np.float) # If we can't match to the right, squash all comparisons where # events_to[j] > events_from[i] if not right: distance[np.less.outer(event_block, events_to)] = np.nan # If we can't match to the left, squash all comparisons where # events_to[j] < events_from[i] if not left: distance[np.greater.outer(event_block, events_to)] = np.nan # Find the minimum distance point from whatever's left after squashing output[bl_s:bl_t] = np.nanargmin(distance, axis=-1) return output
def softmask(X, X_ref, power=1, split_zeros=False): '''Robustly compute a softmask operation. `M = X**power / (X**power + X_ref**power)` Parameters ---------- X : np.ndarray The (non-negative) input array corresponding to the positive mask elements X_ref : np.ndarray The (non-negative) array of reference or background elements. Must have the same shape as `X`. power : number > 0 or np.inf If finite, returns the soft mask computed in a numerically stable way If infinite, returns a hard (binary) mask equivalent to `X > X_ref`. Note: for hard masks, ties are always broken in favor of `X_ref` (`mask=0`). split_zeros : bool If `True`, entries where `X` and X`_ref` are both small (close to 0) will receive mask values of 0.5. Otherwise, the mask is set to 0 for these entries. Returns ------- mask : np.ndarray, shape=`X.shape` The output mask array Raises ------ ParameterError If `X` and `X_ref` have different shapes. If `X` or `X_ref` are negative anywhere If `power <= 0` Examples -------- >>> X = 2 * np.ones((3, 3)) >>> X_ref = np.vander(np.arange(3.0)) >>> X array([[ 2., 2., 2.], [ 2., 2., 2.], [ 2., 2., 2.]]) >>> X_ref array([[ 0., 0., 1.], [ 1., 1., 1.], [ 4., 2., 1.]]) >>> librosa.util.softmask(X, X_ref, power=1) array([[ 1. , 1. , 0.667], [ 0.667, 0.667, 0.667], [ 0.333, 0.5 , 0.667]]) >>> librosa.util.softmask(X_ref, X, power=1) array([[ 0. , 0. , 0.333], [ 0.333, 0.333, 0.333], [ 0.667, 0.5 , 0.333]]) >>> librosa.util.softmask(X, X_ref, power=2) array([[ 1. , 1. , 0.8], [ 0.8, 0.8, 0.8], [ 0.2, 0.5, 0.8]]) >>> librosa.util.softmask(X, X_ref, power=4) array([[ 1. , 1. , 0.941], [ 0.941, 0.941, 0.941], [ 0.059, 0.5 , 0.941]]) >>> librosa.util.softmask(X, X_ref, power=100) array([[ 1.000e+00, 1.000e+00, 1.000e+00], [ 1.000e+00, 1.000e+00, 1.000e+00], [ 7.889e-31, 5.000e-01, 1.000e+00]]) >>> librosa.util.softmask(X, X_ref, power=np.inf) array([[ True, True, True], [ True, True, True], [False, False, True]], dtype=bool) ''' if X.shape != X_ref.shape: raise ParameterError('Shape mismatch: {}!={}'.format(X.shape, X_ref.shape)) if np.any(X < 0) or np.any(X_ref < 0): raise ParameterError('X and X_ref must be non-negative') if power <= 0: raise ParameterError('power must be strictly positive') # We're working with ints, cast to float. dtype = X.dtype if not np.issubdtype(dtype, float): dtype = np.float32 # Re-scale the input arrays relative to the larger value Z = np.maximum(X, X_ref).astype(dtype) bad_idx = (Z < np.finfo(dtype).tiny) Z[bad_idx] = 1 # For finite power, compute the softmask if np.isfinite(power): mask = (X / Z)**power ref_mask = (X_ref / Z)**power good_idx = ~bad_idx mask[good_idx] /= mask[good_idx] + ref_mask[good_idx] # Wherever energy is below energy in both inputs, split the mask if split_zeros: mask[bad_idx] = 0.5 else: mask[bad_idx] = 0.0 else: # Otherwise, compute the hard mask mask = X > X_ref return mask
def spectral_rolloff(y=None, sr=48000, S=None, n_fft=2048, hop_length=512, freq=None, roll_percent=0.85): '''Compute roll-off frequency Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None (optional) spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. freq : None or np.ndarray [shape=(d,) or shape=(d, t)] Center frequencies for spectrogram bins. If `None`, then FFT bin center frequencies are used. Otherwise, it can be a single array of `d` center frequencies, .. note:: `freq` is assumed to be sorted in increasing order roll_percent : float [0 < roll_percent < 1] Roll-off percentage. Returns ------- rolloff : np.ndarray [shape=(1, t)] roll-off frequency for each frame Examples -------- From time-series input >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr) >>> rolloff array([[ 8376.416, 968.994, ..., 8925.513, 9108.545]]) From spectrogram input >>> S, phase = librosa.magphase(librosa.stft(y)) >>> librosa.feature.spectral_rolloff(S=S, sr=sr) array([[ 8376.416, 968.994, ..., 8925.513, 9108.545]]) >>> # With a higher roll percentage: >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.95) array([[ 10012.939, 3003.882, ..., 10034.473, 10077.539]]) >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> plt.semilogy(rolloff.T, label='Roll-off frequency') >>> plt.ylabel('Hz') >>> plt.xticks([]) >>> plt.xlim([0, rolloff.shape[-1]]) >>> plt.legend() >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('log Power spectrogram') >>> plt.tight_layout() ''' if not 0.0 < roll_percent < 1.0: raise ParameterError('roll_percent must lie in the range (0, 1)') S, n_fft = mag_spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) if not np.isrealobj(S): raise ParameterError('Spectral rolloff is only defined ' 'with real-valued input') elif np.any(S < 0): raise ParameterError('Spectral rolloff is only defined ' 'with non-negative energies') # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) # Make sure that frequency can be broadcast if freq.ndim == 1: freq = freq.reshape((-1, 1)) total_energy = np.cumsum(S, axis=0) threshold = roll_percent * total_energy[-1] ind = np.where(total_energy < threshold, np.nan, 1) return np.nanmin(ind * freq, axis=0, keepdims=True)
def hpss(S, kernel_size=31, power=2.0, mask=False, margin=1.0): """Median-filtering harmonic percussive source separation (HPSS). If `margin = 1.0`, decomposes an input spectrogram `S = H + P` where `H` contains the harmonic components, and `P` contains the percussive components. If `margin > 1.0`, decomposes an input spectrogram `S = H + P + R` where `R` contains residual components not included in `H` or `P`. This implementation is based upon the algorithm described by [1]_ and [2]_. .. [1] Fitzgerald, Derry. "Harmonic/percussive separation using median filtering." 13th International Conference on Digital Audio Effects (DAFX10), Graz, Austria, 2010. .. [2] Driedger, Müller, Disch. "Extending harmonic-percussive separation of audio." 15th International Society for Music Information Retrieval Conference (ISMIR 2014), Taipei, Taiwan, 2014. Parameters ---------- S : np.ndarray [shape=(d, n)] input spectrogram. May be real (magnitude) or complex. kernel_size : int or tuple (kernel_harmonic, kernel_percussive) kernel size(s) for the median filters. - If scalar, the same size is used for both harmonic and percussive. - If tuple, the first value specifies the width of the harmonic filter, and the second value specifies the width of the percussive filter. power : float > 0 [scalar] Exponent for the Wiener filter when constructing soft mask matrices. mask : bool Return the masking matrices instead of components. Masking matrices contain non-negative real values that can be used to measure the assignment of energy from `S` into harmonic or percussive components. Components can be recovered by multiplying `S * mask_H` or `S * mask_P`. margin : float or tuple (margin_harmonic, margin_percussive) margin size(s) for the masks (as described in [2]_) - If scalar, the same size is used for both harmonic and percussive. - If tuple, the first value specifies the margin of the harmonic mask, and the second value specifies the margin of the percussive mask. Returns ------- harmonic : np.ndarray [shape=(d, n)] harmonic component (or mask) percussive : np.ndarray [shape=(d, n)] percussive component (or mask) See Also -------- util.softmask Notes ----- This function caches at level 30. Examples -------- Separate into harmonic and percussive >>> y, sr = librosa.load(librosa.util.example_audio_file(), duration=15) >>> D = librosa.stft(y) >>> H, P = librosa.decompose.hpss(D) >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(3, 1, 1) >>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ... ref=np.max), ... y_axis='log') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Full power spectrogram') >>> plt.subplot(3, 1, 2) >>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(H), ... ref=np.max), ... y_axis='log') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Harmonic power spectrogram') >>> plt.subplot(3, 1, 3) >>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(P), ... ref=np.max), ... y_axis='log') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Percussive power spectrogram') >>> plt.tight_layout() >>> plt.show() Or with a narrower horizontal filter >>> H, P = librosa.decompose.hpss(D, kernel_size=(13, 31)) Just get harmonic/percussive masks, not the spectra >>> mask_H, mask_P = librosa.decompose.hpss(D, mask=True) >>> mask_H array([[ 1.000e+00, 1.469e-01, ..., 2.648e-03, 2.164e-03], [ 1.000e+00, 2.368e-01, ..., 9.413e-03, 7.703e-03], ..., [ 8.869e-01, 5.673e-02, ..., 4.603e-02, 1.247e-05], [ 7.068e-01, 2.194e-02, ..., 4.453e-02, 1.205e-05]], dtype=float32) >>> mask_P array([[ 2.858e-05, 8.531e-01, ..., 9.974e-01, 9.978e-01], [ 1.586e-05, 7.632e-01, ..., 9.906e-01, 9.923e-01], ..., [ 1.131e-01, 9.433e-01, ..., 9.540e-01, 1.000e+00], [ 2.932e-01, 9.781e-01, ..., 9.555e-01, 1.000e+00]], dtype=float32) Separate into harmonic/percussive/residual components by using a margin > 1.0 >>> H, P = librosa.decompose.hpss(D, margin=3.0) >>> R = D - (H+P) >>> y_harm = librosa.core.istft(H) >>> y_perc = librosa.core.istft(P) >>> y_resi = librosa.core.istft(R) Get a more isolated percussive component by widening its margin >>> H, P = librosa.decompose.hpss(D, margin=(1.0,5.0)) """ if np.iscomplexobj(S): S, phase = magphase(S) else: phase = 1 if np.isscalar(kernel_size): win_harm = kernel_size win_perc = kernel_size else: win_harm = kernel_size[0] win_perc = kernel_size[1] if np.isscalar(margin): margin_harm = margin margin_perc = margin else: margin_harm = margin[0] margin_perc = margin[1] # margin minimum is 1.0 if margin_harm < 1 or margin_perc < 1: raise ParameterError("Margins must be >= 1.0. " "A typical range is between 1 and 10.") # Compute median filters. Pre-allocation here preserves memory layout. harm = np.empty_like(S) harm[:] = median_filter(S, size=(1, win_harm), mode='reflect') perc = np.empty_like(S) perc[:] = median_filter(S, size=(win_perc, 1), mode='reflect') split_zeros = (margin_harm == 1 and margin_perc == 1) mask_harm = softmask(harm, perc * margin_harm, power=power, split_zeros=split_zeros) mask_perc = softmask(perc, harm * margin_perc, power=power, split_zeros=split_zeros) if mask: return mask_harm, mask_perc return ((S * mask_harm) * phase, (S * mask_perc) * phase)
def pitch_shift(y, sr, n_steps, bins_per_octave=12, res_type='kaiser_best', **kwargs): '''Shift the pitch of a waveform by `n_steps` semitones. Parameters ---------- y : np.ndarray [shape=(n,)] audio time series sr : number > 0 [scalar] audio sampling rate of `y` n_steps : float [scalar] how many (fractional) half-steps to shift `y` bins_per_octave : float > 0 [scalar] how many steps per octave res_type : string Resample type. Possible options: 'kaiser_best', 'kaiser_fast', and 'scipy', 'polyphase', 'fft'. By default, 'kaiser_best' is used. See `core.resample` for more information. kwargs: additional keyword arguments. See `librosa.decompose.stft` for details. Returns ------- y_shift : np.ndarray [shape=(n,)] The pitch-shifted audio time-series See Also -------- time_stretch : time stretching librosa.core.phase_vocoder : spectrogram phase vocoder pyrubberband.pyrb.pitch_shift : high-quality pitch shifting using RubberBand Examples -------- Shift up by a major third (four half-steps) >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> y_third = librosa.effects.pitch_shift(y, sr, n_steps=4) Shift down by a tritone (six half-steps) >>> y_tritone = librosa.effects.pitch_shift(y, sr, n_steps=-6) Shift up by 3 quarter-tones >>> y_three_qt = librosa.effects.pitch_shift(y, sr, n_steps=3, ... bins_per_octave=24) ''' if bins_per_octave < 1 or not np.issubdtype(type(bins_per_octave), np.integer): raise ParameterError('bins_per_octave must be a positive integer.') rate = 2.0**(-float(n_steps) / bins_per_octave) # Stretch in time, then resample y_shift = resample(time_stretch(y, rate, **kwargs), float(sr) / rate, sr, res_type=res_type) # Crop to the same dimension as the input return fix_length(y_shift, len(y))
def spectral_bandwidth(y=None, sr=48000, S=None, n_fft=2048, hop_length=512, freq=None, centroid=None, norm=True, p=2): '''Compute p'th-order spectral bandwidth: (sum_k S[k] * (freq[k] - centroid)**p)**(1/p) Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None (optional) spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. freq : None or np.ndarray [shape=(d,) or shape=(d, t)] Center frequencies for spectrogram bins. If `None`, then FFT bin center frequencies are used. Otherwise, it can be a single array of `d` center frequencies, or a matrix of center frequencies as constructed by `librosa.core.ifgram` centroid : None or np.ndarray [shape=(1, t)] pre-computed centroid frequencies norm : bool Normalize per-frame spectral energy (sum to one) p : float > 0 Power to raise deviation from spectral centroid. Returns ------- bandwidth : np.ndarray [shape=(1, t)] frequency bandwidth for each frame Examples -------- From time-series input >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr) >>> spec_bw array([[ 3379.878, 1429.486, ..., 3235.214, 3080.148]]) From spectrogram input >>> S, phase = librosa.magphase(librosa.stft(y=y)) >>> librosa.feature.spectral_bandwidth(S=S) array([[ 3379.878, 1429.486, ..., 3235.214, 3080.148]]) Using variable bin center frequencies >>> if_gram, D = librosa.ifgram(y) >>> librosa.feature.spectral_bandwidth(S=np.abs(D), freq=if_gram) array([[ 3380.011, 1429.11 , ..., 3235.22 , 3080.148]]) Plot the result >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> plt.semilogy(spec_bw.T, label='Spectral bandwidth') >>> plt.ylabel('Hz') >>> plt.xticks([]) >>> plt.xlim([0, spec_bw.shape[-1]]) >>> plt.legend() >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('log Power spectrogram') >>> plt.tight_layout() ''' S, n_fft = mag_spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) if not np.isrealobj(S): raise ParameterError('Spectral bandwidth is only defined ' 'with real-valued input') elif np.any(S < 0): raise ParameterError('Spectral bandwidth is only defined ' 'with non-negative energies') if centroid is None: centroid = spectral_centroid(y=y, sr=sr, S=S, n_fft=n_fft, hop_length=hop_length, freq=freq) # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) if freq.ndim == 1: deviation = np.abs(np.subtract.outer(freq, centroid[0])) else: deviation = np.abs(freq - centroid[0]) # Column-normalize S if norm: S = normalize(S, norm=1, axis=0) return np.sum(S * deviation**p, axis=0, keepdims=True)**(1./p)
def fix_frames(frames, x_min=0, x_max=None, pad=True): '''Fix a list of frames to lie within [x_min, x_max] Examples -------- >>> # Generate a list of frame indices >>> frames = np.arange(0, 1000.0, 50) >>> frames array([ 0., 50., 100., 150., 200., 250., 300., 350., 400., 450., 500., 550., 600., 650., 700., 750., 800., 850., 900., 950.]) >>> # Clip to span at most 250 >>> librosa.util.fix_frames(frames, x_max=250) array([ 0, 50, 100, 150, 200, 250]) >>> # Or pad to span up to 2500 >>> librosa.util.fix_frames(frames, x_max=2500) array([ 0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 2500]) >>> librosa.util.fix_frames(frames, x_max=2500, pad=False) array([ 0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950]) >>> # Or starting away from zero >>> frames = np.arange(200, 500, 33) >>> frames array([200, 233, 266, 299, 332, 365, 398, 431, 464, 497]) >>> librosa.util.fix_frames(frames) array([ 0, 200, 233, 266, 299, 332, 365, 398, 431, 464, 497]) >>> librosa.util.fix_frames(frames, x_max=500) array([ 0, 200, 233, 266, 299, 332, 365, 398, 431, 464, 497, 500]) Parameters ---------- frames : np.ndarray [shape=(n_frames,)] List of non-negative frame indices x_min : int >= 0 or None Minimum allowed frame index x_max : int >= 0 or None Maximum allowed frame index pad : boolean If `True`, then `frames` is expanded to span the full range `[x_min, x_max]` Returns ------- fixed_frames : np.ndarray [shape=(n_fixed_frames,), dtype=int] Fixed frame indices, flattened and sorted Raises ------ ParameterError If `frames` contains negative values ''' frames = np.asarray(frames) if np.any(frames < 0): raise ParameterError('Negative frame index detected') if pad and (x_min is not None or x_max is not None): frames = np.clip(frames, x_min, x_max) if pad: pad_data = [] if x_min is not None: pad_data.append(x_min) if x_max is not None: pad_data.append(x_max) frames = np.concatenate((pad_data, frames)) if x_min is not None: frames = frames[frames >= x_min] if x_max is not None: frames = frames[frames <= x_max] return np.unique(frames).astype(int)
def axis_sort(S, axis=-1, index=False, value=None): '''Sort an array along its rows or columns. Examples -------- Visualize NMF output for a spectrogram S >>> # Sort the columns of W by peak frequency bin >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> S = np.abs(librosa.stft(y)) >>> W, H = librosa.decompose.decompose(S, n_components=32) >>> W_sort = librosa.util.axis_sort(W) Or sort by the lowest frequency bin >>> W_sort = librosa.util.axis_sort(W, value=np.argmin) Or sort the rows instead of the columns >>> W_sort_rows = librosa.util.axis_sort(W, axis=0) Get the sorting index also, and use it to permute the rows of H >>> W_sort, idx = librosa.util.axis_sort(W, index=True) >>> H_sort = H[idx, :] >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 2, 1) >>> librosa.display.specshow(librosa.amplitude_to_db(W, ref=np.max), ... y_axis='log') >>> plt.title('W') >>> plt.subplot(2, 2, 2) >>> librosa.display.specshow(H, x_axis='time') >>> plt.title('H') >>> plt.subplot(2, 2, 3) >>> librosa.display.specshow(librosa.amplitude_to_db(W_sort, ... ref=np.max), ... y_axis='log') >>> plt.title('W sorted') >>> plt.subplot(2, 2, 4) >>> librosa.display.specshow(H_sort, x_axis='time') >>> plt.title('H sorted') >>> plt.tight_layout() Parameters ---------- S : np.ndarray [shape=(d, n)] Array to be sorted axis : int [scalar] The axis along which to compute the sorting values - `axis=0` to sort rows by peak column index - `axis=1` to sort columns by peak row index index : boolean [scalar] If true, returns the index array as well as the permuted data. value : function function to return the index corresponding to the sort order. Default: `np.argmax`. Returns ------- S_sort : np.ndarray [shape=(d, n)] `S` with the columns or rows permuted in sorting order idx : np.ndarray (optional) [shape=(d,) or (n,)] If `index == True`, the sorting index used to permute `S`. Length of `idx` corresponds to the selected `axis`. Raises ------ ParameterError If `S` does not have exactly 2 dimensions (`S.ndim != 2`) ''' if value is None: value = np.argmax if S.ndim != 2: raise ParameterError('axis_sort is only defined for 2D arrays') bin_idx = value(S, axis=np.mod(1-axis, S.ndim)) idx = np.argsort(bin_idx) sort_slice = [slice(None)] * S.ndim sort_slice[axis] = idx if index: return S[sort_slice], idx else: return S[sort_slice]
def roll_sparse(x, shift, axis=0): '''Sparse matrix roll This operation is equivalent to ``numpy.roll``, but operates on sparse matrices. Parameters ---------- x : scipy.sparse.spmatrix or np.ndarray The sparse matrix input shift : int The number of positions to roll the specified axis axis : (0, 1, -1) The axis along which to roll. Returns ------- x_rolled : same type as `x` The rolled matrix, with the same format as `x` See Also -------- numpy.roll Examples -------- >>> # Generate a random sparse binary matrix >>> X = scipy.sparse.lil_matrix(np.random.randint(0, 2, size=(5,5))) >>> X_roll = roll_sparse(X, 2, axis=0) # Roll by 2 on the first axis >>> X_dense_r = roll_sparse(X.toarray(), 2, axis=0) # Equivalent dense roll >>> np.allclose(X_roll, X_dense_r.toarray()) True ''' if not scipy.sparse.isspmatrix(x): return np.roll(x, shift, axis=axis) # shift-mod-length lets us have shift > x.shape[axis] if axis not in [0, 1, -1]: raise ParameterError('axis must be one of (0, 1, -1)') shift = np.mod(shift, x.shape[axis]) if shift == 0: return x.copy() fmt = x.format if axis == 0: x = x.tocsc() elif axis in (-1, 1): x = x.tocsr() # lil matrix to start x_r = scipy.sparse.lil_matrix(x.shape, dtype=x.dtype) idx_in = [slice(None)] * x.ndim idx_out = [slice(None)] * x_r.ndim idx_in[axis] = slice(0, -shift) idx_out[axis] = slice(shift, None) x_r[tuple(idx_out)] = x[tuple(idx_in)] idx_out[axis] = slice(0, shift) idx_in[axis] = slice(-shift, None) x_r[tuple(idx_out)] = x[tuple(idx_in)] return x_r.asformat(fmt)
def sync(data, idx, aggregate=None, pad=True, axis=-1): """Synchronous aggregation of a multi-dimensional array between boundaries .. note:: In order to ensure total coverage, boundary points may be added to `idx`. If synchronizing a feature matrix against beat tracker output, ensure that frame index numbers are properly aligned and use the same hop length. Parameters ---------- data : np.ndarray multi-dimensional array of features idx : iterable of ints or slices Either an ordered array of boundary indices, or an iterable collection of slice objects. aggregate : function aggregation function (default: `np.mean`) pad : boolean If `True`, `idx` is padded to span the full range `[0, data.shape[axis]]` axis : int The axis along which to aggregate data Returns ------- data_sync : ndarray `data_sync` will have the same dimension as `data`, except that the `axis` coordinate will be reduced according to `idx`. For example, a 2-dimensional `data` with `axis=-1` should satisfy `data_sync[:, i] = aggregate(data[:, idx[i-1]:idx[i]], axis=-1)` Raises ------ ParameterError If the index set is not of consistent type (all slices or all integers) Notes ----- This function caches at level 40. Examples -------- Beat-synchronous CQT spectra >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr, trim=False) >>> C = np.abs(librosa.cqt(y=y, sr=sr)) >>> beats = librosa.util.fix_frames(beats, x_max=C.shape[1]) By default, use mean aggregation >>> C_avg = librosa.util.sync(C, beats) Use median-aggregation instead of mean >>> C_med = librosa.util.sync(C, beats, ... aggregate=np.median) Or sub-beat synchronization >>> sub_beats = librosa.segment.subsegment(C, beats) >>> sub_beats = librosa.util.fix_frames(sub_beats, x_max=C.shape[1]) >>> C_med_sub = librosa.util.sync(C, sub_beats, aggregate=np.median) Plot the results >>> import matplotlib.pyplot as plt >>> beat_t = librosa.frames_to_time(beats, sr=sr) >>> subbeat_t = librosa.frames_to_time(sub_beats, sr=sr) >>> plt.figure() >>> plt.subplot(3, 1, 1) >>> librosa.display.specshow(librosa.amplitude_to_db(C, ... ref=np.max), ... x_axis='time') >>> plt.title('CQT power, shape={}'.format(C.shape)) >>> plt.subplot(3, 1, 2) >>> librosa.display.specshow(librosa.amplitude_to_db(C_med, ... ref=np.max), ... x_coords=beat_t, x_axis='time') >>> plt.title('Beat synchronous CQT power, ' ... 'shape={}'.format(C_med.shape)) >>> plt.subplot(3, 1, 3) >>> librosa.display.specshow(librosa.amplitude_to_db(C_med_sub, ... ref=np.max), ... x_coords=subbeat_t, x_axis='time') >>> plt.title('Sub-beat synchronous CQT power, ' ... 'shape={}'.format(C_med_sub.shape)) >>> plt.tight_layout() >>> plt.show() """ if aggregate is None: aggregate = np.mean shape = list(data.shape) if np.all([isinstance(_, slice) for _ in idx]): slices = idx elif np.all([np.issubdtype(type(_), np.integer) for _ in idx]): slices = index_to_slice(np.asarray(idx), 0, shape[axis], pad=pad) else: raise ParameterError('Invalid index set: {}'.format(idx)) agg_shape = list(shape) agg_shape[axis] = len(slices) data_agg = np.empty(agg_shape, order='F' if np.isfortran(data) else 'C', dtype=data.dtype) idx_in = [slice(None)] * data.ndim idx_agg = [slice(None)] * data_agg.ndim for (i, segment) in enumerate(slices): idx_in[axis] = segment idx_agg[axis] = i data_agg[tuple(idx_agg)] = aggregate(data[tuple(idx_in)], axis=axis) return data_agg
def sparsify_rows(x, quantile=0.01): ''' Return a row-sparse matrix approximating the input `x`. Parameters ---------- x : np.ndarray [ndim <= 2] The input matrix to sparsify. quantile : float in [0, 1.0) Percentage of magnitude to discard in each row of `x` Returns ------- x_sparse : `scipy.sparse.csr_matrix` [shape=x.shape] Row-sparsified approximation of `x` If `x.ndim == 1`, then `x` is interpreted as a row vector, and `x_sparse.shape == (1, len(x))`. Raises ------ ParameterError If `x.ndim > 2` If `quantile` lies outside `[0, 1.0)` Notes ----- This function caches at level 40. Examples -------- >>> # Construct a Hann window to sparsify >>> x = scipy.signal.hann(32) >>> x array([ 0. , 0.01 , 0.041, 0.09 , 0.156, 0.236, 0.326, 0.424, 0.525, 0.625, 0.72 , 0.806, 0.879, 0.937, 0.977, 0.997, 0.997, 0.977, 0.937, 0.879, 0.806, 0.72 , 0.625, 0.525, 0.424, 0.326, 0.236, 0.156, 0.09 , 0.041, 0.01 , 0. ]) >>> # Discard the bottom percentile >>> x_sparse = librosa.util.sparsify_rows(x, quantile=0.01) >>> x_sparse <1x32 sparse matrix of type '<type 'numpy.float64'>' with 26 stored elements in Compressed Sparse Row format> >>> x_sparse.todense() matrix([[ 0. , 0. , 0. , 0.09 , 0.156, 0.236, 0.326, 0.424, 0.525, 0.625, 0.72 , 0.806, 0.879, 0.937, 0.977, 0.997, 0.997, 0.977, 0.937, 0.879, 0.806, 0.72 , 0.625, 0.525, 0.424, 0.326, 0.236, 0.156, 0.09 , 0. , 0. , 0. ]]) >>> # Discard up to the bottom 10th percentile >>> x_sparse = librosa.util.sparsify_rows(x, quantile=0.1) >>> x_sparse <1x32 sparse matrix of type '<type 'numpy.float64'>' with 20 stored elements in Compressed Sparse Row format> >>> x_sparse.todense() matrix([[ 0. , 0. , 0. , 0. , 0. , 0. , 0.326, 0.424, 0.525, 0.625, 0.72 , 0.806, 0.879, 0.937, 0.977, 0.997, 0.997, 0.977, 0.937, 0.879, 0.806, 0.72 , 0.625, 0.525, 0.424, 0.326, 0. , 0. , 0. , 0. , 0. , 0. ]]) ''' if x.ndim == 1: x = x.reshape((1, -1)) elif x.ndim > 2: raise ParameterError('Input must have 2 or fewer dimensions. ' 'Provided x.shape={}.'.format(x.shape)) if not 0.0 <= quantile < 1: raise ParameterError('Invalid quantile {:.2f}'.format(quantile)) x_sparse = scipy.sparse.lil_matrix(x.shape, dtype=x.dtype) mags = np.abs(x) norms = np.sum(mags, axis=1, keepdims=True) mag_sort = np.sort(mags, axis=1) cumulative_mag = np.cumsum(mag_sort / norms, axis=1) threshold_idx = np.argmin(cumulative_mag < quantile, axis=1) for i, j in enumerate(threshold_idx): idx = np.where(mags[i] >= mag_sort[i, j]) x_sparse[i, idx] = x[i, idx] return x_sparse.tocsr()
def decompose(S, n_components=None, transformer=None, sort=False, fit=True, **kwargs): """Decompose a feature matrix. Given a spectrogram `S`, produce a decomposition into `components` and `activations` such that `S ~= components.dot(activations)`. By default, this is done with with non-negative matrix factorization (NMF), but any `sklearn.decomposition`-type object will work. Parameters ---------- S : np.ndarray [shape=(n_features, n_samples), dtype=float] The input feature matrix (e.g., magnitude spectrogram) n_components : int > 0 [scalar] or None number of desired components if None, then `n_features` components are used transformer : None or object If None, use `sklearn.decomposition.NMF` Otherwise, any object with a similar interface to NMF should work. `transformer` must follow the scikit-learn convention, where input data is `(n_samples, n_features)`. `transformer.fit_transform()` will be run on `S.T` (not `S`), the return value of which is stored (transposed) as `activations` The components will be retrieved as `transformer.components_.T` `S ~= np.dot(activations, transformer.components_).T` or equivalently: `S ~= np.dot(transformer.components_.T, activations.T)` sort : bool If `True`, components are sorted by ascending peak frequency. .. note:: If used with `transformer`, sorting is applied to copies of the decomposition parameters, and not to `transformer`'s internal parameters. fit : bool If `True`, components are estimated from the input ``S``. If `False`, components are assumed to be pre-computed and stored in ``transformer``, and are not changed. kwargs : Additional keyword arguments to the default transformer `sklearn.decomposition.NMF` Returns ------- components: np.ndarray [shape=(n_features, n_components)] matrix of components (basis elements). activations: np.ndarray [shape=(n_components, n_samples)] transformed matrix/activation matrix Raises ------ ParameterError if `fit` is False and no `transformer` object is provided. See Also -------- sklearn.decomposition : SciKit-Learn matrix decomposition modules Examples -------- Decompose a magnitude spectrogram into 32 components with NMF >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> S = np.abs(librosa.stft(y)) >>> comps, acts = librosa.decompose.decompose(S, n_components=8) >>> comps array([[ 1.876e-01, 5.559e-02, ..., 1.687e-01, 4.907e-02], [ 3.148e-01, 1.719e-01, ..., 2.314e-01, 9.493e-02], ..., [ 1.561e-07, 8.564e-08, ..., 7.167e-08, 4.997e-08], [ 1.531e-07, 7.880e-08, ..., 5.632e-08, 4.028e-08]]) >>> acts array([[ 4.197e-05, 8.512e-03, ..., 3.056e-05, 9.159e-06], [ 9.568e-06, 1.718e-02, ..., 3.322e-05, 7.869e-06], ..., [ 5.982e-05, 1.311e-02, ..., -0.000e+00, 6.323e-06], [ 3.782e-05, 7.056e-03, ..., 3.290e-05, -0.000e+00]]) Sort components by ascending peak frequency >>> comps, acts = librosa.decompose.decompose(S, n_components=16, ... sort=True) Or with sparse dictionary learning >>> import sklearn.decomposition >>> T = sklearn.decomposition.MiniBatchDictionaryLearning(n_components=16) >>> scomps, sacts = librosa.decompose.decompose(S, transformer=T, sort=True) >>> import matplotlib.pyplot as plt >>> plt.figure(figsize=(10,8)) >>> plt.subplot(3, 1, 1) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ... ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('Input spectrogram') >>> plt.colorbar(format='%+2.0f dB') >>> plt.subplot(3, 2, 3) >>> librosa.display.specshow(librosa.amplitude_to_db(comps, ... ref=np.max), ... y_axis='log') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Components') >>> plt.subplot(3, 2, 4) >>> librosa.display.specshow(acts, x_axis='time') >>> plt.ylabel('Components') >>> plt.title('Activations') >>> plt.colorbar() >>> plt.subplot(3, 1, 3) >>> S_approx = comps.dot(acts) >>> librosa.display.specshow(librosa.amplitude_to_db(S_approx, ... ref=np.max), ... y_axis='log', x_axis='time') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Reconstructed spectrogram') >>> plt.tight_layout() >>> plt.show() """ if transformer is None: if fit is False: raise ParameterError('fit must be True if transformer is None') transformer = sklearn.decomposition.NMF(n_components=n_components, **kwargs) if n_components is None: n_components = S.shape[0] if fit: activations = transformer.fit_transform(S.T).T else: activations = transformer.transform(S.T).T components = transformer.components_.T if sort: components, idx = axis_sort(components, index=True) activations = activations[idx] return components, activations
def nn_filter(S, rec=None, aggregate=None, axis=-1, **kwargs): '''Filtering by nearest-neighbors. Each data point (e.g, spectrogram column) is replaced by aggregating its nearest neighbors in feature space. This can be useful for de-noising a spectrogram or feature matrix. The non-local means method [1]_ can be recovered by providing a weighted recurrence matrix as input and specifying `aggregate=np.average`. Similarly, setting `aggregate=np.median` produces sparse de-noising as in REPET-SIM [2]_. .. [1] Buades, A., Coll, B., & Morel, J. M. (2005, June). A non-local algorithm for image denoising. In Computer Vision and Pattern Recognition, 2005. CVPR 2005. IEEE Computer Society Conference on (Vol. 2, pp. 60-65). IEEE. .. [2] Rafii, Z., & Pardo, B. (2012, October). "Music/Voice Separation Using the Similarity Matrix." International Society for Music Information Retrieval Conference, 2012. Parameters ---------- S : np.ndarray The input data (spectrogram) to filter rec : (optional) scipy.sparse.spmatrix or np.ndarray Optionally, a pre-computed nearest-neighbor matrix as provided by `librosa.segment.recurrence_matrix` aggregate : function aggregation function (default: `np.mean`) If `aggregate=np.average`, then a weighted average is computed according to the (per-row) weights in `rec`. For all other aggregation functions, all neighbors are treated equally. axis : int The axis along which to filter (by default, columns) kwargs Additional keyword arguments provided to `librosa.segment.recurrence_matrix` if `rec` is not provided Returns ------- S_filtered : np.ndarray The filtered data Raises ------ ParameterError if `rec` is provided and its shape is incompatible with `S`. See also -------- decompose hpss librosa.segment.recurrence_matrix Notes ----- This function caches at level 30. Examples -------- De-noise a chromagram by non-local median filtering. By default this would use euclidean distance to select neighbors, but this can be overridden directly by setting the `metric` parameter. >>> y, sr = librosa.load(librosa.util.example_audio_file(), ... offset=30, duration=10) >>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr) >>> chroma_med = librosa.decompose.nn_filter(chroma, ... aggregate=np.median, ... metric='cosine') To use non-local means, provide an affinity matrix and `aggregate=np.average`. >>> rec = librosa.segment.recurrence_matrix(chroma, mode='affinity', ... metric='cosine', sparse=True) >>> chroma_nlm = librosa.decompose.nn_filter(chroma, rec=rec, ... aggregate=np.average) >>> import matplotlib.pyplot as plt >>> plt.figure(figsize=(10, 8)) >>> plt.subplot(5, 1, 1) >>> librosa.display.specshow(chroma, y_axis='chroma') >>> plt.colorbar() >>> plt.title('Unfiltered') >>> plt.subplot(5, 1, 2) >>> librosa.display.specshow(chroma_med, y_axis='chroma') >>> plt.colorbar() >>> plt.title('Median-filtered') >>> plt.subplot(5, 1, 3) >>> librosa.display.specshow(chroma_nlm, y_axis='chroma') >>> plt.colorbar() >>> plt.title('Non-local means') >>> plt.subplot(5, 1, 4) >>> librosa.display.specshow(chroma - chroma_med, ... y_axis='chroma') >>> plt.colorbar() >>> plt.title('Original - median') >>> plt.subplot(5, 1, 5) >>> librosa.display.specshow(chroma - chroma_nlm, ... y_axis='chroma', x_axis='time') >>> plt.colorbar() >>> plt.title('Original - NLM') >>> plt.tight_layout() >>> plt.show() ''' if aggregate is None: aggregate = np.mean if rec is None: kwargs = dict(kwargs) kwargs['sparse'] = True rec = recurrence_matrix(S, axis=axis, **kwargs) elif not scipy.sparse.issparse(rec): rec = scipy.sparse.csc_matrix(rec) if rec.shape[0] != S.shape[axis] or rec.shape[0] != rec.shape[1]: raise ParameterError('Invalid self-similarity matrix shape ' 'rec.shape={} for S.shape={}'.format( rec.shape, S.shape)) return __nn_filter_helper(rec.data, rec.indices, rec.indptr, S.swapaxes(0, axis), aggregate).swapaxes(0, axis)
def onset_strength_multi(y=None, sr=48000, S=None, lag=1, max_size=1, detrend=False, center=True, feature=None, aggregate=None, channels=None, **kwargs): """Compute a spectral flux onset strength envelope across multiple channels. Onset strength for channel `i` at time `t` is determined by: `mean_{f in channels[i]} max(0, S[f, t+1] - S[f, t])` Parameters ---------- y : np.ndarray [shape=(n,)] audio time-series sr : number > 0 [scalar] sampling rate of `y` S : np.ndarray [shape=(d, m)] pre-computed (log-power) spectrogram lag : int > 0 time lag for computing differences max_size : int > 0 size (in frequency bins) of the local max filter. set to `1` to disable filtering. detrend : bool [scalar] Filter the onset strength to remove the DC component center : bool [scalar] Shift the onset function by `n_fft / (2 * hop_length)` frames feature : function Function for computing time-series features, eg, scaled spectrograms. By default, uses `librosa.feature.melspectrogram` with `fmax=11025.0` aggregate : function Aggregation function to use when combining onsets at different frequency bins. Default: `np.mean` channels : list or None Array of channel boundaries or slice objects. If `None`, then a single channel is generated to span all bands. kwargs : additional keyword arguments Additional parameters to `feature()`, if `S` is not provided. Returns ------- onset_envelope : np.ndarray [shape=(n_channels, m)] array containing the onset strength envelope for each specified channel Raises ------ ParameterError if neither `(y, sr)` nor `S` are provided See Also -------- onset_strength Notes ----- This function caches at level 30. Examples -------- First, load some audio and plot the spectrogram >>> import matplotlib.pyplot as plt >>> y, sr = librosa.load(librosa.util.example_audio_file(), ... duration=10.0) >>> D = np.abs(librosa.stft(y)) >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max), ... y_axis='log') >>> plt.title('Power spectrogram') Construct a standard onset function over four sub-bands >>> onset_subbands = librosa.onset.onset_strength_multi(y=y, sr=sr, ... channels=[0, 32, 64, 96, 128]) >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(onset_subbands, x_axis='time') >>> plt.ylabel('Sub-bands') >>> plt.title('Sub-band onset strength') """ if feature is None: feature = melspectrogram kwargs.setdefault('fmax', 11025.0) if aggregate is None: aggregate = np.mean if lag < 1 or not isinstance(lag, int): raise ParameterError('lag must be a positive integer') if max_size < 1 or not isinstance(max_size, int): raise ParameterError('max_size must be a positive integer') # First, compute mel spectrogram if S is None: S = np.abs(feature(y=y, sr=sr, **kwargs)) # Convert to dBs S = power_to_db(S) # Retrieve the n_fft and hop_length, # or default values for onsets if not provided n_fft = kwargs.get('n_fft', 2048) hop_length = kwargs.get('hop_length', 512) # Ensure that S is at least 2-d S = np.atleast_2d(S) # Compute the reference spectrogram. # Efficiency hack: skip filtering step and pass by reference # if max_size will produce a no-op. if max_size == 1: ref_spec = S else: ref_spec = scipy.ndimage.maximum_filter1d(S, max_size, axis=0) # Compute difference to the reference, spaced by lag onset_env = S[:, lag:] - ref_spec[:, :-lag] # Discard negatives (decreasing amplitude) onset_env = np.maximum(0.0, onset_env) # Aggregate within channels pad = True if channels is None: channels = [slice(None)] else: pad = False onset_env = sync(onset_env, channels, aggregate=aggregate, pad=pad, axis=0) # compensate for lag pad_width = lag if center: # Counter-act framing effects. Shift the onsets by n_fft / hop_length pad_width += n_fft // (2 * hop_length) onset_env = np.pad(onset_env, ([0, 0], [int(pad_width), 0]), mode='constant') # remove the DC component if detrend: onset_env = scipy.signal.lfilter([1.0, -1.0], [1.0, -0.99], onset_env, axis=-1) # Trim to match the input duration if center: onset_env = onset_env[:, :S.shape[1]] return onset_env
def cross_similarity(data, data_ref, k=None, metric='euclidean', sparse=False, mode='connectivity', bandwidth=None): '''Compute cross-similarity from one data sequence to a reference sequence. The output is a matrix `xsim`: `xsim[i, j]` is non-zero if `data_ref[:, i]` is a k-nearest neighbor of `data[:, j]`. Parameters ---------- data : np.ndarray [shape=(d, n)] A feature matrix for the comparison sequence data_ref : np.ndarray [shape=(d, n_ref)] A feature matrix for the reference sequence k : int > 0 [scalar] or None the number of nearest-neighbors for each sample Default: `k = 2 * ceil(sqrt(n_ref))`, or `k = 2` if `n_ref <= 3` metric : str Distance metric to use for nearest-neighbor calculation. See `sklearn.neighbors.NearestNeighbors` for details. sparse : bool [scalar] if False, returns a dense type (ndarray) if True, returns a sparse type (scipy.sparse.csc_matrix) mode : str, {'connectivity', 'distance', 'affinity'} If 'connectivity', a binary connectivity matrix is produced. If 'distance', then a non-zero entry contains the distance between points. If 'affinity', then non-zero entries are mapped to `exp( - distance(i, j) / bandwidth)` where `bandwidth` is as specified below. bandwidth : None or float > 0 If using ``mode='affinity'``, this can be used to set the bandwidth on the affinity kernel. If no value is provided, it is set automatically to the median distance to the k'th nearest neighbor of each `data[:, i]`. Returns ------- xsim : np.ndarray or scipy.sparse.csc_matrix, [shape=(n_ref, n)] Cross-similarity matrix See Also -------- recurrence_matrix recurrence_to_lag feature.stack_memory sklearn.neighbors.NearestNeighbors scipy.spatial.distance.cdist Notes ----- This function caches at level 30. Examples -------- Find nearest neighbors in MFCC space between two sequences >>> hop_length = 1024 >>> y_ref, sr = librosa.load(librosa.util.example_audio_file()) >>> y_comp, sr = librosa.load(librosa.util.example_audio_file(), offset=10) >>> mfcc_ref = librosa.feature.mfcc(y=y_ref, sr=sr, hop_length=hop_length) >>> mfcc_comp = librosa.feature.mfcc(y=y_comp, sr=sr, hop_length=hop_length) >>> xsim = librosa.segment.cross_similarity(mfcc_comp, mfcc_ref) Or fix the number of nearest neighbors to 5 >>> xsim = librosa.segment.cross_similarity(mfcc_comp, mfcc_ref, k=5) Use cosine similarity instead of Euclidean distance >>> xsim = librosa.segment.cross_similarity(mfcc_comp, mfcc_ref, metric='cosine') Use an affinity matrix instead of binary connectivity >>> xsim_aff = librosa.segment.cross_similarity(mfcc_comp, mfcc_ref, mode='affinity') Plot the feature and recurrence matrices >>> import matplotlib.pyplot as plt >>> plt.figure(figsize=(8, 4)) >>> plt.subplot(1, 2, 1) >>> librosa.display.specshow(xsim, x_axis='time', y_axis='time', hop_length=hop_length) >>> plt.title('Binary recurrence (symmetric)') >>> plt.subplot(1, 2, 2) >>> librosa.display.specshow(xsim_aff, x_axis='time', y_axis='time', ... cmap='magma_r', hop_length=hop_length) >>> plt.title('Affinity recurrence') >>> plt.tight_layout() ''' data_ref = np.atleast_2d(data_ref) data = np.atleast_2d(data) if data_ref.shape[0] != data.shape[0]: raise ValueError("data_ref and data must have the same first dimension") # swap data axes so the feature axis is last data_ref = np.swapaxes(data_ref, -1, 0) n_ref = data_ref.shape[0] data_ref = data_ref.reshape((n_ref, -1)) data = np.swapaxes(data, -1, 0) n = data.shape[0] data = data.reshape((n, -1)) if mode not in ['connectivity', 'distance', 'affinity']: raise ParameterError(("Invalid mode='{}'. Must be one of " "['connectivity', 'distance', " "'affinity']").format(mode)) if k is None: k = min(n_ref, 2 * np.ceil(np.sqrt(n_ref))) k = int(k) if bandwidth is not None: if bandwidth <= 0: raise ParameterError('Invalid bandwidth={}. ' 'Must be strictly positive.'.format(bandwidth)) # Build the neighbor search object # `auto` mode does not work with some choices of metric. Rather than special-case # those here, we instead use a fall-back to brute force if auto fails. try: knn = sklearn.neighbors.NearestNeighbors(n_neighbors=min(n_ref, k), metric=metric, algorithm='auto') except ValueError: knn = sklearn.neighbors.NearestNeighbors(n_neighbors=min(n_ref, k), metric=metric, algorithm='brute') knn.fit(data_ref) # Get the knn graph if mode == 'affinity': # sklearn's nearest neighbor doesn't support affinity, # so we use distance here and then do the conversion post-hoc kng_mode = 'distance' else: kng_mode = mode xsim = knn.kneighbors_graph(X=data, mode=kng_mode).tolil() # Retain only the top-k links per point for i in range(n): # Get the links from point i links = xsim[i].nonzero()[1] # Order them ascending idx = links[np.argsort(xsim[i, links].toarray())][0] # Everything past the kth closest gets squashed xsim[i, idx[k:]] = 0 # Convert a compressed sparse row (CSR) format xsim = xsim.tocsr() xsim.eliminate_zeros() if mode == 'connectivity': xsim = xsim.astype(np.bool) elif mode == 'affinity': if bandwidth is None: bandwidth = np.nanmedian(xsim.max(axis=1).data) xsim.data[:] = np.exp(xsim.data / (-1 * bandwidth)) # Transpose to n_ref by n xsim = xsim.T if not sparse: xsim = xsim.toarray() return xsim
def subsegment(data, frames, n_segments=4, axis=-1): '''Sub-divide a segmentation by feature clustering. Given a set of frame boundaries (`frames`), and a data matrix (`data`), each successive interval defined by `frames` is partitioned into `n_segments` by constrained agglomerative clustering. .. note:: If an interval spans fewer than `n_segments` frames, then each frame becomes a sub-segment. Parameters ---------- data : np.ndarray Data matrix to use in clustering frames : np.ndarray [shape=(n_boundaries,)], dtype=int, non-negative] Array of beat or segment boundaries, as provided by `librosa.beat.beat_track`, `librosa.onset.onset_detect`, or `agglomerative`. n_segments : int > 0 Maximum number of frames to sub-divide each interval. axis : int Axis along which to apply the segmentation. By default, the last index (-1) is taken. Returns ------- boundaries : np.ndarray [shape=(n_subboundaries,)] List of sub-divided segment boundaries See Also -------- agglomerative : Temporal segmentation librosa.onset.onset_detect : Onset detection librosa.beat.beat_track : Beat tracking Notes ----- This function caches at level 30. Examples -------- Load audio, detect beat frames, and subdivide in twos by CQT >>> y, sr = librosa.load(librosa.util.example_audio_file(), duration=8) >>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=512) >>> beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=512) >>> cqt = np.abs(librosa.cqt(y, sr=sr, hop_length=512)) >>> subseg = librosa.segment.subsegment(cqt, beats, n_segments=2) >>> subseg_t = librosa.frames_to_time(subseg, sr=sr, hop_length=512) >>> subseg array([ 0, 2, 4, 21, 23, 26, 43, 55, 63, 72, 83, 97, 102, 111, 122, 137, 142, 153, 162, 180, 182, 185, 202, 210, 221, 231, 241, 256, 261, 271, 281, 296, 301, 310, 320, 339, 341, 344, 361, 368, 382, 389, 401, 416, 420, 430, 436, 451, 456, 465, 476, 489, 496, 503, 515, 527, 535, 544, 553, 558, 571, 578, 590, 607, 609, 638]) >>> import matplotlib.pyplot as plt >>> plt.figure() >>> librosa.display.specshow(librosa.amplitude_to_db(cqt, ... ref=np.max), ... y_axis='cqt_hz', x_axis='time') >>> lims = plt.gca().get_ylim() >>> plt.vlines(beat_times, lims[0], lims[1], color='lime', alpha=0.9, ... linewidth=2, label='Beats') >>> plt.vlines(subseg_t, lims[0], lims[1], color='linen', linestyle='--', ... linewidth=1.5, alpha=0.5, label='Sub-beats') >>> plt.legend(frameon=True, shadow=True) >>> plt.title('CQT + Beat and sub-beat markers') >>> plt.tight_layout() >>> plt.show() ''' frames = fix_frames(frames, x_min=0, x_max=data.shape[axis], pad=True) if n_segments < 1: raise ParameterError('n_segments must be a positive integer') boundaries = [] idx_slices = [slice(None)] * data.ndim for seg_start, seg_end in zip(frames[:-1], frames[1:]): idx_slices[axis] = slice(seg_start, seg_end) boundaries.extend(seg_start + agglomerative(data[tuple(idx_slices)], min(seg_end - seg_start, n_segments), axis=axis)) return np.ascontiguousarray(boundaries)
def cqt( y, sr=48000, hop_length=512, fmin=None, n_bins=84, bins_per_octave=12, tuning=0.0, filter_scale=1, norm=1, sparsity=0.01, window='hann', scale=True, #real=util.Deprecated(), pad_mode='reflect'): '''Compute the constant-Q transform of an audio signal. This implementation is based on the recursive sub-sampling method described by [1]_. .. [1] Schoerkhuber, Christian, and Anssi Klapuri. "Constant-Q transform toolbox for music processing." 7th Sound and Music Computing Conference, Barcelona, Spain. 2010. Parameters ---------- y : np.ndarray [shape=(n,)] audio time series sr : number > 0 [scalar] sampling rate of `y` hop_length : int > 0 [scalar] number of samples between successive CQT columns. fmin : float > 0 [scalar] Minimum frequency. Defaults to C1 ~= 32.70 Hz n_bins : int > 0 [scalar] Number of frequency bins, starting at `fmin` bins_per_octave : int > 0 [scalar] Number of bins per octave tuning : None or float in `[-0.5, 0.5)` Tuning offset in fractions of a bin (cents). If `None`, tuning will be automatically estimated from the signal. filter_scale : float > 0 Filter scale factor. Small values (<1) use shorter windows for improved time resolution. norm : {inf, -inf, 0, float > 0} Type of norm to use for basis function normalization. See `librosa.util.normalize`. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to `sparsity` fraction of the energy in each basis. Set `sparsity=0` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. scale : bool If `True`, scale the CQT response by square-root the length of each channel's filter. This is analogous to `norm='ortho'` in FFT. If `False`, do not scale the CQT. This is analogous to `norm=None` in FFT. real : bool [DEPRECATED] If `False`, return a complex-valued constant-Q transform (default). If `True`, return the CQT magnitude. .. warning:: This parameter is deprecated in librosa 0.5.0 It will be removed in librosa 0.6.0. pad_mode : string Padding mode for centered frame analysis. See also: `librosa.core.stft` and `np.pad`. Returns ------- CQT : np.ndarray [shape=(n_bins, t), dtype=np.complex or np.float] Constant-Q value each frequency at each time. Raises ------ ParameterError If `hop_length` is not an integer multiple of `2**(n_bins / bins_per_octave)` Or if `y` is too short to support the frequency range of the CQT. See Also -------- librosa.core.resample librosa.util.normalize Notes ----- This function caches at level 20. Examples -------- Generate and plot a constant-Q power spectrum >>> import matplotlib.pyplot as plt >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> C = librosa.cqt(y, sr=sr) >>> librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max), ... sr=sr, x_axis='time', y_axis='cqt_note') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Constant-Q power spectrum') >>> plt.tight_layout() Limit the frequency range >>> C = librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'), ... n_bins=60) >>> C array([[ 8.827e-04, 9.293e-04, ..., 3.133e-07, 2.942e-07], [ 1.076e-03, 1.068e-03, ..., 1.153e-06, 1.148e-06], ..., [ 1.042e-07, 4.087e-07, ..., 1.612e-07, 1.928e-07], [ 2.363e-07, 5.329e-07, ..., 1.294e-07, 1.611e-07]]) Using a higher frequency resolution >>> C = librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'), ... n_bins=60 * 2, bins_per_octave=12 * 2) >>> C array([[ 1.536e-05, 5.848e-05, ..., 3.241e-07, 2.453e-07], [ 1.856e-03, 1.854e-03, ..., 2.397e-08, 3.549e-08], ..., [ 2.034e-07, 4.245e-07, ..., 6.213e-08, 1.463e-07], [ 4.896e-08, 5.407e-07, ..., 9.176e-08, 1.051e-07]]) ''' # How many octaves are we dealing with? n_octaves = int(np.ceil(float(n_bins) / bins_per_octave)) n_filters = min(bins_per_octave, n_bins) len_orig = len(y) if fmin is None: # C1 by default fmin = note_to_hz('C1') if tuning is None: tuning = estimate_tuning(y=y, sr=sr) # First thing, get the freqs of the top octave freqs = cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave)[-bins_per_octave:] fmin_t = np.min(freqs) fmax_t = np.max(freqs) # Determine required resampling quality Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 1) filter_cutoff = fmax_t * (1 + 0.5 * window_bandwidth(window) / Q) nyquist = sr / 2.0 if filter_cutoff < BW_FASTEST * nyquist: res_type = 'kaiser_fast' else: res_type = 'kaiser_best' y, sr, hop_length = __early_downsample(y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale) cqt_resp = [] if res_type != 'kaiser_fast': # Do the top octave before resampling to allow for fast resampling fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin_t, n_filters, bins_per_octave, tuning, filter_scale, norm, sparsity, window=window) # Compute the CQT filter response and append it to the stack cqt_resp.append( __cqt_response(y, n_fft, hop_length, fft_basis, pad_mode)) fmin_t /= 2 fmax_t /= 2 n_octaves -= 1 filter_cutoff = fmax_t * (1 + 0.5 * window_bandwidth(window) / Q) res_type = 'kaiser_fast' # Make sure our hop is long enough to support the bottom octave num_twos = __num_two_factors(hop_length) if num_twos < n_octaves - 1: raise ParameterError('hop_length must be a positive integer ' 'multiple of 2^{0:d} for {1:d}-octave CQT'.format( n_octaves - 1, n_octaves)) # Now do the recursive bit fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin_t, n_filters, bins_per_octave, tuning, filter_scale, norm, sparsity, window=window) my_y, my_sr, my_hop = y, sr, hop_length # Iterate down the octaves for i in range(n_octaves): # Resample (except first time) if i > 0: if len(my_y) < 2: raise ParameterError('Input signal length={} is too short for ' '{:d}-octave CQT'.format( len_orig, n_octaves)) # The additional scaling of sqrt(2) here is to implicitly rescale # the filters my_y = np.sqrt(2) * resample( my_y, my_sr, my_sr / 2.0, res_type=res_type, scale=True) my_sr /= 2.0 my_hop //= 2 # Compute the cqt filter response and append to the stack cqt_resp.append( __cqt_response(my_y, n_fft, my_hop, fft_basis, pad_mode)) C = __trim_stack(cqt_resp, n_bins) if scale: lengths = constant_q_lengths(sr, fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, window=window, filter_scale=filter_scale) C /= np.sqrt(lengths[:, np.newaxis]) # if not isinstance(real, util.Deprecated): # warn('Real-valued CQT (real=True) is deprecated in 0.4.2. ' # 'The `real` parameter will be removed in 0.6.0.' # 'Use np.abs(librosa.cqt(...)) ' # 'instead of real=True to maintain forward compatibility.', # DeprecationWarning) # if real: # C = np.abs(C) return C
def tonnetz(y=None, sr=48000, chroma=None): '''Computes the tonal centroid features (tonnetz), following the method of [1]_. .. [1] Harte, C., Sandler, M., & Gasser, M. (2006). "Detecting Harmonic Change in Musical Audio." In Proceedings of the 1st ACM Workshop on Audio and Music Computing Multimedia (pp. 21-26). Santa Barbara, CA, USA: ACM Press. doi:10.1145/1178723.1178727. Parameters ---------- y : np.ndarray [shape=(n,)] or None Audio time series. sr : number > 0 [scalar] sampling rate of `y` chroma : np.ndarray [shape=(n_chroma, t)] or None Normalized energy for each chroma bin at each frame. If `None`, a cqt chromagram is performed. Returns ------- tonnetz : np.ndarray [shape(6, t)] Tonal centroid features for each frame. Tonnetz dimensions: - 0: Fifth x-axis - 1: Fifth y-axis - 2: Minor x-axis - 3: Minor y-axis - 4: Major x-axis - 5: Major y-axis See Also -------- chroma_cqt Compute a chromagram from a constant-Q transform. chroma_stft Compute a chromagram from an STFT spectrogram or waveform. Examples -------- Compute tonnetz features from the harmonic component of a song >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> y = librosa.effects.harmonic(y) >>> tonnetz = librosa.feature.tonnetz(y=y, sr=sr) >>> tonnetz array([[-0.073, -0.053, ..., -0.054, -0.073], [ 0.001, 0.001, ..., -0.054, -0.062], ..., [ 0.039, 0.034, ..., 0.044, 0.064], [ 0.005, 0.002, ..., 0.011, 0.017]]) Compare the tonnetz features to `chroma_cqt` >>> import matplotlib.pyplot as plt >>> plt.subplot(2, 1, 1) >>> librosa.display.specshow(tonnetz, y_axis='tonnetz') >>> plt.colorbar() >>> plt.title('Tonal Centroids (Tonnetz)') >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(librosa.feature.chroma_cqt(y, sr=sr), ... y_axis='chroma', x_axis='time') >>> plt.colorbar() >>> plt.title('Chroma') >>> plt.tight_layout() ''' if y is None and chroma is None: raise ParameterError('Either the audio samples or the chromagram must be ' 'passed as an argument.') if chroma is None: chroma = chroma_cqt(y=y, sr=sr) # Generate Transformation matrix dim_map = np.linspace(0, 12, num=chroma.shape[0], endpoint=False) scale = np.asarray([7. / 6, 7. / 6, 3. / 2, 3. / 2, 2. / 3, 2. / 3]) V = np.multiply.outer(scale, dim_map) # Even rows compute sin() V[::2] -= 0.5 R = np.array([1, 1, # Fifths 1, 1, # Minor 0.5, 0.5]) # Major phi = R[:, np.newaxis] * np.cos(np.pi * V) # Do the transform to tonnetz return phi.dot(normalize(chroma, norm=1, axis=0))
def lag_to_recurrence(lag, axis=-1): '''Convert a lag matrix into a recurrence matrix. Parameters ---------- lag : np.ndarray or scipy.sparse.spmatrix A lag matrix, as produced by `recurrence_to_lag` axis : int The axis corresponding to the time dimension. The alternate axis will be interpreted in lag coordinates. Returns ------- rec : np.ndarray or scipy.sparse.spmatrix [shape=(n, n)] A recurrence matrix in (time, time) coordinates For sparse matrices, format will match that of `lag`. Raises ------ ParameterError : if `lag` does not have the correct shape See Also -------- recurrence_to_lag Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> hop_length = 1024 >>> mfccs = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length) >>> recurrence = librosa.segment.recurrence_matrix(mfccs) >>> lag_pad = librosa.segment.recurrence_to_lag(recurrence, pad=True) >>> lag_nopad = librosa.segment.recurrence_to_lag(recurrence, pad=False) >>> rec_pad = librosa.segment.lag_to_recurrence(lag_pad) >>> rec_nopad = librosa.segment.lag_to_recurrence(lag_nopad) >>> import matplotlib.pyplot as plt >>> plt.figure(figsize=(8, 4)) >>> plt.subplot(2, 2, 1) >>> librosa.display.specshow(lag_pad, x_axis='time', y_axis='lag', ... hop_length=hop_length) >>> plt.title('Lag (zero-padded)') >>> plt.subplot(2, 2, 2) >>> librosa.display.specshow(lag_nopad, x_axis='time', y_axis='time', ... hop_length=hop_length) >>> plt.title('Lag (no padding)') >>> plt.subplot(2, 2, 3) >>> librosa.display.specshow(rec_pad, x_axis='time', y_axis='time', ... hop_length=hop_length) >>> plt.title('Recurrence (with padding)') >>> plt.subplot(2, 2, 4) >>> librosa.display.specshow(rec_nopad, x_axis='time', y_axis='time', ... hop_length=hop_length) >>> plt.title('Recurrence (without padding)') >>> plt.tight_layout() >>> plt.show() ''' if axis not in [0, 1, -1]: raise ParameterError('Invalid target axis: {}'.format(axis)) axis = np.abs(axis) if lag.ndim != 2 or (lag.shape[0] != lag.shape[1] and lag.shape[1 - axis] != 2 * lag.shape[axis]): raise ParameterError('Invalid lag matrix shape: {}'.format(lag.shape)) # Since lag must be 2-dimensional, abs(axis) = axis t = lag.shape[axis] sparse = scipy.sparse.issparse(lag) if sparse: rec = scipy.sparse.lil_matrix(lag) roll_ax = 1 - axis else: rec = lag.copy() roll_ax = None idx_slice = [slice(None)] * lag.ndim for i in range(1, t): idx_slice[axis] = i rec[tuple(idx_slice)] = roll_sparse(lag[tuple(idx_slice)], i, axis=roll_ax) sub_slice = [slice(None)] * rec.ndim sub_slice[1 - axis] = slice(t) rec = rec[tuple(sub_slice)] if sparse: return rec.asformat(lag.format) return np.ascontiguousarray(rec.T).T
def spectral_contrast(y=None, sr=48000, S=None, n_fft=2048, hop_length=512, freq=None, fmin=200.0, n_bands=6, quantile=0.02, linear=False): '''Compute spectral contrast [1]_ .. [1] Jiang, Dan-Ning, Lie Lu, Hong-Jiang Zhang, Jian-Hua Tao, and Lian-Hong Cai. "Music type classification by spectral contrast feature." In Multimedia and Expo, 2002. ICME'02. Proceedings. 2002 IEEE International Conference on, vol. 1, pp. 113-116. IEEE, 2002. Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None (optional) spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. freq : None or np.ndarray [shape=(d,)] Center frequencies for spectrogram bins. If `None`, then FFT bin center frequencies are used. Otherwise, it can be a single array of `d` center frequencies. fmin : float > 0 Frequency cutoff for the first bin `[0, fmin]` Subsequent bins will cover `[fmin, 2*fmin]`, `[2*fmin, 4*fmin]`, etc. n_bands : int > 1 number of frequency bands quantile : float in (0, 1) quantile for determining peaks and valleys linear : bool If `True`, return the linear difference of magnitudes: `peaks - valleys`. If `False`, return the logarithmic difference: `log(peaks) - log(valleys)`. Returns ------- contrast : np.ndarray [shape=(n_bands + 1, t)] each row of spectral contrast values corresponds to a given octave-based frequency Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> S = np.abs(librosa.stft(y)) >>> contrast = librosa.feature.spectral_contrast(S=S, sr=sr) >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ... ref=np.max), ... y_axis='log') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Power spectrogram') >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(contrast, x_axis='time') >>> plt.colorbar() >>> plt.ylabel('Frequency bands') >>> plt.title('Spectral contrast') >>> plt.tight_layout() ''' S, n_fft = mag_spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) freq = np.atleast_1d(freq) if freq.ndim != 1 or len(freq) != S.shape[0]: raise ParameterError('freq.shape mismatch: expected ' '({:d},)'.format(S.shape[0])) if n_bands < 1 or not isinstance(n_bands, int): raise ParameterError('n_bands must be a positive integer') if not 0.0 < quantile < 1.0: raise ParameterError('quantile must lie in the range (0, 1)') if fmin <= 0: raise ParameterError('fmin must be a positive number') octa = np.zeros(n_bands + 2) octa[1:] = fmin * (2.0**np.arange(0, n_bands + 1)) if np.any(octa[:-1] >= 0.5 * sr): raise ParameterError('Frequency band exceeds Nyquist. ' 'Reduce either fmin or n_bands.') valley = np.zeros((n_bands + 1, S.shape[1])) peak = np.zeros_like(valley) for k, (f_low, f_high) in enumerate(zip(octa[:-1], octa[1:])): current_band = np.logical_and(freq >= f_low, freq <= f_high) idx = np.flatnonzero(current_band) if k > 0: current_band[idx[0] - 1] = True if k == n_bands: current_band[idx[-1] + 1:] = True sub_band = S[current_band] if k < n_bands: sub_band = sub_band[:-1] # Always take at least one bin from each side idx = np.rint(quantile * np.sum(current_band)) idx = int(np.maximum(idx, 1)) sortedr = np.sort(sub_band, axis=0) valley[k] = np.mean(sortedr[:idx], axis=0) peak[k] = np.mean(sortedr[-idx:], axis=0) if linear: return peak - valley else: return power_to_db(peak) - power_to_db(valley)
def match_intervals(intervals_from, intervals_to): '''Match one set of time intervals to another. This can be useful for tasks such as mapping beat timings to segments. .. note:: A target interval may be matched to multiple source intervals. Parameters ---------- intervals_from : np.ndarray [shape=(n, 2)] The time range for source intervals. The `i` th interval spans time `intervals_from[i, 0]` to `intervals_from[i, 1]`. `intervals_from[0, 0]` should be 0, `intervals_from[-1, 1]` should be the track duration. intervals_to : np.ndarray [shape=(m, 2)] Analogous to `intervals_from`. Returns ------- interval_mapping : np.ndarray [shape=(n,)] For each interval in `intervals_from`, the corresponding interval in `intervals_to`. See Also -------- match_events Raises ------ ParameterError If either array of input intervals is not the correct shape ''' if len(intervals_from) == 0 or len(intervals_to) == 0: raise ParameterError('Attempting to match empty interval list') # Verify that the input intervals has correct shape and size valid_intervals(intervals_from) valid_intervals(intervals_to) # The overlap score of a beat with a segment is defined as # max(0, min(beat_end, segment_end) - max(beat_start, segment_start)) output = np.empty(len(intervals_from), dtype=np.int) n_rows = int(spectralUtil.MAX_MEM_BLOCK / (len(intervals_to) * intervals_to.itemsize)) n_rows = max(1, n_rows) for bl_s in range(0, len(intervals_from), n_rows): bl_t = min(bl_s + n_rows, len(intervals_from)) tmp_from = intervals_from[bl_s:bl_t] starts = np.maximum.outer(tmp_from[:, 0], intervals_to[:, 0]) ends = np.minimum.outer(tmp_from[:, 1], intervals_to[:, 1]) score = np.maximum(0, ends - starts) output[bl_s:bl_t] = np.argmax(score, axis=-1) return output
def spectral_centroid(y=None, sr=48000, S=None, n_fft=2048, hop_length=512, freq=None): '''Compute the spectral centroid. Each frame of a magnitude spectrogram is normalized and treated as a distribution over frequency bins, from which the mean (centroid) is extracted per frame. Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None (optional) spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. freq : None or np.ndarray [shape=(d,) or shape=(d, t)] Center frequencies for spectrogram bins. If `None`, then FFT bin center frequencies are used. Otherwise, it can be a single array of `d` center frequencies, or a matrix of center frequencies as constructed by `librosa.core.ifgram` Returns ------- centroid : np.ndarray [shape=(1, t)] centroid frequencies See Also -------- librosa.core.stft Short-time Fourier Transform librosa.core.ifgram Instantaneous-frequency spectrogram Examples -------- From time-series input: >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> cent = librosa.feature.spectral_centroid(y=y, sr=sr) >>> cent array([[ 4382.894, 626.588, ..., 5037.07 , 5413.398]]) From spectrogram input: >>> S, phase = librosa.magphase(librosa.stft(y=y)) >>> librosa.feature.spectral_centroid(S=S) array([[ 4382.894, 626.588, ..., 5037.07 , 5413.398]]) Using variable bin center frequencies: >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> if_gram, D = librosa.ifgram(y) >>> librosa.feature.spectral_centroid(S=np.abs(D), freq=if_gram) array([[ 4420.719, 625.769, ..., 5011.86 , 5221.492]]) Plot the result >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> plt.semilogy(cent.T, label='Spectral centroid') >>> plt.ylabel('Hz') >>> plt.xticks([]) >>> plt.xlim([0, cent.shape[-1]]) >>> plt.legend() >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('log Power spectrogram') >>> plt.tight_layout() ''' S, n_fft = mag_spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) if not np.isrealobj(S): raise ParameterError('Spectral centroid is only defined ' 'with real-valued input') elif np.any(S < 0): raise ParameterError('Spectral centroid is only defined ' 'with non-negative energies') # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) if freq.ndim == 1: freq = freq.reshape((-1, 1)) # Column-normalize S return np.sum(freq * normalize(S, norm=1, axis=0), axis=0, keepdims=True)
def path_enhance(R, n, window='hann', max_ratio=2.0, min_ratio=None, n_filters=7, zero_mean=False, clip=True, **kwargs): '''Multi-angle path enhancement for self- and cross-similarity matrices. This function convolves multiple diagonal smoothing filters with a self-similarity (or recurrence) matrix R, and aggregates the result by an element-wise maximum. Technically, the output is a matrix R_smooth such that `R_smooth[i, j] = max_theta (R * filter_theta)[i, j]` where `*` denotes 2-dimensional convolution, and `filter_theta` is a smoothing filter at orientation theta. This is intended to provide coherent temporal smoothing of self-similarity matrices when there are changes in tempo. Smoothing filters are generated at evenly spaced orientations between min_ratio and max_ratio. This function is inspired by the multi-angle path enhancement of [1]_, but differs by modeling tempo differences in the space of similarity matrices rather than re-sampling the underlying features prior to generating the self-similarity matrix. .. [1] Müller, Meinard and Frank Kurth. "Enhancing similarity matrices for music audio analysis." 2006 IEEE International Conference on Acoustics Speech and Signal Processing Proceedings. Vol. 5. IEEE, 2006. .. note:: if using recurrence_matrix to construct the input similarity matrix, be sure to include the main diagonal by setting `self=True`. Otherwise, the diagonal will be suppressed, and this is likely to produce discontinuities which will pollute the smoothing filter response. Parameters ---------- R : np.ndarray The self- or cross-similarity matrix to be smoothed. Note: sparse inputs are not supported. n : int > 0 The length of the smoothing filter window : window specification The type of smoothing filter to use. See `filters.get_window` for more information on window specification formats. max_ratio : float > 0 The maximum tempo ratio to support min_ratio : float > 0 The minimum tempo ratio to support. If not provided, it will default to `1/max_ratio` n_filters : int >= 1 The number of different smoothing filters to use, evenly spaced between `min_ratio` and `max_ratio`. If `min_ratio = 1/max_ratio` (the default), using an odd number of filters will ensure that the main diagonal (ratio=1) is included. zero_mean : bool By default, the smoothing filters are non-negative and sum to one (i.e. are averaging filters). If `zero_mean=True`, then the smoothing filters are made to sum to zero by subtracting a constant value from the non-diagonal coordinates of the filter. This is primarily useful for suppressing blocks while enhancing diagonals. clip : bool If True, the smoothed similarity matrix will be thresholded at 0, and will not contain negative entries. kwargs : additional keyword arguments Additional arguments to pass to `scipy.ndimage.convolve` Returns ------- R_smooth : np.ndarray, shape=R.shape The smoothed self- or cross-similarity matrix See Also -------- filters.diagonal_filter recurrence_matrix Examples -------- Use a 51-frame diagonal smoothing filter to enhance paths in a recurrence matrix >>> y, sr = librosa.load(librosa.util.example_audio_file(), duration=30) >>> hop_length = 1024 >>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length) >>> rec = librosa.segment.recurrence_matrix(chroma, mode='affinity', self=True) >>> rec_smooth = librosa.segment.path_enhance(rec, 51, window='hann', n_filters=7) Plot the recurrence matrix before and after smoothing >>> import matplotlib.pyplot as plt >>> plt.figure(figsize=(8, 4)) >>> plt.subplot(1,2,1) >>> librosa.display.specshow(rec, x_axis='time', y_axis='time', ... hop_length=hop_length) >>> plt.title('Unfiltered recurrence') >>> plt.subplot(1,2,2) >>> librosa.display.specshow(rec_smooth, x_axis='time', y_axis='time', ... hop_length=hop_length) >>> plt.title('Multi-angle enhanced recurrence') >>> plt.tight_layout() >>> plt.show() ''' if min_ratio is None: min_ratio = 1./max_ratio elif min_ratio > max_ratio: raise ParameterError('min_ratio={} cannot exceed max_ratio={}'.format(min_ratio, max_ratio)) R_smooth = None for ratio in np.logspace(np.log2(min_ratio), np.log2(max_ratio), num=n_filters, base=2): kernel = diagonal_filter(window, n, slope=ratio, zero_mean=zero_mean) if R_smooth is None: R_smooth = scipy.ndimage.convolve(R, kernel, **kwargs) else: # Compute the point-wise maximum in-place np.maximum(R_smooth, scipy.ndimage.convolve(R, kernel, **kwargs), out=R_smooth) if clip: # Clip the output in-place np.clip(R_smooth, 0, None, out=R_smooth) return R_smooth
def pad_center(data, size, axis=-1, **kwargs): '''Wrapper for np.pad to automatically center an array prior to padding. This is analogous to `str.center()` Examples -------- >>> # Generate a vector >>> data = np.ones(5) >>> librosa.util.pad_center(data, 10, mode='constant') array([ 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.]) >>> # Pad a matrix along its first dimension >>> data = np.ones((3, 5)) >>> librosa.util.pad_center(data, 7, axis=0) array([[ 0., 0., 0., 0., 0.], [ 0., 0., 0., 0., 0.], [ 1., 1., 1., 1., 1.], [ 1., 1., 1., 1., 1.], [ 1., 1., 1., 1., 1.], [ 0., 0., 0., 0., 0.], [ 0., 0., 0., 0., 0.]]) >>> # Or its second dimension >>> librosa.util.pad_center(data, 7, axis=1) array([[ 0., 1., 1., 1., 1., 1., 0.], [ 0., 1., 1., 1., 1., 1., 0.], [ 0., 1., 1., 1., 1., 1., 0.]]) Parameters ---------- data : np.ndarray Vector to be padded and centered size : int >= len(data) [scalar] Length to pad `data` axis : int Axis along which to pad and center the data kwargs : additional keyword arguments arguments passed to `np.pad()` Returns ------- data_padded : np.ndarray `data` centered and padded to length `size` along the specified axis Raises ------ ParameterError If `size < data.shape[axis]` See Also -------- numpy.pad ''' kwargs.setdefault('mode', 'constant') n = data.shape[axis] lpad = int((size - n) // 2) lengths = [(0, 0)] * data.ndim lengths[axis] = (lpad, int(size - n - lpad)) if lpad < 0: raise ParameterError(('Target size ({:d}) must be ' 'at least input size ({:d})').format(size, n)) return np.pad(data, lengths, **kwargs)
def recurrence_to_lag(rec, pad=True, axis=-1): '''Convert a recurrence matrix into a lag matrix. `lag[i, j] == rec[i+j, j]` Parameters ---------- rec : np.ndarray, or scipy.sparse.spmatrix [shape=(n, n)] A (binary) recurrence matrix, as returned by `recurrence_matrix` pad : bool If False, `lag` matrix is square, which is equivalent to assuming that the signal repeats itself indefinitely. If True, `lag` is padded with `n` zeros, which eliminates the assumption of repetition. axis : int The axis to keep as the `time` axis. The alternate axis will be converted to lag coordinates. Returns ------- lag : np.ndarray The recurrence matrix in (lag, time) (if `axis=1`) or (time, lag) (if `axis=0`) coordinates Raises ------ ParameterError : if `rec` is non-square See Also -------- recurrence_matrix lag_to_recurrence Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> hop_length = 1024 >>> mfccs = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length) >>> recurrence = librosa.segment.recurrence_matrix(mfccs) >>> lag_pad = librosa.segment.recurrence_to_lag(recurrence, pad=True) >>> lag_nopad = librosa.segment.recurrence_to_lag(recurrence, pad=False) >>> import matplotlib.pyplot as plt >>> plt.figure(figsize=(8, 4)) >>> plt.subplot(1, 2, 1) >>> librosa.display.specshow(lag_pad, x_axis='time', y_axis='lag', ... hop_length=hop_length) >>> plt.title('Lag (zero-padded)') >>> plt.subplot(1, 2, 2) >>> librosa.display.specshow(lag_nopad, x_axis='time', hop_length=hop_length) >>> plt.title('Lag (no padding)') >>> plt.tight_layout() >>> plt.show() ''' axis = np.abs(axis) if rec.ndim != 2 or rec.shape[0] != rec.shape[1]: raise ParameterError('non-square recurrence matrix shape: ' '{}'.format(rec.shape)) sparse = scipy.sparse.issparse(rec) roll_ax = None if sparse: roll_ax = 1 - axis lag_format = rec.format if axis == 0: rec = rec.tocsc() elif axis in (-1, 1): rec = rec.tocsr() t = rec.shape[axis] if sparse: if pad: kron = np.asarray([[1, 0]]).swapaxes(axis, 0) lag = scipy.sparse.kron(kron.astype(rec.dtype), rec, format='lil') else: lag = scipy.sparse.lil_matrix(rec) else: if pad: padding = [(0, 0), (0, 0)] padding[(1-axis)] = (0, t) lag = np.pad(rec, padding, mode='constant') else: lag = rec.copy() idx_slice = [slice(None)] * lag.ndim for i in range(1, t): idx_slice[axis] = i lag[tuple(idx_slice)] = roll_sparse(lag[tuple(idx_slice)], -i, axis=roll_ax) if sparse: return lag.asformat(lag_format) return np.ascontiguousarray(lag.T).T