def __init__(self, input_blocksize=256, input_stepsize=256, input_samplerate=22050, fmin=32.7, bins_per_octave=60, n_octaves=6, harmonics=(0.5, 1, 2, 3, 4, 5), buffer_size=1000): super(NYUHCQT, self).__init__() self.input_blocksize = input_blocksize self.input_stepsize = input_stepsize self.input_samplerate = input_samplerate self.fmin = fmin self.bins_per_octave = bins_per_octave self.n_octaves = n_octaves self.harmonics = harmonics self.buffer_size = buffer_size lengths = filters.constant_q_lengths(self.input_samplerate, fmin=self.fmin, n_bins=self.n_octaves * self.bins_per_octave, bins_per_octave=self.bins_per_octave, tuning=0.0, window='hann', filter_scale=1) self.buffer_margin_size = int(round(lengths[0] / self.input_blocksize)) self.buffer = np.zeros(self.input_blocksize * self.buffer_size) self.idx = self.buffer_margin_size * self.input_blocksize self.cleanup = False self.output_idx = 0 self.values = None
def get_variables(y, ndft_else, cqt_name="x", sr=22050, n_hop=512, fmin=None, n_bins=84, bins_per_octave=12, tuning=0.0, filter_scale=1, norm=1, sparsity=0.01, window='hann', scale=True, pad_mode='reflect'): if fmin is None: # C1 by default fmin = librosa.time_frequency.note_to_hz('C1') if tuning is None: tuning = estimate_tuning(y=y, sr=sr) fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin, n_bins, bins_per_octave, tuning, filter_scale, norm, sparsity, hop_length=n_hop, window=window) fft_basis = np.abs(fft_basis).astype('float32').todense() fft_basis_tf = tf.constant(fft_basis, name="fft_basis_"+cqt_name, dtype='float32') if n_fft == ndft_else: dft_real_kernels_cqt_tf, dft_imag_kernels_cqt_tf = None, None else: dft_real_kernels_cqt, dft_imag_kernels_cqt = get_stft_kernels(n_fft) dft_real_kernels_cqt_tf = tf.constant(dft_real_kernels_cqt, name="dft_real_kernels_cqt_"+cqt_name, dtype='float32') dft_imag_kernels_cqt_tf = tf.constant(dft_imag_kernels_cqt, name="dft_imag_kernels_cqt_"+cqt_name, dtype='float32') if not scale: lengths = filters.constant_q_lengths(sr, fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, window=window, filter_scale=filter_scale) lengths = np.sqrt(lengths[:, np.newaxis] / n_fft).astype('float32') lengths_tf = tf.constant(lengths, name="lengths_"+cqt_name, dtype='float32') else: lengths_tf = None return dft_real_kernels_cqt_tf, dft_imag_kernels_cqt_tf, fft_basis_tf, lengths_tf
def icqt_tf(C, y, added_samples, sr=22050, hop_length=512, fmin=None, n_bins=84, bins_per_octave=12, filter_scale=1, norm=1, sparsity=0.01, window='hann', scale=True, pad_mode='reflect', use_smoothing=True, n_samples_total=None): tuning = 0.0 # How many octaves are we dealing with? n_octaves = int(np.ceil(float(n_bins) / bins_per_octave)) n_filters = min(bins_per_octave, n_bins) if scale: lengths = filters.constant_q_lengths(sr, fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, window=window, filter_scale=filter_scale) lengths_tf = tf.constant(lengths.astype('complex64'), dtype=tf.complex64) C *= tf.sqrt(lengths_tf[:, tf.newaxis]) if fmin is None: # C1 by default fmin = note_to_hz('C1') # First thing, get the freqs of the top octave freqs = cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave)[-bins_per_octave:] fmin_t = np.min(freqs) fmax_t = np.max(freqs) # Determine required resampling quality Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 1) filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q) nyquist = sr / 2.0 if filter_cutoff < audio.BW_FASTEST * nyquist: res_type = 'kaiser_fast' else: res_type = 'kaiser_best' y, sr, hop_length = __early_downsample(y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale) cqt_resp = [] for i in range(n_octaves): cqt_resp += [ C[:, i * bins_per_octave:i * bins_per_octave + bins_per_octave, :] ] cqt_resp = cqt_resp[::-1] if n_samples_total == None: n_bins = cqt_resp[0].get_shape().as_list()[-1] n_samples_total = hop_length * n_bins print('n_samples_total:', n_samples_total) if res_type != 'kaiser_fast': # Do the top octave before resampling to allow for fast resampling fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin_t, n_filters, bins_per_octave, tuning, filter_scale, norm, sparsity, window=window) fft_basis = np.linalg.pinv(fft_basis) fft_basis_tf = tf.transpose(tf.constant(fft_basis.astype( np.complex64))) # Compute the CQT filter response and append it to the stack y = __icqt_response_tf(cqt_resp[0], n_fft, hop_length, fft_basis_tf, pad_mode, added_samples[0]) y = tf.image.resize_images(y[:, :, tf.newaxis, tf.newaxis], [n_samples_total, 1])[:, :, 0, 0] fmin_t /= 2 fmax_t /= 2 n_octaves -= 1 filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q) res_type = 'kaiser_fast' # Make sure our hop is long enough to support the bottom octave num_twos = __num_two_factors(hop_length) if num_twos < n_octaves - 1: raise ParameterError('hop_length must be a positive integer ' 'multiple of 2^{0:d} for {1:d}-octave CQT'.format( n_octaves - 1, n_octaves)) # Now do the recursive bit fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin_t, n_filters, bins_per_octave, tuning, filter_scale, norm, sparsity, window=window) fft_basis_tf = tf.transpose( tf.constant(np.linalg.pinv(fft_basis.astype(np.complex64)))) my_y, my_sr, my_hop = y, sr, hop_length # Iterate down the octaves for i in range(n_octaves): # Resample (except first time) if i > 0: #my_y = audio_resample_tf(my_y, my_sr, my_sr/2.0, # res_type=res_type, # scale=True, use_smoothing=use_smoothing) # The re-scale the filters to compensate for downsampling my_sr /= 2.0 my_hop //= 2 ratio = float(sr) / my_sr # Compute the cqt filter response and append to the stack my_y = __icqt_response_tf(cqt_resp[i + 1], n_fft, my_hop, fft_basis_tf / np.sqrt(ratio), pad_mode, added_samples[i + 1]) my_y = tf.image.resize_images( my_y[:, :, tf.newaxis, tf.newaxis], [n_samples_total, 1])[:, :, 0, 0] / np.sqrt(ratio) y += my_y else: my_y = __icqt_response_tf(cqt_resp[i + 1], n_fft, my_hop, fft_basis_tf, pad_mode, added_samples[i + 1]) my_y = tf.image.resize_images(my_y[:, :, tf.newaxis, tf.newaxis], [n_samples_total, 1])[:, :, 0, 0] y += my_y #print('Octave:',i) #print('y.size:', my_y.get_shape().as_list()) #print('SR:', my_sr) #print('Hop:', my_hop) #print('New SR:',sr) return y
def cqt_tf(y, sr=22050, hop_length=512, fmin=None, n_bins=84, bins_per_octave=12, filter_scale=1, norm=1, sparsity=0.01, window='hann', scale=True, pad_mode='reflect', use_smoothing=True, return_added_samples=False, debug=False): tuning = 0.0 # How many octaves are we dealing with? n_octaves = int(np.ceil(float(n_bins) / bins_per_octave)) n_filters = min(bins_per_octave, n_bins) len_orig = y.get_shape().as_list()[1] added_samples = [] if fmin is None: # C1 by default fmin = note_to_hz('C1') # First thing, get the freqs of the top octave freqs = cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave)[-bins_per_octave:] fmin_t = np.min(freqs) fmax_t = np.max(freqs) # Determine required resampling quality Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 1) filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q) nyquist = sr / 2.0 if filter_cutoff < audio.BW_FASTEST * nyquist: res_type = 'kaiser_fast' else: res_type = 'kaiser_best' y, sr, hop_length = __early_downsample_tf(y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale, use_smoothing) #print('y after early downsaple:', y.get_shape().as_list()[1]) cqt_resp = [] if res_type != 'kaiser_fast': # Do the top octave before resampling to allow for fast resampling fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin_t, n_filters, bins_per_octave, tuning, filter_scale, norm, sparsity, window=window) fft_basis = fft_basis.astype('complex64') fft_basis_tf = tf.constant(fft_basis, dtype=tf.complex64) fft_basis_tf = tf.transpose(fft_basis_tf) # Compute the CQT filter response and append it to the stack cqt_res, add_samples = __cqt_response_tf(y, n_fft, hop_length, fft_basis_tf, pad_mode, debug) cqt_resp.append(cqt_res) added_samples += [add_samples] fmin_t /= 2 fmax_t /= 2 n_octaves -= 1 filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q) res_type = 'kaiser_fast' # Make sure our hop is long enough to support the bottom octave num_twos = __num_two_factors(hop_length) if num_twos < n_octaves - 1: raise ParameterError('hop_length must be a positive integer ' 'multiple of 2^{0:d} for {1:d}-octave CQT'.format( n_octaves - 1, n_octaves)) # Now do the recursive bit fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin_t, n_filters, bins_per_octave, tuning, filter_scale, norm, sparsity, window=window) fft_basis = fft_basis.astype('complex64') fft_basis_tf = tf.constant(fft_basis, dtype=tf.complex64) fft_basis_tf = tf.transpose(fft_basis_tf) my_y, my_sr, my_hop = y, sr, hop_length # Iterate down the octaves for i in range(n_octaves): # Resample (except first time) if i > 0: if my_y.get_shape().as_list()[1] < 2: raise ParameterError('Input signal length={} is too short for ' '{:d}-octave CQT'.format( len_orig, n_octaves)) #print('Resample from ', my_sr, 'to', my_sr/2.0) my_y = audio_resample_tf(my_y, my_sr, my_sr / 2.0, res_type=res_type, scale=True, use_smoothing=use_smoothing) # The re-scale the filters to compensate for downsampling fft_basis_tf *= np.sqrt(2) my_sr /= 2.0 my_hop //= 2 #print('y after early downsaple:', my_y.get_shape().as_list()[1]) # Compute the cqt filter response and append to the stack cqt_res, add_samples = __cqt_response_tf(my_y, n_fft, my_hop, fft_basis_tf, pad_mode, debug) cqt_resp.append(cqt_res) added_samples += [add_samples] C = __trim_stack_tf(cqt_resp, n_bins) if scale: lengths = filters.constant_q_lengths(sr, fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, window=window, filter_scale=filter_scale) lengths_tf = tf.constant(lengths.astype('complex64'), dtype=tf.complex64) C /= tf.sqrt(lengths_tf[:, tf.newaxis]) if return_added_samples: return C, added_samples else: return C
def icqt(C, sr=22050, hop_length=512, fmin=None, bins_per_octave=12, tuning=0.0, filter_scale=1, norm=1, sparsity=0.01, window='hann', scale=True, amin=1e-6): '''Compute the inverse constant-Q transform. Given a constant-Q transform representation `C` of an audio signal `y`, this function produces an approximation `y_hat`. .. warning:: This implementation is unstable, and subject to change in future versions of librosa. We recommend that its use be limited to sonification and diagnostic applications. Parameters ---------- C : np.ndarray, [shape=(n_bins, n_frames)] Constant-Q representation as produced by `core.cqt` hop_length : int > 0 [scalar] number of samples between successive frames fmin : float > 0 [scalar] Minimum frequency. Defaults to C1 ~= 32.70 Hz tuning : float in `[-0.5, 0.5)` [scalar] Tuning offset in fractions of a bin (cents). filter_scale : float > 0 [scalar] Filter scale factor. Small values (<1) use shorter windows for improved time resolution. norm : {inf, -inf, 0, float > 0} Type of norm to use for basis function normalization. See `librosa.util.normalize`. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to `sparsity` fraction of the energy in each basis. Set `sparsity=0` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. scale : bool If `True`, scale the CQT response by square-root the length of each channel's filter. This is analogous to `norm='ortho'` in FFT. If `False`, do not scale the CQT. This is analogous to `norm=None` in FFT. amin : float or None When applying squared window normalization, sample positions with coefficients below `amin` will left as is. If `None`, then `amin` is inferred as the smallest valid floating point value. Returns ------- y : np.ndarray, [shape=(n_samples), dtype=np.float] Audio time-series reconstructed from the CQT representation. See Also -------- cqt Notes ----- This function caches at level 40. Examples -------- Using default parameters >>> y, sr = librosa.load(librosa.util.example_audio_file(), duration=15) >>> C = librosa.cqt(y=y, sr=sr) >>> y_hat = librosa.icqt(C=C, sr=sr) Or with a different hop length and frequency resolution: >>> hop_length = 256 >>> bins_per_octave = 12 * 3 >>> C = librosa.cqt(y=y, sr=sr, hop_length=256, n_bins=7*bins_per_octave, ... bins_per_octave=bins_per_octave) >>> y_hat = librosa.icqt(C=C, sr=sr, hop_length=hop_length, ... bins_per_octave=bins_per_octave) ''' warnings.warn( 'librosa.icqt is unstable, and subject to change in future versions. ' 'Please use with caution.') n_bins, n_frames = C.shape n_octaves = int(np.ceil(float(n_bins) / bins_per_octave)) if amin is None: amin = util.tiny(C) if fmin is None: fmin = note_to_hz('C1') freqs = cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave, tuning=tuning)[-bins_per_octave:] fmin_t = np.min(freqs) # Make the filter bank basis, lengths = filters.constant_q(sr=sr, fmin=fmin_t, n_bins=bins_per_octave, bins_per_octave=bins_per_octave, filter_scale=filter_scale, tuning=tuning, norm=norm, window=window, pad_fft=True) n_fft = basis.shape[1] # The extra factor of lengths**0.5 corrects for within-octave tapering basis = basis * np.sqrt(lengths[:, np.newaxis]) # Estimate the gain per filter bdot = basis.conj().dot(basis.T) bscale = np.sum(np.abs(bdot), axis=1) n_trim = basis.shape[1] // 2 if scale: Cnorm = np.ones(n_bins)[:, np.newaxis] else: Cnorm = filters.constant_q_lengths(sr=sr, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, filter_scale=filter_scale, tuning=tuning, window=window)[:, np.newaxis]**0.5 y = None # Revised algorithm: # for each octave # upsample old octave # @--numba accelerate this loop? # for each basis # convolve with activation (valid-mode) # divide by window sumsquare # trim and add to total for octave in range(n_octaves - 1, -1, -1): # Compute the slice index for the current octave slice_ = slice(-(octave + 1) * bins_per_octave - 1, -(octave) * bins_per_octave - 1) # Project onto the basis C_oct = C[slice_] / Cnorm[slice_] basis_oct = basis[-C_oct.shape[0]:] y_oct = None # Make a dummy activation oct_hop = hop_length // 2**octave n = n_fft + (C_oct.shape[1] - 1) * oct_hop for i in range(basis_oct.shape[0] - 1, -1, -1): wss = filters.window_sumsquare(window, n_frames, hop_length=oct_hop, win_length=int(lengths[i]), n_fft=n_fft, norm=norm) wss *= lengths[i]**2 # Construct the response for this filter y_oct_i = np.zeros(n, dtype=C_oct.dtype) __activation_fill(y_oct_i, basis_oct[i], C_oct[i], oct_hop) # Retain only the real part # Only do window normalization for sufficiently large window # coefficients y_oct_i = y_oct_i.real / np.maximum(amin, wss) if y_oct is None: y_oct = y_oct_i else: y_oct += y_oct_i # Remove the effects of zero-padding y_oct = y_oct[n_trim:-n_trim] * bscale[i] if y is None: y = y_oct else: # Up-sample the previous buffer and add in the new one # Scipy-resampling is fast here, since it's a power-of-two relation y = audio.resample(y, 1, 2, scale=True, res_type='scipy') + y_oct return y
def pseudo_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, bins_per_octave=12, tuning=0.0, filter_scale=1, norm=1, sparsity=0.01, window='hann', scale=True, pad_mode='reflect'): '''Compute the pseudo constant-Q transform of an audio signal. This uses a single fft size that is the smallest power of 2 that is greater than or equal to the max of: 1. The longest CQT filter 2. 2x the hop_length Parameters ---------- y : np.ndarray [shape=(n,)] audio time series sr : number > 0 [scalar] sampling rate of `y` hop_length : int > 0 [scalar] number of samples between successive CQT columns. fmin : float > 0 [scalar] Minimum frequency. Defaults to C1 ~= 32.70 Hz n_bins : int > 0 [scalar] Number of frequency bins, starting at `fmin` bins_per_octave : int > 0 [scalar] Number of bins per octave tuning : None or float in `[-0.5, 0.5)` Tuning offset in fractions of a bin (cents). If `None`, tuning will be automatically estimated from the signal. filter_scale : float > 0 Filter filter_scale factor. Larger values use longer windows. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to `sparsity` fraction of the energy in each basis. Set `sparsity=0` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. pad_mode : string Padding mode for centered frame analysis. See also: `librosa.core.stft` and `np.pad`. Returns ------- CQT : np.ndarray [shape=(n_bins, t), dtype=np.float] Pseudo Constant-Q energy for each frequency at each time. Raises ------ ParameterError If `hop_length` is not an integer multiple of `2**(n_bins / bins_per_octave)` Or if `y` is too short to support the frequency range of the CQT. Notes ----- This function caches at level 20. ''' if fmin is None: # C1 by default fmin = note_to_hz('C1') if tuning is None: tuning = estimate_tuning(y=y, sr=sr) fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin, n_bins, bins_per_octave, tuning, filter_scale, norm, sparsity, hop_length=hop_length, window=window) fft_basis = np.abs(fft_basis) # Compute the magnitude STFT with Hann window D = np.abs(stft(y, n_fft=n_fft, hop_length=hop_length, pad_mode=pad_mode)) # Project onto the pseudo-cqt basis C = fft_basis.dot(D) if scale: C /= np.sqrt(n_fft) else: lengths = filters.constant_q_lengths(sr, fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, window=window, filter_scale=filter_scale) C *= np.sqrt(lengths[:, np.newaxis] / n_fft) return C
def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, bins_per_octave=12, tuning=0.0, filter_scale=1, norm=1, sparsity=0.01, window='hann', scale=True, pad_mode='reflect', res_type='scipy'): '''Compute the constant-Q transform of an audio signal. This implementation is based on the recursive sub-sampling method described by [1]_. .. [1] Schoerkhuber, Christian, and Anssi Klapuri. "Constant-Q transform toolbox for music processing." 7th Sound and Music Computing Conference, Barcelona, Spain. 2010. Parameters ---------- y : np.ndarray [shape=(n,)] audio time series sr : number > 0 [scalar] sampling rate of `y` hop_length : int > 0 [scalar] number of samples between successive CQT columns. fmin : float > 0 [scalar] Minimum frequency. Defaults to C1 ~= 32.70 Hz n_bins : int > 0 [scalar] Number of frequency bins, starting at `fmin` bins_per_octave : int > 0 [scalar] Number of bins per octave tuning : None or float in `[-0.5, 0.5)` Tuning offset in fractions of a bin (cents). If `None`, tuning will be automatically estimated from the signal. filter_scale : float > 0 Filter scale factor. Small values (<1) use shorter windows for improved time resolution. norm : {inf, -inf, 0, float > 0} Type of norm to use for basis function normalization. See `librosa.util.normalize`. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to `sparsity` fraction of the energy in each basis. Set `sparsity=0` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. scale : bool If `True`, scale the CQT response by square-root the length of each channel's filter. This is analogous to `norm='ortho'` in FFT. If `False`, do not scale the CQT. This is analogous to `norm=None` in FFT. pad_mode : string Padding mode for centered frame analysis. See also: `librosa.core.stft` and `np.pad`. Returns ------- CQT : np.ndarray [shape=(n_bins, t), dtype=np.complex or np.float] Constant-Q value each frequency at each time. Raises ------ ParameterError If `hop_length` is not an integer multiple of `2**(n_bins / bins_per_octave)` Or if `y` is too short to support the frequency range of the CQT. See Also -------- librosa.core.resample librosa.util.normalize Notes ----- This function caches at level 20. Examples -------- Generate and plot a constant-Q power spectrum >>> import matplotlib.pyplot as plt >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> C = np.abs(librosa.cqt(y, sr=sr)) >>> librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max), ... sr=sr, x_axis='time', y_axis='cqt_note') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Constant-Q power spectrum') >>> plt.tight_layout() Limit the frequency range >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'), ... n_bins=60)) >>> C array([[ 8.827e-04, 9.293e-04, ..., 3.133e-07, 2.942e-07], [ 1.076e-03, 1.068e-03, ..., 1.153e-06, 1.148e-06], ..., [ 1.042e-07, 4.087e-07, ..., 1.612e-07, 1.928e-07], [ 2.363e-07, 5.329e-07, ..., 1.294e-07, 1.611e-07]]) Using a higher frequency resolution >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'), ... n_bins=60 * 2, bins_per_octave=12 * 2)) >>> C array([[ 1.536e-05, 5.848e-05, ..., 3.241e-07, 2.453e-07], [ 1.856e-03, 1.854e-03, ..., 2.397e-08, 3.549e-08], ..., [ 2.034e-07, 4.245e-07, ..., 6.213e-08, 1.463e-07], [ 4.896e-08, 5.407e-07, ..., 9.176e-08, 1.051e-07]]) ''' # How many octaves are we dealing with? n_octaves = int(np.ceil(float(n_bins) / bins_per_octave)) n_filters = min(bins_per_octave, n_bins) len_orig = len(y) if fmin is None: # C1 by default fmin = note_to_hz('C1') if tuning is None: tuning = estimate_tuning(y=y, sr=sr) # First thing, get the freqs of the top octave freqs = cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave)[-bins_per_octave:] fmin_t = np.min(freqs) fmax_t = np.max(freqs) # Determine required resampling quality Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 1) filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q) nyquist = sr / 2.0 y, sr, hop_length = __early_downsample(y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale) cqt_resp = [] if res_type != 'kaiser_fast': # Do the top octave before resampling to allow for fast resampling fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin_t, n_filters, bins_per_octave, tuning, filter_scale, norm, sparsity, window=window) # Compute the CQT filter response and append it to the stack cqt_resp.append( __cqt_response(y, n_fft, hop_length, fft_basis, pad_mode)) fmin_t /= 2 fmax_t /= 2 n_octaves -= 1 filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q) res_type = 'kaiser_fast' # Make sure our hop is long enough to support the bottom octave num_twos = __num_two_factors(hop_length) if num_twos < n_octaves - 1: raise ParameterError('hop_length must be a positive integer ' 'multiple of 2^{0:d} for {1:d}-octave CQT'.format( n_octaves - 1, n_octaves)) # Now do the recursive bit fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin_t, n_filters, bins_per_octave, tuning, filter_scale, norm, sparsity, window=window) my_y, my_sr, my_hop = y, sr, hop_length # Iterate down the octaves for i in range(n_octaves): # Resample (except first time) if i > 0: if len(my_y) < 2: raise ParameterError('Input signal length={} is too short for ' '{:d}-octave CQT'.format( len_orig, n_octaves)) my_y = audio.resample(my_y, my_sr, my_sr / 2.0, res_type=res_type, scale=True) # The re-scale the filters to compensate for downsampling fft_basis[:] *= np.sqrt(2) my_sr /= 2.0 my_hop //= 2 # Compute the cqt filter response and append to the stack cqt_resp.append( __cqt_response(my_y, n_fft, my_hop, fft_basis, pad_mode)) C = __trim_stack(cqt_resp, n_bins) if scale: lengths = filters.constant_q_lengths(sr, fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, window=window, filter_scale=filter_scale) C /= np.sqrt(lengths[:, np.newaxis]) return C
def hybrid_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, bins_per_octave=12, tuning=0.0, filter_scale=1, norm=1, sparsity=0.01, window='hann', scale=True, pad_mode='reflect'): '''Compute the hybrid constant-Q transform of an audio signal. Here, the hybrid CQT uses the pseudo CQT for higher frequencies where the hop_length is longer than half the filter length and the full CQT for lower frequencies. Parameters ---------- y : np.ndarray [shape=(n,)] audio time series sr : number > 0 [scalar] sampling rate of `y` hop_length : int > 0 [scalar] number of samples between successive CQT columns. fmin : float > 0 [scalar] Minimum frequency. Defaults to C1 ~= 32.70 Hz n_bins : int > 0 [scalar] Number of frequency bins, starting at `fmin` bins_per_octave : int > 0 [scalar] Number of bins per octave tuning : None or float in `[-0.5, 0.5)` Tuning offset in fractions of a bin (cents). If `None`, tuning will be automatically estimated from the signal. filter_scale : float > 0 Filter filter_scale factor. Larger values use longer windows. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to `sparsity` fraction of the energy in each basis. Set `sparsity=0` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. pad_mode : string Padding mode for centered frame analysis. See also: `librosa.core.stft` and `np.pad`. Returns ------- CQT : np.ndarray [shape=(n_bins, t), dtype=np.float] Constant-Q energy for each frequency at each time. Raises ------ ParameterError If `hop_length` is not an integer multiple of `2**(n_bins / bins_per_octave)` Or if `y` is too short to support the frequency range of the CQT. See Also -------- cqt pseudo_cqt Notes ----- This function caches at level 20. ''' if fmin is None: # C1 by default fmin = note_to_hz('C1') if tuning is None: tuning = estimate_tuning(y=y, sr=sr) # Get all CQT frequencies freqs = cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave, tuning=tuning) # Compute the length of each constant-Q basis function lengths = filters.constant_q_lengths(sr, fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=filter_scale, window=window) # Determine which filters to use with Pseudo CQT # These are the ones that fit within 2 hop lengths after padding pseudo_filters = 2.0**np.ceil(np.log2(lengths)) < 2 * hop_length n_bins_pseudo = int(np.sum(pseudo_filters)) n_bins_full = n_bins - n_bins_pseudo cqt_resp = [] if n_bins_pseudo > 0: fmin_pseudo = np.min(freqs[pseudo_filters]) cqt_resp.append( pseudo_cqt(y, sr, hop_length=hop_length, fmin=fmin_pseudo, n_bins=n_bins_pseudo, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=filter_scale, norm=norm, sparsity=sparsity, window=window, scale=scale, pad_mode=pad_mode)) if n_bins_full > 0: cqt_resp.append( np.abs( cqt(y, sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins_full, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=filter_scale, norm=norm, sparsity=sparsity, window=window, scale=scale, pad_mode=pad_mode))) return __trim_stack(cqt_resp, n_bins)