예제 #1
0
    def __init__(self,
                 input_blocksize=256,
                 input_stepsize=256,
                 input_samplerate=22050,
                 fmin=32.7,
                 bins_per_octave=60,
                 n_octaves=6,
                 harmonics=(0.5, 1, 2, 3, 4, 5),
                 buffer_size=1000):
        super(NYUHCQT, self).__init__()
        self.input_blocksize = input_blocksize
        self.input_stepsize = input_stepsize
        self.input_samplerate = input_samplerate
        self.fmin = fmin
        self.bins_per_octave = bins_per_octave
        self.n_octaves = n_octaves
        self.harmonics = harmonics
        self.buffer_size = buffer_size

        lengths = filters.constant_q_lengths(self.input_samplerate,
                                             fmin=self.fmin,
                                             n_bins=self.n_octaves * self.bins_per_octave,
                                             bins_per_octave=self.bins_per_octave,
                                             tuning=0.0,
                                             window='hann',
                                             filter_scale=1)

        self.buffer_margin_size = int(round(lengths[0] / self.input_blocksize))
        self.buffer = np.zeros(self.input_blocksize * self.buffer_size)
        self.idx = self.buffer_margin_size * self.input_blocksize

        self.cleanup = False

        self.output_idx = 0
        self.values = None
예제 #2
0
def get_variables(y, ndft_else,
                   cqt_name="x", sr=22050, n_hop=512, fmin=None, n_bins=84,
               bins_per_octave=12, tuning=0.0, filter_scale=1,
               norm=1, sparsity=0.01, window='hann', scale=True,
               pad_mode='reflect'):
    if fmin is None:
        # C1 by default
        fmin = librosa.time_frequency.note_to_hz('C1')

    if tuning is None:
        tuning = estimate_tuning(y=y, sr=sr)

    fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin, n_bins,
                                           bins_per_octave,
                                           tuning, filter_scale,
                                           norm, sparsity,
                                           hop_length=n_hop,
                                           window=window)

    fft_basis = np.abs(fft_basis).astype('float32').todense()
    fft_basis_tf = tf.constant(fft_basis, name="fft_basis_"+cqt_name, dtype='float32')

    if n_fft == ndft_else:
        dft_real_kernels_cqt_tf, dft_imag_kernels_cqt_tf = None, None
    else:
        dft_real_kernels_cqt, dft_imag_kernels_cqt = get_stft_kernels(n_fft)
        dft_real_kernels_cqt_tf = tf.constant(dft_real_kernels_cqt, name="dft_real_kernels_cqt_"+cqt_name, dtype='float32')
        dft_imag_kernels_cqt_tf = tf.constant(dft_imag_kernels_cqt, name="dft_imag_kernels_cqt_"+cqt_name, dtype='float32')

    if not scale:
        lengths = filters.constant_q_lengths(sr, fmin,
                                             n_bins=n_bins,
                                             bins_per_octave=bins_per_octave,
                                             tuning=tuning,
                                             window=window,
                                             filter_scale=filter_scale)
        lengths = np.sqrt(lengths[:, np.newaxis] / n_fft).astype('float32')
        lengths_tf = tf.constant(lengths, name="lengths_"+cqt_name, dtype='float32')
    else:
        lengths_tf = None
        
    return dft_real_kernels_cqt_tf, dft_imag_kernels_cqt_tf, fft_basis_tf, lengths_tf
예제 #3
0
def icqt_tf(C,
            y,
            added_samples,
            sr=22050,
            hop_length=512,
            fmin=None,
            n_bins=84,
            bins_per_octave=12,
            filter_scale=1,
            norm=1,
            sparsity=0.01,
            window='hann',
            scale=True,
            pad_mode='reflect',
            use_smoothing=True,
            n_samples_total=None):

    tuning = 0.0
    # How many octaves are we dealing with?
    n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
    n_filters = min(bins_per_octave, n_bins)

    if scale:
        lengths = filters.constant_q_lengths(sr,
                                             fmin,
                                             n_bins=n_bins,
                                             bins_per_octave=bins_per_octave,
                                             tuning=tuning,
                                             window=window,
                                             filter_scale=filter_scale)
        lengths_tf = tf.constant(lengths.astype('complex64'),
                                 dtype=tf.complex64)
        C *= tf.sqrt(lengths_tf[:, tf.newaxis])

    if fmin is None:
        # C1 by default
        fmin = note_to_hz('C1')

    # First thing, get the freqs of the top octave
    freqs = cqt_frequencies(n_bins, fmin,
                            bins_per_octave=bins_per_octave)[-bins_per_octave:]

    fmin_t = np.min(freqs)
    fmax_t = np.max(freqs)

    # Determine required resampling quality
    Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 1)
    filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q)
    nyquist = sr / 2.0
    if filter_cutoff < audio.BW_FASTEST * nyquist:
        res_type = 'kaiser_fast'
    else:
        res_type = 'kaiser_best'

    y, sr, hop_length = __early_downsample(y, sr, hop_length, res_type,
                                           n_octaves, nyquist, filter_cutoff,
                                           scale)

    cqt_resp = []

    for i in range(n_octaves):
        cqt_resp += [
            C[:, i * bins_per_octave:i * bins_per_octave + bins_per_octave, :]
        ]

    cqt_resp = cqt_resp[::-1]

    if n_samples_total == None:
        n_bins = cqt_resp[0].get_shape().as_list()[-1]
        n_samples_total = hop_length * n_bins
    print('n_samples_total:', n_samples_total)

    if res_type != 'kaiser_fast':

        # Do the top octave before resampling to allow for fast resampling
        fft_basis, n_fft, _ = __cqt_filter_fft(sr,
                                               fmin_t,
                                               n_filters,
                                               bins_per_octave,
                                               tuning,
                                               filter_scale,
                                               norm,
                                               sparsity,
                                               window=window)

        fft_basis = np.linalg.pinv(fft_basis)
        fft_basis_tf = tf.transpose(tf.constant(fft_basis.astype(
            np.complex64)))
        # Compute the CQT filter response and append it to the stack
        y = __icqt_response_tf(cqt_resp[0], n_fft, hop_length, fft_basis_tf,
                               pad_mode, added_samples[0])

        y = tf.image.resize_images(y[:, :, tf.newaxis, tf.newaxis],
                                   [n_samples_total, 1])[:, :, 0, 0]
        fmin_t /= 2
        fmax_t /= 2
        n_octaves -= 1

        filter_cutoff = fmax_t * (1 +
                                  0.5 * filters.window_bandwidth(window) / Q)

        res_type = 'kaiser_fast'

    # Make sure our hop is long enough to support the bottom octave
    num_twos = __num_two_factors(hop_length)
    if num_twos < n_octaves - 1:
        raise ParameterError('hop_length must be a positive integer '
                             'multiple of 2^{0:d} for {1:d}-octave CQT'.format(
                                 n_octaves - 1, n_octaves))

    # Now do the recursive bit
    fft_basis, n_fft, _ = __cqt_filter_fft(sr,
                                           fmin_t,
                                           n_filters,
                                           bins_per_octave,
                                           tuning,
                                           filter_scale,
                                           norm,
                                           sparsity,
                                           window=window)

    fft_basis_tf = tf.transpose(
        tf.constant(np.linalg.pinv(fft_basis.astype(np.complex64))))

    my_y, my_sr, my_hop = y, sr, hop_length

    # Iterate down the octaves
    for i in range(n_octaves):

        # Resample (except first time)
        if i > 0:

            #my_y = audio_resample_tf(my_y, my_sr, my_sr/2.0,
            #                      res_type=res_type,
            #                      scale=True, use_smoothing=use_smoothing)

            # The re-scale the filters to compensate for downsampling
            my_sr /= 2.0
            my_hop //= 2

            ratio = float(sr) / my_sr

            # Compute the cqt filter response and append to the stack
            my_y = __icqt_response_tf(cqt_resp[i + 1], n_fft, my_hop,
                                      fft_basis_tf / np.sqrt(ratio), pad_mode,
                                      added_samples[i + 1])
            my_y = tf.image.resize_images(
                my_y[:, :, tf.newaxis, tf.newaxis],
                [n_samples_total, 1])[:, :, 0, 0] / np.sqrt(ratio)

            y += my_y

        else:
            my_y = __icqt_response_tf(cqt_resp[i + 1], n_fft, my_hop,
                                      fft_basis_tf, pad_mode,
                                      added_samples[i + 1])
            my_y = tf.image.resize_images(my_y[:, :, tf.newaxis, tf.newaxis],
                                          [n_samples_total, 1])[:, :, 0, 0]
            y += my_y

        #print('Octave:',i)
        #print('y.size:', my_y.get_shape().as_list())
        #print('SR:', my_sr)
        #print('Hop:', my_hop)
        #print('New SR:',sr)
    return y
예제 #4
0
def cqt_tf(y,
           sr=22050,
           hop_length=512,
           fmin=None,
           n_bins=84,
           bins_per_octave=12,
           filter_scale=1,
           norm=1,
           sparsity=0.01,
           window='hann',
           scale=True,
           pad_mode='reflect',
           use_smoothing=True,
           return_added_samples=False,
           debug=False):

    tuning = 0.0
    # How many octaves are we dealing with?
    n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
    n_filters = min(bins_per_octave, n_bins)

    len_orig = y.get_shape().as_list()[1]

    added_samples = []

    if fmin is None:
        # C1 by default
        fmin = note_to_hz('C1')

    # First thing, get the freqs of the top octave
    freqs = cqt_frequencies(n_bins, fmin,
                            bins_per_octave=bins_per_octave)[-bins_per_octave:]

    fmin_t = np.min(freqs)
    fmax_t = np.max(freqs)

    # Determine required resampling quality
    Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 1)
    filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q)
    nyquist = sr / 2.0
    if filter_cutoff < audio.BW_FASTEST * nyquist:
        res_type = 'kaiser_fast'
    else:
        res_type = 'kaiser_best'

    y, sr, hop_length = __early_downsample_tf(y, sr, hop_length, res_type,
                                              n_octaves, nyquist,
                                              filter_cutoff, scale,
                                              use_smoothing)

    #print('y after early downsaple:', y.get_shape().as_list()[1])
    cqt_resp = []

    if res_type != 'kaiser_fast':

        # Do the top octave before resampling to allow for fast resampling
        fft_basis, n_fft, _ = __cqt_filter_fft(sr,
                                               fmin_t,
                                               n_filters,
                                               bins_per_octave,
                                               tuning,
                                               filter_scale,
                                               norm,
                                               sparsity,
                                               window=window)

        fft_basis = fft_basis.astype('complex64')

        fft_basis_tf = tf.constant(fft_basis, dtype=tf.complex64)
        fft_basis_tf = tf.transpose(fft_basis_tf)
        # Compute the CQT filter response and append it to the stack
        cqt_res, add_samples = __cqt_response_tf(y, n_fft, hop_length,
                                                 fft_basis_tf, pad_mode, debug)
        cqt_resp.append(cqt_res)
        added_samples += [add_samples]

        fmin_t /= 2
        fmax_t /= 2
        n_octaves -= 1

        filter_cutoff = fmax_t * (1 +
                                  0.5 * filters.window_bandwidth(window) / Q)

        res_type = 'kaiser_fast'

    # Make sure our hop is long enough to support the bottom octave
    num_twos = __num_two_factors(hop_length)
    if num_twos < n_octaves - 1:
        raise ParameterError('hop_length must be a positive integer '
                             'multiple of 2^{0:d} for {1:d}-octave CQT'.format(
                                 n_octaves - 1, n_octaves))

    # Now do the recursive bit
    fft_basis, n_fft, _ = __cqt_filter_fft(sr,
                                           fmin_t,
                                           n_filters,
                                           bins_per_octave,
                                           tuning,
                                           filter_scale,
                                           norm,
                                           sparsity,
                                           window=window)

    fft_basis = fft_basis.astype('complex64')

    fft_basis_tf = tf.constant(fft_basis, dtype=tf.complex64)
    fft_basis_tf = tf.transpose(fft_basis_tf)

    my_y, my_sr, my_hop = y, sr, hop_length

    # Iterate down the octaves
    for i in range(n_octaves):

        # Resample (except first time)
        if i > 0:
            if my_y.get_shape().as_list()[1] < 2:
                raise ParameterError('Input signal length={} is too short for '
                                     '{:d}-octave CQT'.format(
                                         len_orig, n_octaves))

            #print('Resample from ', my_sr, 'to', my_sr/2.0)
            my_y = audio_resample_tf(my_y,
                                     my_sr,
                                     my_sr / 2.0,
                                     res_type=res_type,
                                     scale=True,
                                     use_smoothing=use_smoothing)
            # The re-scale the filters to compensate for downsampling
            fft_basis_tf *= np.sqrt(2)

            my_sr /= 2.0
            my_hop //= 2
            #print('y after early downsaple:', my_y.get_shape().as_list()[1])

        # Compute the cqt filter response and append to the stack
        cqt_res, add_samples = __cqt_response_tf(my_y, n_fft, my_hop,
                                                 fft_basis_tf, pad_mode, debug)
        cqt_resp.append(cqt_res)
        added_samples += [add_samples]

    C = __trim_stack_tf(cqt_resp, n_bins)

    if scale:
        lengths = filters.constant_q_lengths(sr,
                                             fmin,
                                             n_bins=n_bins,
                                             bins_per_octave=bins_per_octave,
                                             tuning=tuning,
                                             window=window,
                                             filter_scale=filter_scale)
        lengths_tf = tf.constant(lengths.astype('complex64'),
                                 dtype=tf.complex64)
        C /= tf.sqrt(lengths_tf[:, tf.newaxis])

    if return_added_samples:
        return C, added_samples
    else:
        return C
예제 #5
0
def icqt(C,
         sr=22050,
         hop_length=512,
         fmin=None,
         bins_per_octave=12,
         tuning=0.0,
         filter_scale=1,
         norm=1,
         sparsity=0.01,
         window='hann',
         scale=True,
         amin=1e-6):
    '''Compute the inverse constant-Q transform.
    Given a constant-Q transform representation `C` of an audio signal `y`,
    this function produces an approximation `y_hat`.
    .. warning:: This implementation is unstable, and subject to change in
                 future versions of librosa.  We recommend that its use be
                 limited to sonification and diagnostic applications.
    Parameters
    ----------
    C : np.ndarray, [shape=(n_bins, n_frames)]
        Constant-Q representation as produced by `core.cqt`
    hop_length : int > 0 [scalar]
        number of samples between successive frames
    fmin : float > 0 [scalar]
        Minimum frequency. Defaults to C1 ~= 32.70 Hz
    tuning : float in `[-0.5, 0.5)` [scalar]
        Tuning offset in fractions of a bin (cents).
    filter_scale : float > 0 [scalar]
        Filter scale factor. Small values (<1) use shorter windows
        for improved time resolution.
    norm : {inf, -inf, 0, float > 0}
        Type of norm to use for basis function normalization.
        See `librosa.util.normalize`.
    sparsity : float in [0, 1)
        Sparsify the CQT basis by discarding up to `sparsity`
        fraction of the energy in each basis.
        Set `sparsity=0` to disable sparsification.
    window : str, tuple, number, or function
        Window specification for the basis filters.
        See `filters.get_window` for details.
    scale : bool
        If `True`, scale the CQT response by square-root the length
        of each channel's filter. This is analogous to `norm='ortho'` in FFT.
        If `False`, do not scale the CQT. This is analogous to `norm=None`
        in FFT.
    amin : float or None
        When applying squared window normalization, sample positions with
        coefficients below `amin` will left as is.
        If `None`, then `amin` is inferred as the smallest valid floating
        point value.
    Returns
    -------
    y : np.ndarray, [shape=(n_samples), dtype=np.float]
        Audio time-series reconstructed from the CQT representation.
    See Also
    --------
    cqt
    Notes
    -----
    This function caches at level 40.
    Examples
    --------
    Using default parameters
    >>> y, sr = librosa.load(librosa.util.example_audio_file(), duration=15)
    >>> C = librosa.cqt(y=y, sr=sr)
    >>> y_hat = librosa.icqt(C=C, sr=sr)
    Or with a different hop length and frequency resolution:
    >>> hop_length = 256
    >>> bins_per_octave = 12 * 3
    >>> C = librosa.cqt(y=y, sr=sr, hop_length=256, n_bins=7*bins_per_octave,
    ...                 bins_per_octave=bins_per_octave)
    >>> y_hat = librosa.icqt(C=C, sr=sr, hop_length=hop_length,
    ...                 bins_per_octave=bins_per_octave)
    '''
    warnings.warn(
        'librosa.icqt is unstable, and subject to change in future versions. '
        'Please use with caution.')

    n_bins, n_frames = C.shape
    n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))

    if amin is None:
        amin = util.tiny(C)

    if fmin is None:
        fmin = note_to_hz('C1')

    freqs = cqt_frequencies(n_bins,
                            fmin,
                            bins_per_octave=bins_per_octave,
                            tuning=tuning)[-bins_per_octave:]

    fmin_t = np.min(freqs)

    # Make the filter bank
    basis, lengths = filters.constant_q(sr=sr,
                                        fmin=fmin_t,
                                        n_bins=bins_per_octave,
                                        bins_per_octave=bins_per_octave,
                                        filter_scale=filter_scale,
                                        tuning=tuning,
                                        norm=norm,
                                        window=window,
                                        pad_fft=True)
    n_fft = basis.shape[1]

    # The extra factor of lengths**0.5 corrects for within-octave tapering
    basis = basis * np.sqrt(lengths[:, np.newaxis])

    # Estimate the gain per filter
    bdot = basis.conj().dot(basis.T)
    bscale = np.sum(np.abs(bdot), axis=1)

    n_trim = basis.shape[1] // 2

    if scale:
        Cnorm = np.ones(n_bins)[:, np.newaxis]
    else:
        Cnorm = filters.constant_q_lengths(sr=sr,
                                           fmin=fmin,
                                           n_bins=n_bins,
                                           bins_per_octave=bins_per_octave,
                                           filter_scale=filter_scale,
                                           tuning=tuning,
                                           window=window)[:, np.newaxis]**0.5

    y = None

    # Revised algorithm:
    #   for each octave
    #      upsample old octave
    #      @--numba accelerate this loop?
    #      for each basis
    #         convolve with activation (valid-mode)
    #         divide by window sumsquare
    #         trim and add to total

    for octave in range(n_octaves - 1, -1, -1):
        # Compute the slice index for the current octave
        slice_ = slice(-(octave + 1) * bins_per_octave - 1,
                       -(octave) * bins_per_octave - 1)

        # Project onto the basis
        C_oct = C[slice_] / Cnorm[slice_]
        basis_oct = basis[-C_oct.shape[0]:]

        y_oct = None

        # Make a dummy activation
        oct_hop = hop_length // 2**octave
        n = n_fft + (C_oct.shape[1] - 1) * oct_hop

        for i in range(basis_oct.shape[0] - 1, -1, -1):
            wss = filters.window_sumsquare(window,
                                           n_frames,
                                           hop_length=oct_hop,
                                           win_length=int(lengths[i]),
                                           n_fft=n_fft,
                                           norm=norm)

            wss *= lengths[i]**2

            # Construct the response for this filter
            y_oct_i = np.zeros(n, dtype=C_oct.dtype)
            __activation_fill(y_oct_i, basis_oct[i], C_oct[i], oct_hop)
            # Retain only the real part
            # Only do window normalization for sufficiently large window
            # coefficients
            y_oct_i = y_oct_i.real / np.maximum(amin, wss)

            if y_oct is None:
                y_oct = y_oct_i
            else:
                y_oct += y_oct_i

        # Remove the effects of zero-padding
        y_oct = y_oct[n_trim:-n_trim] * bscale[i]

        if y is None:
            y = y_oct
        else:
            # Up-sample the previous buffer and add in the new one
            # Scipy-resampling is fast here, since it's a power-of-two relation
            y = audio.resample(y, 1, 2, scale=True, res_type='scipy') + y_oct

    return y
예제 #6
0
def pseudo_cqt(y,
               sr=22050,
               hop_length=512,
               fmin=None,
               n_bins=84,
               bins_per_octave=12,
               tuning=0.0,
               filter_scale=1,
               norm=1,
               sparsity=0.01,
               window='hann',
               scale=True,
               pad_mode='reflect'):
    '''Compute the pseudo constant-Q transform of an audio signal.
    This uses a single fft size that is the smallest power of 2 that is greater
    than or equal to the max of:
        1. The longest CQT filter
        2. 2x the hop_length
    Parameters
    ----------
    y : np.ndarray [shape=(n,)]
        audio time series
    sr : number > 0 [scalar]
        sampling rate of `y`
    hop_length : int > 0 [scalar]
        number of samples between successive CQT columns.
    fmin : float > 0 [scalar]
        Minimum frequency. Defaults to C1 ~= 32.70 Hz
    n_bins : int > 0 [scalar]
        Number of frequency bins, starting at `fmin`
    bins_per_octave : int > 0 [scalar]
        Number of bins per octave
    tuning : None or float in `[-0.5, 0.5)`
        Tuning offset in fractions of a bin (cents).
        If `None`, tuning will be automatically estimated from the signal.
    filter_scale : float > 0
        Filter filter_scale factor. Larger values use longer windows.
    sparsity : float in [0, 1)
        Sparsify the CQT basis by discarding up to `sparsity`
        fraction of the energy in each basis.
        Set `sparsity=0` to disable sparsification.
    window : str, tuple, number, or function
        Window specification for the basis filters.
        See `filters.get_window` for details.
    pad_mode : string
        Padding mode for centered frame analysis.
        See also: `librosa.core.stft` and `np.pad`.
    Returns
    -------
    CQT : np.ndarray [shape=(n_bins, t), dtype=np.float]
        Pseudo Constant-Q energy for each frequency at each time.
    Raises
    ------
    ParameterError
        If `hop_length` is not an integer multiple of
        `2**(n_bins / bins_per_octave)`
        Or if `y` is too short to support the frequency range of the CQT.
    Notes
    -----
    This function caches at level 20.
    '''

    if fmin is None:
        # C1 by default
        fmin = note_to_hz('C1')

    if tuning is None:
        tuning = estimate_tuning(y=y, sr=sr)

    fft_basis, n_fft, _ = __cqt_filter_fft(sr,
                                           fmin,
                                           n_bins,
                                           bins_per_octave,
                                           tuning,
                                           filter_scale,
                                           norm,
                                           sparsity,
                                           hop_length=hop_length,
                                           window=window)

    fft_basis = np.abs(fft_basis)

    # Compute the magnitude STFT with Hann window
    D = np.abs(stft(y, n_fft=n_fft, hop_length=hop_length, pad_mode=pad_mode))

    # Project onto the pseudo-cqt basis
    C = fft_basis.dot(D)

    if scale:
        C /= np.sqrt(n_fft)
    else:
        lengths = filters.constant_q_lengths(sr,
                                             fmin,
                                             n_bins=n_bins,
                                             bins_per_octave=bins_per_octave,
                                             tuning=tuning,
                                             window=window,
                                             filter_scale=filter_scale)

        C *= np.sqrt(lengths[:, np.newaxis] / n_fft)

    return C
예제 #7
0
def cqt(y,
        sr=22050,
        hop_length=512,
        fmin=None,
        n_bins=84,
        bins_per_octave=12,
        tuning=0.0,
        filter_scale=1,
        norm=1,
        sparsity=0.01,
        window='hann',
        scale=True,
        pad_mode='reflect',
        res_type='scipy'):
    '''Compute the constant-Q transform of an audio signal.
    This implementation is based on the recursive sub-sampling method
    described by [1]_.
    .. [1] Schoerkhuber, Christian, and Anssi Klapuri.
        "Constant-Q transform toolbox for music processing."
        7th Sound and Music Computing Conference, Barcelona, Spain. 2010.
    Parameters
    ----------
    y : np.ndarray [shape=(n,)]
        audio time series
    sr : number > 0 [scalar]
        sampling rate of `y`
    hop_length : int > 0 [scalar]
        number of samples between successive CQT columns.
    fmin : float > 0 [scalar]
        Minimum frequency. Defaults to C1 ~= 32.70 Hz
    n_bins : int > 0 [scalar]
        Number of frequency bins, starting at `fmin`
    bins_per_octave : int > 0 [scalar]
        Number of bins per octave
    tuning : None or float in `[-0.5, 0.5)`
        Tuning offset in fractions of a bin (cents).
        If `None`, tuning will be automatically estimated from the signal.
    filter_scale : float > 0
        Filter scale factor. Small values (<1) use shorter windows
        for improved time resolution.
    norm : {inf, -inf, 0, float > 0}
        Type of norm to use for basis function normalization.
        See `librosa.util.normalize`.
    sparsity : float in [0, 1)
        Sparsify the CQT basis by discarding up to `sparsity`
        fraction of the energy in each basis.
        Set `sparsity=0` to disable sparsification.
    window : str, tuple, number, or function
        Window specification for the basis filters.
        See `filters.get_window` for details.
    scale : bool
        If `True`, scale the CQT response by square-root the length of
        each channel's filter.  This is analogous to `norm='ortho'` in FFT.
        If `False`, do not scale the CQT. This is analogous to
        `norm=None` in FFT.
    pad_mode : string
        Padding mode for centered frame analysis.
        See also: `librosa.core.stft` and `np.pad`.
    Returns
    -------
    CQT : np.ndarray [shape=(n_bins, t), dtype=np.complex or np.float]
        Constant-Q value each frequency at each time.
    Raises
    ------
    ParameterError
        If `hop_length` is not an integer multiple of
        `2**(n_bins / bins_per_octave)`
        Or if `y` is too short to support the frequency range of the CQT.
    See Also
    --------
    librosa.core.resample
    librosa.util.normalize
    Notes
    -----
    This function caches at level 20.
    Examples
    --------
    Generate and plot a constant-Q power spectrum
    >>> import matplotlib.pyplot as plt
    >>> y, sr = librosa.load(librosa.util.example_audio_file())
    >>> C = np.abs(librosa.cqt(y, sr=sr))
    >>> librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
    ...                          sr=sr, x_axis='time', y_axis='cqt_note')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Constant-Q power spectrum')
    >>> plt.tight_layout()
    Limit the frequency range
    >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
    ...                 n_bins=60))
    >>> C
    array([[  8.827e-04,   9.293e-04, ...,   3.133e-07,   2.942e-07],
           [  1.076e-03,   1.068e-03, ...,   1.153e-06,   1.148e-06],
           ...,
           [  1.042e-07,   4.087e-07, ...,   1.612e-07,   1.928e-07],
           [  2.363e-07,   5.329e-07, ...,   1.294e-07,   1.611e-07]])
    Using a higher frequency resolution
    >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
    ...                 n_bins=60 * 2, bins_per_octave=12 * 2))
    >>> C
    array([[  1.536e-05,   5.848e-05, ...,   3.241e-07,   2.453e-07],
           [  1.856e-03,   1.854e-03, ...,   2.397e-08,   3.549e-08],
           ...,
           [  2.034e-07,   4.245e-07, ...,   6.213e-08,   1.463e-07],
           [  4.896e-08,   5.407e-07, ...,   9.176e-08,   1.051e-07]])
    '''

    # How many octaves are we dealing with?
    n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
    n_filters = min(bins_per_octave, n_bins)

    len_orig = len(y)

    if fmin is None:
        # C1 by default
        fmin = note_to_hz('C1')

    if tuning is None:
        tuning = estimate_tuning(y=y, sr=sr)

    # First thing, get the freqs of the top octave
    freqs = cqt_frequencies(n_bins, fmin,
                            bins_per_octave=bins_per_octave)[-bins_per_octave:]

    fmin_t = np.min(freqs)
    fmax_t = np.max(freqs)

    # Determine required resampling quality
    Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 1)
    filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q)
    nyquist = sr / 2.0

    y, sr, hop_length = __early_downsample(y, sr, hop_length, res_type,
                                           n_octaves, nyquist, filter_cutoff,
                                           scale)

    cqt_resp = []

    if res_type != 'kaiser_fast':

        # Do the top octave before resampling to allow for fast resampling
        fft_basis, n_fft, _ = __cqt_filter_fft(sr,
                                               fmin_t,
                                               n_filters,
                                               bins_per_octave,
                                               tuning,
                                               filter_scale,
                                               norm,
                                               sparsity,
                                               window=window)

        # Compute the CQT filter response and append it to the stack
        cqt_resp.append(
            __cqt_response(y, n_fft, hop_length, fft_basis, pad_mode))

        fmin_t /= 2
        fmax_t /= 2
        n_octaves -= 1

        filter_cutoff = fmax_t * (1 +
                                  0.5 * filters.window_bandwidth(window) / Q)

        res_type = 'kaiser_fast'

    # Make sure our hop is long enough to support the bottom octave
    num_twos = __num_two_factors(hop_length)
    if num_twos < n_octaves - 1:
        raise ParameterError('hop_length must be a positive integer '
                             'multiple of 2^{0:d} for {1:d}-octave CQT'.format(
                                 n_octaves - 1, n_octaves))

    # Now do the recursive bit
    fft_basis, n_fft, _ = __cqt_filter_fft(sr,
                                           fmin_t,
                                           n_filters,
                                           bins_per_octave,
                                           tuning,
                                           filter_scale,
                                           norm,
                                           sparsity,
                                           window=window)

    my_y, my_sr, my_hop = y, sr, hop_length

    # Iterate down the octaves
    for i in range(n_octaves):

        # Resample (except first time)
        if i > 0:
            if len(my_y) < 2:
                raise ParameterError('Input signal length={} is too short for '
                                     '{:d}-octave CQT'.format(
                                         len_orig, n_octaves))

            my_y = audio.resample(my_y,
                                  my_sr,
                                  my_sr / 2.0,
                                  res_type=res_type,
                                  scale=True)
            # The re-scale the filters to compensate for downsampling
            fft_basis[:] *= np.sqrt(2)

            my_sr /= 2.0
            my_hop //= 2

        # Compute the cqt filter response and append to the stack
        cqt_resp.append(
            __cqt_response(my_y, n_fft, my_hop, fft_basis, pad_mode))

    C = __trim_stack(cqt_resp, n_bins)

    if scale:
        lengths = filters.constant_q_lengths(sr,
                                             fmin,
                                             n_bins=n_bins,
                                             bins_per_octave=bins_per_octave,
                                             tuning=tuning,
                                             window=window,
                                             filter_scale=filter_scale)
        C /= np.sqrt(lengths[:, np.newaxis])

    return C
예제 #8
0
def hybrid_cqt(y,
               sr=22050,
               hop_length=512,
               fmin=None,
               n_bins=84,
               bins_per_octave=12,
               tuning=0.0,
               filter_scale=1,
               norm=1,
               sparsity=0.01,
               window='hann',
               scale=True,
               pad_mode='reflect'):
    '''Compute the hybrid constant-Q transform of an audio signal.
    Here, the hybrid CQT uses the pseudo CQT for higher frequencies where
    the hop_length is longer than half the filter length and the full CQT
    for lower frequencies.
    Parameters
    ----------
    y : np.ndarray [shape=(n,)]
        audio time series
    sr : number > 0 [scalar]
        sampling rate of `y`
    hop_length : int > 0 [scalar]
        number of samples between successive CQT columns.
    fmin : float > 0 [scalar]
        Minimum frequency. Defaults to C1 ~= 32.70 Hz
    n_bins : int > 0 [scalar]
        Number of frequency bins, starting at `fmin`
    bins_per_octave : int > 0 [scalar]
        Number of bins per octave
    tuning : None or float in `[-0.5, 0.5)`
        Tuning offset in fractions of a bin (cents).
        If `None`, tuning will be automatically estimated from the signal.
    filter_scale : float > 0
        Filter filter_scale factor. Larger values use longer windows.
    sparsity : float in [0, 1)
        Sparsify the CQT basis by discarding up to `sparsity`
        fraction of the energy in each basis.
        Set `sparsity=0` to disable sparsification.
    window : str, tuple, number, or function
        Window specification for the basis filters.
        See `filters.get_window` for details.
    pad_mode : string
        Padding mode for centered frame analysis.
        See also: `librosa.core.stft` and `np.pad`.
    Returns
    -------
    CQT : np.ndarray [shape=(n_bins, t), dtype=np.float]
        Constant-Q energy for each frequency at each time.
    Raises
    ------
    ParameterError
        If `hop_length` is not an integer multiple of
        `2**(n_bins / bins_per_octave)`
        Or if `y` is too short to support the frequency range of the CQT.
    See Also
    --------
    cqt
    pseudo_cqt
    Notes
    -----
    This function caches at level 20.
    '''

    if fmin is None:
        # C1 by default
        fmin = note_to_hz('C1')

    if tuning is None:
        tuning = estimate_tuning(y=y, sr=sr)

    # Get all CQT frequencies
    freqs = cqt_frequencies(n_bins,
                            fmin,
                            bins_per_octave=bins_per_octave,
                            tuning=tuning)

    # Compute the length of each constant-Q basis function
    lengths = filters.constant_q_lengths(sr,
                                         fmin,
                                         n_bins=n_bins,
                                         bins_per_octave=bins_per_octave,
                                         tuning=tuning,
                                         filter_scale=filter_scale,
                                         window=window)

    # Determine which filters to use with Pseudo CQT
    # These are the ones that fit within 2 hop lengths after padding
    pseudo_filters = 2.0**np.ceil(np.log2(lengths)) < 2 * hop_length

    n_bins_pseudo = int(np.sum(pseudo_filters))

    n_bins_full = n_bins - n_bins_pseudo
    cqt_resp = []

    if n_bins_pseudo > 0:
        fmin_pseudo = np.min(freqs[pseudo_filters])

        cqt_resp.append(
            pseudo_cqt(y,
                       sr,
                       hop_length=hop_length,
                       fmin=fmin_pseudo,
                       n_bins=n_bins_pseudo,
                       bins_per_octave=bins_per_octave,
                       tuning=tuning,
                       filter_scale=filter_scale,
                       norm=norm,
                       sparsity=sparsity,
                       window=window,
                       scale=scale,
                       pad_mode=pad_mode))

    if n_bins_full > 0:
        cqt_resp.append(
            np.abs(
                cqt(y,
                    sr,
                    hop_length=hop_length,
                    fmin=fmin,
                    n_bins=n_bins_full,
                    bins_per_octave=bins_per_octave,
                    tuning=tuning,
                    filter_scale=filter_scale,
                    norm=norm,
                    sparsity=sparsity,
                    window=window,
                    scale=scale,
                    pad_mode=pad_mode)))

    return __trim_stack(cqt_resp, n_bins)