示例#1
0
def split_frames(x: array,
                 frame_length: int,
                 hop_length: int,
                 axis: int = -1) -> array:
    """Slice a data array into (overlapping) frames.

    This function is aligned with librosa.frame
    """

    if not isinstance(x, np.ndarray):
        raise ParameterError(
            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")

    if x.shape[axis] < frame_length:
        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
                             f" for frame_length={frame_length:d}")

    if hop_length < 1:
        raise ParameterError(f"Invalid hop_length: {hop_length:d}")

    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
        warnings.warn(f"librosa.util.frame called with axis={axis} "
                      "on a non-contiguous input. This will result in a copy.")
        x = np.asfortranarray(x)
    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
        warnings.warn(f"librosa.util.frame called with axis={axis} "
                      "on a non-contiguous input. This will result in a copy.")
        x = np.ascontiguousarray(x)

    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
    strides = np.asarray(x.strides)

    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize

    if axis == -1:
        shape = list(x.shape)[:-1] + [frame_length, n_frames]
        strides = list(strides) + [hop_length * new_stride]

    elif axis == 0:
        shape = [n_frames, frame_length] + list(x.shape)[1:]
        strides = [hop_length * new_stride] + list(strides)

    else:
        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")

    return as_strided(x, shape=shape, strides=strides)
示例#2
0
def mfcc(x,
         sr: int = 16000,
         spect: Optional[array] = None,
         n_mfcc: int = 20,
         dct_type: int = 2,
         norm: str = "ortho",
         lifter: int = 0,
         **kwargs) -> array:
    """Mel-frequency cepstral coefficients (MFCCs)

    This function is NOT strictly aligned with librosa. The following example shows how to get the
    same result with librosa:

    # paddleaudioe mfcc:
     kwargs = {
        'window_size':512,
        'hop_length':320,
        'mel_bins':64,
        'fmin':50,
         'to_db':False}
    a = mfcc(x,
        spect=None,
        n_mfcc=20,
        dct_type=2,
        norm='ortho',
        lifter=0,
        **kwargs)

    # librosa mfcc:
    spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512,
                                              win_length=512,
                                              hop_length=320,
                                              n_mels=64, fmin=50)
    b = librosa.feature.mfcc(x,
        sr=16000,
        S=spect,
        n_mfcc=20,
        dct_type=2,
        norm='ortho',
        lifter=0)

    assert np.mean( (a-b)**2) < 1e-8

    """
    if spect is None:
        spect = melspectrogram(x, sr=sr, **kwargs)

    M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]

    if lifter > 0:
        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
                        lifter)
        return M * factor[:, np.newaxis]
    elif lifter == 0:
        return M
    else:
        raise ParameterError(
            f"MFCC lifter={lifter} must be a non-negative number")
示例#3
0
def random_crop1d(y: array, crop_len: int) -> array:
    """ Do random cropping on 1d input signal

    The input is a 1d signal, typically a sound waveform
    """
    if y.ndim != 1:
        raise ParameterError('only accept 1d numpy array')
    n = len(y)
    idx = randint(n - crop_len)
    return y[idx:idx + crop_len]
示例#4
0
def center_crop1d(y: array, crop_len: int) -> array:
    """ Do random cropping on 1d input signal

    The input is a 1d signal, typically a sound waveform
    """
    if y.ndim != 1:
        raise ParameterError(
            f'only accept 1d numpy array, but received y.ndim={y.ndim}')

    n = len(y)
    start = (n - crop_len) // 2
    return y[start:start + crop_len]
示例#5
0
def compute_fbank_matrix(sr: int,
                         n_fft: int,
                         n_mels: int = 128,
                         fmin: float = 0.0,
                         fmax: Optional[float] = None,
                         htk: bool = False,
                         norm: str = "slaney",
                         dtype: type = np.float32):
    """Compute fbank matrix.

    This funciton is aligned with librosa.
    """
    if norm != "slaney":
        raise ParameterError('norm must set to slaney')

    if fmax is None:
        fmax = float(sr) / 2

    # Initialize the weights
    n_mels = int(n_mels)
    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)

    # Center freqs of each FFT bin
    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)

    # 'Center freqs' of mel bands - uniformly spaced between limits
    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)

    fdiff = np.diff(mel_f)
    ramps = np.subtract.outer(mel_f, fftfreqs)

    for i in range(n_mels):
        # lower and upper slopes for all bins
        lower = -ramps[i] / fdiff[i]
        upper = ramps[i + 2] / fdiff[i + 1]

        # .. then intersect them with each other and zero
        weights[i] = np.maximum(0, np.minimum(lower, upper))

    if norm == "slaney":
        # Slaney-style mel is scaled to be approx constant energy per channel
        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
        weights *= enorm[:, np.newaxis]

    # Only check weights if f_mel[0] is positive
    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
        # This means we have an empty channel somewhere
        warnings.warn("Empty filters detected in mel frequency basis. "
                      "Some channels will produce empty responses. "
                      "Try increasing your sampling rate (and fmax) or "
                      "reducing n_mels.")

    return weights
示例#6
0
def power_to_db(spect: array,
                ref: float = 1.0,
                amin: float = 1e-10,
                top_db: Optional[float] = 80.0) -> array:
    """Convert a power spectrogram (amplitude squared) to decibel (dB) units

    This computes the scaling ``10 * log10(spect / ref)`` in a numerically
    stable way.

    This function is aligned with librosa.
    """
    spect = np.asarray(spect)

    if amin <= 0:
        raise ParameterError("amin must be strictly positive")

    if np.issubdtype(spect.dtype, np.complexfloating):
        warnings.warn(
            "power_to_db was called on complex input so phase "
            "information will be discarded. To suppress this warning, "
            "call power_to_db(np.abs(D)**2) instead.")
        magnitude = np.abs(spect)
    else:
        magnitude = spect

    if callable(ref):
        # User supplied a function to calculate reference power
        ref_value = ref(magnitude)
    else:
        ref_value = np.abs(ref)

    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))

    if top_db is not None:
        if top_db < 0:
            raise ParameterError("top_db must be non-negative")
        log_spec = np.maximum(log_spec, log_spec.max() - top_db)

    return log_spec
示例#7
0
def _check_audio(y, mono=True) -> bool:
    """Determine whether a variable contains valid audio data.

    The audio y must be a np.ndarray, ether 1-channel or two channel
    """
    if not isinstance(y, np.ndarray):
        raise ParameterError("Audio data must be of type numpy.ndarray")
    if y.ndim > 2:
        raise ParameterError(
            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")

    if mono and y.ndim == 2:
        raise ParameterError(
            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")

    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")

    if not np.issubdtype(y.dtype, np.floating):
        raise ParameterError("Audio data must be floating-point")

    if not np.isfinite(y).all():
        raise ParameterError("Audio buffer is not finite everywhere")

    return True
示例#8
0
def random_crop2d(s: array, crop_len: int, tempo_axis: int = 0) -> array:
    """ Do random cropping for 2D array, typically a spectrogram.

    The cropping is done in temporal direction on the time-freq input signal.
    """
    if tempo_axis >= s.ndim:
        raise ParameterError('axis out of range')

    n = s.shape[tempo_axis]
    idx = randint(high=n - crop_len)
    sli = [slice(None) for i in range(s.ndim)]
    sli[tempo_axis] = slice(idx, idx + crop_len)
    out = s[tuple(sli)]
    return out
示例#9
0
def center_crop_or_pad1d(y: array, crop_len: int) -> array:
    """ Do random cropping or padding to the target length defined by crop_len,given a 1d input signal

    The input is a 1d signal, typically a sound waveform.
    """
    if y.ndim != 1:
        raise ParameterError(
            f'only accept 1d numpy array, but received y.ndim={y.ndim}')
    n = len(y)
    if crop_len == n:
        return y
    elif crop_len > n:
        return pad_center(y, crop_len)
    else:
        return center_crop1d(y, crop_len)
示例#10
0
def pad_center(data: array, size: int, axis: int = -1, **kwargs) -> array:
    """Pad an array to a target length along a target axis.

    This differs from `np.pad` by centering the data prior to padding,
    analogous to `str.center`
    """

    kwargs.setdefault("mode", "constant")
    n = data.shape[axis]
    lpad = int((size - n) // 2)
    lengths = [(0, 0)] * data.ndim
    lengths[axis] = (lpad, int(size - n - lpad))

    if lpad < 0:
        raise ParameterError(("Target size ({size:d}) must be "
                              "at least input size ({n:d})"))

    return np.pad(data, lengths, **kwargs)
示例#11
0
def mu_decode(y: array, mu: int = 255, quantized: bool = True) -> array:
    """Mu-law decoding.

    Compute the mu-law decoding given an input code.

    it assumes that the input y is in
    range [0,mu-1] when quantize is True and [-1,1] otherwise

    Reference:
        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm

    """
    if mu < 1:
        raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')

    mu = mu - 1
    if quantized:  # undo the quantization
        y = y * 2 / mu - 1
    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
    return x
示例#12
0
def melspectrogram(x: array,
                   sr: int = 16000,
                   window_size: int = 512,
                   hop_length: int = 320,
                   n_mels: int = 64,
                   fmin: int = 50,
                   fmax: Optional[float] = None,
                   window: str = 'hann',
                   center: bool = True,
                   pad_mode: str = 'reflect',
                   power: float = 2.0,
                   to_db: bool = True,
                   ref: float = 1.0,
                   amin: float = 1e-10,
                   top_db: Optional[float] = None) -> array:
    """Compute mel-spectrogram.

    Parameters:
        x: numpy.ndarray
        The input wavform is a numpy array [shape=(n,)]

        window_size: int, typically 512, 1024, 2048, etc.
        The window size for framing, also used as n_fft for stft


    Returns:
        The mel-spectrogram in power scale or db scale(default)


    Notes:
    1. sr is default to 16000, which is commonly used in speech/speaker processing.
    2. when fmax is None, it is set to sr//2.
    3. this function will convert mel spectgrum to db scale by default. This is different
    that of librosa.

    """
    _check_audio(x, mono=True)
    if len(x) <= 0:
        raise ParameterError('The input waveform is empty')

    if fmax is None:
        fmax = sr // 2
    if fmin < 0 or fmin >= fmax:
        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')

    s = stft(x,
             n_fft=window_size,
             hop_length=hop_length,
             win_length=window_size,
             window=window,
             center=center,
             pad_mode=pad_mode)

    spect_power = np.abs(s)**power
    fb_matrix = compute_fbank_matrix(sr=sr,
                                     n_fft=window_size,
                                     n_mels=n_mels,
                                     fmin=fmin,
                                     fmax=fmax)
    mel_spect = np.matmul(fb_matrix, spect_power)
    if to_db:
        return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
    else:
        return mel_spect
示例#13
0
def stft(x: array,
         n_fft: int = 2048,
         hop_length: Optional[int] = None,
         win_length: Optional[int] = None,
         window: str = "hann",
         center: bool = True,
         dtype: type = np.complex64,
         pad_mode: str = "reflect") -> array:
    """Short-time Fourier transform (STFT).

    This function is aligned with librosa.
    """
    _check_audio(x)
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)

    fft_window = get_window(window, win_length, fftbins=True)

    # Pad the window out to n_fft size
    fft_window = pad_center(fft_window, n_fft)

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))

    # Pad the time series so that frames are centered
    if center:
        if n_fft > x.shape[-1]:
            warnings.warn(
                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
            )
        x = np.pad(x, int(n_fft // 2), mode=pad_mode)

    elif n_fft > x.shape[-1]:
        raise ParameterError(
            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
        )

    # Window the time series.
    x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), x_frames.shape[1]),
                           dtype=dtype,
                           order="F")
    fft = np.fft  # use numpy fft as default
    # Constrain STFT block sizes to 256 KB
    MAX_MEM_BLOCK = 2**8 * 2**10
    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
    n_columns = max(n_columns, 1)

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
        stft_matrix[:,
                    bl_s:bl_t] = fft.rfft(fft_window * x_frames[:, bl_s:bl_t],
                                          axis=0)

    return stft_matrix