Exemplo n.º 1
0
    def __test(min_db):
        # Check that 1KHz is around 0dB
        a_khz = librosa.A_weighting(1000.0, min_db=min_db)
        assert np.allclose(a_khz, 0, atol=1e-3)

        a_range = librosa.A_weighting(np.linspace(2e1, 2e4), min_db=min_db)
        # Check that the db cap works
        if min_db is not None:
            assert not np.any(a_range < min_db)
Exemplo n.º 2
0
def get_X(decision_length, fmin, hop_length, n_bins_per_octave, n_octaves,
          track_or_path):
    if isinstance(track_or_path, basestring):
        x_mono, sr = librosa.core.load(track_or_path, sr=None, mono=True)
    else:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            (sr, x_stereo) = track_or_path.audio_data
            warnings.resetwarnings()
        x_stereo = x_stereo.astype(np.float32)
        x_mono = np.sum(x_stereo, axis=1) / (32768.0 * 2)
    if x_mono.shape[0] < decision_length:
        padding_length = x_mono.shape[0] - decision_length
        padding = np.zeros(padding_length, dtype=np.float32)
        x_mono = np.hstack((x_mono, padding))
    n_bins = n_octaves * n_bins_per_octave
    freqs = librosa.cqt_frequencies(bins_per_octave=n_bins_per_octave,
                                    fmin=fmin,
                                    n_bins=n_bins)
    CQT = np.abs(
        librosa.cqt(x_mono,
                    bins_per_octave=n_bins_per_octave,
                    fmin=fmin,
                    hop_length=hop_length,
                    n_bins=n_bins,
                    sr=sr,
                    real=False))
    A_weights_dB = librosa.A_weighting(freqs, min_db=-80.0)
    A_weights = (10.0**(A_weights_dB / 10))
    X = np.log1p(1000.0 * CQT * A_weights[:, np.newaxis])
    X = X.astype(np.float32)
    return X
Exemplo n.º 3
0
def compute_loudness(audio,
                     sample_rate=16000,
                     frame_rate=50,
                     n_fft=2048,
                     range_db=120.0,
                     ref_db=20.7):
    """Perceptual loudness in dB, relative to white noise, amplitude=1.

    Args:
        audio: tensor. Shape [batch_size, audio_length] or [audio_length].
        sample_rate: Audio sample rate in Hz.
        frame_rate: Rate of loudness frames in Hz.
        n_fft: Fft window size.
        range_db: Sets the dynamic range of loudness in decibels. The minimum loudness (per a frequency bin) corresponds to -range_db.
        ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a slight dependence on fft_size due to different granularity of perceptual weighting.

    Returns:
        Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,].
    """
    # Temporarily a batch dimension for single examples.
    is_1d = (len(audio.shape) == 1)
    if is_1d:
        audio = audio[None, :]

    # Take STFT.
    hop_length = sample_rate // frame_rate
    s = torch.stft(audio, n_fft=n_fft, hop_length=hop_length)
    # batch, frequency_bins, n_frames

    # Compute power of each bin
    amplitude = torch.sqrt(amp(s) + 1e-5)  #sqrt(0) gives nan gradient
    power_db = torch.log10(amplitude + 1e-5)
    power_db *= 20.0

    # Perceptual weighting.
    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
    a_weighting = librosa.A_weighting(frequencies)[None, :, None]
    loudness = power_db + torch.from_numpy(a_weighting.astype(np.float32)).to(
        audio.device)

    # Set dynamic range.
    loudness -= ref_db
    loudness = torch.clamp(loudness, min=-range_db)

    # Average over frequency bins.
    loudness = torch.mean(loudness, dim=1)

    # Remove temporary batch dimension.
    loudness = loudness[0] if is_1d else loudness

    # Compute expected length of loudness vector
    n_secs = audio.shape[-1] / float(
        sample_rate)  # `n_secs` can have milliseconds
    expected_len = int(n_secs * frame_rate)

    # Pad with `-range_db` noise floor or trim vector
    loudness = pad_or_trim_to_expected_length(loudness, expected_len,
                                              -range_db)
    return loudness
Exemplo n.º 4
0
 def __init__(self, sr, n_fft, min_db):
     super().__init__()
     self.min_db = min_db
     freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
     self.a_weighting = torch.nn.Parameter(
         data=torch.from_numpy(librosa.A_weighting(freqs + 1e-10)),
         requires_grad=False,
     )
Exemplo n.º 5
0
def test_multi_frequency_weighting(kinds):
    freq = np.linspace(2e1, 2e4)
    assert np.allclose(librosa.multi_frequency_weighting(freq, kinds),
                       np.stack([
                           librosa.A_weighting(freq),
                           librosa.Z_weighting(freq),
                           librosa.C_weighting(freq),
                       ]),
                       0,
                       atol=1e-3)
Exemplo n.º 6
0
def perceptual_weights():
    """A-weighted frequency-dependent perceptual loudness weights"""
    frequencies = librosa.fft_frequencies(sr=torchcrepe.SAMPLE_RATE,
                                          n_fft=torchcrepe.WINDOW_SIZE)

    # A warning is raised for nearly inaudible frequencies, but it ends up
    # defaulting to -100 db. That default is fine for our purposes.
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', RuntimeWarning)
        return librosa.A_weighting(frequencies)[:, None] - REF_DB
Exemplo n.º 7
0
    def _evaluate(self):
        fft = self.get("input")
        if fft is None:
            self.set("output", None)
            return

        weighting = librosa.A_weighting(fft.frequencies +
                                        fft.bin_resolution * 0.5)

        new_fft = fft.copy()
        new_fft.magnitudes *= 10**(self.get("alpha") * weighting / 10.0)
        # for db it would be like this:
        #new_fft.magnitudes += weighting
        self.set("output", new_fft)
Exemplo n.º 8
0
def extract_loudness(signal, sampling_rate, block_size, n_fft=2048):
    S = li.stft(
        signal,
        n_fft=n_fft,
        hop_length=block_size,
        win_length=n_fft,
        center=True,
    )
    S = np.log(abs(S) + 1e-7)
    f = li.fft_frequencies(sampling_rate, n_fft)
    a_weight = li.A_weighting(f)

    S = S + a_weight.reshape(-1, 1)

    S = np.mean(S, 0)[..., :-1]

    return S
Exemplo n.º 9
0
def calc_loudness(audio, rate=_AUDIO_RATE, center=False, hop_size=16, n_fft=_LD_N_FFT):
    np.seterr(divide='ignore')

    """Compute loudness, add to example (ref is white noise, amplitude=1)."""
    # Copied from magenta/ddsp/spectral_ops.py
    # Get magnitudes.
    # hop_size = int(_AUDIO_RATE // _F0_AND_LOUDNESS_RATE)
    if center is False:
        # Add padding to the end
        n_samples_initial = int(audio.shape[-1])
        n_frames = int(np.ceil(n_samples_initial / hop_size))
        n_samples_final = (n_frames - 1) * hop_size + n_fft
        pad = n_samples_final - n_samples_initial
        audio = np.pad(audio, ((0, pad),), "constant")

    spectra = librosa.stft(
        audio, n_fft=n_fft, hop_length=hop_size, center=center).T

    # Compute power
    amplitude = np.abs(spectra)
    amin = 1e-20  # Avoid log(0) instabilities.
    power_db = np.log10(np.maximum(amin, amplitude))
    power_db *= 20.0

    # Perceptual weighting.
    frequencies = librosa.fft_frequencies(sr=rate, n_fft=n_fft)
    a_weighting = librosa.A_weighting(frequencies)[np.newaxis, :]
    loudness = power_db + a_weighting

    # Set dynamic range.
    loudness -= _REF_DB
    loudness = np.maximum(loudness, -_LD_RANGE)

    # Average over frequency bins.
    mean_loudness_db = np.mean(loudness, axis=-1)
    return mean_loudness_db.astype(np.float32)
def get_loudness(wav,
                 sr,
                 n_fft=1280,
                 hop_length=320,
                 win_length=None,
                 ref=1.0,
                 min_db=-80.0):
    """
  Extract the loudness measurement of the signal.
  Feature is extracted using A-weighting of the signal frequencies.

  Args:
    wav          - waveform (numpy array)
    sr           - sampling rate
    n_fft        - number of points for fft
    hop_length   - stride of stft
    win_length   - size of window of stft
    ref          - reference for amplitude log-scale
    min_db       - floor for db difference
  Returns:
    loudness     - loudness of signal, shape (n_frames,) 
  """

    A_weighting = librosa.A_weighting(
        librosa.fft_frequencies(sr, n_fft=n_fft) + 1e-6, min_db=min_db)
    weighting = 10**(A_weighting / 10)

    power_spec = abs(
        librosa.stft(wav,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     win_length=win_length))**2
    loudness = np.mean(power_spec * weighting[:, None], axis=0)
    loudness = librosa.power_to_db(loudness, ref=ref)  # in db

    return loudness[:, np.newaxis].astype(np.float32)
Exemplo n.º 11
0
def compute_loudness(audio,
                     sample_rate=16000,
                     frame_rate=250,
                     n_fft=2048,
                     range_db=LD_RANGE,
                     ref_db=20.7,
                     use_tf=False):
    """Perceptual loudness in dB, relative to white noise, amplitude=1.

  Function is differentiable if use_tf=True.
  Args:
    audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or
      [batch_size,].
    sample_rate: Audio sample rate in Hz.
    frame_rate: Rate of loudness frames in Hz.
    n_fft: Fft window size.
    range_db: Sets the dynamic range of loudness in decibles. The minimum
      loudness (per a frequency bin) corresponds to -range_db.
    ref_db: Sets the reference maximum perceptual loudness as given by
      (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value
      corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a
      slight dependence on fft_size due to different granularity of perceptual
      weighting.
    use_tf: Make function differentiable by using librosa.

  Returns:
    Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,].
  """
    # Pick tensorflow or numpy.
    lib = tf if use_tf else np

    # Make inputs tensors for tensorflow.
    audio = tf_float32(audio) if use_tf else audio

    # Temporarily a batch dimension for single examples.
    is_1d = (len(audio.shape) == 1)
    audio = audio[lib.newaxis, :] if is_1d else audio

    # Take STFT.
    hop_size = sample_rate // frame_rate
    overlap = 1 - hop_size / n_fft
    stft_fn = stft if use_tf else stft_np
    s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True)

    # Compute power
    amplitude = lib.abs(s)
    log10 = (
        lambda x: tf.math.log(x) / tf.math.log(10.0)) if use_tf else np.log10
    amin = 1e-20  # Avoid log(0) instabilities.
    power_db = log10(lib.maximum(amin, amplitude))
    power_db *= 20.0

    # Perceptual weighting.
    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
    a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :]
    loudness = power_db + a_weighting

    # Set dynamic range.
    loudness -= ref_db
    loudness = lib.maximum(loudness, -range_db)
    mean = tf.reduce_mean if use_tf else np.mean

    # Average over frequency bins.
    loudness = mean(loudness, axis=-1)

    # Remove temporary batch dimension.
    loudness = loudness[0] if is_1d else loudness
    return loudness
Exemplo n.º 12
0
def compute_loudness(audio,
                     sample_rate=16000,
                     frame_rate=250,
                     n_fft=2048,
                     range_db=LD_RANGE,
                     ref_db=20.7,
                     use_tf=False):
    """Perceptual loudness in dB, relative to white noise, amplitude=1.

  Function is differentiable if use_tf=True.
  Args:
    audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or
      [batch_size,].
    sample_rate: Audio sample rate in Hz.
    frame_rate: Rate of loudness frames in Hz.
    n_fft: Fft window size.
    range_db: Sets the dynamic range of loudness in decibles. The minimum
      loudness (per a frequency bin) corresponds to -range_db.
    ref_db: Sets the reference maximum perceptual loudness as given by
      (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value
      corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a
      slight dependence on fft_size due to different granularity of perceptual
      weighting.
    use_tf: Make function differentiable by using tensorflow.

  Returns:
    Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,].
  """
    if sample_rate % frame_rate != 0:
        raise ValueError(
            'frame_rate: {} must evenly divide sample_rate: {}.'
            'For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz'
            .format(frame_rate, sample_rate))

    # Pick tensorflow or numpy.
    lib = tf if use_tf else np

    # Make inputs tensors for tensorflow.
    audio = tf_float32(audio) if use_tf else audio

    # Temporarily a batch dimension for single examples.
    is_1d = (len(audio.shape) == 1)
    audio = audio[lib.newaxis, :] if is_1d else audio

    # Take STFT.
    hop_size = sample_rate // frame_rate
    overlap = 1 - hop_size / n_fft
    stft_fn = stft if use_tf else stft_np
    s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True)

    # Compute power.
    amplitude = lib.abs(s)
    power_db = amplitude_to_db(amplitude, use_tf=use_tf)

    # Perceptual weighting.
    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
    a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :]
    loudness = power_db + a_weighting

    # Set dynamic range.
    loudness -= ref_db
    loudness = lib.maximum(loudness, -range_db)
    mean = tf.reduce_mean if use_tf else np.mean

    # Average over frequency bins.
    loudness = mean(loudness, axis=-1)

    # Remove temporary batch dimension.
    loudness = loudness[0] if is_1d else loudness

    # Compute expected length of loudness vector
    n_secs = audio.shape[-1] / float(
        sample_rate)  # `n_secs` can have milliseconds
    expected_len = int(n_secs * frame_rate)

    # Pad with `-range_db` noise floor or trim vector
    loudness = pad_or_trim_to_expected_length(loudness,
                                              expected_len,
                                              -range_db,
                                              use_tf=use_tf)
    return loudness
Exemplo n.º 13
0
def a_weighting(frequencies):
    return librosa.A_weighting(frequencies, min_db=-160.)
Exemplo n.º 14
0
def compute_loudness(audio,
                     sample_rate=16000,
                     frame_rate=250,
                     n_fft=512,
                     range_db=DB_RANGE,
                     ref_db=0.0,
                     use_tf=True,
                     pad_end=True):
    """Perceptual loudness (weighted power) in dB.

  Function is differentiable if use_tf=True.
  Args:
    audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or
      [batch_size,].
    sample_rate: Audio sample rate in Hz.
    frame_rate: Rate of loudness frames in Hz.
    n_fft: Fft window size.
    range_db: Sets the dynamic range of loudness in decibles. The minimum
      loudness (per a frequency bin) corresponds to -range_db.
    ref_db: Sets the reference maximum perceptual loudness as given by
      (A_weighting + 10 * log10(abs(stft(audio))**2.0). The old (<v2.0.0)
      default value corresponded to white noise with amplitude=1.0 and
      n_fft=2048. With v2.0.0 it was set to 0.0 to be more consistent with power
      calculations that have a natural scale for 0 dB being amplitude=1.0.
    use_tf: Make function differentiable by using tensorflow.
    pad_end: Add zero padding at end of audio (like `same` convolution).

  Returns:
    Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,].
  """
    if sample_rate % frame_rate != 0:
        raise ValueError(
            'frame_rate: {} must evenly divide sample_rate: {}.'
            'For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz'
            .format(frame_rate, sample_rate))

    # Pick tensorflow or numpy.
    lib = tf if use_tf else np
    reduce_mean = tf.reduce_mean if use_tf else np.mean
    stft_fn = stft if use_tf else stft_np

    # Make inputs tensors for tensorflow.
    audio = tf_float32(audio) if use_tf else audio

    # Temporarily a batch dimension for single examples.
    is_1d = (len(audio.shape) == 1)
    audio = audio[lib.newaxis, :] if is_1d else audio

    # Take STFT.
    hop_size = sample_rate // frame_rate
    overlap = 1 - hop_size / n_fft
    s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=pad_end)

    # Compute power.
    amplitude = lib.abs(s)
    power = amplitude**2

    # Perceptual weighting.
    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
    a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :]

    # Perform weighting in linear scale, a_weighting given in decibels.
    weighting = 10**(a_weighting / 10)
    power = power * weighting

    # Average over frequencies (weighted power per a bin).
    avg_power = reduce_mean(power, axis=-1)
    loudness = core.power_to_db(avg_power,
                                ref_db=ref_db,
                                range_db=range_db,
                                use_tf=use_tf)

    # Remove temporary batch dimension.
    loudness = loudness[0] if is_1d else loudness

    # Compute expected length of loudness vector.
    expected_secs = audio.shape[-1] / float(sample_rate)
    expected_len = int(expected_secs * frame_rate)

    # Pad with `-range_db` noise floor or trim vector.
    loudness = pad_or_trim_to_expected_length(loudness,
                                              expected_len,
                                              -range_db,
                                              use_tf=use_tf)

    return loudness
Exemplo n.º 15
0
출처: https://hyongdoc.tistory.com/402 [Doony Garage]
'''

import numpy as np
import librosa, librosa.display
import matplotlib.pyplot as plt

n_fft = 2048
win_length = 2048
hop_length = 1024
n_mels = 128

print("Loading data ...")
y, sr = librosa.load(librosa.util.example_audio_file())
D = librosa.stft(y)
print(D)

D = np.abs(librosa.stft(y, n_fft=n_fft, win_length = win_length, hop_length=hop_length))
mel_spec = librosa.feature.melspectrogram(S=D, sr=sr, n_mels=n_mels, hop_length=hop_length, win_length=win_length)
librosa.display.specshow(librosa.amplitude_to_db(mel_spec, ref=0.00002), sr=sr, hop_length = hop_length, y_axis='mel', x_axis='time')#, cmap = cm.jet)
plt.colorbar(format='%2.0f dB')
plt.show()

freqs = librosa.cqt_frequencies(108, librosa.note_to_hz('C1'))
aw = librosa.A_weighting(freqs)
plt.plot(freqs, aw)
plt.xlabel('Frequency (Hz)')
plt.ylabel('Weighting (log10)')
plt.title('A-Weighting of CQT frequencies')
plt.show()