def __test(min_db): # Check that 1KHz is around 0dB a_khz = librosa.A_weighting(1000.0, min_db=min_db) assert np.allclose(a_khz, 0, atol=1e-3) a_range = librosa.A_weighting(np.linspace(2e1, 2e4), min_db=min_db) # Check that the db cap works if min_db is not None: assert not np.any(a_range < min_db)
def get_X(decision_length, fmin, hop_length, n_bins_per_octave, n_octaves, track_or_path): if isinstance(track_or_path, basestring): x_mono, sr = librosa.core.load(track_or_path, sr=None, mono=True) else: with warnings.catch_warnings(): warnings.filterwarnings("ignore") (sr, x_stereo) = track_or_path.audio_data warnings.resetwarnings() x_stereo = x_stereo.astype(np.float32) x_mono = np.sum(x_stereo, axis=1) / (32768.0 * 2) if x_mono.shape[0] < decision_length: padding_length = x_mono.shape[0] - decision_length padding = np.zeros(padding_length, dtype=np.float32) x_mono = np.hstack((x_mono, padding)) n_bins = n_octaves * n_bins_per_octave freqs = librosa.cqt_frequencies(bins_per_octave=n_bins_per_octave, fmin=fmin, n_bins=n_bins) CQT = np.abs( librosa.cqt(x_mono, bins_per_octave=n_bins_per_octave, fmin=fmin, hop_length=hop_length, n_bins=n_bins, sr=sr, real=False)) A_weights_dB = librosa.A_weighting(freqs, min_db=-80.0) A_weights = (10.0**(A_weights_dB / 10)) X = np.log1p(1000.0 * CQT * A_weights[:, np.newaxis]) X = X.astype(np.float32) return X
def compute_loudness(audio, sample_rate=16000, frame_rate=50, n_fft=2048, range_db=120.0, ref_db=20.7): """Perceptual loudness in dB, relative to white noise, amplitude=1. Args: audio: tensor. Shape [batch_size, audio_length] or [audio_length]. sample_rate: Audio sample rate in Hz. frame_rate: Rate of loudness frames in Hz. n_fft: Fft window size. range_db: Sets the dynamic range of loudness in decibels. The minimum loudness (per a frequency bin) corresponds to -range_db. ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a slight dependence on fft_size due to different granularity of perceptual weighting. Returns: Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,]. """ # Temporarily a batch dimension for single examples. is_1d = (len(audio.shape) == 1) if is_1d: audio = audio[None, :] # Take STFT. hop_length = sample_rate // frame_rate s = torch.stft(audio, n_fft=n_fft, hop_length=hop_length) # batch, frequency_bins, n_frames # Compute power of each bin amplitude = torch.sqrt(amp(s) + 1e-5) #sqrt(0) gives nan gradient power_db = torch.log10(amplitude + 1e-5) power_db *= 20.0 # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[None, :, None] loudness = power_db + torch.from_numpy(a_weighting.astype(np.float32)).to( audio.device) # Set dynamic range. loudness -= ref_db loudness = torch.clamp(loudness, min=-range_db) # Average over frequency bins. loudness = torch.mean(loudness, dim=1) # Remove temporary batch dimension. loudness = loudness[0] if is_1d else loudness # Compute expected length of loudness vector n_secs = audio.shape[-1] / float( sample_rate) # `n_secs` can have milliseconds expected_len = int(n_secs * frame_rate) # Pad with `-range_db` noise floor or trim vector loudness = pad_or_trim_to_expected_length(loudness, expected_len, -range_db) return loudness
def __init__(self, sr, n_fft, min_db): super().__init__() self.min_db = min_db freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft) self.a_weighting = torch.nn.Parameter( data=torch.from_numpy(librosa.A_weighting(freqs + 1e-10)), requires_grad=False, )
def test_multi_frequency_weighting(kinds): freq = np.linspace(2e1, 2e4) assert np.allclose(librosa.multi_frequency_weighting(freq, kinds), np.stack([ librosa.A_weighting(freq), librosa.Z_weighting(freq), librosa.C_weighting(freq), ]), 0, atol=1e-3)
def perceptual_weights(): """A-weighted frequency-dependent perceptual loudness weights""" frequencies = librosa.fft_frequencies(sr=torchcrepe.SAMPLE_RATE, n_fft=torchcrepe.WINDOW_SIZE) # A warning is raised for nearly inaudible frequencies, but it ends up # defaulting to -100 db. That default is fine for our purposes. with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) return librosa.A_weighting(frequencies)[:, None] - REF_DB
def _evaluate(self): fft = self.get("input") if fft is None: self.set("output", None) return weighting = librosa.A_weighting(fft.frequencies + fft.bin_resolution * 0.5) new_fft = fft.copy() new_fft.magnitudes *= 10**(self.get("alpha") * weighting / 10.0) # for db it would be like this: #new_fft.magnitudes += weighting self.set("output", new_fft)
def extract_loudness(signal, sampling_rate, block_size, n_fft=2048): S = li.stft( signal, n_fft=n_fft, hop_length=block_size, win_length=n_fft, center=True, ) S = np.log(abs(S) + 1e-7) f = li.fft_frequencies(sampling_rate, n_fft) a_weight = li.A_weighting(f) S = S + a_weight.reshape(-1, 1) S = np.mean(S, 0)[..., :-1] return S
def calc_loudness(audio, rate=_AUDIO_RATE, center=False, hop_size=16, n_fft=_LD_N_FFT): np.seterr(divide='ignore') """Compute loudness, add to example (ref is white noise, amplitude=1).""" # Copied from magenta/ddsp/spectral_ops.py # Get magnitudes. # hop_size = int(_AUDIO_RATE // _F0_AND_LOUDNESS_RATE) if center is False: # Add padding to the end n_samples_initial = int(audio.shape[-1]) n_frames = int(np.ceil(n_samples_initial / hop_size)) n_samples_final = (n_frames - 1) * hop_size + n_fft pad = n_samples_final - n_samples_initial audio = np.pad(audio, ((0, pad),), "constant") spectra = librosa.stft( audio, n_fft=n_fft, hop_length=hop_size, center=center).T # Compute power amplitude = np.abs(spectra) amin = 1e-20 # Avoid log(0) instabilities. power_db = np.log10(np.maximum(amin, amplitude)) power_db *= 20.0 # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[np.newaxis, :] loudness = power_db + a_weighting # Set dynamic range. loudness -= _REF_DB loudness = np.maximum(loudness, -_LD_RANGE) # Average over frequency bins. mean_loudness_db = np.mean(loudness, axis=-1) return mean_loudness_db.astype(np.float32)
def get_loudness(wav, sr, n_fft=1280, hop_length=320, win_length=None, ref=1.0, min_db=-80.0): """ Extract the loudness measurement of the signal. Feature is extracted using A-weighting of the signal frequencies. Args: wav - waveform (numpy array) sr - sampling rate n_fft - number of points for fft hop_length - stride of stft win_length - size of window of stft ref - reference for amplitude log-scale min_db - floor for db difference Returns: loudness - loudness of signal, shape (n_frames,) """ A_weighting = librosa.A_weighting( librosa.fft_frequencies(sr, n_fft=n_fft) + 1e-6, min_db=min_db) weighting = 10**(A_weighting / 10) power_spec = abs( librosa.stft(wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length))**2 loudness = np.mean(power_spec * weighting[:, None], axis=0) loudness = librosa.power_to_db(loudness, ref=ref) # in db return loudness[:, np.newaxis].astype(np.float32)
def compute_loudness(audio, sample_rate=16000, frame_rate=250, n_fft=2048, range_db=LD_RANGE, ref_db=20.7, use_tf=False): """Perceptual loudness in dB, relative to white noise, amplitude=1. Function is differentiable if use_tf=True. Args: audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or [batch_size,]. sample_rate: Audio sample rate in Hz. frame_rate: Rate of loudness frames in Hz. n_fft: Fft window size. range_db: Sets the dynamic range of loudness in decibles. The minimum loudness (per a frequency bin) corresponds to -range_db. ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a slight dependence on fft_size due to different granularity of perceptual weighting. use_tf: Make function differentiable by using librosa. Returns: Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,]. """ # Pick tensorflow or numpy. lib = tf if use_tf else np # Make inputs tensors for tensorflow. audio = tf_float32(audio) if use_tf else audio # Temporarily a batch dimension for single examples. is_1d = (len(audio.shape) == 1) audio = audio[lib.newaxis, :] if is_1d else audio # Take STFT. hop_size = sample_rate // frame_rate overlap = 1 - hop_size / n_fft stft_fn = stft if use_tf else stft_np s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True) # Compute power amplitude = lib.abs(s) log10 = ( lambda x: tf.math.log(x) / tf.math.log(10.0)) if use_tf else np.log10 amin = 1e-20 # Avoid log(0) instabilities. power_db = log10(lib.maximum(amin, amplitude)) power_db *= 20.0 # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :] loudness = power_db + a_weighting # Set dynamic range. loudness -= ref_db loudness = lib.maximum(loudness, -range_db) mean = tf.reduce_mean if use_tf else np.mean # Average over frequency bins. loudness = mean(loudness, axis=-1) # Remove temporary batch dimension. loudness = loudness[0] if is_1d else loudness return loudness
def compute_loudness(audio, sample_rate=16000, frame_rate=250, n_fft=2048, range_db=LD_RANGE, ref_db=20.7, use_tf=False): """Perceptual loudness in dB, relative to white noise, amplitude=1. Function is differentiable if use_tf=True. Args: audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or [batch_size,]. sample_rate: Audio sample rate in Hz. frame_rate: Rate of loudness frames in Hz. n_fft: Fft window size. range_db: Sets the dynamic range of loudness in decibles. The minimum loudness (per a frequency bin) corresponds to -range_db. ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a slight dependence on fft_size due to different granularity of perceptual weighting. use_tf: Make function differentiable by using tensorflow. Returns: Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,]. """ if sample_rate % frame_rate != 0: raise ValueError( 'frame_rate: {} must evenly divide sample_rate: {}.' 'For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz' .format(frame_rate, sample_rate)) # Pick tensorflow or numpy. lib = tf if use_tf else np # Make inputs tensors for tensorflow. audio = tf_float32(audio) if use_tf else audio # Temporarily a batch dimension for single examples. is_1d = (len(audio.shape) == 1) audio = audio[lib.newaxis, :] if is_1d else audio # Take STFT. hop_size = sample_rate // frame_rate overlap = 1 - hop_size / n_fft stft_fn = stft if use_tf else stft_np s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True) # Compute power. amplitude = lib.abs(s) power_db = amplitude_to_db(amplitude, use_tf=use_tf) # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :] loudness = power_db + a_weighting # Set dynamic range. loudness -= ref_db loudness = lib.maximum(loudness, -range_db) mean = tf.reduce_mean if use_tf else np.mean # Average over frequency bins. loudness = mean(loudness, axis=-1) # Remove temporary batch dimension. loudness = loudness[0] if is_1d else loudness # Compute expected length of loudness vector n_secs = audio.shape[-1] / float( sample_rate) # `n_secs` can have milliseconds expected_len = int(n_secs * frame_rate) # Pad with `-range_db` noise floor or trim vector loudness = pad_or_trim_to_expected_length(loudness, expected_len, -range_db, use_tf=use_tf) return loudness
def a_weighting(frequencies): return librosa.A_weighting(frequencies, min_db=-160.)
def compute_loudness(audio, sample_rate=16000, frame_rate=250, n_fft=512, range_db=DB_RANGE, ref_db=0.0, use_tf=True, pad_end=True): """Perceptual loudness (weighted power) in dB. Function is differentiable if use_tf=True. Args: audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or [batch_size,]. sample_rate: Audio sample rate in Hz. frame_rate: Rate of loudness frames in Hz. n_fft: Fft window size. range_db: Sets the dynamic range of loudness in decibles. The minimum loudness (per a frequency bin) corresponds to -range_db. ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The old (<v2.0.0) default value corresponded to white noise with amplitude=1.0 and n_fft=2048. With v2.0.0 it was set to 0.0 to be more consistent with power calculations that have a natural scale for 0 dB being amplitude=1.0. use_tf: Make function differentiable by using tensorflow. pad_end: Add zero padding at end of audio (like `same` convolution). Returns: Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,]. """ if sample_rate % frame_rate != 0: raise ValueError( 'frame_rate: {} must evenly divide sample_rate: {}.' 'For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz' .format(frame_rate, sample_rate)) # Pick tensorflow or numpy. lib = tf if use_tf else np reduce_mean = tf.reduce_mean if use_tf else np.mean stft_fn = stft if use_tf else stft_np # Make inputs tensors for tensorflow. audio = tf_float32(audio) if use_tf else audio # Temporarily a batch dimension for single examples. is_1d = (len(audio.shape) == 1) audio = audio[lib.newaxis, :] if is_1d else audio # Take STFT. hop_size = sample_rate // frame_rate overlap = 1 - hop_size / n_fft s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=pad_end) # Compute power. amplitude = lib.abs(s) power = amplitude**2 # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :] # Perform weighting in linear scale, a_weighting given in decibels. weighting = 10**(a_weighting / 10) power = power * weighting # Average over frequencies (weighted power per a bin). avg_power = reduce_mean(power, axis=-1) loudness = core.power_to_db(avg_power, ref_db=ref_db, range_db=range_db, use_tf=use_tf) # Remove temporary batch dimension. loudness = loudness[0] if is_1d else loudness # Compute expected length of loudness vector. expected_secs = audio.shape[-1] / float(sample_rate) expected_len = int(expected_secs * frame_rate) # Pad with `-range_db` noise floor or trim vector. loudness = pad_or_trim_to_expected_length(loudness, expected_len, -range_db, use_tf=use_tf) return loudness
출처: https://hyongdoc.tistory.com/402 [Doony Garage] ''' import numpy as np import librosa, librosa.display import matplotlib.pyplot as plt n_fft = 2048 win_length = 2048 hop_length = 1024 n_mels = 128 print("Loading data ...") y, sr = librosa.load(librosa.util.example_audio_file()) D = librosa.stft(y) print(D) D = np.abs(librosa.stft(y, n_fft=n_fft, win_length = win_length, hop_length=hop_length)) mel_spec = librosa.feature.melspectrogram(S=D, sr=sr, n_mels=n_mels, hop_length=hop_length, win_length=win_length) librosa.display.specshow(librosa.amplitude_to_db(mel_spec, ref=0.00002), sr=sr, hop_length = hop_length, y_axis='mel', x_axis='time')#, cmap = cm.jet) plt.colorbar(format='%2.0f dB') plt.show() freqs = librosa.cqt_frequencies(108, librosa.note_to_hz('C1')) aw = librosa.A_weighting(freqs) plt.plot(freqs, aw) plt.xlabel('Frequency (Hz)') plt.ylabel('Weighting (log10)') plt.title('A-Weighting of CQT frequencies') plt.show()