def __test(tuning, accidental, octave, round_midi): note = 'A{:s}'.format(accidental) if octave is not None: note = '{:s}{:d}'.format(note, octave) else: octave = 0 if tuning is not None: note = '{:s}{:+d}'.format(note, tuning) else: tuning = 0 if round_midi: tuning = np.around(tuning, -2) hz_true = 440.0 * (2.0**(tuning * 0.01 / 12)) * (2.0**(octave - 4)) if accidental == '#': hz_true *= 2.0**(1./12) elif accidental in list('b!'): hz_true /= 2.0**(1./12) hz = librosa.note_to_hz(note, round_midi=round_midi) assert np.allclose(hz, hz_true) hz = librosa.note_to_hz([note], round_midi=round_midi) assert np.allclose(hz[0], hz_true)
def make_signal(sr, duration, fmax="C8"): """ Generates a linear sine sweep """ fmin = librosa.note_to_hz("C1") / sr if fmax is None: fmax = 0.5 else: fmax = librosa.note_to_hz(fmax) / sr return np.sin(np.cumsum(2 * np.pi * np.logspace(np.log10(fmin), np.log10(fmax), num=duration * sr)))
def make_signal(sr, duration, fmax='C8'): ''' Generates a linear sine sweep ''' fmin = librosa.note_to_hz('C1') / sr if fmax is None: fmax = 0.5 else: fmax = librosa.note_to_hz(fmax) / sr return np.sin(np.cumsum(2 * np.pi * np.logspace(np.log10(fmin), np.log10(fmax), num=duration * sr)))
def test_cqt(): sr = 11025 # Impulse train y = np.zeros(int(5.0 * sr)) y[::sr] = 1.0 # Hop size not long enough for num octaves # num_octaves = 6, 2**6 = 64 > 32 yield (raises(librosa.ParameterError)(__test_cqt_size), y, sr, 32, None, 72, 12, 0.0, 2, None, 1, 0.01) # Filters go beyond Nyquist. 500 Hz -> 4 octaves = 8000 Hz > 11000 Hz yield (raises(librosa.ParameterError)(__test_cqt_size), y, sr, 512, 500, 48, 12, 0.0, 2, None, 1, 0.01) # Test with fmin near Nyquist for fmin in [3000, 4800]: for n_bins in [1, 2]: for bins_per_octave in [12]: yield (__test_cqt_size, y, sr, 512, fmin, n_bins, bins_per_octave, 0.0, 2, None, 1, 0.01) # Test for no errors and correct output size for fmin in [None, librosa.note_to_hz('C2')]: for n_bins in [1, 12, 24, 48, 72, 74, 76]: for bins_per_octave in [12, 24]: for tuning in [None, 0, 0.25]: for resolution in [1, 2]: for norm in [1, 2]: yield (__test_cqt_size, y, sr, 512, fmin, n_bins, bins_per_octave, tuning, resolution, None, norm, 0.01)
def test_cqt_white_noise(): def __test(fmin, n_bins, scale, sr, y): C = np.abs(librosa.cqt(y=y, sr=sr, fmin=fmin, n_bins=n_bins, scale=scale)) if not scale: lengths = librosa.filters.constant_q_lengths(sr, fmin, n_bins=n_bins) C /= np.sqrt(lengths[:, np.newaxis]) # Only compare statistics across the time dimension # we want ~ constant mean and variance across frequencies assert np.allclose(np.mean(C, axis=1), 1.0, atol=2.5e-1), np.mean(C, axis=1) assert np.allclose(np.std(C, axis=1), 0.5, atol=5e-1), np.std(C, axis=1) srand() for sr in [22050]: y = np.random.randn(30 * sr) for scale in [False, True]: for fmin in librosa.note_to_hz(['C1', 'C2']): for n_octaves in range(2, 4): yield __test, fmin, n_octaves * 12, scale, sr, y
def test_hcqt_white_noise(): def __test(fmin, n_bins, scale, sr, y): C = librosa.hybrid_cqt(y=y, sr=sr, fmin=fmin, n_bins=n_bins, scale=scale) if not scale: lengths = librosa.filters.constant_q_lengths(sr, fmin, n_bins=n_bins) C /= np.sqrt(lengths[:, np.newaxis]) assert np.allclose(np.mean(C, axis=1), 1.0, atol=2.5e-1), np.mean(C, axis=1) assert np.allclose(np.std(C, axis=1), 0.5, atol=5e-1), np.std(C, axis=1) srand() for sr in [22050]: y = np.random.randn(30 * sr) for scale in [False, True]: for fmin in librosa.note_to_hz(['C1', 'C2']): for n_octaves in [6, 7]: yield __test, fmin, n_octaves * 12, scale, sr, y
def test_constant_q(): def __test(sr, fmin, n_bins, bins_per_octave, tuning, resolution, pad_fft, norm): F, lengths = librosa.filters.constant_q(sr, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, resolution=resolution, pad_fft=pad_fft, norm=norm) assert np.all(lengths <= F.shape[1]) eq_(len(F), n_bins) if not pad_fft: return eq_(np.mod(np.log2(F.shape[1]), 1.0), 0.0) # Check for vanishing negative frequencies F_fft = np.abs(np.fft.fft(F, axis=1)) # Normalize by row-wise peak F_fft = F_fft / np.max(F_fft, axis=1, keepdims=True) assert not np.any(F_fft[:, -F_fft.shape[1]//2:] > 1e-4) sr = 11025 # Try to make a cq basis too close to nyquist yield (raises(librosa.ParameterError)(__test), sr, sr/2.0, 1, 12, 0, 1, True, 1) # with negative fmin yield (raises(librosa.ParameterError)(__test), sr, -60, 1, 12, 0, 1, True, 1) # with negative bins_per_octave yield (raises(librosa.ParameterError)(__test), sr, 60, 1, -12, 0, 1, True, 1) # with negative bins yield (raises(librosa.ParameterError)(__test), sr, 60, -1, 12, 0, 1, True, 1) # with negative resolution yield (raises(librosa.ParameterError)(__test), sr, 60, 1, 12, 0, -1, True, 1) # with negative norm yield (raises(librosa.ParameterError)(__test), sr, 60, 1, 12, 0, 1, True, -1) for fmin in [None, librosa.note_to_hz('C3')]: for n_bins in [12, 24]: for bins_per_octave in [12, 24]: for tuning in [0, 0.25]: for resolution in [1, 2]: for norm in [1, 2]: for pad_fft in [False, True]: yield (__test, sr, fmin, n_bins, bins_per_octave, tuning, resolution, pad_fft, norm)
def test_cqt(): sr = 11025 duration = 5.0 y = make_signal(sr, duration) # incorrect hop length for a 6-octave analysis # num_octaves = 6, 2**(6-1) = 32 > 16 for hop_length in [-1, 0, 16, 63, 65]: yield (raises(librosa.ParameterError)(__test_cqt_size), y, sr, hop_length, None, 72, 12, 0.0, 2, None, 1, 0.01) # Filters go beyond Nyquist. 500 Hz -> 4 octaves = 8000 Hz > 11000 Hz yield (raises(librosa.ParameterError)(__test_cqt_size), y, sr, 512, 500, 4 * 12, 12, 0.0, 2, None, 1, 0.01) # Test with fmin near Nyquist for fmin in [3000, 4800]: for n_bins in [1, 2]: for bins_per_octave in [12]: yield (__test_cqt_size, y, sr, 512, fmin, n_bins, bins_per_octave, 0.0, 2, None, 1, 0.01) # Test for no errors and correct output size for fmin in [None, librosa.note_to_hz('C2')]: for n_bins in [1, 12, 24, 48, 72, 74, 76]: for bins_per_octave in [12, 24]: for tuning in [None, 0, 0.25]: for resolution in [1, 2]: for norm in [1, 2]: yield (__test_cqt_size, y, sr, 512, fmin, n_bins, bins_per_octave, tuning, resolution, None, norm, 0.01)
def __test(target_hz, resolution, bins_per_octave, tuning): y = np.sin(2 * np.pi * target_hz * t) tuning_est = librosa.estimate_tuning(resolution=resolution, bins_per_octave=bins_per_octave, y=y, sr=sr, n_fft=2048, fmin=librosa.note_to_hz('C4'), fmax=librosa.note_to_hz('G#9')) # Round to the proper number of decimals deviation = np.around(np.abs(tuning - tuning_est), int(-np.log10(resolution))) # We'll accept an answer within three bins of the resolution assert deviation <= 3 * resolution
def test_hybrid_cqt(): # This test verifies that hybrid and full cqt agree down to 1e-4 # on 99% of bins which are nonzero (> 1e-8) in either representation. sr = 11025 duration = 5.0 y = make_signal(sr, duration, None) def __test(hop_length, fmin, n_bins, bins_per_octave, tuning, resolution, norm, sparsity, res_type): C2 = librosa.hybrid_cqt(y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=resolution, norm=norm, sparsity=sparsity, res_type=res_type) C1 = np.abs(librosa.cqt(y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=resolution, norm=norm, sparsity=sparsity, res_type=res_type)) assert C1.shape == C2.shape # Check for numerical comparability idx1 = (C1 > 1e-4 * C1.max()) idx2 = (C2 > 1e-4 * C2.max()) perc = 0.99 thresh = 1e-3 idx = idx1 | idx2 assert np.percentile(np.abs(C1[idx] - C2[idx]), perc) < thresh * max(C1.max(), C2.max()) for fmin in [None, librosa.note_to_hz('C2')]: for n_bins in [1, 12, 24, 48, 72, 74, 76]: for bins_per_octave in [12, 24]: for tuning in [None, 0, 0.25]: for resolution in [1, 2]: for norm in [1, 2]: for res_type in [None, 'polyphase']: yield (__test, 512, fmin, n_bins, bins_per_octave, tuning, resolution, norm, 0.01, res_type)
def test_hybrid_cqt(): sr = 11025 # Impulse train y = np.zeros(int(5.0 * sr)) y[::sr] = 1.0 def __test(hop_length, fmin, n_bins, bins_per_octave, tuning, resolution, norm, sparsity): C2 = librosa.hybrid_cqt( y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, resolution=resolution, norm=norm, sparsity=sparsity, ) C1 = librosa.cqt( y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, resolution=resolution, norm=norm, sparsity=sparsity, ) eq_(C1.shape, C2.shape) # Check for numerical comparability assert np.mean(np.abs(C1 - C2)) < 1e-3 # Hop size not long enough for num octaves # num_octaves = 6, 2**(72/12) = 64 > 32 yield (raises(librosa.ParameterError)(__test), 32, None, 72, 12, 0.0, 2, 1, 0.01) for fmin in [None, librosa.note_to_hz("C2")]: for n_bins in [1, 12, 24, 48, 72, 74, 76]: for bins_per_octave in [12, 24]: for tuning in [None, 0, 0.25]: for resolution in [1, 2]: for norm in [1, 2]: yield (__test, 512, fmin, n_bins, bins_per_octave, tuning, resolution, norm, 0.01)
def __init__(self, name, sr, hop_length, n_octaves=8, over_sample=3, fmin=None): super(CQT, self).__init__(name, sr, hop_length) if fmin is None: fmin = librosa.note_to_hz('C1') self.n_octaves = n_octaves self.over_sample = over_sample self.fmin = fmin self.register('mag', [None, n_octaves * 12 * over_sample], np.float32) self.register('phase', [None, n_octaves * 12 * over_sample], np.float32)
def __init__(self, sr=32768, hop_length=1024, n_octaves=8, over_sample=3, fmin=None, dtype=np.float32): self.sr = sr self.hop_length = hop_length self.n_octaves = n_octaves self.over_sample = over_sample if fmin is None: fmin = librosa.note_to_hz('C1') self.fmin = fmin self.dtype = dtype
def test_hybrid_cqt(): sr = 11025 # Impulse train y = np.zeros(int(5.0 * sr)) y[::sr] = 1.0 def __test(hop_length, fmin, n_bins, bins_per_octave, tuning, resolution, norm, sparsity): C2 = librosa.hybrid_cqt(y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, resolution=resolution, norm=norm, sparsity=sparsity) C1 = librosa.cqt(y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, resolution=resolution, norm=norm, sparsity=sparsity) eq_(C1.shape, C2.shape) # Check for numerical comparability assert np.mean(np.abs(C1 - C2)) < 1e-3 for fmin in [None, librosa.note_to_hz('C2')]: for n_bins in [1, 12, 24, 48, 72, 74, 76]: for bins_per_octave in [12, 24]: for tuning in [None, 0, 0.25]: for resolution in [1, 2]: for norm in [1, 2]: yield (__test, 512, fmin, n_bins, bins_per_octave, tuning, resolution, norm, 0.01)
def fcqt(time_signal, fs, q_rate=q_rate_def, fmin=None, fratio=note_resolution, spThresh=0.0054, num_oct=6): # fminの定義 if fmin is None: fmin = librosa.note_to_hz("C1") # フレーム移動量は固定 nhop = CQTHOP # Calculate Constant-Q Properties freqs = GenMusicalScale(fmin, note_resolution, num_oct) # 各周波数ビンの中心周波数 nfreq = int(num_oct * note_resolution) # 周波数ビンの個数定義 Q = int( (1. / ((2**(1. / fratio)) - 1)) * q_rate) # Eq.(2) Q Value from 1992 sig_len = len(time_signal) # サンプル数 nframe = int(sig_len / nhop) # フレーム数 # N > max(N_k) fftLen = int(2**(ceil(log2(int(float(fs * Q) / freqs[0]))))) # h_fftLen = fftLen / 2 fftLen = int(2**(ceil(log2(int(float(fs * Q) / freqs[0]))))) h_fftLen = int(fftLen / 2) # =================== # カーネル行列の計算 # =================== sparseKernel = zeros([nfreq, fftLen], dtype=complex128) for k in range(nfreq): tmpKernel = zeros(fftLen, dtype=complex128) freq = freqs[k] # N_k N_k = int(float(fs * Q) / freq) # FFT窓の中心を解析部分に合わせる. startWin = int((fftLen - N_k) / 2) tmpKernel[startWin:startWin + N_k] = (hammingWindow(N_k) / N_k) * exp( two_pi_j * Q * arange(N_k, dtype=float64) / N_k) # FFT (kernel matrix) sparseKernel[k] = np.fft.fft(tmpKernel) ### 十分小さい値を0にする sparseKernel[abs(sparseKernel) <= spThresh] = 0 ### スパース行列に変換する sparseKernel = csr_matrix(sparseKernel) ### 複素共役にする sparseKernel = sparseKernel.conjugate() / fftLen # =========== # Execution # =========== ### New signal (for Calculation) new_sig = zeros(len(time_signal) + fftLen, dtype=float64) new_sig[h_fftLen:-h_fftLen] = time_signal ret = zeros([nframe, nfreq], dtype=complex128) for iiter in tqdm(range(nframe)): istart = iiter * nhop iend = istart + fftLen # FFT (input signal)? sig_fft = np.fft.fft(new_sig[istart:iend]) # 行列積? ret[iiter] = sig_fft * sparseKernel.T return ret, freqs
def get_first_null_f0(items_handler: ItemsHandler, start_offset: float, min_duration: float, end_offset: ty.Optional[float] = None, min_note: str = 'C1', max_note: str = 'C7', frame_length: float = 2048, win_length: ty.Optional[float] = None, offset_units: LengthUnit = LengthUnit.ms, length_units: LengthUnit = LengthUnit.samples) -> float: audio = items_handler.load_audio()[0] sr = items_handler.sr if length_units != LengthUnit.samples: if length_units != LengthUnit.ms: raise TypeError('length_units can be only of ms or samples') frame_length = length_convert(frame_length, sr, length_units, LengthUnit.samples) if win_length: win_length = length_convert(win_length, sr, length_units, LengthUnit.samples) hop_length = int(frame_length // 4) start_offset_int = ty.cast( int, length_convert(start_offset, sr, offset_units, LengthUnit.samples)) if start_offset_int: audio = audio[start_offset_int:] # type:ignore if end_offset: end_offset_int = ty.cast( int, length_convert(end_offset, sr, offset_units, LengthUnit.samples)) audio = audio[:end_offset_int - start_offset_int] # type:ignore min_duration_frms = length_convert(min_duration, sr, offset_units, LengthUnit.frames, hop_length=hop_length) fmin, fmax = lr.note_to_hz(min_note), lr.note_to_hz(max_note) f0s, v_flag, v_prob = lr.pyin( audio, fmin=fmin, fmax=fmax, sr=sr, win_length=None if win_length is None else win_length, frame_length=frame_length, ) # print(list(zip(f0s, v_flag))) nulls = np.where(~v_flag) # print(nulls) for idx, val in enumerate(nulls[0]): # print(val) if val >= min_duration_frms: # print(val, v_flag[val + 1]) if v_flag[val + 1]: # print(f'skipping {val}') continue break if val < 5: raise PitchError( f'Cannot find null f0 at the reasonable frame (>=5): {v_flag}') val_normalized = length_convert(val, sr, LengthUnit.frames, offset_units, hop_length=hop_length) # print(val_normalized, ) return start_offset + val_normalized
def save_rainbowgram_plot(audio, sample_rate: int = 16000, filename: str = None, output_dir: str = "output") -> None: """ Saves the spectrogram plot of the given audio to the given filename in the given output_dir. The resulting plot is a Constant-Q transform (CQT) spectrogram with the vertical axis being the amplitude converted to dB-scale, and the intensity of lines proportional to the log magnitude of the power spectrum and the color given by the derivative of the phase, making the phase visible as "rainbow colors", hence the affective name "rainbowgrams" (given by the Magenta team). :param audio: the audio content, as a floating point time series :param sample_rate: the sampling rate of the file :param filename: the optional filename, set to "%Y-%m-%d_%H%M%S".png if None :param output_dir: the output dir """ os.makedirs(output_dir, exist_ok=True) # Configuration from https://arxiv.org/abs/1704.01279 # and https://gist.github.com/jesseengel/e223622e255bd5b8c9130407397a0494 peak = 70 hop_length = 256 over_sample = 4 res_factor = 0.8 octaves = 6 notes_per_octave = 10 color_dict = { "red": ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)), "green": ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)), "blue": ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)), "alpha": ((0.0, 1.0, 1.0), (1.0, 0.0, 0.0)) } color_mask = LinearSegmentedColormap("ColorMask", color_dict) plt.register_cmap(cmap=color_mask) # Init subplots, there is only one plot but we have to use 2 cmap, # which means 2 call to ax.matshow that wouldn"t work with a single plot. fig, ax = plt.subplots() plt.axis("off") bins_per_octave = int(notes_per_octave * over_sample) num_bins = int(octaves * notes_per_octave * over_sample) constant_q_transform = librosa.cqt(audio, sr=sample_rate, hop_length=hop_length, bins_per_octave=bins_per_octave, n_bins=num_bins, filter_scale=res_factor, fmin=librosa.note_to_hz("C2")) mag, phase = librosa.core.magphase(constant_q_transform) phase_angle = np.angle(phase) phase_unwrapped = np.unwrap(phase_angle) dphase = phase_unwrapped[:, 1:] - phase_unwrapped[:, :-1] dphase = np.concatenate([phase_unwrapped[:, 0:1], dphase], axis=1) / np.pi mag = (librosa.amplitude_to_db(mag, amin=1e-13, top_db=peak, ref=np.max) / peak) + 1 ax.matshow(dphase[::-1, :], cmap=plt.cm.rainbow) ax.matshow(mag[::-1, :], cmap=color_mask) if not filename: date_and_time = time.strftime("%Y-%m-%d_%H%M%S") filename = f"{date_and_time}.png" path = os.path.join(output_dir, filename) plt.savefig(fname=path, dpi=600) plt.close(fig)
def probabilities(y, note_min, note_max, sr, frame_length, window_length, hop_length, pitch_acc, voiced_acc, onset_acc, spread): """ Estimate prior (observed) probabilities from audio signal Parameters ---------- y : 1-D numpy array Array containing audio samples note_min : string, 'A#4' format Lowest note supported by this estimator note_max : string, 'A#4' format Highest note supported by this estimator sr : int Sample rate. frame_length : int window_length : int hop_length : int Parameters for FFT estimation pitch_acc : float, between 0 and 1 Probability (estimated) that the pitch estimator is correct. voiced_acc : float, between 0 and 1 Estimated accuracy of the "voiced" parameter. onset_acc : float, between 0 and 1 Estimated accuracy of the onset detector. spread : float, between 0 and 1 Probability that the singer/musician had a one-semitone deviation due to vibrato or glissando. Returns ------- P : 2D numpy array. P[j,t] is the prior probability of being in state j at time t. """ fmin = librosa.note_to_hz(note_min) fmax = librosa.note_to_hz(note_max) midi_min = librosa.note_to_midi(note_min) midi_max = librosa.note_to_midi(note_max) n_notes = midi_max - midi_min + 1 # F0 and voicing f0, voiced_flag, voiced_prob = librosa.pyin(y, fmin * 0.9, fmax * 1.1, sr, frame_length, window_length, hop_length) tuning = librosa.pitch_tuning(f0) f0_ = np.round(librosa.hz_to_midi(f0 - tuning)).astype(int) onsets = librosa.onset.onset_detect(y, sr=sr, hop_length=hop_length, backtrack=True) P = np.ones((n_notes * 2 + 1, len(f0))) for t in range(len(f0)): # probability of silence or onset = 1-voiced_prob # Probability of a note = voiced_prob * (pitch_acc) (estimated note) # Probability of a note = voiced_prob * (1-pitch_acc) (estimated note) if voiced_flag[t] == False: P[0, t] = voiced_acc else: P[0, t] = 1 - voiced_acc for j in range(n_notes): if t in onsets: P[(j * 2) + 1, t] = onset_acc else: P[(j * 2) + 1, t] = 1 - onset_acc if j + midi_min == f0_[t]: P[(j * 2) + 2, t] = pitch_acc elif np.abs(j + midi_min - f0_[t]) == 1: P[(j * 2) + 2, t] = pitch_acc * spread else: P[(j * 2) + 2, t] = 1 - pitch_acc return P
from keras.layers import * from keras.models import Model from keras.utils import plot_model from numpy.lib.stride_tricks import as_strided import matplotlib.pyplot as plt import librosa.display import IPython.display as ipd # Reference [d]: Classical MIDI Files, https://www.mfiles.co.uk/classical-midi.htm model = None history = None trainx, trainy = ([], []) validx, validy = ([], []) BINS = 88 FREF = librosa.note_to_hz('A0') # Reference frequency = 27.50 Hz -> A0 # For 84 frequency bins: # FREF = librosa.note_to_hz('C1') # Reference frequency = 32.70 Hz -> C1 hPi = np.zeros(BINS) # HMM Initial state (note) probabilities hSteps = np.zeros(BINS*2-1) # Transition steps hA = np.zeros([BINS, BINS]) # Transition matrix hB = np.zeros([BINS, BINS]) # Emission matrix fbins = librosa.cqt_frequencies(BINS, fmin=FREF) FMAX = fbins[BINS-1] STD = 25 # Standard deviation for the probability vector Sr = 16000 def sample(filename): global trainx, trainy, validx, validy, Sr
def detectionOnsets(y): fmin = librosa.note_to_hz(Notemin) fmax = librosa.note_to_hz(Notemax) #Nmin = int((sr/(fmax*(2**(1/BINS_PER_OCTAVE)-1)))) #Nmax = int((sr/(fmin*(2**(1/BINS_PER_OCTAVE)-1)))) n_bins = int( (librosa.note_to_midi(Notemax) - librosa.note_to_midi(Notemin)) * BINS_PER_OCTAVE / 12) Chrom = librosa.amplitude_to_db(np.abs( librosa.cqt(y=y, sr=sr, hop_length=STEP, fmin=fmin, bins_per_octave=BINS_PER_OCTAVE, n_bins=n_bins)), ref=np.max) Nf = len(Chrom) N = len(Chrom[0]) Diff = np.zeros((Nf, N)) Dev = np.zeros(N) for j in range(1, N): for i in range(Nf): Diff[i, j] = np.abs(Chrom[i, j] - Chrom[i, j - 1]) Dev[j] = sum(Diff[:, j]) # FONCTION DE SEUIL # Ajout de zéros en queue et en tête l = [] Seuil = [] Onsets = [] for k in range(int(H / 2)): l.append(0) for val in Dev: l.append(val) for k in range(int(H / 2)): l.append(0) #Calcul de la médiane for i in range(N): Seuil.append(ALPHA + BETA * stat.median(l[i:i + H])) if Dev[i] > Seuil[i]: Onsets.append(i) times = librosa.frames_to_time(np.arange(N), sr=sr, hop_length=STEP) # FONCTION DE TRI SUR LES ONSETS i = 0 while i < (len(Onsets) - 1): while (i < (len(Onsets) - 1)) and (times[Onsets[i + 1]] < times[Onsets[i]] + T): if Dev[Onsets[i + 1]] < Dev[Onsets[i]]: del Onsets[i + 1] else: del Onsets[i] i = i + 1 onset_frames = librosa.util.fix_frames(Onsets, x_min=0, x_max=Chrom.shape[1] - 1) onset_times = librosa.frames_to_time(onset_frames, sr=sr, hop_length=STEP) #Synchronisation sur les onsets, en enlevant le début et la fin des longues frames ChromSync = np.zeros((Nf, len(onset_frames) - 1)) n_att = int(librosa.time_to_frames(T_att, sr=sr, hop_length=STEP)) for j in range(len(onset_frames) - 1): for i in range(Nf): ChromSync[i, j] = np.mean(Chrom[i][(onset_frames[j] + n_att):(onset_frames[j + 1] - n_att)]) #Normalisation du spectre # ChromSync[:,1] = librosa.power_to_db(librosa.db_to_power(ChromSync[:,1]) / np.sum(librosa.db_to_power(ChromSync[:,1]))) if norm_spectre: for j in range(ChromSync.shape[1]): ChromSync[:, j] = librosa.power_to_db( librosa.db_to_power(ChromSync[:, j]) / np.sum(librosa.db_to_power(ChromSync[:, j]))) #Affichage if plot_onsets: plt.figure(figsize=(13, 7)) ax1 = plt.subplot(3, 1, 1) librosa.display.specshow(Chrom, bins_per_octave=BINS_PER_OCTAVE, fmin=fmin, y_axis='cqt_note', x_axis='time', x_coords=times) plt.title('CQT spectrogram') plt.subplot(3, 1, 2, sharex=ax1) plt.plot(times, Dev, label='Deviation') plt.plot(times, Seuil, color='g', label='Seuil') plt.vlines(times[Onsets], 0, Dev.max(), color='r', alpha=0.9, linestyle='--', label='Onsets') plt.axis('tight') plt.legend(frameon=True, framealpha=0.75) ax1 = plt.subplot(3, 1, 3, sharex=ax1) librosa.display.specshow(ChromSync, bins_per_octave=BINS_PER_OCTAVE, fmin=fmin, y_axis='cqt_note', x_axis='time', x_coords=onset_times) plt.show() return onset_times
print(pw_l.shape) ms_l = librosa.feature.melspectrogram(S=pw_l, n_mels=256) ms_r = librosa.feature.melspectrogram(S=pw_r, n_mels=256) #by default n_mels=128 print(ms_l.shape) tranform = np.empty((2, 256, 431)) tranform[0] = ms_l tranform[1] = ms_r path_save = data_path + "\\" + save1_path + "\\" np.save(path_save + 'airport-barcelona-0-0-a.npy', tranform) #CQT Cqt = librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('A1')) print(Cqt.size) C = np.abs(Cqt) freqs = librosa.cqt_frequencies(C.shape[0], fmin=librosa.note_to_hz('A1')) print(freqs.size) perceptual_Cqt = librosa.perceptual_weighting(C**2, freqs, ref=np.max) plt.figure() plt.subplot(2, 1, 1) librosa.display.specshow(librosa.amplitude_to_db(Cqt, ref=np.max), fmin=librosa.note_to_hz('A1'), y_axis='cqt_hz') plt.title('Log CQT power') plt.colorbar(format='%+2.0f dB') plt.subplot(2, 1, 2) librosa.display.specshow(perceptual_Cqt,
def demo(instrument): fund_freq = librosa.note_to_hz("C4") fname = os.path.join(AUDIO_SAMPLES_DIR, "{}_C4.wav".format(instrument)) y, sr = librosa.load(fname) plot_signal_harmonics(y, sr, fund_freq) return Audio(fname)
def notes_to_audio(automaton=False, function=None, deterministic=True, maxsamplesize=44100): if function != None: print "###################################################" print "Function to Audio" print "###################################################" print "Function:", function #Example: #>>> map(lambda x: eval('x*x+x+1'),range(1,10)) #[3, 7, 13, 21, 31, 43, 57, 73, 91] notes = map(lambda x: eval(function), range(0, 44100)) npnotes = np.asarray(notes) #scalednpnotes=np.int16(npnotes/np.max(npnotes)*32767) scalednpnotes = npnotes print "Notes :", scalednpnotes print "Size of scaled notes:", len(scalednpnotes) write("function_synthesized_music.wav", maxsamplesize, scalednpnotes) return if function == None and automaton == False: print "###################################################" print "Notes to Audio" print "###################################################" npnotes = np.random.uniform(10, 100, 44100) #scalednpnotes=np.int16(npnotes/np.max(npnotes)*32767) scalednpnotes = npnotes print "Notes :", scalednpnotes print "Size of scaled notes:", len(scalednpnotes) write("notes_synthesized_music.wav", maxsamplesize, scalednpnotes) return if automaton == True: print "###################################################" print "Automaton to Audio" print "###################################################" states2notes_machine_file = open("NotesStateMachine.txt", "r") states2notes_machine = ast.literal_eval( states2notes_machine_file.read()) dfanotes = [ int(librosa.note_to_hz(states2notes_machine['start-s1']) * 1000) ] prevstates = ['start'] iter = 0 while iter < maxsamplesize - 1: possibletransitions = [] prevprevstates = prevstates prevstates = [] #print "prevstate:",prevstate #if 'fs' in prevstate: # break for k, v in states2notes_machine.iteritems(): statetransition = k.split("-") if statetransition[0] in prevprevstates: possibletransitions.append(states2notes_machine[k]) prevstates.append(statetransition[1]) if deterministic: break for note in possibletransitions: hertz = librosa.note_to_hz(note) #print "Hertz:",hertz dfanotes.append(int(hertz * 1000)) #break iter += 1 npnotes = np.array(dfanotes) #scalednpnotes=np.int16(npnotes/np.max(npnotes)*32767) scalednpnotes = npnotes print "Notes :", scalednpnotes print "Size of scaled dfanotes:", len(scalednpnotes) write("automaton_synthesized_music.wav", maxsamplesize, scalednpnotes) return
def test_constant_q(): def __test(sr, fmin, n_bins, bins_per_octave, tuning, filter_scale, pad_fft, norm): F, lengths = librosa.filters.constant_q( sr, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=filter_scale, pad_fft=pad_fft, norm=norm) assert np.all(lengths <= F.shape[1]) eq_(len(F), n_bins) if not pad_fft: return eq_(np.mod(np.log2(F.shape[1]), 1.0), 0.0) # Check for vanishing negative frequencies F_fft = np.abs(np.fft.fft(F, axis=1)) # Normalize by row-wise peak F_fft = F_fft / np.max(F_fft, axis=1, keepdims=True) assert not np.any(F_fft[:, -F_fft.shape[1] // 2:] > 1e-4) sr = 11025 # Try to make a cq basis too close to nyquist yield (raises(librosa.ParameterError)(__test), sr, sr / 2.0, 1, 12, 0, 1, True, 1) # with negative fmin yield (raises(librosa.ParameterError)(__test), sr, -60, 1, 12, 0, 1, True, 1) # with negative bins_per_octave yield (raises(librosa.ParameterError)(__test), sr, 60, 1, -12, 0, 1, True, 1) # with negative bins yield (raises(librosa.ParameterError)(__test), sr, 60, -1, 12, 0, 1, True, 1) # with negative filter_scale yield (raises(librosa.ParameterError)(__test), sr, 60, 1, 12, 0, -1, True, 1) # with negative norm yield (raises(librosa.ParameterError)(__test), sr, 60, 1, 12, 0, 1, True, -1) for fmin in [None, librosa.note_to_hz('C3')]: for n_bins in [12, 24]: for bins_per_octave in [12, 24]: for tuning in [0, 0.25]: for filter_scale in [1, 2]: for norm in [1, 2]: for pad_fft in [False, True]: yield (__test, sr, fmin, n_bins, bins_per_octave, tuning, filter_scale, pad_fft, norm)
def perceptual_cqt(y,sr): C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('A1'))) freqs = librosa.cqt_frequencies(C.shape[0], fmin=librosa.note_to_hz('A1'))#Adapted to music perceptual_CQT = librosa.perceptual_weighting(C**2, freqs, ref=np.max)# Uses return perceptual_CQT
# print some signal stuff print("x: ", x.shape) print("fs: ", fs) print("hop: ", hop) print("frame length: ", len(x) // hop) # extract filename label label = extract_label(file) # -- # chroma features # calc chroma fmin = librosa.note_to_hz('C2') chroma = calc_chroma(x, fs, hop, n_octaves=4, bins_per_octave=36, fmin=fmin) # -- # onsets h, a, b = 5, 0.09, 0.7 param_str = '{}_h-{}_a-{}_b-{}'.format(label, h, a, b).replace('.', 'p') # calc onsets #onsets, onset_times, c, thresh = calc_onsets(x, fs, N=N, hop=hop, adapt_frames=h, adapt_alpha=a, adapt_beta=b) # --
def generate_dataset(mfw, base_path, audio_preview=False, use_cqt=True, interactive_plots=False): utils.ensure_parent_exists(base_path) path_midi = "{}.mid".format(base_path) path_wave = "{}.wav".format(base_path) store_midi_and_wave(mfw.midi_file, path_midi, path_wave) if audio_preview: os.system("audacious '{}' &".format(path_wave)) sr, wave_data = read_wave(path_wave) if use_cqt: print("Transforming data") bins_per_note = 4 bins_per_octave = 12 * bins_per_note n_octaves = 9 n_bins = n_octaves * bins_per_octave hop_length = 512 lowest_note_name = "C1" # cqt default lowest_note_hz = librosa.note_to_hz(lowest_note_name) lowest_note_midi = librosa.note_to_midi(lowest_note_name) # https://librosa.github.io/librosa/generated/librosa.core.cqt.html C = cqt( wave_data, sr=sr, fmin=lowest_note_hz, n_bins=n_bins, bins_per_octave=bins_per_octave, hop_length=hop_length, filter_scale=1.0, #sparsity=0.0, tuning=0.0, # we don't want automatic tuning estimation ) mag = np.abs(C).astype(np.float32) # 16th notes at 200 bpm are 800 notes/min = 13.3 notes/sec => note duration = 75 ms print("Sample rate: {}".format(sr)) print("Hop duration: {:.1f} ms".format(hop_length / sr * 1000)) print("Length audio: {:.1f} sec".format(len(wave_data) / sr)) print("Shape audio: {} [{}, {:.1f} MB]".format( wave_data.shape, wave_data.dtype, wave_data.nbytes / 1e6)) print("Shape transformed: {} [{}, {:.1f} MB]".format( mag.shape, mag.dtype, mag.nbytes / 1e6)) # Groundtruth extraction with same shape groundtruth = mfw.extract_groundtruth( raw_length=len(wave_data), sample_rate=sr, lowest_note=lowest_note_midi, highest_note=lowest_note_midi + n_octaves * 12, hop_length=hop_length, bins_per_note=4, ) print("Storing dataset") path_X = "{}_X.npy".format(base_path) path_Y = "{}_Y.npy".format(base_path) np.save(path_X, mag) np.save(path_Y, groundtruth) print("Generating plots") plot_dataset(C, groundtruth, base_path, sr, lowest_note_hz, hop_length, bins_per_octave, interactive_plots) else: np.save("wavedata.npy", data) os.system("nim -r c ./src/process_wave_data wavedata.npy") data = np.load("wavedata_preprocessed.npy") data = data[::-1, :] fig, ax = plt.subplots(1, 1, figsize=(16, 10)) plt.subplots_adjust(left=0.05, bottom=0.05, top=0.95, right=0.95) plt.imshow(data, aspect='auto') plt.show()
import os import os.path as osp from hashlib import sha1 from subprocess import DEVNULL, call, check_call import librosa import numpy as np from scipy.spatial import ConvexHull cqt_fmin = librosa.note_to_hz('A1') ffmpeg_path = './codecs/ffmpeg.exe' class Codec: # TODO: add flag indicating whether supporting hires files def __init__(self, path, cmd_args): ''' path: path to the codec binary cmd_args: (extr) command line args for encoding, used for adjusting compression level ''' self.path = osp.abspath(path) if cmd_args is None: self.cmd_args = [] elif isinstance(cmd_args, (list, tuple)): self.cmd_args = list(cmd_args) else: self.cmd_args = [cmd_args] def encode(self, fin): pass def decode(self, fin):
def beatract(dir_name, file_name=-1, save_dir=-1, addable_option="-n", \ specific=4, threshold_length=8, show_graph=-1, save_graph=-1, debugmode=-1, \ time_variation=0.5): ''' at given dir_name/file_name extract beat and save it to txt file at save to. Args: Return: Raise: nothing. ''' # if file_name is default value, check all file in directory. if file_name == -1: file_names = os.listdir(dir_name) else: file_names = [file_name] # if save_dir is default value, save_dir is in source directory. if save_dir == -1: save_dir = dir_name # now is now beat extracting number. now = 0 for file_name in file_names: now += 1 if debugmode != -1: # if debugmode on, write debugging message to console. print "Strat extracting " + file_name + "... Now " + str(now) + " / "+ \ str(len(file_names)) dest_file = to_wav(dir_name, dir_name, file_name, addable_option) # if want to extract some given length, give load to duration value. audio_list, sampling_rate = lb.load(dest_file, offset=0.0) if debugmode != -1: # if debugmode on, write debugging message to console. print "file opend..." + "... Now " + str(now) + " / "+ str(len(file_names)) music = lb.cqt(audio_list, sr=sampling_rate, fmin=lb.note_to_hz('C1'), n_bins=60*specific, \ bins_per_octave=12*specific) if debugmode != -1: # if debugmode on, write debugging message to console. print "file CQT finished..." + "... Now " + str(now) + " / "+ str(len(file_names)) threshold = get_threshold(music) _, r_harmonic = parse_noise(music, threshold) if debugmode != -1: # if debugmode on, write debugging message to console. print "file CQT harmonics extracted..." + "... Now " + str(now) + " / " + \ str(len(file_names)) note = stage_note(r_harmonic) _, note_list, icoef_table, _ = bt2.tie_note(note, threshold_length, debug_mode=1) weights = bt2.weightract(r_harmonic, note, note_list, icoef_table) # Set Time variation for input values. real_weights = bt2.set_time_variation(weights, \ get_music_time(sampling_rate, len(audio_list)), sampling_rate, \ time_variation=time_variation) save_to(save_dir, file_name.split(".")[0] + ".txt", real_weights) if debugmode != -1: print "finished extract file..." + "... Now " + str(now) + " / "+ str(len(file_names)) if show_graph != -1: # if show graph is on... plt.figure() plt.plot(real_weights) plt.show() if save_graph != -1: # if save graph is on... plt.figure() plt.plot(real_weights) plt.savefig(str(dir_name)+"/"+str(file_name.split(".")[0] + ".png"))
from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten from tensorflow.keras.layers import Conv2D, MaxPooling2D from tensorflow.keras import utils # Define global variables. CHANNELS = 1 RATE = 44100 FRAMES_PER_BUFFER = 1024 * 17 N_FFT = 4096 SCREEN_WIDTH = 178 ENERGY_THRESHOLD = 0.4 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Choose the frequency range of your log-spectrogram. F_LO = librosa.note_to_hz('C2') F_HI = librosa.note_to_hz('C9') M = librosa.filters.mel(RATE, N_FFT, SCREEN_WIDTH, fmin=F_LO, fmax=F_HI) p = pyaudio.PyAudio() loaded_model = tf.keras.models.load_model( '/Users/Dodanto/Documents/GitHub/SnapPoint/SnapPoint.h5') def mfcc(y): sample = librosa.feature.mfcc(y=y, sr=44100, n_mfcc=40) sample = numpy.expand_dims(sample.T, axis=0) return sample def test(sample): data = mfcc(sample)
window = getattr(scipy.signal.windows, window_name) wdec = librosa.filters.__float_window(window) if n == int(n): n = int(n) assert np.allclose(wdec(n), window(n)) else: wf = wdec(n) fn = int(np.floor(n)) assert not np.any(wf[fn:]) @pytest.mark.parametrize("sr", [11025]) @pytest.mark.parametrize("fmin", [None, librosa.note_to_hz("C3")]) @pytest.mark.parametrize("n_bins", [12, 24]) @pytest.mark.parametrize("bins_per_octave", [12, 24]) @pytest.mark.parametrize("filter_scale", [1, 2]) @pytest.mark.parametrize("norm", [1, 2]) @pytest.mark.parametrize("pad_fft", [False, True]) def test_constant_q(sr, fmin, n_bins, bins_per_octave, filter_scale, pad_fft, norm): F, lengths = librosa.filters.constant_q( sr=sr, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, filter_scale=filter_scale, pad_fft=pad_fft,
def segmentation(song, display=False): ''' Takes in a song and then returns a class containing the spectrogram, bpm, and major segments It also fills the song's beatTrack and uses it in the segmentation algorithm. Algorithm written by: Brian McFee https://bmcfee.github.io/ :param song: (Song) | song to segment :param display: (bool) | optional argument to display graph of segments using matPlotLib :return: seg_dict (dict) | dictionary of segments ''' import numpy as np import scipy import matplotlib.pyplot as plt import sklearn.cluster y = song.load.y sr = song.load.sr beat_track = song.beat_track BINS_PER_OCTAVE = 12 * 3 N_OCTAVES = 7 C = librosa.amplitude_to_db(librosa.cqt(y=y, sr=sr, bins_per_octave=BINS_PER_OCTAVE, n_bins=N_OCTAVES * BINS_PER_OCTAVE), ref=np.max) # To reduce dimensionality, we'll beat-synchronous the CQT tempo, beats = tuple(beat_track) Csync = librosa.util.sync(C, beats, aggregate=np.median) ##################################################################### # Let's build a weighted recurrence matrix using beat-synchronous CQT # width=3 prevents links within the same bar # mode='affinity' here implements S_rep R = librosa.segment.recurrence_matrix(Csync, width=3, mode='affinity', sym=True) # Enhance diagonals with a median filter (Equation 2) df = librosa.segment.timelag_filter(scipy.ndimage.median_filter) Rf = df(R, size=(1, 7)) ################################################################### # Now let's build the sequence matrix (S_loc) using mfcc-similarity mfcc = librosa.feature.mfcc(y=y, sr=sr) Msync = librosa.util.sync(mfcc, beats) path_distance = np.sum(np.diff(Msync, axis=1)**2, axis=0) sigma = np.median(path_distance) path_sim = np.exp(-path_distance / sigma) R_path = np.diag(path_sim, k=1) + np.diag(path_sim, k=-1) ########################################################## # And compute the balanced combination deg_path = np.sum(R_path, axis=1) deg_rec = np.sum(Rf, axis=1) mu = deg_path.dot(deg_path + deg_rec) / np.sum((deg_path + deg_rec)**2) A = mu * Rf + (1 - mu) * R_path ##################################################### # Now let's compute the normalized Laplacian L = scipy.sparse.csgraph.laplacian(A, normed=True) # and its spectral decomposition evals, evecs = scipy.linalg.eigh(L) # We can clean this up further with a median filter. # This can help smooth over small discontinuities evecs = scipy.ndimage.median_filter(evecs, size=(9, 1)) # cumulative normalization is needed for symmetric normalize laplacian eigenvectors Cnorm = np.cumsum(evecs**2, axis=1)**0.5 # If we want k clusters, use the first k normalized eigenvectors. k = 5 X = evecs[:, :k] / Cnorm[:, k - 1:k] ############################################################# # Let's use these k components to cluster beats into segments KM = sklearn.cluster.KMeans(n_clusters=k) seg_ids = KM.fit_predict(X) bound_beats = 1 + np.flatnonzero(seg_ids[:-1] != seg_ids[1:]) bound_beats = librosa.util.fix_frames(bound_beats, x_min=0) bound_segs = list(seg_ids[bound_beats]) bound_frames = beats[bound_beats] bound_frames = librosa.util.fix_frames(bound_frames, x_min=None, x_max=C.shape[1] - 1) bound_tuples = [] for i in range(1, len(bound_frames)): bound_tuples.append((bound_frames[i - 1], bound_frames[i] - 1)) bound_tuples = tuple(map(lambda x: librosa.frames_to_time(x), bound_tuples)) pairs = zip(bound_segs, bound_tuples) seg_dict = dict() for seg, frame in pairs: seg_dict.setdefault(seg, []).append(frame) if display: import matplotlib.patches as patches plt.figure(figsize=(12, 4)) colors = plt.get_cmap('Paired', k) bound_times = librosa.frames_to_time(bound_frames) freqs = librosa.cqt_frequencies(n_bins=C.shape[0], fmin=librosa.note_to_hz('C1'), bins_per_octave=BINS_PER_OCTAVE) librosa.display.specshow(C, y_axis='cqt_hz', sr=sr, bins_per_octave=BINS_PER_OCTAVE, x_axis='time') ax = plt.gca() for interval, label in zip(zip(bound_times, bound_times[1:]), bound_segs): ax.add_patch( patches.Rectangle((interval[0], freqs[0]), interval[1] - interval[0], freqs[-1], facecolor=colors(label), alpha=0.50)) plt.tight_layout() plt.show() return seg_dict
def test_note_to_hz_badnote(): librosa.note_to_hz("does not pass")
n_components=None if len(argv) > 3: n_components = int(argv[3]) from librosa import load, cqt, logamplitude, note_to_midi, note_to_hz import numpy as np # load an audio file (with samplerate) x, sr = load(filename) # compute constant-Q transform (~ pitch-based STFT) #hop_size = 512 pitch_max = note_to_midi('D5') pitch_min = 'B3' pitch_min_number = note_to_midi(pitch_min) C = cqt(x, sr=sr, fmin=note_to_hz(pitch_min), n_bins=pitch_max-pitch_min_number) # try some midi visualization from Midi import midi_matrix midi_mat = midi_matrix(midi_filename, min_pitch=note_to_midi(pitch_min)) # NMF #V = np.log10(1 + 100000 * C**2) V = np.abs(C).transpose() W_zero = np.zeros((pitch_max - pitch_min_number, pitch_max - pitch_min_number)) pitch = pitch_min_number for comp in W_zero: comp[pitch-pitch_min_number] = 1.0
def chroma_cqt_processed(self, n_chroma=12, n_octaves=7, bins_per_octave=12, fmin='C1', margin=8, kernel_size=31, power=2.0, mask=False, params=None): """ Adapted from librosa docs https://librosa.github.io/librosa_gallery/auto_examples/plot_chroma.html """ from scipy.ndimage import median_filter if params is not None: if 'fmin' in params.keys(): if isinstance(params['fmin'], str): fmin = librosa.note_to_hz(params['fmin']) else: fmin = params['fmin'] if 'n_chroma' in params.keys(): n_chroma = params['n_chroma'] if 'n_octaves' in params.keys(): n_octaves = params['n_octaves'] if 'bins_per_octave' in params.keys(): bins_per_octave = params['bins_per_octave'] if 'margin' in params.keys(): margin = params['margin'] if 'kernel_size' in params.keys(): kernel_size = params['kernel_size'] if 'power' in params.keys(): power = params['power'] if 'mask' in params.keys(): mask = params['mask'] else: if fmin and isinstance(fmin, str): fmin = librosa.note_to_hz(fmin) harmonic = librosa.effects.harmonic(y=self.audio_vector, margin=margin, kernel_size=kernel_size, power=power, mask=mask) chroma_cqt_harm = librosa.feature.chroma_cqt( y=harmonic, sr=self.fs, bins_per_octave=bins_per_octave, hop_length=self.hop_length, fmin=fmin, n_chroma=n_chroma, n_octaves=n_octaves) chroma_filter = np.minimum( chroma_cqt_harm, librosa.decompose.nn_filter(chroma_cqt_harm, aggregate=np.median, metric='cosine')) chroma_smooth = median_filter(chroma_filter, size=(1, 9)) return { 'chroma_filtered': chroma_filter, 'chroma_smoothed': chroma_smooth }
def add_pitch(self, **kwargs): self.pitch_fmin = kwargs.pop("pitch_fmin", librosa.note_to_hz('C2')) self.pitch_fmax = kwargs.pop("pitch_fmax", librosa.note_to_hz('C7')) self.pitch_avg = kwargs.pop("pitch_avg", None) self.pitch_std = kwargs.pop("pitch_std", None) self.pitch_norm = kwargs.pop("pitch_norm", False)
def make_ground(): # ground_path = '/mnt/Stuff/Acads/UGP/mycode/ground_voc' # ground_list = [x.split('.')[0].strip() for x in os.listdir(ground_path)] #os.chdir("./ground_voc") Fmin = librosa.note_to_hz('C2') Fmax = librosa.note_to_hz('C7') note = uf(np.arange(int(np.log2(Fmax/Fmin) * 12*1 +1)) , Fmin,1) # melody2 = "/mnt/Stuff/Acads/UGP/medleydb/medleydb/data/Annotations/Melody/\ # Melody2/" # mirk = "/mnt/data/datasets/MIR-1K/" bach = "/mnt/data/datasets/Bach10_v1.1/" # for song in os.listdir(melody2): # print(song.rsplit(".")[0][:-8]) # path = melody2 + song # liz = pd.read_csv(path,names = ['time','freq']) # liz = liz.to_numpy() # # #print(liz.shape[0]) # # N = int(liz.shape[0]/2 +1) # #print(N) # # i=0 # #ground_liz = np.zeros([61,N]) # gl = np.zeros(N) # for x in liz: # if i%2==0: # gl[int(i/2)] = np.argwhere(note == find_nearest(note,x[1])) # #ground_liz[int(gl[int(i/2)]) , int(i/2)] = 1 # i+=1 # save_path = song.rsplit(".")[0][:-8] + ".npy" # np.save(save_path , gl) # print(" Done.") # for file in os.listdir(mirk): # if file.split('.')[1] == 'pv': # path = mirk + file # liz = [] # ff = open(path , 'r') # freq = [float(x) for x in ff] # # N = len(freq) # # print(N) # # i = 0 # gl = np.zeros(N) # for x in freq: # gl[i] = round(x) #np.argwhere(note == find_nearest(note , x)) # i+=1 # save_path = file.split('.')[0] + '.npy' # np.save(save_path , gl) # print('Done.') for song in os.listdir(bach): print(song) file = bach + song + '/' + song + '-GTF0s.mat' f = loadmat(file) f = f['GTF0s'] i=0 for ch in ['violin', 'clarinet','saxophone','bassoon']: fr = f[i] i+=1 N = fr.shape[0] gl = np.zeros(N) j=0 for x in fr: gl[j] = x - 36 +1 j+=1 save_path = 'ground_bach/' + song + '-' + ch + '.npy' np.save(save_path , gl) print('Done.')
def test_hybrid_cqt(): # This test verifies that hybrid and full cqt agree down to 1e-4 # on 99% of bins which are nonzero (> 1e-8) in either representation. sr = 11025 duration = 5.0 y = make_signal(sr, duration, None) def __test(hop_length, fmin, n_bins, bins_per_octave, tuning, resolution, norm, sparsity, res_type): C2 = librosa.hybrid_cqt(y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=resolution, norm=norm, sparsity=sparsity, res_type=res_type) C1 = np.abs( librosa.cqt(y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=resolution, norm=norm, sparsity=sparsity, res_type=res_type)) assert C1.shape == C2.shape # Check for numerical comparability idx1 = (C1 > 1e-4 * C1.max()) idx2 = (C2 > 1e-4 * C2.max()) perc = 0.99 thresh = 1e-3 idx = idx1 | idx2 assert np.percentile(np.abs(C1[idx] - C2[idx]), perc) < thresh * max(C1.max(), C2.max()) for fmin in [None, librosa.note_to_hz('C2')]: for n_bins in [1, 12, 24, 48, 72, 74, 76]: for bins_per_octave in [12, 24]: for tuning in [None, 0, 0.25]: for resolution in [1, 2]: for norm in [1, 2]: for res_type in [None, 'polyphase']: yield (__test, 512, fmin, n_bins, bins_per_octave, tuning, resolution, norm, 0.01, res_type)
import numpy as np import librosa import librosa.display import matplotlib.pyplot as plt y, sr = librosa.load( r"C:\Users\theko\Documents\Dataset\022035001\Tukiya_Tiruvadi.mp3", sr=22050) cqt = librosa.cqt(y, sr, fmin=librosa.note_to_hz("C2"), n_bins=48) c = np.abs(cqt) # fig, ax = plt.subplots() # # img = librosa.display.specshow(librosa.amplitude_to_db(c, ref=np.max), sr=sr, x_axis='time', y_axis='cqt_note', ax=ax) # # ax.set_title('Constant-Q power spectrum') # # fig.colorbar(img, ax=ax, format="%+2.0f dB") i = np.argmax(c, axis=0) notes = np.zeros((48, )) for i_ in i: notes[i_] += 1 notes_X = np.linspace(48, 96, 48, dtype='int32') plt.plot(notes_X, notes) plt.show()
def __init__(self, *args, **kwargs): super(self.__class__, self).__init__(*args, **kwargs) args, _ = parser.parse_known_args() if args.samplerate == None: self.samplerate = \ int(sd.query_devices(args.input_device)['default_samplerate']) else: self.samplerate = int(args.samplerate) print(f"INFO -- Sampling rate at {self.samplerate} Hz") self.threadpool = QtCore.QThreadPool() self.q = queue.Queue() self.setFixedSize(args.width, args.height) self.mainbox = QtWidgets.QWidget() self.setCentralWidget(self.mainbox) self.layout = QtWidgets.QGridLayout() self.mainbox.setLayout(self.layout) # Widgets self.spec_plot = SpectrogramWidget() self.wave_plot = WaveFormWidget() for i, widget in enumerate([self.spec_plot, self.wave_plot]): self.layout.addWidget(widget, i, 0) # Initialize x and y self.length = self.samplerate * args.duration self.y = np.random.rand(self.length, len(args.channels)) self.x = np.linspace(0, args.duration, num=self.length) self.zcr = librosa.feature.zero_crossing_rate(self.y.mean(axis=1))[0] # Wave Plot self.waveline_1 = self.wave_plot.plot(x=self.x, y=self.y[:, 0], pen=pg.mkPen('g', width=0.5), name='channel_1') self.waveline_2 = self.wave_plot.plot(x=self.x, y=self.y[:, 1], pen=pg.mkPen('y', width=0.5), name='channel_2') self.waveline_3 = self.wave_plot.plot(x=np.linspace( 0, args.duration, self.zcr.shape[0]), y=self.zcr, pen=pg.mkPen('r', width=2), name='zcr') # Spectrogram self.fmax = int( librosa.core.fft_frequencies(sr=self.samplerate, n_fft=args.n_fft)[-1]) D = librosa.stft(y=self.y.mean(axis=1), n_fft=args.n_fft, center=False) self.specdata = librosa.amplitude_to_db(np.abs(D), ref=np.max) # M = librosa.feature.melspectrogram( # y=self.y.mean(axis=1), # sr=self.samplerate, # n_fft=args.n_fft, # n_mels=args.n_mels) # self.specdata = librosa.power_to_db(S=M, ref=np.max) self.F0 = librosa.yin(y=self.y.mean(axis=1), sr=self.samplerate, frame_length=2048, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C5'), center=False) self.spec_image = pg.ImageItem(item=self.specdata.T) self.spec_plot.addItem(item=self.spec_image) self.f0_line = self.spec_plot.plot(x=np.linspace( 0, args.duration, self.F0.shape[0]), y=self.F0, pen=pg.mkPen('r', width=2), name='f0') self.bar = pg.ColorBarItem(values=(librosa.note_to_hz('C2'), librosa.note_to_hz('C5')), cmap=pg.colormap.get('CET-L9')) self.bar.setImageItem(self.spec_image) # Start audio stream and animations self.start_stream() if args.input_device == 'Virtual Input (VB-Audio Virtual Cable), Windows DirectSound': self.play_media(media_url=args.media_url, type='stream', volume=100) self.animate() self.show()
def main(): """ Compare the real variant with different max pooling settings to librosa HVQT. """ # Select parameters to use across all implementations n_bins = 216 # 6 octaves gamma = None # default gamma hop_length = 512 bins_per_octave = 36 fmin = librosa.note_to_hz('C1') harmonics = [0.5, 1, 2, 3, 4, 5] # Load an example piece of audio y, sr = librosa.load(librosa.util.example_audio_file()) # Calculate the HVQT using librosa lib_start = time() lib_hvqt = librosa_hvqt(y, harmonics, sr, hop_length, fmin, n_bins, bins_per_octave, gamma) print(f'Processing Time (Librosa): {time() - lib_start}') # Print a new line print() # Convert librosa HVQT to decibels lib_hvqt = librosa.amplitude_to_db(lib_hvqt, ref=np.max) # Set the device for the convolutional implementations device = 1 device = torch.device( f'cuda:{device}' if torch.cuda.is_available() else 'cpu') # Add a batch and channel dimension to the audio, and make it a tensor y = torch.Tensor([[y]]).to(device) # Construct the real-only variant lhvqt_real = LHVQT(lvqt=LVQT_R, harmonics=harmonics, fs=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, gamma=gamma, max_p=1, random=False, update=False, to_db=False, db_to_prob=False, batch_norm=False, var_drop=False).to(device) # Compute the response from the real variant rea_start = time() rea_hvqt = lhvqt_real(y) print(f'Processing Time (Real w/ MP=1): {time() - rea_start}') # Remove from the device and convert back to ndarray rea_hvqt = rea_hvqt.squeeze(0).cpu().detach().numpy() # Convert HVQT to decibels rea_hvqt = librosa.amplitude_to_db(rea_hvqt, ref=np.max) # Compute similarities after putting all transforms on dB scale print('Real Variant Similariy (MP=1): %1.2f%%' % (100 * hvqt_similarity(rea_hvqt, lib_hvqt))) # Print a new line print() # Construct the real-only variant lhvqt_real = LHVQT(lvqt=LVQT_R, harmonics=harmonics, fs=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, gamma=gamma, max_p=2, to_db=False, db_to_prob=False, batch_norm=False, var_drop=False).to(device) # Compute the response from the real variant rea_start = time() rea_hvqt = lhvqt_real(y) print(f'Processing Time (Real w/ MP=2): {time() - rea_start}') # Remove from the device and convert back to ndarray rea_hvqt = rea_hvqt.squeeze(0).cpu().detach().numpy() # Convert HVQT to decibels rea_hvqt = librosa.amplitude_to_db(rea_hvqt, ref=np.max) # Compute similarities after putting all transforms on dB scale print('Real Variant Similariy (MP=2): %1.2f%%' % (100 * hvqt_similarity(rea_hvqt, lib_hvqt))) # Print a new line print() # Construct the real-only variant lhvqt_real = LHVQT(lvqt=LVQT_R, harmonics=harmonics, fs=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, gamma=gamma, max_p=4, to_db=False, db_to_prob=False, batch_norm=False, var_drop=False).to(device) # Compute the response from the real variant rea_start = time() rea_hvqt = lhvqt_real(y) print(f'Processing Time (Real w/ MP=4): {time() - rea_start}') # Remove from the device and convert back to ndarray rea_hvqt = rea_hvqt.squeeze(0).cpu().detach().numpy() # Convert HVQT to decibels rea_hvqt = librosa.amplitude_to_db(rea_hvqt, ref=np.max) # Compute similarities after putting all transforms on dB scale print('Real Variant Similariy (MP=4): %1.2f%%' % (100 * hvqt_similarity(rea_hvqt, lib_hvqt))) # Print a new line print() # Construct the real-only variant lhvqt_real = LHVQT(lvqt=LVQT_R, harmonics=harmonics, fs=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, gamma=gamma, max_p=8, to_db=False, db_to_prob=False, batch_norm=False, var_drop=False).to(device) # Compute the response from the real variant rea_start = time() rea_hvqt = lhvqt_real(y) print(f'Processing Time (Real w/ MP=8): {time() - rea_start}') # Remove from the device and convert back to ndarray rea_hvqt = rea_hvqt.squeeze(0).cpu().detach().numpy() # Convert HVQT to decibels rea_hvqt = librosa.amplitude_to_db(rea_hvqt, ref=np.max) # Compute similarities after putting all transforms on dB scale print('Real Variant Similariy (MP=8): %1.2f%%' % (100 * hvqt_similarity(rea_hvqt, lib_hvqt))) # Print a new line print() # Construct the real-only variant lhvqt_real = LHVQT(lvqt=LVQT_R, harmonics=harmonics, fs=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, gamma=gamma, max_p=16, to_db=False, db_to_prob=False, batch_norm=False, var_drop=False).to(device) # Compute the response from the real variant rea_start = time() rea_hvqt = lhvqt_real(y) print(f'Processing Time (Real w/ MP=16): {time() - rea_start}') # Remove from the device and convert back to ndarray rea_hvqt = rea_hvqt.squeeze(0).cpu().detach().numpy() # Convert HVQT to decibels rea_hvqt = librosa.amplitude_to_db(rea_hvqt, ref=np.max) # Compute similarities after putting all transforms on dB scale print('Real Variant Similariy (MP=16): %1.2f%%' % (100 * hvqt_similarity(rea_hvqt, lib_hvqt))) # Print a new line print() # Construct the real-only variant lhvqt_real = LHVQT(lvqt=LVQT_R, harmonics=harmonics, fs=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, gamma=gamma, max_p=32, to_db=False, db_to_prob=False, batch_norm=False, var_drop=False).to(device) # Compute the response from the real variant rea_start = time() rea_hvqt = lhvqt_real(y) print(f'Processing Time (Real w/ MP=32): {time() - rea_start}') # Remove from the device and convert back to ndarray rea_hvqt = rea_hvqt.squeeze(0).cpu().detach().numpy() # Convert HVQT to decibels rea_hvqt = librosa.amplitude_to_db(rea_hvqt, ref=np.max) # Compute similarities after putting all transforms on dB scale print('Real Variant Similariy (MP=32): %1.2f%%' % (100 * hvqt_similarity(rea_hvqt, lib_hvqt)))
def notes_to_audio(automaton=False,function=None,deterministic=True,maxsamplesize=44100): if function != None: print "###################################################" print "Function to Audio" print "###################################################" print "Function:",function #Example: #>>> map(lambda x: eval('x*x+x+1'),range(1,10)) #[3, 7, 13, 21, 31, 43, 57, 73, 91] notes=map(lambda x: eval(function),range(0,44100)) npnotes=np.asarray(notes) #scalednpnotes=np.int16(npnotes/np.max(npnotes)*32767) scalednpnotes=npnotes print "Notes :",scalednpnotes print "Size of scaled notes:",len(scalednpnotes) write("function_synthesized_music.wav",maxsamplesize,scalednpnotes) return if function == None and automaton == False: print "###################################################" print "Notes to Audio" print "###################################################" npnotes=np.random.uniform(10,100,44100) #scalednpnotes=np.int16(npnotes/np.max(npnotes)*32767) scalednpnotes=npnotes print "Notes :",scalednpnotes print "Size of scaled notes:",len(scalednpnotes) write("notes_synthesized_music.wav",maxsamplesize,scalednpnotes) return if automaton == True: print "###################################################" print "Automaton to Audio" print "###################################################" states2notes_machine_file=open("NotesStateMachine.txt","r") states2notes_machine=ast.literal_eval(states2notes_machine_file.read()) dfanotes=[int(librosa.note_to_hz(states2notes_machine['start-s1'])*1000)] prevstates=['start'] iter=0 while iter < maxsamplesize-1: possibletransitions=[] prevprevstates=prevstates prevstates=[] #print "prevstate:",prevstate #if 'fs' in prevstate: # break for k,v in states2notes_machine.iteritems(): statetransition=k.split("-") if statetransition[0] in prevprevstates: possibletransitions.append(states2notes_machine[k]) prevstates.append(statetransition[1]) if deterministic: break for note in possibletransitions: hertz=librosa.note_to_hz(note) #print "Hertz:",hertz dfanotes.append(int(hertz*1000)) #break iter += 1 npnotes=np.array(dfanotes) #scalednpnotes=np.int16(npnotes/np.max(npnotes)*32767) scalednpnotes=npnotes print "Notes :",scalednpnotes print "Size of scaled dfanotes:",len(scalednpnotes) write("automaton_synthesized_music.wav",maxsamplesize,scalednpnotes) return
bound_frames = librosa.util.fix_frames(bound_frames, x_min=None, x_max=C.shape[1]-1) ################################################### # And plot the final segmentation over original CQT # sphinx_gallery_thumbnail_number = 5 import matplotlib.patches as patches plt.figure(figsize=(12, 4)) bound_times = librosa.frames_to_time(bound_frames) freqs = librosa.cqt_frequencies(n_bins=C.shape[0], fmin=librosa.note_to_hz('C1'), bins_per_octave=BINS_PER_OCTAVE) librosa.display.specshow(C, y_axis='cqt_hz', sr=sr, bins_per_octave=BINS_PER_OCTAVE, x_axis='time') ax = plt.gca() for interval, label in zip(zip(bound_times, bound_times[1:]), bound_segs): ax.add_patch(patches.Rectangle((interval[0], freqs[0]), interval[1] - interval[0], freqs[-1], facecolor=colors(label), alpha=0.50)) plt.tight_layout()
# file name to mat file with onsets and midi notes------------------------- mat_file_name = '01-AchGottundHerr-GTF0s.mat' # Loading file in memory--------------------------------------------------- file_name = '01-AchGottundHerr_4Kanal.wav' file_path = 'ignore/sounds/' full_name = file_path + file_name audio_data, sampling_rate = libr.load(full_name, sr=None, duration=5) # CQT Params--------------------------------------------------------------- hop = 256 start_note = 'C2' cqt = libr.cqt(audio_data, sr=sampling_rate, hop_length=hop, fmin=libr.note_to_hz(start_note), n_bins=48, bins_per_octave=12) # Define common harmonic structure----------------------------------------- cqt_bins = 48 list_chs = [0, 12, 19, 24, 28, 31] # First initialisation of fundamental frequency distribution--------------- chs = initial_harmonics(list_chs, np.zeros((cqt_bins, 1)), option=1) u, v = inverse_filter(cqt, chs, cqt_bins) u_bar = non_linear_mapping(u) len_u = len(u_bar[:, 0]) # iterative algorithm------------------------------------------------------ (num_rows, num_cols) = cqt.shape
import json import numpy as np import sys import tensorflow as tf import random import os import matplotlib.pyplot as plt import librosa from scipy import signal import pickle from wavenet import (WaveNetModel, time_to_batch, batch_to_time, causal_conv, optimizer_factory, mu_law_decode, image2vector) NOTES = ['D#3', 'G3', 'A#3'] # e-flat chord NOTES_HZ = librosa.note_to_hz(NOTES) SAMPLE_RATE_HZ = 2000.0 # Hz TRAIN_ITERATIONS = 400 SAMPLE_DURATION = 0.5 # Seconds SAMPLE_PERIOD_SECS = 1.0 / SAMPLE_RATE_HZ MOMENTUM = 0.95 GENERATE_SAMPLES = 900 QUANTIZATION_CHANNELS = 256 NUM_SPEAKERS = 3 F1 = 155.56 # E-flat frequency in hz F2 = 196.00 # G frequency in hz F3 = 233.08 # B-flat frequency in hz receptive_field = 256
from __future__ import print_function import librosa from magenta.common import tf_utils from magenta.models.onsets_frames_transcription import audio_transform import tensorflow as tf DEFAULT_SAMPLE_RATE = 16000 DEFAULT_SPEC_TYPE = 'cqt' DEFAULT_SPEC_LOG_AMPLITUDE = False DEFAULT_SPEC_MEL_HTK = False DEFAULT_SPEC_HOP_LENGTH = 512 DEFAULT_SPEC_N_BINS = 264 # (88/12)*36=264 DEFAULT_SPEC_FMIN = librosa.note_to_hz(['A0'])[0] DEFAULT_CQT_BINS_PER_OCTAVE = 36 DEFAULT_FRAMES_PER_SECOND = DEFAULT_SAMPLE_RATE / DEFAULT_SPEC_HOP_LENGTH MIN_MIDI_PITCH = librosa.note_to_midi('A0') MAX_MIDI_PITCH = librosa.note_to_midi('C8') MIDI_PITCHES = MAX_MIDI_PITCH - MIN_MIDI_PITCH + 1 MAX_MIDI_VELOCITY = 127 DEFAULT_CROP_TRAINING_SEQUENCE_TO_NOTES = False DEFAULT_ONSET_MODE = 'length_ms' DEFAULT_ONSET_LENGTH = 100 DEFAULT_ONSET_DELAY = 0
from __future__ import print_function import librosa from magenta.common import tf_utils from magenta.models.onsets_frames_transcription import audio_transform import tensorflow as tf DEFAULT_SAMPLE_RATE = 16000 DEFAULT_SPEC_TYPE = 'cqt' DEFAULT_SPEC_LOG_AMPLITUDE = False DEFAULT_SPEC_MEL_HTK = False DEFAULT_SPEC_HOP_LENGTH = 512 DEFAULT_SPEC_N_BINS = 264 # (88/12)*36=264 DEFAULT_SPEC_FMIN = librosa.note_to_hz(['A0'])[0] DEFAULT_CQT_BINS_PER_OCTAVE = 36 DEFAULT_FRAMES_PER_SECOND = DEFAULT_SAMPLE_RATE / DEFAULT_SPEC_HOP_LENGTH MIN_MIDI_PITCH = librosa.note_to_midi('A0') MAX_MIDI_PITCH = librosa.note_to_midi('C8') MIDI_PITCHES = MAX_MIDI_PITCH - MIN_MIDI_PITCH + 1 MAX_MIDI_VELOCITY = 127 DEFAULT_ONSET_MODE = 'length_ms' DEFAULT_ONSET_LENGTH = 100 DEFAULT_ONSET_DELAY = 0 DEFAULT_MIN_FRAME_OCCUPANCY_FOR_LABEL = 0.0