def interp_hcqt(audio_fpath=None, y=None, fs=None): """Compute the harmonic CQT from a given audio file Parameters ---------- audio_fpath : str path to audio file Returns ------- hcqt : np.ndarray Harmonic cqt time_grid : np.ndarray List of time stamps in seconds freq_grid : np.ndarray List of frequency values in Hz """ if y is None: y, fs = librosa.load(audio_fpath, sr=SR) else: y = librosa.resample(y, fs, SR) fs = SR # How many bins do we need? # n_bins_max = 2**(ceil(log2(max(HARMONICS))) * BPO + n_bins) n_bins_plane = N_OCTAVES * BINS_PER_OCTAVE n_bins_master = int(np.ceil(np.log2(np.max(HARMONICS))) * BINS_PER_OCTAVE) + n_bins_plane cqt_master = np.abs(librosa.cqt(y=y, sr=fs, hop_length=HOP_LENGTH, fmin=FMIN, n_bins=n_bins_master, bins_per_octave=BINS_PER_OCTAVE)) freq_grid = librosa.cqt_frequencies(N_OCTAVES * BINS_PER_OCTAVE, FMIN, bins_per_octave=BINS_PER_OCTAVE) freq_master = librosa.cqt_frequencies(n_bins_master, FMIN, bins_per_octave=BINS_PER_OCTAVE) hcqt = librosa.interp_harmonics(cqt_master, freq_master, HARMONICS)[:, :N_OCTAVES * BINS_PER_OCTAVE] log_hcqt = ((1.0/80.0) * librosa.core.amplitude_to_db(hcqt, ref=np.max)) + 1.0 time_grid = librosa.core.frames_to_time( np.arange(log_hcqt.shape[-1]), sr=SR, hop_length=HOP_LENGTH ) return log_hcqt, freq_grid, time_grid
def to_local_average_cents(salience, center=None): """ find the weighted average cents near the argmax bin """ if not hasattr(to_local_average_cents, 'cents_mapping'): # the bin number-to-cents mapping freq_grid = librosa.cqt_frequencies(config.cqt_bins, config.fmin, config.bins_per_octave) to_local_average_cents.mapping = ( np.linspace(0, 7180, 360) + freq_grid[-1]) if salience.ndim == 1: if center is None: center = int(np.argmax(salience)) start = max(0, center - 4) end = min(len(salience), center + 5) salience = salience[start:end] product_sum = np.sum( salience * to_local_average_cents.mapping[start:end]) weight_sum = np.sum(salience) return product_sum / weight_sum if salience.ndim == 2: return np.array([to_local_average_cents(salience[i, :]) for i in range(salience.shape[0])]) raise Exception("label should be either 1d or 2d ndarray")
def test_wavelet(sr, fmin, n_bins, bins_per_octave, filter_scale, pad_fft, norm, gamma): freqs = librosa.cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave) F, lengths = librosa.filters.wavelet(freqs=freqs, sr=sr, filter_scale=filter_scale, pad_fft=pad_fft, norm=norm, gamma=gamma) assert np.all(lengths <= F.shape[1]) assert len(F) == n_bins if not pad_fft: return assert np.mod(np.log2(F.shape[1]), 1.0) == 0.0 # Check for vanishing negative frequencies F_fft = np.abs(np.fft.fft(F, axis=1)) # Normalize by row-wise peak F_fft = F_fft / np.max(F_fft, axis=1, keepdims=True) assert np.max(F_fft[:, -F_fft.shape[1] // 2:]) < 1e-3
def get_X(decision_length, fmin, hop_length, n_bins_per_octave, n_octaves, track_or_path): if isinstance(track_or_path, basestring): x_mono, sr = librosa.core.load(track_or_path, sr=None, mono=True) else: with warnings.catch_warnings(): warnings.filterwarnings("ignore") (sr, x_stereo) = track_or_path.audio_data warnings.resetwarnings() x_stereo = x_stereo.astype(np.float32) x_mono = np.sum(x_stereo, axis=1) / (32768.0 * 2) if x_mono.shape[0] < decision_length: padding_length = x_mono.shape[0] - decision_length padding = np.zeros(padding_length, dtype=np.float32) x_mono = np.hstack((x_mono, padding)) n_bins = n_octaves * n_bins_per_octave freqs = librosa.cqt_frequencies(bins_per_octave=n_bins_per_octave, fmin=fmin, n_bins=n_bins) CQT = np.abs( librosa.cqt(x_mono, bins_per_octave=n_bins_per_octave, fmin=fmin, hop_length=hop_length, n_bins=n_bins, sr=sr, real=False)) A_weights_dB = librosa.A_weighting(freqs, min_db=-80.0) A_weights = (10.0**(A_weights_dB / 10)) X = np.log1p(1000.0 * CQT * A_weights[:, np.newaxis]) X = X.astype(np.float32) return X
def get_freq_grid(): """Get the hcqt frequency grid """ (bins_per_octave, n_octaves, _, _, f_min, _, over_sample) = get_hcqt_params() freq_grid = librosa.cqt_frequencies( n_octaves*12*over_sample, f_min, bins_per_octave=bins_per_octave) return freq_grid
def extract_bar_cqt(sr, wav_data): """ :param sr: Sample Rate of the Wav file :param wav_data: Single Channel Wav Data :return: splits of wav_data into bars by finding tempo dynamically """ onset_env = librosa.onset.onset_strength(y=wav_data, sr=sr) prior = scipy.stats.lognorm(loc=np.log(120), scale=120, s=1) pulse = librosa.beat.plp(onset_envelope=onset_env, sr=sr, hop_length=Config.HOP_LENGTH, prior=prior) beats_plp = np.flatnonzero(librosa.util.localmax(pulse)) times = librosa.times_like(pulse, sr=sr) frequencies = librosa.cqt_frequencies( n_bins=Config.N_BINS, fmin=Config.F_MIN, bins_per_octave=Config.BINS_PER_OCTAVE) cqt = np.abs( librosa.cqt(wav_data, sr=sr, fmin=Config.F_MIN, n_bins=Config.N_BINS, bins_per_octave=Config.BINS_PER_OCTAVE)) cqt_db = librosa.amplitude_to_db(cqt, ref=np.max) cqt_split = [] for i, b in enumerate(beats_plp[:-1]): cqt_split.append(cqt_db[:, b:beats_plp[i + 1]]) cqt_split.append(cqt_db[:, beats_plp[-1]:]) return cqt_split, times[beats_plp]
def centroid(spectrum, config=dict()): ''' Computes spectral centroid feature. Parameters ---------- spectrum : np.ndarray [shape=(n_bins, n_frames)] Spectrum from which the feature is computed. config : dict Configuration dictionary. For full list of parameters with their description, see README file. This function use no parameters. Returns ------- feature : np.ndarray [shape=(n_frames,)] Computed spectral centroid feature. ''' freq = None spectrum_type = get(config, 'spectrum.type') if spectrum_type == 'cqt': freq = librosa.cqt_frequencies(get(config, 'spectrum.n_bins'), fmin=librosa.note_to_hz('C1')) elif spectrum_type == 'mel': freq = librosa.mel_frequencies(n_mels=get(config, 'spectrum.n_bins'), htk=True) return librosa.feature.spectral_centroid(S=spectrum, freq=freq)[0]
def get_cqt_index(pitch, hparams): """Get row closest to this pitch in a CQT spectrogram""" frequencies = librosa.cqt_frequencies( constants.TIMBRE_SPEC_BANDS, fmin=librosa.midi_to_hz(constants.MIN_TIMBRE_PITCH), bins_per_octave=constants.BINS_PER_OCTAVE) return np.abs(frequencies - librosa.midi_to_hz(pitch.numpy() - 1)).argmin()
def estimate_pitch(self, segment, threshold): freqs = librosa.cqt_frequencies(n_bins=self.n_bins, fmin=librosa.note_to_hz('C1'), bins_per_octave=12) if segment.max() < threshold: return [None, np.mean((np.amax(segment, axis=0)))] else: f0 = int(np.mean((np.argmax(segment, axis=0)))) return [freqs[f0], np.mean((np.amax(segment, axis=0)))]
def get_freq_grid(): """Get the hcqt frequency grid Function from https://github.com/rabitt/ismir2017-deepsalience""" (bins_per_octave, n_octaves, _, _, f_min, _) = get_hcqt_params() freq_grid = librosa.cqt_frequencies(bins_per_octave * n_octaves, f_min, bins_per_octave=bins_per_octave) return freq_grid
def compute_hcqt(audio_fpath): """Compute the harmonic CQT from a given audio file Parameters ---------- audio_fpath : str path to audio file Returns ------- hcqt : np.ndarray Harmonic cqt time_grid : np.ndarray List of time stamps in seconds freq_grid : np.ndarray List of frequency values in Hz """ if audio_fpath.endswith('npy'): log_hcqt = np.load(audio_fpath) else: y, fs = librosa.load(audio_fpath, sr=SR) cqt_list = [] shapes = [] for h in HARMONICS: cqt = librosa.cqt(y, sr=fs, hop_length=HOP_LENGTH, fmin=FMIN * float(h), n_bins=BINS_PER_OCTAVE * N_OCTAVES, bins_per_octave=BINS_PER_OCTAVE) cqt_list.append(cqt) shapes.append(cqt.shape) shapes_equal = [s == shapes[0] for s in shapes] if not all(shapes_equal): min_time = np.min([s[1] for s in shapes]) new_cqt_list = [] for i in range(len(cqt_list)): new_cqt_list.append(cqt_list[i][:, :min_time]) cqt_list = new_cqt_list log_hcqt = ((1.0 / 80.0) * librosa.core.amplitude_to_db( np.abs(np.array(cqt_list)), ref=np.max)) + 1.0 freq_grid = librosa.cqt_frequencies(BINS_PER_OCTAVE * N_OCTAVES, FMIN, bins_per_octave=BINS_PER_OCTAVE) time_grid = librosa.core.frames_to_time(range(log_hcqt.shape[2]), sr=SR, hop_length=HOP_LENGTH) return log_hcqt, freq_grid, time_grid
def load_hcqt(hcqt_filepath): log_hcqt = np.load(hcqt_filepath) freq_grid = librosa.cqt_frequencies(BINS_PER_OCTAVE * N_OCTAVES, FMIN, bins_per_octave=BINS_PER_OCTAVE) time_grid = librosa.core.frames_to_time(range(log_hcqt.shape[2]), sr=SR, hop_length=HOP_LENGTH) return log_hcqt, freq_grid, time_grid
def compute_pitches(self, display_plot_frame=-1): overall_chromagram = Chromagram() # first C = C3 notes = librosa.cqt_frequencies(12, fmin=librosa.note_to_hz('C3')) self.specgram_to_plot = [] for n in range(12): for octave in range(1, self.num_octave + 1): for harmonic in range(1, self.num_harmonic + 1): f_candidate = notes[n] * octave * harmonic window_size = int((8 / f_candidate) * self.fs) chromagram = Chromagram() for frame, x_t in enumerate(frame_cutter(self.x, window_size)): real_window_size = max(x_t.shape[0], window_size) window = numpy.hanning(real_window_size) s, f = mlab.magnitude_spectrum(x_t, Fs=self.fs, window=window) s = s[:int(s.shape[0]/2)] f = f[:int(f.shape[0]/2)] s[s < 0] = 0.0 # clip might_append_1 = s.copy() might_append_2 = [] for _ in range(self.harmonic_elim_runs): max_freq_idx = s.argmax(axis=0) max_f = f[max_freq_idx] try: note = librosa.hz_to_note(max_f, octave=False) chromagram[note] += s[max_freq_idx] might_append_2.append((max_freq_idx, max_f, note)) except (ValueError, OverflowError): continue eliminated = [] for harmonic_index_multiple in range( 1, self.harmonic_multiples_elim ): elim_freq = harmonic_index_multiple * max_f elim_index = numpy.where(f == elim_freq) s[elim_index] -= s[elim_index] might_append_3 = s.copy() if frame == display_plot_frame: # plot once and stop display_plot_frame = -1 _display_plots(self.clip_name, self.x, ((might_append_1, might_append_2, might_append_3))) overall_chromagram += chromagram return overall_chromagram
def cqtgram(y, sr, hop_length=512, octave_bins=24, n_octaves=8, fmin=40, perceptual_weighting=False): s_complex = librosa.cqt( y, sr=sr, hop_length=hop_length, bins_per_octave=octave_bins, n_bins=octave_bins * n_octaves, fmin=fmin, ) specgram = np.abs(s_complex) if perceptual_weighting: freqs = librosa.cqt_frequencies(specgram.shape[0], fmin=fmin, bins_per_octave=octave_bins) specgram = librosa.perceptual_weighting(specgram**2, freqs, ref=np.max) else: specgram = librosa.amplitude_to_db(specgram, ref=np.max) return specgram
def test_cqt_frequencies(n_bins, fmin, bins_per_octave, tuning): freqs = librosa.cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave, tuning=tuning) # Make sure we get the right number of bins assert len(freqs) == n_bins # And that the first bin matches fmin by tuning assert np.allclose(freqs[0], fmin * 2.0**(float(tuning) / bins_per_octave)) # And that we have constant Q Q = np.diff(np.log2(freqs)) assert np.allclose(Q, 1.0 / bins_per_octave)
def predictOne(self, samples: Signal): """Calculates the cqt of the given audio using librosa. Args: samples (Signal): The samples of the audio. grid (list of float): The . Returns: tuple of List[float]: The cqt of the audio. """ sr = samples.sampleRate hop_length = self.parameters["hopLength"].value n_bins = self.parameters["binNumber"].value cqt_sr = sr / hop_length cqt = librosa.cqt(samples.values, sr=sr, hop_length=hop_length, n_bins=n_bins) linear_cqt = np.abs(cqt) if self.parameters["scale"].value == "Amplitude": result = linear_cqt elif self.parameters["scale"].value == "Power": result = linear_cqt**2 elif self.parameters["scale"].value == "MSAF": result = librosa.amplitude_to_db(linear_cqt**2, ref=np.max) result += np.min( result ) * -1 # Inverting the db scale (don't know if this is correct) elif self.parameters["scale"].value == "Power dB": result = librosa.amplitude_to_db( linear_cqt, ref=np.max) # Based on Librosa, standard power spectrum in dB result += np.min(result) * -1 elif self.parameters["scale"].value == "Perceived dB": freqs = librosa.cqt_frequencies(linear_cqt.shape[0], fmin=librosa.note_to_hz('C1')) result = librosa.perceptual_weighting(linear_cqt**2, freqs, ref=np.max) result += np.min(result) * -1 else: raise ValueError("parameterScale is not a correct value") return (Signal(result.T, sampleRate=cqt_sr), )
def cqt(self, data: np.array = None, hop_lengths: List[int] = None, bins_per_octave: int = 12): # data: shape = (sample number, channel number) # hop_length: how many samples are between two selected sample segments if self.__opened == False: raise Exception('load an audio file first!'); if hop_lengths is None: hop_lengths = [512] * (self.__channels if data is None else data.shape[-1]); assert len(hop_lengths) == self.__channels if data is None else len(hop_lengths) == data.shape[-1]; normalized = self.normalize(data); channels = list(); for i in range(normalized.shape[-1]): normalized_channel = normalized[:,i]; channel_results = cqt(normalized_channel, self.__frame_rate, hop_lengths[i], fmin = note_to_hz('A0'), n_bins = 88, bins_per_octave = bins_per_octave); # results.shape = (84, hop number) channels.append(channel_results); spectrum = np.stack(channels, axis = 0); # spectrum.shape = (channel number, 88, hop number) freqs = cqt_frequencies(88, fmin = note_to_hz('A0'), bins_per_octave = bins_per_octave); return spectrum, freqs;
def compute_pitches(self, display_plot_frame=-1): # first C = C3 notes = librosa.cqt_frequencies(12, fmin=librosa.note_to_hz('C3')) divisor_ratio = (self.fs / 4.0) / self.frame_size self.dft_maxes = [] overall_chromagram = Chromagram() for frame, x in enumerate(frame_cutter(self.x, self.frame_size)): chromagram = Chromagram() x = x * scipy.signal.hamming(self.frame_size) x_dft = numpy.sqrt(numpy.absolute(numpy.fft.rfft(x))) for n in range(12): chroma_sum = 0.0 for octave in range(1, self.num_octave + 1): note_sum = 0.0 for harmonic in range(1, self.num_harmonic + 1): x_dft_max = float("-inf") # sentinel k_prime = numpy.round( (notes[n] * octave * harmonic) / divisor_ratio) k0 = int(k_prime - self.num_bins * harmonic) k1 = int(k_prime + self.num_bins * harmonic) best_ind = None for k in range(k0, k1): curr_ = x_dft[k] if curr_ > x_dft_max: x_dft_max = curr_ best_ind = k note_sum += x_dft_max * (1.0 / harmonic) self.dft_maxes.append((k0, best_ind, k1)) chroma_sum += note_sum chromagram[n] += chroma_sum overall_chromagram += chromagram if frame == display_plot_frame: _display_plots(self.clip_name, self.fs, self.frame_size, x_dft, self.x, x, self.dft_maxes) return overall_chromagram
def _compute_cqt(self, y, sr): """Compute a CQT. Parameters ---------- y : np.array Audio signal sr : float Audio singal sample rate Returns ------- cqt_log : np.array [n_samples, n_freqs] Log amplitude CQT. samples : np.array [n_samples] CQT time stamps. freqs : np.array [n_freqs] CQT frequencies. """ fmin = librosa.note_to_hz(self.min_note) bins_per_octave = 12 n_cqt_bins = bins_per_octave * self.n_octaves cqt = np.abs( librosa.cqt(y, sr=sr, hop_length=self.hop_size, fmin=fmin, filter_scale=self.filter_scale, bins_per_octave=bins_per_octave, n_bins=n_cqt_bins)) cqt = self._norm_matrix(cqt) n_time_frames = cqt.shape[1] freqs = librosa.cqt_frequencies(fmin=fmin, bins_per_octave=bins_per_octave, n_bins=n_cqt_bins) samples = librosa.frames_to_samples(range(n_time_frames), hop_length=self.hop_size) return cqt, samples, freqs
def toggle_y_axis(self): freqs = librosa.cqt_frequencies(300, fmin=librosa.note_to_hz('C2'), bins_per_octave=60) f_axis_dict = [] if self.radio_y_frequency.isChecked( ) and self.radio_cqt_option.isChecked(): freqs_formatted = ['%.2f' % elem for elem in freqs] f_axis_dict = list(dict(enumerate(freqs_formatted)).items()) self.p1.setLabel('left', "Frequency", units='Hz') major_f_ticks = f_axis_dict[::300 // 4] del f_axis_dict[::300 // 4] minor_f_ticks = f_axis_dict elif self.radio_y_note.isChecked() and self.radio_cqt_option.isChecked( ): notes = librosa.hz_to_note(freqs)[::5] temp_dict = {} for i, note in enumerate(notes): temp_dict[i * 5] = note f_axis_dict = list(temp_dict.items()) self.p1.setLabel('left', "Notes", units='') major_f_ticks = f_axis_dict[::60 // 4] del f_axis_dict[::60 // 4] minor_f_ticks = f_axis_dict elif self.radio_y_frequency.isChecked( ) and self.radio_plca_option.isChecked(): freqs_formatted = ['%.2f' % elem for elem in freqs] freqs = freqs_formatted[::5] f_axis_dict = list(dict(enumerate(freqs)).items()) self.p1.setLabel('left', "Frequency", units='Hz') major_f_ticks = f_axis_dict[::60 // 4] del f_axis_dict[::60 // 4] minor_f_ticks = f_axis_dict elif self.radio_y_note.isChecked( ) and self.radio_plca_option.isChecked(): notes = librosa.hz_to_note(freqs)[::5] f_axis_dict = list(dict(enumerate(notes)).items()) self.p1.setLabel('left', "Notes", units='') major_f_ticks = f_axis_dict[::60 // 4] del f_axis_dict[::60 // 4] minor_f_ticks = f_axis_dict newLeftTicks = self.p1.getAxis('left') newLeftTicks.setTicks([major_f_ticks, minor_f_ticks])
def librosaSpectrum(self): import librosa.display measurementPath = os.path.join(os.path.dirname(__file__), '../test/data', 'eot.wav') # measurement = ms.loadSignalFromWav(measurementPath) y, sr = librosa.load(measurementPath, sr=500) # no of octaves in the CQT (have to divide SR by 8 because we remove 1 octave for nyquist and another to fit # the filter inside nyquist (I think) bins_per_octave = 24 n_bins = math.floor(bins_per_octave * math.log2(sr / 8)) - 1 fmin = 4.0 cqt_freq = librosa.cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave) C = librosa.core.cqt(y, sr, hop_length=2**12, fmin=4.0, bins_per_octave=bins_per_octave, n_bins=n_bins, filter_scale=2) spectrum = np.sqrt(np.mean(np.abs(C)**2, axis=-1)) peak = np.sqrt(np.max(np.abs(C)**2, axis=-1)) plt.figure() plt.xlim(4, 250) plt.semilogx(cqt_freq, librosa.amplitude_to_db(spectrum, ref=np.max(peak))) # plt.semilogx(cqt_freq, librosa.amplitude_to_db(peak, ref=np.max(peak))) measurement = Signal(y, fs=500) f, Pxx = measurement.peakSpectrum() # plt.semilogx(f, librosa.amplitude_to_db(Pxx, ref=np.max(Pxx))) f, Pxx_spec = measurement.spectrum() plt.semilogx(f, librosa.amplitude_to_db(Pxx_spec, ref=np.max(Pxx))) f, Pxx = measurement.peakSpectrum() f, Pxx_spec = measurement.spectrum() plt.semilogx(f, librosa.amplitude_to_db(Pxx_spec, ref=np.max(Pxx))) plt.tight_layout() plt.show()
def processAudio(f_method='fft', b_method='times'): # Get raw PCM data y, sr = librosa.load(PATH_TO_AUDIO, duration=DURATION, sr=SAMPLE_RATE, mono=True) # Separate harmonics and percussives into two waveforms y_harmonic, y_percussive = librosa.effects.hpss(y) # Beat track on the percussive signal tempo, beat_frames = librosa.beat.beat_track(y=y_percussive, sr=SAMPLE_RATE) if b_method == 'times': B = librosa.frames_to_time(beat_frames, sr=SAMPLE_RATE) else: B = beat_frames if f_method == 'fft': D = librosa.stft(y, n_fft=FFT_SIZE, hop_length=HOP_LENGTH, center=True) F = librosa.fft_frequencies(sr=SAMPLE_RATE, n_fft=FFT_SIZE) elif f_method == 'cqt': D = librosa.cqt(y, sr=SAMPLE_RATE, hop_length=HOP_LENGTH, n_bins=84, bins_per_octave=12, fmin=28) F = librosa.cqt_frequencies(84, 28, 12) D = np.transpose(librosa.amplitude_to_db(np.abs(D), ref=np.max)) return { "d": D, "f": F, "b": B, }
def cqtgram(self, y, hop_length=512, octave_bins=24, n_octaves=8, fmin=40, perceptual_weighting=False): S_complex = librosa.cqt(y, sr=self.sr, hop_length=hop_length, bins_per_octave=octave_bins, n_bins=octave_bins * n_octaves, fmin=fmin, real=False) S = np.abs(S_complex) if perceptual_weighting: freqs = librosa.cqt_frequencies(S.shape[0], fmin=fmin, bins_per_octave=octave_bins) S = librosa.perceptual_weighting(S**2, freqs, ref_power=np.max) else: S = librosa.logamplitude(S**2, ref_power=np.max) return S
x_min=None, x_max=C.shape[1]-1) ################################################### # And plot the final segmentation over original CQT # sphinx_gallery_thumbnail_number = 5 import matplotlib.patches as patches import json # plt.figure(figsize=(12, 4)) bound_times = librosa.frames_to_time(bound_frames) freqs = librosa.cqt_frequencies(n_bins=C.shape[0], fmin=librosa.note_to_hz('C1'), bins_per_octave=BINS_PER_OCTAVE) print(len(bound_times)) with open("./output-segments.json", 'w+') as output: json.dump(bound_times.tolist(),output, indent=4) librosa.display.specshow(C, y_axis='cqt_hz', sr=sr, bins_per_octave=BINS_PER_OCTAVE, x_axis='time') ax = plt.gca() for interval, label in zip(zip(bound_times, bound_times[1:]), bound_segs): ax.add_patch(patches.Rectangle((interval[0], freqs[0]), interval[1] - interval[0], freqs[-1],
def process_output(atb): freq_grid = librosa.cqt_frequencies(config.cqt_bins, config.fmin, config.bins_per_octave) time_grid = np.linspace(0, config.hoptime * atb.shape[0], atb.shape[0]) time_grid, est_freqs = get_multif0(atb.T, freq_grid, time_grid) return time_grid, est_freqs
def icqt_recursive(C, sr=22050, hop_length=512, fmin=None, bins_per_octave=12, filter_scale=None, norm=1, scale=True, window='hann'): n_octaves = int(np.ceil(float(C.shape[0]) / bins_per_octave)) if fmin is None: fmin = librosa.note_to_hz('C1') freqs = librosa.cqt_frequencies(C.shape[0], fmin, bins_per_octave=bins_per_octave)[-bins_per_octave:] fmin_t = np.min(freqs) fmax_t = np.max(freqs) # Make the filter bank f, lengths = librosa.filters.constant_q(sr=sr, fmin=fmin_t, n_bins=bins_per_octave, bins_per_octave=bins_per_octave, filter_scale=filter_scale, norm=norm, window=window) if scale: f = f / np.sqrt(lengths[:, np.newaxis]) else: f = f / lengths[:, np.newaxis] n_trim = f.shape[1] // 2 # Hermitian the filters and sparsify f = librosa.util.sparsify_rows(f) y = None for octave in range(n_octaves - 1, -1, -1): # Compute the slice index for the current octave slice_ = slice(-(octave+1) * bins_per_octave - 1, -(octave) * bins_per_octave - 1) # Project onto the basis C_ = C[slice_] fb = f[-C_.shape[0]:] #/ np.sqrt(lengths[-C_.shape[0]:, np.newaxis]) Cf = fb.conj().T.dot(C_) # Overlap-add the responses y_oct = np.zeros(int(f.shape[1] + (2**(-octave) * hop_length * C.shape[1])), dtype=f.dtype) for i in range(Cf.shape[1]): y_oct[int(i * hop_length * 2**(-octave)):int(i * hop_length * 2**(-octave) + Cf.shape[0])] += Cf[:, i] if y is None: y = y_oct continue # Up-sample the previous buffer and add in the new one y = (librosa.core.resample(y.real, 1, 2, scale=True) + 1.j * librosa.core.resample(y.imag, 1, 2, scale=True)) y = y[n_trim:-n_trim] / 2 + y_oct # Chop down the length y = librosa.util.fix_length(y.real, f.shape[1] + hop_length * C.shape[1]) y *= 2**n_octaves # Trim off the center-padding return np.ascontiguousarray(y[n_trim : -n_trim])
librosa.display.specshow(librosa.logamplitude(melspec, ref_power=np.max), y_axis='mel', sr=sr, cmap='viridis') plt.xlabel('Original mel spectrum') plt.subplot(2,1,2) librosa.display.specshow(librosa.logamplitude(melspec2, ref_power=np.max), y_axis='mel', sr=sr, cmap='viridis', x_axis='time') plt.xlabel('Reconstructed signal') plt.tight_layout() # In[799]: sr_max = librosa.cqt_frequencies(n_bins=C.shape[0], fmin=librosa.note_to_hz('C1'), bins_per_octave=int(12*over_sample))[-1] * 2 # In[800]: y_filt = librosa.resample(librosa.resample(y, sr, sr_max), sr_max, sr) # In[801]: mir_eval.separation.evaluate(y[np.newaxis, :], y2[np.newaxis, :]) # In[802]:
def decode_song(x, sr, detect_ends=True, draw=False): """ Given an audio signal, returns the transcribed music as a DecodedSong class """ n_bins = NUM_KEYS * BINS_PER_NOTE bins_per_octave = 12 * BINS_PER_NOTE Cxx = librosa.cqt(x, sr=sr, n_bins=n_bins, bins_per_octave=bins_per_octave, fmin=FMIN) fs = librosa.cqt_frequencies(n_bins, FMIN, bins_per_octave) ts = librosa.frames_to_time(np.arange(Cxx.shape[1]), sr=sr) Sxx = librosa.amplitude_to_db(np.abs(Cxx)**2) if draw and False: librosa.display.specshow(Sxx, sr=sr, x_axis='time', y_axis='off') plt.show() # compute onset strength envelope onset_envelope = get_onset_envelope(Sxx, ts, draw=draw) times = librosa.frames_to_time(np.arange(len(onset_envelope)), sr=sr) # display STFT for debugging """ Sxx = librosa.core.stft(x, n_fft=4096, hop_length=512) fs = librosa.fft_frequencies(sr=sr, n_fft=4096) Sxx = librosa.amplitude_to_db(np.abs(Sxx)**2) if draw: librosa.display.specshow(Sxx, sr=sr, x_axis='time', y_axis='off') plt.show() """ min_note_length = 0.075 * sr // 512 + 1 # ~ 75ms #onset_envelope = librosa.onset.onset_strength(y=y, sr=sr, feature=librosa.cqt) onset_frames = librosa.onset.onset_detect(onset_envelope=onset_envelope, sr=sr, wait=min_note_length) # look ahead of note onsets to find best frames to detect note note_detect_frames = [] for n in range(len(onset_frames)): if n == len(onset_frames) - 1: next_onset = len(onset_envelope) - 1 else: next_onset = onset_frames[n + 1] onset_start = onset_frames[n] note_frame = int((next_onset - onset_start) * 0.5) + onset_start diff = np.diff(onset_envelope[onset_start:]) decreasing = np.argwhere(diff < 0).flatten() if len(decreasing): # found a place to detect note onset_forward_shift = int(0.15 // (ts[1] - ts[0]) + 1) note_frame2 = decreasing[0] + onset_forward_shift + onset_start note_frame = min(note_frame, note_frame2) # don't detect after next note note_detect_frames.append(note_frame) note_detect_frames = np.array(note_detect_frames) onset_times = ts[onset_frames] note_detect_times = ts[note_detect_frames] if draw: # display onset envelope, onsets and note detection frames plt.plot(times, onset_envelope, label='Onset strength') plt.vlines(onset_times, 0, onset_envelope.max(), color='r', alpha=0.9, linestyle='--', label='Onsets') plt.vlines(note_detect_times, 0, onset_envelope.max(), color='g', alpha=0.9, linestyle='--', label='Note detect') plt.axis('tight') plt.legend(frameon=True, framealpha=0.75) plt.show() Sxx = librosa.amplitude_to_db( np.power(np.abs(Cxx).T, np.linspace(3, 2, num=Cxx.shape[0])).T) omit_notes = [] song = DecodedSong() for n in list(range( len(onset_frames))): # iterate through onsets and add notes onset_forward_shift = int(0.05 // (ts[1] - ts[0]) + 1) t = onset_times[n] if n == len(onset_frames) - 1: tnext = ts[-1] else: tnext = onset_times[n + 1] i = onset_frames[n] + onset_forward_shift i = note_detect_frames[n] i = min(i, Sxx.shape[1] - 1) bins, notes, freqs, volumes = get_note_bins_at_index( Sxx, fs, i, omit_notes=omit_notes, peak_note=False, draw=False, polyphonic=False) omit_notes = notes # if last note has NOT ended next note has to be of different frequency end_i = detect_note_end(bins[0], i, Sxx) tnext_detect = ts[end_i] if tnext_detect < tnext: omit_notes = [] if freqs: for n in range(len(freqs)): freq = freqs[n] if detect_ends: tnext = min(tnext_detect, tnext) song.add_note(Note(freq, t, tnext - t, volume=1)) return song
import matplotlib.pyplot as plt import librosa.display import IPython.display as ipd # Reference [d]: Classical MIDI Files, https://www.mfiles.co.uk/classical-midi.htm model = None history = None trainx, trainy = ([], []) validx, validy = ([], []) BINS = 88 FREF = librosa.note_to_hz('A0') # Reference frequency = 27.50 Hz -> A0 # For 84 frequency bins: # FREF = librosa.note_to_hz('C1') # Reference frequency = 32.70 Hz -> C1 fbins = librosa.cqt_frequencies(BINS, fmin=FREF) FMAX = fbins[BINS-1] STD = 25 # Standard deviation for the probability vector def sample(filename): global trainx, trainy, validx, validy data, Sr = librosa.load("%s.mp3" % filename) D = np.abs(librosa.cqt(data, sr=Sr, fmin=FREF, n_bins=BINS)) Spec = librosa.amplitude_to_db(librosa.magphase(D)[0], ref=np.min).T num_samples = 0 # Number of time frame samples per sound file with open("%s.txt" % filename) as f: for line in f: (i1, i2, i3) = line.split()
def segmentation(song, display=False): ''' Takes in a song and then returns a class containing the spectrogram, bpm, and major segments It also fills the song's beatTrack and uses it in the segmentation algorithm. Algorithm written by: Brian McFee https://bmcfee.github.io/ :param song: (Song) | song to segment :param display: (bool) | optional argument to display graph of segments using matPlotLib :return: seg_dict (dict) | dictionary of segments ''' import numpy as np import scipy import matplotlib.pyplot as plt import sklearn.cluster y = song.load.y sr = song.load.sr beat_track = song.beat_track BINS_PER_OCTAVE = 12 * 3 N_OCTAVES = 7 C = librosa.amplitude_to_db(librosa.cqt(y=y, sr=sr, bins_per_octave=BINS_PER_OCTAVE, n_bins=N_OCTAVES * BINS_PER_OCTAVE), ref=np.max) # To reduce dimensionality, we'll beat-synchronous the CQT tempo, beats = tuple(beat_track) Csync = librosa.util.sync(C, beats, aggregate=np.median) ##################################################################### # Let's build a weighted recurrence matrix using beat-synchronous CQT # width=3 prevents links within the same bar # mode='affinity' here implements S_rep R = librosa.segment.recurrence_matrix(Csync, width=3, mode='affinity', sym=True) # Enhance diagonals with a median filter (Equation 2) df = librosa.segment.timelag_filter(scipy.ndimage.median_filter) Rf = df(R, size=(1, 7)) ################################################################### # Now let's build the sequence matrix (S_loc) using mfcc-similarity mfcc = librosa.feature.mfcc(y=y, sr=sr) Msync = librosa.util.sync(mfcc, beats) path_distance = np.sum(np.diff(Msync, axis=1)**2, axis=0) sigma = np.median(path_distance) path_sim = np.exp(-path_distance / sigma) R_path = np.diag(path_sim, k=1) + np.diag(path_sim, k=-1) ########################################################## # And compute the balanced combination deg_path = np.sum(R_path, axis=1) deg_rec = np.sum(Rf, axis=1) mu = deg_path.dot(deg_path + deg_rec) / np.sum((deg_path + deg_rec)**2) A = mu * Rf + (1 - mu) * R_path ##################################################### # Now let's compute the normalized Laplacian L = scipy.sparse.csgraph.laplacian(A, normed=True) # and its spectral decomposition evals, evecs = scipy.linalg.eigh(L) # We can clean this up further with a median filter. # This can help smooth over small discontinuities evecs = scipy.ndimage.median_filter(evecs, size=(9, 1)) # cumulative normalization is needed for symmetric normalize laplacian eigenvectors Cnorm = np.cumsum(evecs**2, axis=1)**0.5 # If we want k clusters, use the first k normalized eigenvectors. k = 5 X = evecs[:, :k] / Cnorm[:, k - 1:k] ############################################################# # Let's use these k components to cluster beats into segments KM = sklearn.cluster.KMeans(n_clusters=k) seg_ids = KM.fit_predict(X) bound_beats = 1 + np.flatnonzero(seg_ids[:-1] != seg_ids[1:]) bound_beats = librosa.util.fix_frames(bound_beats, x_min=0) bound_segs = list(seg_ids[bound_beats]) bound_frames = beats[bound_beats] bound_frames = librosa.util.fix_frames(bound_frames, x_min=None, x_max=C.shape[1] - 1) bound_tuples = [] for i in range(1, len(bound_frames)): bound_tuples.append((bound_frames[i - 1], bound_frames[i] - 1)) bound_tuples = tuple(map(lambda x: librosa.frames_to_time(x), bound_tuples)) pairs = zip(bound_segs, bound_tuples) seg_dict = dict() for seg, frame in pairs: seg_dict.setdefault(seg, []).append(frame) if display: import matplotlib.patches as patches plt.figure(figsize=(12, 4)) colors = plt.get_cmap('Paired', k) bound_times = librosa.frames_to_time(bound_frames) freqs = librosa.cqt_frequencies(n_bins=C.shape[0], fmin=librosa.note_to_hz('C1'), bins_per_octave=BINS_PER_OCTAVE) librosa.display.specshow(C, y_axis='cqt_hz', sr=sr, bins_per_octave=BINS_PER_OCTAVE, x_axis='time') ax = plt.gca() for interval, label in zip(zip(bound_times, bound_times[1:]), bound_segs): ax.add_patch( patches.Rectangle((interval[0], freqs[0]), interval[1] - interval[0], freqs[-1], facecolor=colors(label), alpha=0.50)) plt.tight_layout() plt.show() return seg_dict