def test_vqt( y_cqt, sr_cqt, hop_length, fmin, n_bins, gamma, bins_per_octave, tuning, filter_scale, norm, res_type, sparsity, ): C = librosa.vqt( y=y_cqt, sr=sr_cqt, hop_length=hop_length, fmin=fmin, n_bins=n_bins, gamma=gamma, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=filter_scale, norm=norm, sparsity=sparsity, res_type=res_type, ) # type is complex assert np.iscomplexobj(C) # number of bins is correct assert C.shape[0] == n_bins
def AMT_Framing(filename_): # Audio Processing #filename = 'Guns N Roses-Sweet Child O Mine Intro.wav' filename = "{}".format(filename_) x, fs = librosa.load(filename, sr=None, mono=True, duration=DURATION) V = librosa.vqt(x, sr=fs, hop_length=hop_length, fmin=fmin, n_bins=n_bins, gamma=20, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=filter_scale, norm=norm, sparsity=0.01, window='hann', scale=scale, pad_mode=pad_mode, res_type=res_type, dtype=dtype) # Mapping Magnitude spectrogram to the Mel Scale V_mel = np.abs(V) logFrame = librosa.amplitude_to_db(V_mel) mels = librosa.feature.melspectrogram(S=V_mel, sr=fs, n_mels=n_mels, n_fft=hop_length * 2, hop_length=hop_length) np_array_list = [] np_array_list.append(mels) frame_windows_list = [] numSlices_list = [] Y_numSlices = 625 for i in range(len(np_array_list)): VQT_result = np_array_list[i] paddedX = np.zeros( (VQT_result.shape[0], VQT_result.shape[1] + WINDOW_SIZE - 1), dtype=float) pad_amount = int(WINDOW_SIZE / 2) paddedX[:, pad_amount:-pad_amount] = VQT_result frame_windows = np.array([ paddedX[:, j:j + WINDOW_SIZE] for j in range(VQT_result.shape[1]) ]) #frame_windows = np.expand_dims(frame_windows, axis=3) numSlices = min(frame_windows.shape[0], Y_numSlices) numSlices_list.append(numSlices) frame_windows_list.append(frame_windows[:numSlices]) audio_frames = np.concatenate(frame_windows_list, axis=0) #audio_frames = frame_windows_list #storingData(audio_frames) #print("Windows shape: ",audio_frames.shape) return audio_frames
def AMT(filename_): # Audio Processing # Loading the Audios # Path Configuration #path = os.getcwd() + '/' + filename_ filename = "{}".format(filename_) x, fs = librosa.load(filename, sr=None, mono=True, duration=12) # VQT Computation V = librosa.vqt(x, sr=fs, hop_length=hop_length, fmin=fmin, n_bins=n_bins, gamma=20, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=filter_scale, norm=norm, sparsity=0.01, window='hann', scale=scale, pad_mode=pad_mode, res_type=res_type, dtype=dtype) V_mel = np.abs(V) # Conversion into the Mel-Scale to display and save Mel-spectrogram for prediction melspec(V_mel,filename)
def test_vqt( y_cqt_110, sr_cqt, hop_length, fmin, n_bins, gamma, bins_per_octave, tuning, filter_scale, norm, res_type, sparsity, ): C = librosa.vqt( y=y_cqt_110, sr=sr_cqt, hop_length=hop_length, fmin=fmin, n_bins=n_bins, gamma=gamma, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=filter_scale, norm=norm, sparsity=sparsity, res_type=res_type, ) # type is complex assert np.iscomplexobj(C) # number of bins is correct assert C.shape[0] == n_bins if fmin is None: fmin = librosa.note_to_hz("C1") # check for peaks if 110 is within range if 110 <= fmin * 2 ** (n_bins / bins_per_octave): peaks = np.argmax(np.abs(C), axis=0) # This is our most common peak index in the CQT spectrum # we use the mode here over frames to sidestep transient effects # at the beginning and end of the CQT common_peak = scipy.stats.mode(peaks)[0][0] # Convert peak index to frequency peak_frequency = fmin * 2 ** (common_peak / bins_per_octave)
def VQT_from_file(audio_file, bins_per_octave=60, n_octaves=8, gamma=20): y, fs = librosa.load(audio_file, sr=25600) vqt = librosa.vqt(y, sr=fs, hop_length=256, fmin=SpectrogramUtil.FMIN, n_bins=bins_per_octave * n_octaves, bins_per_octave=bins_per_octave, gamma=gamma) log_vqt = ( (1. / 80.) * librosa.amplitude_to_db(np.abs(np.array(vqt)), ref=np.max)) + 1. return log_vqt
def process_mel_spectrogram(self, n_mels, CQT=False, VQT=False): if CQT: C = np.abs(librosa.cqt(self.audio_waveform, sr=self.sample_rate, n_bins=112)) CQT = librosa.amplitude_to_db(C, ref=np.max) self.mel_spectrogram = np.flipud(CQT) elif VQT: V = np.abs(librosa.vqt(self.audio_waveform, sr=self.sample_rate, n_bins=112)) VQT = librosa.amplitude_to_db(V, ref=np.max) self.mel_spectrogram = np.flipud(VQT) else: mel_spectrogram = np.flipud(librosa.feature.melspectrogram(self.audio_waveform, sr=self.sample_rate, n_mels=n_mels)) log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max) self.mel_spectrogram = log_mel_spectrogram midi_time = self.midi_note_array.shape[1]*self.tempo/self.PPQ/1000000 audio_time = self.mel_spectrogram.shape[1]*512/self.sample_rate # Fixing audio and midi discrepancy time_difference = audio_time - midi_time time_difference_sample_ticks = round(time_difference*self.sample_rate/512) print(time_difference_sample_ticks) if time_difference_sample_ticks > 0: self.mel_spectrogram = self.mel_spectrogram[:, 0:-time_difference_sample_ticks] self.song_total_sample_ticks = self.mel_spectrogram.shape[1]
ax.set_ylabel('log Hz', size=15) ax.set_xlabel('Time', size=15) plt.show() plt.show() mel_spectrogram = librosa.feature.melspectrogram(y, sr=sr, n_mels=128) print(mel_spectrogram) log_mel_sprectrogram = librosa.power_to_db(mel_spectrogram) C = np.abs(librosa.cqt(y, sr=sr, n_bins=112)) CQT = librosa.amplitude_to_db(C, ref=np.max) print(CQT) V = np.abs(librosa.vqt(y, sr=sr, n_bins=112)) VQT = librosa.amplitude_to_db(V, ref=np.max) print(CQT) X = librosa.stft(y) Xdb = librosa.amplitude_to_db(abs(X)) fig, axs = plt.subplots(1, 3, figsize=(15, 20)) axs[0].imshow(np.flipud(VQT), aspect='auto', interpolation='nearest') axs[1].imshow(np.flipud(log_mel_sprectrogram), aspect='auto', interpolation='nearest') axs[2].imshow(np.flipud(CQT), aspect='auto', interpolation='nearest') plt.show() # fig, axs = plt.subplots() # plt.grid(b=None)
def librosa_hvqt(audio, harmonics, sample_rate, hop_length, fmin, n_bins, bins_per_octave, gamma): """ Compute an HVQT using Librosa. Parameters ---------- audio : ndarray (N) Audio to transform, N - number of samples harmonics : list of ints Specific harmonics to stack across the harmonic dimension sample_rate : int or float Number of samples per second of audio hop_length : int Number of samples between frames fmin : float Lowest center frequency in basis n_bins : int Number of basis functions in the filterbank bins_per_octave : int Number of basis functions per octave gamma : float Bandwidth offset to smoothly vary Q-factor Returns ---------- hvqt : ndarray (H x F x T) Harmonic Variable-Q Transform (HVQT) for the provided audio, H - number of harmonics F - number of bins T - number of time steps (frames) """ # Initialize a list to hold the harmonic-wise transforms hvqt = list() # Initialize a list to hold the number of frames for each transform frames = list() # Loop through harmonics for h in range(len(harmonics)): # Compute the true minimum center frequency for this harmonic h_fmin = harmonics[h] * fmin # Compute the VQT for this harmonic vqt = librosa.vqt(audio, sr=sample_rate, hop_length=hop_length, fmin=h_fmin, n_bins=n_bins, gamma=gamma, bins_per_octave=bins_per_octave) # Keep track of the number of frames produced frames.append(vqt.shape[-1]) # Add the VQT to the collection hvqt.append(np.expand_dims(vqt, axis=0)) # Determine the maximum number of frames that can be concatenated max_frames = min(frames) # Perform any trimming and concatenate hvqt = np.concatenate([vqt[..., :max_frames] for vqt in hvqt]) # Take the magnitude hvqt = np.abs(hvqt) return hvqt