def test_yin_multi(y_multi): y, sr = y_multi Pall = librosa.yin(y, fmin=30, fmax=300) P0 = librosa.yin(y[0], fmin=30, fmax=300) P1 = librosa.yin(y[1], fmin=30, fmax=300) assert np.allclose(Pall[0], P0) assert np.allclose(Pall[1], P1) assert not np.allclose(P0, P1)
def compute(): self.F0 = librosa.yin(y=samples.mean(axis=1), sr=self.samplerate, frame_length=2048, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C5'), center=False)
def analyze(cls, events: List[CorpusEvent], metadata: Metadata) -> List[CorpusEvent]: if not FeatureUtils.is_valid_audio(events, metadata): raise FeatureError( f"Feature '{cls.__name__}' does not support content of " f"type {metadata.content_type.__class__.__name__}") metadata: AudioMetadata = typing.cast(AudioMetadata, metadata) # TODO: Pass rather than hard-code yin_frames: np.ndarray = librosa.yin(metadata.foreground_data, fmin=50, fmax=4186, sr=metadata.sr, frame_length=2048, hop_length=metadata.hop_length) yin_midipitches: np.ndarray = np.round( 12 * np.log2(yin_frames / 8.175798915643707)) for event in events: onset_frame: int = librosa.time_to_frames( event.onset, sr=metadata.sr, hop_length=metadata.hop_length) end_frame: int = librosa.time_to_frames( event.onset + event.duration, sr=metadata.sr, hop_length=metadata.hop_length) hist, _ = np.histogram(yin_midipitches[onset_frame:end_frame], bins=128, range=(0, 128)) pitch: int = int(np.argmax(hist)) event.set_feature(cls(value=pitch)) return events
def calculate_f0(file_path, fmin=65, fmax=2093, sample_rate=44100, frame_length=2048, win_length=735 * 2, hop_length=735, threshold=0.1, seuil_voice=250): """ We calculated the fundamental frequency by using the algorithm yin and all the f0 higher than seuil_voice will be considered as unvoiced and be reset to 0 :param seuil_voice: :param threshold: :param file_path: the path of the wav file .wav :param fmin: :param fmax: :param sample_rate: the sample rate of the wav file :param frame_length: :param win_length: :param hop_length: :return: a numpy array which contains the f0 at every moment """ wav_file, _ = librosa.load(file_path, sr=sample_rate) result_f0 = librosa.yin(wav_file, fmin=fmin, fmax=fmax, sr=sample_rate, frame_length=frame_length, win_length=win_length, hop_length=hop_length, trough_threshold=threshold) # choose a threshold to distinguish voiced and unvoiced part of the waveform # for the unvoiced parts, we consider the F0 = 0 result_f0[result_f0 >= seuil_voice] = 0 # initialize a vector to save the voiced unvoiced flags uv_flags = np.zeros(result_f0.shape) uv_flags[result_f0 > 0] = 1 return result_f0, uv_flags
def __call__(self, data): signal = data['signal'] sr = data['sample_rate'] self.n_fft = int(np.ceil(0.025 * sr)) self.win_length = int(np.ceil(0.025 * sr)) self.hop_length = int(np.ceil(0.01 * sr)) audio_mfcc = librosa.feature.mfcc(y=signal.numpy().reshape(-1), sr=sr, n_mfcc=13, n_fft=self.n_fft, hop_length=self.hop_length) #f0, voiced_flag, voiced_probs = librosa.pyin(y=signal.numpy().reshape(-1), # sr=sr, # frame_length = self.n_fft, # hop_length = self.hop_length, # fmin=librosa.note_to_hz('C2'), # fmax=librosa.note_to_hz('C7')) f0 = librosa.yin(y=signal.numpy().reshape(-1), sr=sr, frame_length=self.n_fft, hop_length=self.hop_length, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) f0[f0 > 500] = 0 f0[np.isnan(f0)] = 0 delta1 = librosa.feature.delta(audio_mfcc) #delta1 = delta1 / np.linalg.norm(delta1) delta2 = librosa.feature.delta(audio_mfcc, order=2) #delta2 = delta2 / np.linalg.norm(delta2) mfcc_result = np.concatenate( (audio_mfcc.transpose(), delta1.transpose(), delta2.transpose(), f0.transpose().reshape((-1, 1))), axis=1) mfcc_result = torch.from_numpy(mfcc_result) mfcc_result = mfcc_pad2(mfcc_result) #f0 = f0 / np.linalg.norm(f0) data['MFCC'] = mfcc_result data['input'] = mfcc_result return data
def __call__(self, data): signal = data['signal'] sr = data['sample_rate'] self.n_fft = int(np.ceil(0.025 * sr)) self.win_length = int(np.ceil(0.025 * sr)) self.hop_length = int(np.ceil(0.01 * sr)) audio_mfcc = torch.FloatTensor( librosa.feature.mfcc(y=signal.numpy().reshape(-1), sr=sr, n_mfcc=13, n_fft=self.n_fft, hop_length=self.hop_length)) #f0, voiced_flag, voiced_probs = librosa.pyin(y=signal.numpy().reshape(-1), # sr=sr, # frame_length=n_fft, # hop_length=hop_length, # fmin=librosa.note_to_hz('C2'), # fmax=librosa.note_to_hz('C7')) f0 = torch.FloatTensor( librosa.yin(y=signal.numpy().reshape(-1), sr=sr, frame_length=self.n_fft, hop_length=self.hop_length, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))) f0[f0 > 500] = 0 f0[torch.isnan(f0)] = 0 delta1 = torch.FloatTensor(librosa.feature.delta(audio_mfcc)) delta2 = torch.FloatTensor(librosa.feature.delta(audio_mfcc, order=2)) # mfcc_result = np.concatenate((audio_mfcc, delta1, delta2), axis=0) f0_delta1 = torch.FloatTensor(librosa.feature.delta(f0)).view(1, -1) f0_delta2 = torch.FloatTensor(librosa.feature.delta(f0, order=2)).view( 1, -1) audio_mfcc = np.concatenate((audio_mfcc, f0.reshape((1, -1)))) delta1 = torch.cat((delta1, f0_delta1)) delta2 = torch.cat((delta2, f0_delta2)) audio_mfcc = torch.from_numpy(audio_mfcc) mfcc_result = torch.stack([audio_mfcc, delta1, delta2]) mfcc_result = mfcc_pad1(mfcc_result) data['MFCC'] = mfcc_result data['input'] = mfcc_result return data
def calculate_f0_without_threshold(file_path, fmin=65, fmax=2093, sample_rate=44100, frame_length=2048, win_length=735 * 2, hop_length=735, threshold=0.1): """ calculation of the fundamental frequency without the threshold :param file_path: :param fmin: :param fmax: :param sample_rate: :param frame_length: :param win_length: :param hop_length: :param threshold: :return: """ wav_file, _ = librosa.load(file_path, sr=sample_rate) result_f0 = librosa.yin(wav_file, fmin=fmin, fmax=fmax, sr=sample_rate, frame_length=frame_length, win_length=win_length, hop_length=hop_length, trough_threshold=threshold) return result_f0
import sounddevice as sd from scipy.io.wavfile import write import librosa fs = 44100 #Sample rate seconds = 4 #duration of recording print('start recording') myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1) sd.wait() #wait until recording is finished print('finished recording') write('output.wav', fs, myrecording) #save as wav file y, sr = librosa.load('output.wav') librosa.yin(y, fmin=310, fmax=400) # Guitar strings are E2=82.41Hz, A2=110Hz, D3=146.8Hz, G3=196Hz, B3=246.9Hz, E4=329.6Hz
y = normalize_signal(y) f_min = 80 #Hz f_max = 1300 #Hz taus = np.arange(start=sr // f_max, stop=sr // f_min) coinv_window = int(taus[-1] + 1) n_windows = len(y) // coinv_window - 1 pitches1 = step1(y) pitches2 = step2(y) pitches3 = step3(y) pitches4 = step4(y) pitches5 = step5(y) pitches6 = step6(y) librosa_pitches = librosa.yin(y, 100, 1300, win_length=coinv_window) fig, axs = plt.subplots(3, 2) custom_xlim = 0, len(y) / sr custom_ylim = 0, max(pitches5) * 1.25 axs[0, 0].plot(np.linspace(0, len(y) / sr, num=n_windows), pitches1) axs[0, 0].hlines(440, -0.1 * len(y) / sr, len(y) / sr * 1.1, colors='r', linestyles='--') axs[0, 0].set_title('Step 1') axs[0, 1].plot(np.linspace(0, len(y) / sr, num=n_windows), pitches2) axs[0, 1].hlines(440, -0.1 * len(y) / sr, len(y) / sr * 1.1, colors='r',
def __init__(self, *args, **kwargs): super(self.__class__, self).__init__(*args, **kwargs) args, _ = parser.parse_known_args() if args.samplerate == None: self.samplerate = \ int(sd.query_devices(args.input_device)['default_samplerate']) else: self.samplerate = int(args.samplerate) print(f"INFO -- Sampling rate at {self.samplerate} Hz") self.threadpool = QtCore.QThreadPool() self.q = queue.Queue() self.setFixedSize(args.width, args.height) self.mainbox = QtWidgets.QWidget() self.setCentralWidget(self.mainbox) self.layout = QtWidgets.QGridLayout() self.mainbox.setLayout(self.layout) # Widgets self.spec_plot = SpectrogramWidget() self.wave_plot = WaveFormWidget() for i, widget in enumerate([self.spec_plot, self.wave_plot]): self.layout.addWidget(widget, i, 0) # Initialize x and y self.length = self.samplerate * args.duration self.y = np.random.rand(self.length, len(args.channels)) self.x = np.linspace(0, args.duration, num=self.length) self.zcr = librosa.feature.zero_crossing_rate(self.y.mean(axis=1))[0] # Wave Plot self.waveline_1 = self.wave_plot.plot(x=self.x, y=self.y[:, 0], pen=pg.mkPen('g', width=0.5), name='channel_1') self.waveline_2 = self.wave_plot.plot(x=self.x, y=self.y[:, 1], pen=pg.mkPen('y', width=0.5), name='channel_2') self.waveline_3 = self.wave_plot.plot(x=np.linspace( 0, args.duration, self.zcr.shape[0]), y=self.zcr, pen=pg.mkPen('r', width=2), name='zcr') # Spectrogram self.fmax = int( librosa.core.fft_frequencies(sr=self.samplerate, n_fft=args.n_fft)[-1]) D = librosa.stft(y=self.y.mean(axis=1), n_fft=args.n_fft, center=False) self.specdata = librosa.amplitude_to_db(np.abs(D), ref=np.max) # M = librosa.feature.melspectrogram( # y=self.y.mean(axis=1), # sr=self.samplerate, # n_fft=args.n_fft, # n_mels=args.n_mels) # self.specdata = librosa.power_to_db(S=M, ref=np.max) self.F0 = librosa.yin(y=self.y.mean(axis=1), sr=self.samplerate, frame_length=2048, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C5'), center=False) self.spec_image = pg.ImageItem(item=self.specdata.T) self.spec_plot.addItem(item=self.spec_image) self.f0_line = self.spec_plot.plot(x=np.linspace( 0, args.duration, self.F0.shape[0]), y=self.F0, pen=pg.mkPen('r', width=2), name='f0') self.bar = pg.ColorBarItem(values=(librosa.note_to_hz('C2'), librosa.note_to_hz('C5')), cmap=pg.colormap.get('CET-L9')) self.bar.setImageItem(self.spec_image) # Start audio stream and animations self.start_stream() if args.input_device == 'Virtual Input (VB-Audio Virtual Cable), Windows DirectSound': self.play_media(media_url=args.media_url, type='stream', volume=100) self.animate() self.show()
'audio_files/oh/oh_0.wav', 'audio_files/eeh/eeh_0.wav', 'audio_files/ah/ah_1.wav', 'audio_files/oh/oh_1.wav', 'audio_files/eeh/eeh_1.wav', 'audio_files/ah/ah_2.wav', 'audio_files/oh/oh_2.wav', 'audio_files/eeh/eeh_2.wav'] sythesized = np.zeros(0) recorded = np.zeros(0) for i, file_name in enumerate(file_names): y, sr = librosa.load(file_name, sr=44100) pitch = librosa.yin(y, 50, 1000) fr = np.mean(pitch) n_fft = 8096 hop_size = 256 p = librosa.stft(y, n_fft=n_fft, hop_length=hop_size) d = librosa.amplitude_to_db( np.abs(p), ref=np.max ) freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft) m = np.mean( d , axis=1 ) m -= np.min( m ) def moving_average(x, w): x = np.hstack( ( x[0]*np.ones(w//2) , x , x[-1]*np.ones(w//2-1) ) ) ma = np.convolve(x, np.ones(w), 'valid') / w
def f0(wave_form, sample_rate, hop_length, fmin=80, fmax=10000): return yin(wave_form, fmin=fmin, fmax=fmax, hop_length=hop_length).reshape(-1, 1)
def pitch(samples: Samples) -> float: freqs = librosa.yin(samples.data, 65, 2000, sr=samples.rate) return np.mean(freqs)
def get_f0(self): f0 = lr.yin(self.samples, 65, 2093, SAMPLE_RATE) return np.mean(f0)