def test_pitch_feats(self, kwargs): """compute_kaldi_pitch produces numerically compatible result with compute-kaldi-pitch-feats""" sample_rate = kwargs['sample_rate'] waveform = get_sinusoid(dtype='float32', sample_rate=sample_rate) result = F.compute_kaldi_pitch(waveform[0], **kwargs) waveform = get_sinusoid(dtype='int16', sample_rate=sample_rate) wave_file = self.get_temp_path('test.wav') save_wav(wave_file, waveform, sample_rate) command = ['compute-kaldi-pitch-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-'] kaldi_result = run_kaldi(command, 'scp', wave_file) self.assert_equal(result, expected=kaldi_result)
def extract_features(x, sr): step = 0.01 fft_time = 0.05 n_mels = 128 n_mfcc = 40 n_fft = int(fft_time * sr) hop_length = int(step * sr) spec = AT.MelSpectrogram(sample_rate=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, f_max=8000)(x)[0] intensity = spec.mean(dim=0).log() spec = AT.AmplitudeToDB()(spec) mfcc = AT.MFCC( sample_rate=sr, n_mfcc=n_mfcc, melkwargs={ "n_fft": n_fft, "hop_length": hop_length, "n_mels": n_mels, "f_max": 8000, }, )(x)[0] mfcc = (mfcc - mfcc.mean(dim=1, keepdim=True)) / mfcc.std(dim=1, keepdim=True) pitch_feature = AF.compute_kaldi_pitch( x, sample_rate=sr, frame_length=fft_time * 1000, frame_shift=step * 1000, snip_edges=True, min_f0=70, max_f0=350, penalty_factor=0.01, ) pitch = pitch_feature[0] return { "Waveform": x[0], "MelSpectrogram": spec, "MFCC": mfcc, "Pitch": pitch, "Intensity": intensity, }
plot_pitch(waveform, sample_rate, pitch) play_audio(waveform, sample_rate) ###################################################################### # Kaldi Pitch (beta) # ------------------ # # Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic # speech recognition (ASR) applications. This is a beta feature in ``torchaudio``, # and it is available only in ``functional``. # # 1. A pitch extraction algorithm tuned for automatic speech recognition # # Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S. # Khudanpur # # 2014 IEEE International Conference on Acoustics, Speech and Signal # Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi: # 10.1109/ICASSP.2014.6854049. # [`abstract <https://ieeexplore.ieee.org/document/6854049>`__], # [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__] # waveform, sample_rate = get_speech_sample(resample=16000) pitch_feature = F.compute_kaldi_pitch(waveform, sample_rate) pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1] plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc) play_audio(waveform, sample_rate)
def func(tensor): sample_rate: float = 44100. return F.compute_kaldi_pitch(tensor, sample_rate)