def test_window_length(self): X = stft(self.x, 512, 160, window_length=400) x_hat = istft(X, 512, 160, window_length=400) X_ref = istft(stft(self.x, 400, 160), 400, 160) tc.assert_equal(X.shape, (243, 257)) tc.assert_allclose(X_ref, x_hat, rtol=1e-6, atol=1e-6)
def test_restore_time_signal_with_str_window(self): x = self.x X = stft(x, window='hann') tc.assert_almost_equal( x, istft(X, 1024, 256, window='hann', num_samples=len(x))) tc.assert_equal(X.shape, (154, 513))
def stft(self, x): from paderbox.transform.module_stft import stft return stft( x, size=self.stft_size, shift=self.stft_shift, fading=self.stft_fading, )
def test_compare_with_matlab(self): y = self.x Y_python = stft(y, symmetric_window=True) mlab = Mlab().process mlab.set_variable('y', y) mlab.run_code('Y = transform.stft(y(:), 1024, 256, @blackman);') Y_matlab = mlab.get_variable('Y').T tc.assert_almost_equal(Y_matlab, Y_python)
def test_restore_time_signal_from_stft_and_istft_odd_parameter(self): x = self.x import random kwargs = dict( # size=np.random.randint(100, 200), size=151, # Test uneven size shift=np.random.randint(40, 100), window=random.choice(['blackman', 'hann', 'hamming']), fading='full', ) X = stft(x, **kwargs) x_hat = istft(X, **kwargs, num_samples=x.shape[-1]) assert x_hat.dtype == np.float64, (x_hat.dtype, x.dtype) tc.assert_almost_equal(x, x_hat, err_msg=str(kwargs))
def test_batch_mode(self): size = 1024 shift = 256 # Reference X = stft_single_channel(self.x) x1 = np.array([self.x, self.x]) X1 = stft(x1) tc.assert_equal(X1.shape, (2, 154, 513)) for d in np.ndindex(2): tc.assert_equal(X1[d, :, :].squeeze(), X) x11 = np.array([x1, x1]) X11 = stft(x11) tc.assert_equal(X11.shape, (2, 2, 154, 513)) for d, k in np.ndindex(2, 2): tc.assert_equal(X11[d, k, :, :].squeeze(), X) x2 = x1.transpose() X2 = stft(x2, axis=0) tc.assert_equal(X2.shape, (154, 513, 2)) for d in np.ndindex(2): tc.assert_equal(X2[:, :, d].squeeze(), X) x21 = np.array([x2, x2]) X21 = stft(x21, axis=1) tc.assert_equal(X21.shape, (2, 154, 513, 2)) for d, k in np.ndindex(2, 2): tc.assert_equal(X21[d, :, :, k].squeeze(), X) x22 = x21.swapaxes(0, 1) X22 = stft(x22, axis=0) tc.assert_equal(X22.shape, (154, 513, 2, 2)) for d, k in np.ndindex(2, 2): tc.assert_equal(X22[:, :, d, k].squeeze(), X)
def test_spectrogram_and_energy(self): x = self.x X = stft(x) spectrogram = stft_to_spectrogram(X) energy = spectrogram_to_energy_per_frame(spectrogram) tc.assert_equal(X.shape, (154, 513)) tc.assert_equal(spectrogram.shape, (154, 513)) tc.assert_isreal(spectrogram) tc.assert_array_greater_equal(spectrogram, 0) tc.assert_equal(energy.shape, (154, )) tc.assert_isreal(energy) tc.assert_array_greater_equal(energy, 0)
def test_restore_time_signal_from_stft_and_istft(self): x = self.x X = stft(x) tc.assert_almost_equal(x, istft(X, 1024, 256)[:len(x)]) tc.assert_equal(X.shape, (154, 513))
def test_stft_frame_count(self): stft_params = dict(size=1024, shift=256, fading=False) x = np.random.normal(size=[1023]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (1, 513)) x = np.random.normal(size=[1024]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (1, 513)) x = np.random.normal(size=[1025]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (2, 513)) stft_params = dict(size=1024, shift=256, fading=True) x = np.random.normal(size=[1023]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (7, 513)) x = np.random.normal(size=[1024]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (7, 513)) x = np.random.normal(size=[1025]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (8, 513)) stft_params = dict(size=512, shift=160, window_length=400, fading=False) x = np.random.normal(size=[399]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (1, 257)) x = np.random.normal(size=[400]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (1, 257)) x = np.random.normal(size=[401]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (2, 257)) x = np.random.normal(size=[559]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (2, 257)) x = np.random.normal(size=[560]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (2, 257)) x = np.random.normal(size=[561]) X = stft(x, **stft_params) tc.assert_equal(X.shape, (3, 257))
def test_restore_time_signal_from_stft_and_istft_kaldi_params(self): x = self.x X = stft(x, size=400, shift=160) tc.assert_almost_equal(x, istft(X, 400, 160)[:len(x)]) tc.assert_equal(X.shape, (243, 201))
def test_restore_time_signal_from_stft_and_istft_with_num_samples(self): x = self.x X = stft(x) tc.assert_almost_equal(x, istft(X, 1024, 256, num_samples=len(x))) tc.assert_equal(X.shape, (154, 513))
def fbank(time_signal: np.ndarray, sample_rate: int = 16000, window_length: int = 400, stft_shift: int = 160, number_of_filters: int = 23, stft_size: int = 512, lowest_frequency: float = 0., highest_frequency: Optional[float] = None, preemphasis_factor: float = 0.97, window: Callable = scipy.signal.windows.hamming, denoise: bool = False): """Compute Mel-filterbank energy features from an audio signal. Source: https://github.com/jameslyons/python_speech_features Tutorial: http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/ # noqa Illustrations: http://ntjenkins.upb.de/view/PythonToolbox/job/python_toolbox_notebooks/HTML_Report/toolbox_examples/transform/06%20-%20Additional%20features.html Args: time_signal: the audio signal from which to compute features. Should be an N*1 array sample_rate: the sample rate of the signal we are working with. window_length: the length of the analysis window in samples. Default is 400 (25 milliseconds @ 16kHz) stft_shift: the step between successive windows in samples. Default is 160 (10 milliseconds @ 16kHz) number_of_filters: the number of filters in the filterbank, default 23. stft_size: the FFT size. Default is 512. lowest_frequency: lowest band edge of mel filters. In Hz, default is 0. highest_frequency: highest band edge of mel filters. In Hz, default is samplerate/2 preemphasis_factor: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. window: window function used for stft denoise: Returns: A numpy array of size (frames by number_of_filters) containing the Mel filterbank features. """ highest_frequency = highest_frequency or sample_rate / 2 time_signal = preemphasis_with_offset_compensation(time_signal, preemphasis_factor) stft_signal = stft(time_signal, size=stft_size, shift=stft_shift, window=window, window_length=window_length, fading=None) spectrogram = stft_to_spectrogram(stft_signal) / stft_size mel_transform = MelTransform(sample_rate=sample_rate, stft_size=stft_size, number_of_filters=number_of_filters, lowest_frequency=lowest_frequency, highest_frequency=highest_frequency, log=False) feature = mel_transform(spectrogram) if denoise: feature -= np.min(feature, axis=0) return feature
def modmfcc(time_signal, sample_rate=16000, stft_win_len=400, stft_shift=160, numcep=30, number_of_filters=40, stft_size=512, lowest_frequency=0, highest_frequency=None, preemphasis_factor=0.97, ceplifter=22, stft_window=scipy.signal.hamming, mod_length=16, mod_shift=8, mod_window=scipy.signal.hamming, avg_length=1, avg_shift=1): """ Compute Mod-MFCC features from an audio signal. :param time_signal: the audio signal from which to compute features. Should be an channels x samples array. :param sample_rate: the sample rate of the signal we are working with. Default is 16000. :param stft_win_len: the length of the analysis window. In samples. Default is 400 (25 milliseconds @ 16kHz). :param stft_shift: the step between successive windows. In samples. Default is 160 (10 milliseconds @ 16kHz). :param numcep: the number of cepstrum to return, Default is 20. :param number_of_filters: number of filters in the filterbank, Default is 40. :param stft_size: the FFT size. Default is 512. :param lowest_frequency: lowest band edge of mel filters. In Hz, Default is 0. :param highest_frequency: highest band edge of mel filters. In Hz, Default is samplerate/2. :param preemphasis_factor: apply preemphasis filter with preemphasis_factor as coefficient. 0 is no filter. Default is 0.97. :param ceplifter: the liftering coefficient to use. ceplifter <= 0 disables lifter. Default is 22. :param stft_window: the window function to use for fbank features. Default is hamming window. :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. """ x = mfcc(time_signal, sample_rate=sample_rate, window_length=stft_win_len, window=stft_window, stft_shift=stft_shift, stft_size=stft_size, number_of_filters=number_of_filters, lowest_frequency=lowest_frequency, highest_frequency=highest_frequency, preemphasis_factor=preemphasis_factor, ceplifter=ceplifter, numcep=numcep) x = np.abs( stft(x, size=mod_length, shift=mod_shift, window=mod_window, axis=-2, fading=False)) assert avg_length >= avg_shift if avg_length > 1: x = segment_axis(x, length=avg_length, shift=avg_shift, end='pad', axis=-3) x = np.mean(x, axis=-3) return x