示例#1
0
    def test_window_length(self):
        X = stft(self.x, 512, 160, window_length=400)
        x_hat = istft(X, 512, 160, window_length=400)

        X_ref = istft(stft(self.x, 400, 160), 400, 160)
        tc.assert_equal(X.shape, (243, 257))

        tc.assert_allclose(X_ref, x_hat, rtol=1e-6, atol=1e-6)
示例#2
0
    def test_restore_time_signal_with_str_window(self):
        x = self.x
        X = stft(x, window='hann')

        tc.assert_almost_equal(
            x, istft(X, 1024, 256, window='hann', num_samples=len(x)))
        tc.assert_equal(X.shape, (154, 513))
示例#3
0
 def stft(self, x):
     from paderbox.transform.module_stft import stft
     return stft(
         x,
         size=self.stft_size,
         shift=self.stft_shift,
         fading=self.stft_fading,
     )
示例#4
0
 def test_compare_with_matlab(self):
     y = self.x
     Y_python = stft(y, symmetric_window=True)
     mlab = Mlab().process
     mlab.set_variable('y', y)
     mlab.run_code('Y = transform.stft(y(:), 1024, 256, @blackman);')
     Y_matlab = mlab.get_variable('Y').T
     tc.assert_almost_equal(Y_matlab, Y_python)
示例#5
0
 def test_restore_time_signal_from_stft_and_istft_odd_parameter(self):
     x = self.x
     import random
     kwargs = dict(
         # size=np.random.randint(100, 200),
         size=151,  # Test uneven size
         shift=np.random.randint(40, 100),
         window=random.choice(['blackman', 'hann', 'hamming']),
         fading='full',
     )
     X = stft(x, **kwargs)
     x_hat = istft(X, **kwargs, num_samples=x.shape[-1])
     assert x_hat.dtype == np.float64, (x_hat.dtype, x.dtype)
     tc.assert_almost_equal(x, x_hat, err_msg=str(kwargs))
示例#6
0
    def test_batch_mode(self):
        size = 1024
        shift = 256

        # Reference
        X = stft_single_channel(self.x)

        x1 = np.array([self.x, self.x])
        X1 = stft(x1)
        tc.assert_equal(X1.shape, (2, 154, 513))

        for d in np.ndindex(2):
            tc.assert_equal(X1[d, :, :].squeeze(), X)

        x11 = np.array([x1, x1])
        X11 = stft(x11)
        tc.assert_equal(X11.shape, (2, 2, 154, 513))
        for d, k in np.ndindex(2, 2):
            tc.assert_equal(X11[d, k, :, :].squeeze(), X)

        x2 = x1.transpose()
        X2 = stft(x2, axis=0)
        tc.assert_equal(X2.shape, (154, 513, 2))
        for d in np.ndindex(2):
            tc.assert_equal(X2[:, :, d].squeeze(), X)

        x21 = np.array([x2, x2])
        X21 = stft(x21, axis=1)
        tc.assert_equal(X21.shape, (2, 154, 513, 2))
        for d, k in np.ndindex(2, 2):
            tc.assert_equal(X21[d, :, :, k].squeeze(), X)

        x22 = x21.swapaxes(0, 1)
        X22 = stft(x22, axis=0)
        tc.assert_equal(X22.shape, (154, 513, 2, 2))
        for d, k in np.ndindex(2, 2):
            tc.assert_equal(X22[:, :, d, k].squeeze(), X)
示例#7
0
    def test_spectrogram_and_energy(self):
        x = self.x
        X = stft(x)
        spectrogram = stft_to_spectrogram(X)
        energy = spectrogram_to_energy_per_frame(spectrogram)

        tc.assert_equal(X.shape, (154, 513))

        tc.assert_equal(spectrogram.shape, (154, 513))
        tc.assert_isreal(spectrogram)
        tc.assert_array_greater_equal(spectrogram, 0)

        tc.assert_equal(energy.shape, (154, ))
        tc.assert_isreal(energy)
        tc.assert_array_greater_equal(energy, 0)
示例#8
0
    def test_restore_time_signal_from_stft_and_istft(self):
        x = self.x
        X = stft(x)

        tc.assert_almost_equal(x, istft(X, 1024, 256)[:len(x)])
        tc.assert_equal(X.shape, (154, 513))
示例#9
0
    def test_stft_frame_count(self):

        stft_params = dict(size=1024, shift=256, fading=False)

        x = np.random.normal(size=[1023])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (1, 513))

        x = np.random.normal(size=[1024])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (1, 513))

        x = np.random.normal(size=[1025])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (2, 513))

        stft_params = dict(size=1024, shift=256, fading=True)

        x = np.random.normal(size=[1023])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (7, 513))

        x = np.random.normal(size=[1024])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (7, 513))

        x = np.random.normal(size=[1025])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (8, 513))

        stft_params = dict(size=512,
                           shift=160,
                           window_length=400,
                           fading=False)

        x = np.random.normal(size=[399])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (1, 257))

        x = np.random.normal(size=[400])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (1, 257))

        x = np.random.normal(size=[401])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (2, 257))

        x = np.random.normal(size=[559])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (2, 257))

        x = np.random.normal(size=[560])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (2, 257))

        x = np.random.normal(size=[561])
        X = stft(x, **stft_params)
        tc.assert_equal(X.shape, (3, 257))
示例#10
0
    def test_restore_time_signal_from_stft_and_istft_kaldi_params(self):
        x = self.x
        X = stft(x, size=400, shift=160)

        tc.assert_almost_equal(x, istft(X, 400, 160)[:len(x)])
        tc.assert_equal(X.shape, (243, 201))
示例#11
0
    def test_restore_time_signal_from_stft_and_istft_with_num_samples(self):
        x = self.x
        X = stft(x)

        tc.assert_almost_equal(x, istft(X, 1024, 256, num_samples=len(x)))
        tc.assert_equal(X.shape, (154, 513))
示例#12
0
def fbank(time_signal: np.ndarray,
          sample_rate: int = 16000,
          window_length: int = 400,
          stft_shift: int = 160,
          number_of_filters: int = 23,
          stft_size: int = 512,
          lowest_frequency: float = 0.,
          highest_frequency: Optional[float] = None,
          preemphasis_factor: float = 0.97,
          window: Callable = scipy.signal.windows.hamming,
          denoise: bool = False):
    """Compute Mel-filterbank energy features from an audio signal.

    Source: https://github.com/jameslyons/python_speech_features
    Tutorial: http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/ # noqa

    Illustrations: http://ntjenkins.upb.de/view/PythonToolbox/job/python_toolbox_notebooks/HTML_Report/toolbox_examples/transform/06%20-%20Additional%20features.html

    Args:
        time_signal: the audio signal from which to compute features.
            Should be an N*1 array
        sample_rate: the sample rate of the signal we are working with.
        window_length: the length of the analysis window in samples.
            Default is 400 (25 milliseconds @ 16kHz)
        stft_shift: the step between successive windows in samples.
            Default is 160 (10 milliseconds @ 16kHz)
        number_of_filters: the number of filters in the filterbank, default 23.
        stft_size: the FFT size. Default is 512.
        lowest_frequency: lowest band edge of mel filters.
            In Hz, default is 0.
        highest_frequency: highest band edge of mel filters.
            In Hz, default is samplerate/2
        preemphasis_factor: apply preemphasis filter with preemph as coefficient.
            0 is no filter. Default is 0.97.
        window: window function used for stft
        denoise:

    Returns: A numpy array of size (frames by number_of_filters) containing the
        Mel filterbank features.

    """
    highest_frequency = highest_frequency or sample_rate / 2
    time_signal = preemphasis_with_offset_compensation(time_signal,
                                                       preemphasis_factor)

    stft_signal = stft(time_signal,
                       size=stft_size,
                       shift=stft_shift,
                       window=window,
                       window_length=window_length,
                       fading=None)

    spectrogram = stft_to_spectrogram(stft_signal) / stft_size

    mel_transform = MelTransform(sample_rate=sample_rate,
                                 stft_size=stft_size,
                                 number_of_filters=number_of_filters,
                                 lowest_frequency=lowest_frequency,
                                 highest_frequency=highest_frequency,
                                 log=False)
    feature = mel_transform(spectrogram)

    if denoise:
        feature -= np.min(feature, axis=0)

    return feature
示例#13
0
def modmfcc(time_signal,
            sample_rate=16000,
            stft_win_len=400,
            stft_shift=160,
            numcep=30,
            number_of_filters=40,
            stft_size=512,
            lowest_frequency=0,
            highest_frequency=None,
            preemphasis_factor=0.97,
            ceplifter=22,
            stft_window=scipy.signal.hamming,
            mod_length=16,
            mod_shift=8,
            mod_window=scipy.signal.hamming,
            avg_length=1,
            avg_shift=1):
    """
    Compute Mod-MFCC features from an audio signal.

    :param time_signal: the audio signal from which to compute features.
        Should be an channels x samples array.
    :param sample_rate: the sample rate of the signal we are working with.
        Default is 16000.
    :param stft_win_len: the length of the analysis window. In samples.
        Default is 400 (25 milliseconds @ 16kHz).
    :param stft_shift: the step between successive windows. In samples.
        Default is 160 (10 milliseconds @ 16kHz).
    :param numcep: the number of cepstrum to return, Default is 20.
    :param number_of_filters: number of filters in the filterbank,
        Default is 40.
    :param stft_size: the FFT size. Default is 512.
    :param lowest_frequency: lowest band edge of mel filters. In Hz,
        Default is 0.
    :param highest_frequency: highest band edge of mel filters. In Hz,
        Default is samplerate/2.
    :param preemphasis_factor: apply preemphasis filter with preemphasis_factor
        as coefficient. 0 is no filter. Default is 0.97.
    :param ceplifter: the liftering coefficient to use.
        ceplifter <= 0 disables lifter.
        Default is 22.
    :param stft_window: the window function to use for fbank features. Default is
        hamming window.
    :returns: A numpy array of size (NUMFRAMES by numcep) containing features.
        Each row holds 1 feature vector.
    """
    x = mfcc(time_signal,
             sample_rate=sample_rate,
             window_length=stft_win_len,
             window=stft_window,
             stft_shift=stft_shift,
             stft_size=stft_size,
             number_of_filters=number_of_filters,
             lowest_frequency=lowest_frequency,
             highest_frequency=highest_frequency,
             preemphasis_factor=preemphasis_factor,
             ceplifter=ceplifter,
             numcep=numcep)

    x = np.abs(
        stft(x,
             size=mod_length,
             shift=mod_shift,
             window=mod_window,
             axis=-2,
             fading=False))
    assert avg_length >= avg_shift
    if avg_length > 1:
        x = segment_axis(x,
                         length=avg_length,
                         shift=avg_shift,
                         end='pad',
                         axis=-3)
        x = np.mean(x, axis=-3)
    return x