def test_vad(self):
     common_utils.set_audio_backend('default')
     filepath = common_utils.get_asset_path("vad-go-mono-32000.wav")
     waveform, sample_rate = torchaudio.load(filepath)
     self.assert_batch_consistencies(F.vad,
                                     waveform,
                                     sample_rate=sample_rate)
Exemplo n.º 2
0
    def assert_compatibilities(self, n_fft, hop_length, power, n_mels, n_mfcc,
                               sample_rate):
        common_utils.set_audio_backend('default')
        path = common_utils.get_asset_path('sinewave.wav')
        sound, sample_rate = common_utils.load_wav(path)
        sound_librosa = sound.cpu().numpy().squeeze()  # (64000)

        # test core spectrogram
        spect_transform = torchaudio.transforms.Spectrogram(
            n_fft=n_fft, hop_length=hop_length, power=power)
        out_librosa, _ = librosa.core.spectrum._spectrogram(
            y=sound_librosa, n_fft=n_fft, hop_length=hop_length, power=power)

        out_torch = spect_transform(sound).squeeze().cpu()
        self.assertEqual(out_torch,
                         torch.from_numpy(out_librosa),
                         atol=1e-5,
                         rtol=1e-5)

        # test mel spectrogram
        melspect_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            window_fn=torch.hann_window,
            hop_length=hop_length,
            n_mels=n_mels,
            n_fft=n_fft)
        librosa_mel = librosa.feature.melspectrogram(y=sound_librosa,
                                                     sr=sample_rate,
                                                     n_fft=n_fft,
                                                     hop_length=hop_length,
                                                     n_mels=n_mels,
                                                     htk=True,
                                                     norm=None)
        librosa_mel_tensor = torch.from_numpy(librosa_mel)
        torch_mel = melspect_transform(sound).squeeze().cpu()
        self.assertEqual(torch_mel.type(librosa_mel_tensor.dtype),
                         librosa_mel_tensor,
                         atol=5e-3,
                         rtol=1e-5)

        # test s2db
        power_to_db_transform = torchaudio.transforms.AmplitudeToDB(
            'power', 80.)
        power_to_db_torch = power_to_db_transform(
            spect_transform(sound)).squeeze().cpu()
        power_to_db_librosa = librosa.core.spectrum.power_to_db(out_librosa)
        self.assertEqual(power_to_db_torch,
                         torch.from_numpy(power_to_db_librosa),
                         atol=5e-3,
                         rtol=1e-5)

        mag_to_db_transform = torchaudio.transforms.AmplitudeToDB(
            'magnitude', 80.)
        mag_to_db_torch = mag_to_db_transform(torch.abs(sound)).squeeze().cpu()
        mag_to_db_librosa = librosa.core.spectrum.amplitude_to_db(
            sound_librosa)
        self.assertEqual(mag_to_db_torch,
                         torch.from_numpy(mag_to_db_librosa),
                         atol=5e-3,
                         rtol=1e-5)

        power_to_db_torch = power_to_db_transform(
            melspect_transform(sound)).squeeze().cpu()
        db_librosa = librosa.core.spectrum.power_to_db(librosa_mel)
        db_librosa_tensor = torch.from_numpy(db_librosa)
        self.assertEqual(power_to_db_torch.type(db_librosa_tensor.dtype),
                         db_librosa_tensor,
                         atol=5e-3,
                         rtol=1e-5)

        # test MFCC
        melkwargs = {'hop_length': hop_length, 'n_fft': n_fft}
        mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate,
                                                    n_mfcc=n_mfcc,
                                                    norm='ortho',
                                                    melkwargs=melkwargs)

        # librosa.feature.mfcc doesn't pass kwargs properly since some of the
        # kwargs for melspectrogram and mfcc are the same. We just follow the
        # function body in
        # https://librosa.github.io/librosa/_modules/librosa/feature/spectral.html#melspectrogram
        # to mirror this function call with correct args:
        #
        # librosa_mfcc = librosa.feature.mfcc(
        #     y=sound_librosa, sr=sample_rate, n_mfcc = n_mfcc,
        #     hop_length=hop_length, n_fft=n_fft, htk=True, norm=None, n_mels=n_mels)

        librosa_mfcc = scipy.fftpack.dct(db_librosa,
                                         axis=0,
                                         type=2,
                                         norm='ortho')[:n_mfcc]
        librosa_mfcc_tensor = torch.from_numpy(librosa_mfcc)
        torch_mfcc = mfcc_transform(sound).squeeze().cpu()

        self.assertEqual(torch_mfcc.type(librosa_mfcc_tensor.dtype),
                         librosa_mfcc_tensor,
                         atol=5e-3,
                         rtol=1e-5)