def test_audio_preprocessors(self): batch_size = 2 dl = nemo_asr.AudioToSpeechLabelDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=batch_size, # placement=DeviceType.GPU, drop_last=False, shuffle=False, ) installed_torchaudio = True try: import torchaudio except ModuleNotFoundError: installed_torchaudio = False with self.assertRaises(ModuleNotFoundError): to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(n_fft=400, window=None) with self.assertRaises(ModuleNotFoundError): to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15) if installed_torchaudio: to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(n_fft=400, window=None) to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15) time_stretch_augment = nemo_asr.TimeStretchAugmentation( self.featurizer_config['sample_rate'], probability=1.0, min_speed_rate=0.9, max_speed_rate=1.1 ) to_melspec = nemo_asr.AudioToMelSpectrogramPreprocessor(features=50) for batch in dl.data_iterator: input_signals, seq_lengths, _, _ = batch input_signals = input_signals.to(to_melspec._device) seq_lengths = seq_lengths.to(to_melspec._device) melspec = to_melspec.forward(input_signals, seq_lengths) if installed_torchaudio: spec = to_spectrogram.forward(input_signals, seq_lengths) mfcc = to_mfcc.forward(input_signals, seq_lengths) ts_input_signals = time_stretch_augment.forward(input_signals, seq_lengths) # Check that number of features is what we expect self.assertTrue(melspec[0].shape[1] == 50) if installed_torchaudio: self.assertTrue(spec[0].shape[1] == 201) # n_fft // 2 + 1 bins self.assertTrue(mfcc[0].shape[1] == 15) timesteps = ts_input_signals[0].shape[1] self.assertTrue(timesteps <= int(1.15 * self.featurizer_config['sample_rate'])) self.assertTrue(timesteps >= int(0.85 * self.featurizer_config['sample_rate']))
def test_audio_preprocessors(self): batch_size = 5 dl = nemo_asr.AudioToTextDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=batch_size, # placement=DeviceType.GPU, drop_last=True, shuffle=False, ) installed_torchaudio = True try: import torchaudio except ModuleNotFoundError: installed_torchaudio = False with self.assertRaises(ModuleNotFoundError): to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor( n_fft=400, window=None) with self.assertRaises(ModuleNotFoundError): to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15) if installed_torchaudio: to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor( n_fft=400, window=None) to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15) to_melspec = nemo_asr.AudioToMelSpectrogramPreprocessor(features=50) for batch in dl.data_iterator: input_signals, seq_lengths, _, _ = batch input_signals = input_signals.to(to_melspec._device) seq_lengths = seq_lengths.to(to_melspec._device) melspec = to_melspec.forward(input_signals, seq_lengths) if installed_torchaudio: spec = to_spectrogram.forward(input_signals, seq_lengths) mfcc = to_mfcc.forward(input_signals, seq_lengths) # Check that number of features is what we expect self.assertTrue(melspec[0].shape[1] == 50) if installed_torchaudio: self.assertTrue(spec[0].shape[1] == 201) # n_fft // 2 + 1 bins self.assertTrue(mfcc[0].shape[1] == 15)