# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import os import shutil import tempfile import unittest from transformers import MCTCTProcessor, is_speech_available, is_torch_available from transformers.file_utils import FEATURE_EXTRACTOR_NAME from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizer from transformers.testing_utils import require_torch, require_torchaudio if is_speech_available() and is_torch_available(): from transformers import MCTCTFeatureExtractor from .test_feature_extraction_mctct import floats_list @require_torch @require_torchaudio class MCTCTProcessorTest(unittest.TestCase): def setUp(self): vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split( " ") vocab_tokens = dict(zip(vocab, range(len(vocab)))) self.add_kwargs_tokens_map = { "pad_token": "<pad>",
import shutil import tempfile import unittest from pathlib import Path from shutil import copyfile from transformers import Speech2TextTokenizer, is_speech_available from transformers.models.speech_to_text.tokenization_speech_to_text import VOCAB_FILES_NAMES, save_json from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_torch, require_torchaudio from transformers.utils import FEATURE_EXTRACTOR_NAME from .test_feature_extraction_speech_to_text import floats_list if is_speech_available(): from transformers import Speech2TextFeatureExtractor, Speech2TextProcessor SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model") @require_torch @require_torchaudio @require_sentencepiece class Speech2TextProcessorTest(unittest.TestCase): def setUp(self): self.tmpdirname = tempfile.mkdtemp() vocab = ["<s>", "<pad>", "</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est"] vocab_tokens = dict(zip(vocab, range(len(vocab))))
class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): feature_extraction_class = Speech2TextFeatureExtractor if is_speech_available( ) else None def setUp(self): self.feat_extract_tester = Speech2TextFeatureExtractionTester(self) def test_call(self): # Tests that all call wrap to encode_plus and batch_encode_plus feature_extractor = self.feature_extraction_class( **self.feat_extract_tester.prepare_feat_extract_dict()) # create three inputs of length 800, 1000, and 1200 speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] np_speech_inputs = [ np.asarray(speech_input) for speech_input in speech_inputs ] # Test feature size input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features self.assertTrue(input_features.ndim == 3) self.assertTrue( input_features.shape[-1] == feature_extractor.feature_size) # Test not batched input encoded_sequences_1 = feature_extractor( speech_inputs[0], return_tensors="np").input_features encoded_sequences_2 = feature_extractor( np_speech_inputs[0], return_tensors="np").input_features self.assertTrue( np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) # Test batched encoded_sequences_1 = feature_extractor( speech_inputs, return_tensors="np").input_features encoded_sequences_2 = feature_extractor( np_speech_inputs, return_tensors="np").input_features for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) def test_cepstral_mean_and_variance_normalization(self): feature_extractor = self.feature_extraction_class( **self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] paddings = ["longest", "max_length", "do_not_pad"] max_lengths = [None, 16, None] var_tolerances = [1e-3, 1e-3, 1e-1] for max_length, padding, var_tol in zip(max_lengths, paddings, var_tolerances): inputs = feature_extractor(speech_inputs, padding=padding, max_length=max_length, return_attention_mask=True) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = [np.sum(x) for x in attention_mask] def _check_zero_mean_unit_variance(input_vector, var_tol=1e-3): self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) self.assertTrue( np.all(np.abs(np.var(input_vector, axis=0) - 1) < var_tol)) _check_zero_mean_unit_variance( input_features[0][:fbank_feat_lengths[0]], var_tol) _check_zero_mean_unit_variance( input_features[1][:fbank_feat_lengths[1]], var_tol) _check_zero_mean_unit_variance( input_features[2][:fbank_feat_lengths[2]], var_tol) def test_cepstral_mean_and_variance_normalization_np(self): feature_extractor = self.feature_extraction_class( **self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] paddings = ["longest", "max_length", "do_not_pad"] max_lengths = [None, 16, None] var_tolerances = [1e-3, 1e-3, 1e-1] for max_length, padding, var_tol in zip(max_lengths, paddings, var_tolerances): inputs = feature_extractor(speech_inputs, max_length=max_length, padding=padding, return_tensors="np", return_attention_mask=True) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = [np.sum(x) for x in attention_mask] def _check_zero_mean_unit_variance(input_vector, var_tol=1e-3): self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) self.assertTrue( np.all(np.abs(np.var(input_vector, axis=0) - 1) < var_tol)) _check_zero_mean_unit_variance( input_features[0][:fbank_feat_lengths[0]], var_tol) _check_zero_mean_unit_variance( input_features[1][:fbank_feat_lengths[1]], var_tol) _check_zero_mean_unit_variance( input_features[2][:fbank_feat_lengths[2]], var_tol) def test_cepstral_mean_and_variance_normalization_trunc(self): feature_extractor = self.feature_extraction_class( **self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] inputs = feature_extractor( speech_inputs, padding="max_length", max_length=4, truncation=True, return_tensors="np", return_attention_mask=True, ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) def _check_zero_mean_unit_variance(input_vector): self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) self.assertTrue( np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3)) _check_zero_mean_unit_variance( input_features[0, :fbank_feat_lengths[0]]) _check_zero_mean_unit_variance(input_features[1]) _check_zero_mean_unit_variance(input_features[2])
class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): feature_extraction_class = MCTCTFeatureExtractor if is_speech_available( ) else None def setUp(self): self.feat_extract_tester = MCTCTFeatureExtractionTester(self) def _check_zero_mean_unit_variance(self, input_vector): self.assertTrue(np.all(np.mean(input_vector) < 1e-3)) self.assertTrue(np.all(np.abs(np.var(input_vector) - 1) < 1e-3)) def test_call(self): # Tests that all call wrap to encode_plus and batch_encode_plus feature_extractor = self.feature_extraction_class( **self.feat_extract_tester.prepare_feat_extract_dict()) # create three inputs of length 800, 1000, and 12000 speech_inputs = [ floats_list((1, x))[0] for x in range(8000, 14000, 2000) ] np_speech_inputs = [ np.asarray(speech_input) for speech_input in speech_inputs ] # Test feature size input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features self.assertTrue(input_features.ndim == 3) self.assertTrue( input_features.shape[-1] == feature_extractor.feature_size) # Test not batched input encoded_sequences_1 = feature_extractor( speech_inputs[0], return_tensors="np").input_features encoded_sequences_2 = feature_extractor( np_speech_inputs[0], return_tensors="np").input_features self.assertTrue( np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) # Test batched encoded_sequences_1 = feature_extractor( speech_inputs, return_tensors="np").input_features encoded_sequences_2 = feature_extractor( np_speech_inputs, return_tensors="np").input_features for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) def test_cepstral_mean_and_variance_normalization(self): feature_extractor = self.feature_extraction_class( **self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [ floats_list((1, x))[0] for x in range(8000, 14000, 2000) ] paddings = ["longest", "max_length", "do_not_pad"] max_lengths = [None, 16, None] for max_length, padding in zip(max_lengths, paddings): inputs = feature_extractor( speech_inputs, padding=padding, max_length=max_length, return_attention_mask=True, truncation=max_length is not None, # reference to #16419 ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = [np.sum(x) for x in attention_mask] self._check_zero_mean_unit_variance( input_features[0][:fbank_feat_lengths[0]]) self._check_zero_mean_unit_variance( input_features[1][:fbank_feat_lengths[1]]) self._check_zero_mean_unit_variance( input_features[2][:fbank_feat_lengths[2]]) def test_cepstral_mean_and_variance_normalization_np(self): feature_extractor = self.feature_extraction_class( **self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [ floats_list((1, x))[0] for x in range(8000, 14000, 2000) ] paddings = ["longest", "max_length", "do_not_pad"] max_lengths = [None, 16, None] for max_length, padding in zip(max_lengths, paddings): inputs = feature_extractor( speech_inputs, max_length=max_length, padding=padding, return_tensors="np", return_attention_mask=True, truncation=max_length is not None, ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = [np.sum(x) for x in attention_mask] self._check_zero_mean_unit_variance( input_features[0][:fbank_feat_lengths[0]]) self.assertTrue( input_features[0][fbank_feat_lengths[0]:].sum() < 1e-6) self._check_zero_mean_unit_variance( input_features[1][:fbank_feat_lengths[1]]) self.assertTrue( input_features[0][fbank_feat_lengths[1]:].sum() < 1e-6) self._check_zero_mean_unit_variance( input_features[2][:fbank_feat_lengths[2]]) def test_cepstral_mean_and_variance_normalization_trunc_max_length(self): feature_extractor = self.feature_extraction_class( **self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [ floats_list((1, x))[0] for x in range(8000, 14000, 2000) ] inputs = feature_extractor( speech_inputs, padding="max_length", max_length=4, truncation=True, return_tensors="np", return_attention_mask=True, ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) self._check_zero_mean_unit_variance( input_features[0, :fbank_feat_lengths[0]]) self._check_zero_mean_unit_variance(input_features[1]) self._check_zero_mean_unit_variance(input_features[2]) def test_cepstral_mean_and_variance_normalization_trunc_longest(self): feature_extractor = self.feature_extraction_class( **self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [ floats_list((1, x))[0] for x in range(8000, 14000, 2000) ] inputs = feature_extractor( speech_inputs, padding="longest", max_length=4, truncation=True, return_tensors="np", return_attention_mask=True, ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) self._check_zero_mean_unit_variance( input_features[0, :fbank_feat_lengths[0]]) self._check_zero_mean_unit_variance( input_features[1, :fbank_feat_lengths[1]]) self._check_zero_mean_unit_variance(input_features[2]) # make sure that if max_length < longest -> then pad to max_length self.assertEqual(input_features.shape, (3, 4, 24)) speech_inputs = [ floats_list((1, x))[0] for x in range(8000, 14000, 2000) ] inputs = feature_extractor( speech_inputs, padding="longest", max_length=16, truncation=True, return_tensors="np", return_attention_mask=True, ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) self._check_zero_mean_unit_variance( input_features[0, :fbank_feat_lengths[0]]) self._check_zero_mean_unit_variance( input_features[1, :fbank_feat_lengths[1]]) self._check_zero_mean_unit_variance(input_features[2]) # make sure that if max_length < longest -> then pad to max_length self.assertEqual(input_features.shape, (3, 16, 24)) def test_double_precision_pad(self): import torch feature_extractor = self.feature_extraction_class( **self.feat_extract_tester.prepare_feat_extract_dict()) np_speech_inputs = np.random.rand(100, 32).astype(np.float64) py_speech_inputs = np_speech_inputs.tolist() for inputs in [py_speech_inputs, np_speech_inputs]: np_processed = feature_extractor.pad([{ "input_features": inputs }], return_tensors="np") self.assertTrue(np_processed.input_features.dtype == np.float32) pt_processed = feature_extractor.pad([{ "input_features": inputs }], return_tensors="pt") self.assertTrue(pt_processed.input_features.dtype == torch.float32) def test_different_window(self): import torch init_dict = self.feat_extract_tester.prepare_feat_extract_dict() init_dict["win_function"] = "hann_window" feature_extractor = self.feature_extraction_class(**init_dict) np_speech_inputs = np.random.rand(100, 32).astype(np.float64) py_speech_inputs = np_speech_inputs.tolist() for inputs in [py_speech_inputs, np_speech_inputs]: np_processed = feature_extractor.pad([{ "input_features": inputs }], return_tensors="np") self.assertTrue(np_processed.input_features.dtype == np.float32) pt_processed = feature_extractor.pad([{ "input_features": inputs }], return_tensors="pt") self.assertTrue(pt_processed.input_features.dtype == torch.float32)
class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): feature_extraction_class = Speech2TextFeatureExtractor if is_speech_available() else None def setUp(self): self.feat_extract_tester = Speech2TextFeatureExtractionTester(self) def test_call(self): # Tests that all call wrap to encode_plus and batch_encode_plus feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) # create three inputs of length 800, 1000, and 1200 speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] # Test feature size input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features self.assertTrue(input_features.ndim == 3) self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size) # Test not batched input encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) # Test batched encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) def test_cepstral_mean_and_variance_normalization(self): feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] # TODO(Patrick, Suraj, Anton) - It's surprising that "non-padded/non-numpified" padding # results in quite inaccurate variance computation after (see 5e-1 tolerance) # Issue is filed and PR is underway: https://github.com/huggingface/transformers/issues/13539 # paddings = ["longest", "max_length", "do_not_pad"] # max_lengths = [None, 16, None] # var_tolerances = [1e-3, 1e-3, 5e-1] paddings = ["longest", "max_length"] max_lengths = [None, 16] var_tolerances = [1e-3, 1e-3] for max_length, padding, var_tol in zip(max_lengths, paddings, var_tolerances): inputs = feature_extractor( speech_inputs, padding=padding, max_length=max_length, return_attention_mask=True ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = [np.sum(x) for x in attention_mask] def _check_zero_mean_unit_variance(input_vector, var_tol=1e-3): self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < var_tol)) _check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]], var_tol) _check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]], var_tol) _check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]], var_tol) def test_cepstral_mean_and_variance_normalization_np(self): feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] # TODO(Patrick, Suraj, Anton) - It's surprising that "non-padded/non-numpified" padding # results in quite inaccurate variance computation after (see 5e-1 tolerance) # Issue is filed and PR is underway: https://github.com/huggingface/transformers/issues/13539 # paddings = ["longest", "max_length", "do_not_pad"] # max_lengths = [None, 16, None] # var_tolerances = [1e-3, 1e-3, 5e-1] paddings = ["longest", "max_length"] max_lengths = [None, 16] var_tolerances = [1e-3, 1e-3] for max_length, padding, var_tol in zip(max_lengths, paddings, var_tolerances): inputs = feature_extractor( speech_inputs, max_length=max_length, padding=padding, return_tensors="np", return_attention_mask=True ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = [np.sum(x) for x in attention_mask] def _check_zero_mean_unit_variance(input_vector, var_tol=1e-3): self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < var_tol)) _check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]], var_tol) self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6) _check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]], var_tol) self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6) _check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]], var_tol) def test_cepstral_mean_and_variance_normalization_trunc_max_length(self): feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] inputs = feature_extractor( speech_inputs, padding="max_length", max_length=4, truncation=True, return_tensors="np", return_attention_mask=True, ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) def _check_zero_mean_unit_variance(input_vector): self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3)) _check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]]) _check_zero_mean_unit_variance(input_features[1]) _check_zero_mean_unit_variance(input_features[2]) def test_cepstral_mean_and_variance_normalization_trunc_longest(self): feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] inputs = feature_extractor( speech_inputs, padding="longest", max_length=4, truncation=True, return_tensors="np", return_attention_mask=True, ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) def _check_zero_mean_unit_variance(input_vector): self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3)) _check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]]) _check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]]) _check_zero_mean_unit_variance(input_features[2]) # make sure that if max_length < longest -> then pad to max_length self.assertEqual(input_features.shape, (3, 4, 24)) speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] inputs = feature_extractor( speech_inputs, padding="longest", max_length=16, truncation=True, return_tensors="np", return_attention_mask=True, ) input_features = inputs.input_features attention_mask = inputs.attention_mask fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) _check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]]) _check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]]) _check_zero_mean_unit_variance(input_features[2]) # make sure that if max_length < longest -> then pad to max_length self.assertEqual(input_features.shape, (3, 6, 24))