class Speech2Text2StandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (Speech2Text2Decoder, Speech2Text2ForCausalLM) if is_torch_available() else () all_generative_model_classes = (Speech2Text2ForCausalLM,) if is_torch_available() else () test_pruning = False def setUp( self, ): self.model_tester = Speech2Text2StandaloneDecoderModelTester(self, is_training=False) self.config_tester = ConfigTester(self, config_class=Speech2Text2Config) # not implemented currently def test_inputs_embeds(self): pass # speech2text2 has no base model def test_save_load_fast_init_from_base(self): pass # speech2text2 has no base model def test_save_load_fast_init_to_base(self): pass def test_config(self): self.config_tester.run_common_tests() def test_decoder_model_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_past(*config_and_inputs) # decoder cannot keep gradients def test_retain_grad_hidden_states_attentions(self): return
class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = ( Speech2TextModel, Speech2TextForConditionalGeneration) if is_torch_available() else () all_generative_model_classes = ( Speech2TextForConditionalGeneration, ) if is_torch_available() else () is_encoder_decoder = True test_pruning = False test_missing_keys = False test_torchscript = True input_name = "input_features" def setUp(self): self.model_tester = Speech2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=Speech2TextConfig) self.maxDiff = 3000 def test_config(self): self.config_tester.run_common_tests() def test_save_load_strict(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs() for model_class in self.all_model_classes: model = model_class(config) with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model2, info = model_class.from_pretrained( tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) def test_model_forward(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_forward(*config_and_inputs) def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_past_large_inputs( *config_and_inputs) def test_encoder_decoder_model_standalone(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common( ) self.model_tester.check_encoder_decoder_model_standalone( *config_and_inputs) # not implemented currently def test_inputs_embeds(self): pass # training is not supported yet def test_training(self): pass def test_training_gradient_checkpointing(self): pass def test_generate_fp16(self): config, input_dict = self.model_tester.prepare_config_and_inputs() input_features = input_dict["input_features"] attention_mask = input_dict["attention_mask"] model = Speech2TextForConditionalGeneration(config).eval().to( torch_device) if torch_device == "cuda": input_features = input_features.half() model.half() model.generate(input_features, attention_mask=attention_mask) model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) signature = inspect.signature(model.forward) # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] expected_arg_names = [ "input_features", "attention_mask", "decoder_input_ids", "decoder_attention_mask", ] expected_arg_names.extend([ "head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs" ] if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names else ["encoder_outputs"]) self.assertListEqual(arg_names[:len(expected_arg_names)], expected_arg_names) def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1) self.assertEqual(len(hidden_states), expected_num_layers) if hasattr(self.model_tester, "encoder_seq_length"): seq_length = self.model_tester.encoder_seq_length else: seq_length = self.model_tester.seq_length subsampled_seq_length = model._get_feat_extract_output_lengths( seq_length) self.assertListEqual( list(hidden_states[0].shape[-2:]), [subsampled_seq_length, self.model_tester.hidden_size], ) if config.is_encoder_decoder: hidden_states = outputs.decoder_hidden_states self.assertIsInstance(hidden_states, (list, tuple)) self.assertEqual(len(hidden_states), expected_num_layers) seq_len = getattr(self.model_tester, "seq_length", None) decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) self.assertListEqual( list(hidden_states[0].shape[-2:]), [decoder_seq_length, self.model_tester.hidden_size], ) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True check_hidden_states_output(inputs_dict, config, model_class) # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.return_dict = True seq_len = getattr(self.model_tester, "seq_length", None) decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True model = model_class(config) model.to(torch_device) model.eval() subsampled_encoder_seq_length = model._get_feat_extract_output_lengths( encoder_seq_length) subsampled_encoder_key_length = model._get_feat_extract_output_lengths( encoder_key_length) with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length ], ) out_len = len(outputs) correct_outlen = 5 # loss is at first position if "labels" in inputs_dict: correct_outlen += 1 # loss is added to beginning if "past_key_values" in outputs: correct_outlen += 1 # past_key_values have been returned self.assertEqual(out_len, correct_outlen) # decoder attentions decoder_attentions = outputs.decoder_attentions self.assertIsInstance(decoder_attentions, (list, tuple)) self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length ], ) # cross attentions cross_attentions = outputs.cross_attentions self.assertIsInstance(cross_attentions, (list, tuple)) self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(cross_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, decoder_seq_length, subsampled_encoder_key_length, ], ) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = True model = model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): outputs = model( **self._prepare_for_class(inputs_dict, model_class)) added_hidden_states = 2 self.assertEqual(out_len + added_hidden_states, len(outputs)) self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(self_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length ], ) def test_resize_tokens_embeddings(self): ( original_config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() if not self.test_resize_embeddings: return for model_class in self.all_model_classes: config = copy.deepcopy(original_config) model = model_class(config) model.to(torch_device) if self.model_tester.is_training is False: model.eval() model_vocab_size = config.vocab_size # Retrieve the embeddings and clone theme model_embed = model.resize_token_embeddings(model_vocab_size) cloned_embeddings = model_embed.weight.clone() # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size model_embed = model.resize_token_embeddings(model_vocab_size + 10) self.assertEqual(model.config.vocab_size, model_vocab_size + 10) # Check that it actually resizes the embeddings matrix self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size model_embed = model.resize_token_embeddings(model_vocab_size - 15) self.assertEqual(model.config.vocab_size, model_vocab_size - 15) # Check that it actually resizes the embeddings matrix self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) # make sure that decoder_input_ids are resized if "decoder_input_ids" in inputs_dict: inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) model(**self._prepare_for_class(inputs_dict, model_class)) # Check that adding and removing tokens has not modified the first part of the embedding matrix. models_equal = True for p1, p2 in zip(cloned_embeddings, model_embed.weight): if p1.data.ne(p2.data).sum() > 0: models_equal = False self.assertTrue(models_equal) def test_resize_embeddings_untied(self): ( original_config, inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() if not self.test_resize_embeddings: return original_config.tie_word_embeddings = False # if model cannot untied embeddings -> leave test if original_config.tie_word_embeddings: return for model_class in self.all_model_classes: config = copy.deepcopy(original_config) model = model_class(config).to(torch_device) # if no output embeddings -> leave test if model.get_output_embeddings() is None: continue # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size model_vocab_size = config.vocab_size model.resize_token_embeddings(model_vocab_size + 10) self.assertEqual(model.config.vocab_size, model_vocab_size + 10) output_embeds = model.get_output_embeddings() self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10) # Check bias if present if output_embeds.bias is not None: self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10) # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size model.resize_token_embeddings(model_vocab_size - 15) self.assertEqual(model.config.vocab_size, model_vocab_size - 15) # Check that it actually resizes the embeddings matrix output_embeds = model.get_output_embeddings() self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15) # Check bias if present if output_embeds.bias is not None: self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15) # Check that the model can still do a forward pass successfully (every parameter should be resized) if "decoder_input_ids" in inputs_dict: inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) def test_generate_without_input_ids(self): pass @staticmethod def _get_encoder_outputs(model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1): encoder = model.get_encoder() encoder_outputs = encoder( input_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) encoder_outputs[ "last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave( num_interleave, dim=0) input_ids = input_ids[:, :, 0] input_ids = torch.zeros_like( input_ids[:, :1], dtype=torch.long) + model._get_decoder_start_token_id() attention_mask = None return encoder_outputs, input_ids, attention_mask def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1): batch_size, seq_length = input_ids.shape[:2] subsampled_seq_length = self.model_tester.get_subsampled_output_lengths( seq_length) num_sequences_in_output = batch_size * num_return_sequences gen_len = (output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length) # scores self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) # Attentions # encoder self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, subsampled_seq_length) # decoder self._check_attentions_for_generate( num_sequences_in_output, output.decoder_attentions, min_length=1, max_length=output.sequences.shape[-1], config=config, use_cache=use_cache, ) # Hidden States # encoder self._check_encoder_hidden_states_for_generate( output.encoder_hidden_states, batch_size, config, subsampled_seq_length) # decoder self._check_hidden_states_for_generate( num_sequences_in_output, output.decoder_hidden_states, min_length=1, max_length=output.sequences.shape[-1], config=config, use_cache=use_cache, ) def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: return configs_no_init = _config_zero_init( config) # To be sure we have no Nan configs_no_init.torchscript = True for model_class in self.all_model_classes: model = model_class(config=configs_no_init) model.to(torch_device) model.eval() inputs = self._prepare_for_class(inputs_dict, model_class) try: model.config.use_cache = False # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward input_features = inputs["input_features"] attention_mask = inputs["attention_mask"] decoder_input_ids = inputs["decoder_input_ids"] decoder_attention_mask = inputs["decoder_attention_mask"] traced_model = torch.jit.trace( model, (input_features, attention_mask, decoder_input_ids, decoder_attention_mask)) except RuntimeError: self.fail("Couldn't trace module.") with tempfile.TemporaryDirectory() as tmp_dir_name: pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") try: torch.jit.save(traced_model, pt_file_name) except Exception: self.fail("Couldn't save module.") try: loaded_model = torch.jit.load(pt_file_name) except Exception: self.fail("Couldn't load module.") model.to(torch_device) model.eval() loaded_model.to(torch_device) loaded_model.eval() model_state_dict = model.state_dict() loaded_model_state_dict = loaded_model.state_dict() self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) models_equal = True for layer_name, p1 in model_state_dict.items(): p2 = loaded_model_state_dict[layer_name] if p1.data.ne(p2.data).sum() > 0: models_equal = False self.assertTrue(models_equal)
from transformers.file_utils import cached_property from transformers.testing_utils import ( is_torch_available, require_sentencepiece, require_tokenizers, require_torch, require_torchaudio, slow, torch_device, ) from ..generation.test_generation_utils import GenerationTesterMixin from ..test_configuration_common import ConfigTester from ..test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor if is_torch_available(): import torch from transformers import Speech2TextForConditionalGeneration, Speech2TextModel, Speech2TextProcessor from transformers.models.speech_to_text.modeling_speech_to_text import Speech2TextDecoder, Speech2TextEncoder def prepare_speech_to_text_inputs_dict( config, input_features, decoder_input_ids, attention_mask=None, decoder_attention_mask=None, head_mask=None, decoder_head_mask=None, cross_attn_head_mask=None,