def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = random_attention_mask( [self.batch_size, self.seq_length]) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = self.get_config() return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def check_ctc_training(self, config, input_values, *args): config.ctc_zero_infinity = True model = Data2VecAudioForCTC(config=config) model.to(torch_device) model.train() # freeze feature encoder model.freeze_feature_encoder() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths( torch.tensor(input_lengths)) labels = ids_tensor( (input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i]:] = 0.0 if max_length_labels[i] < labels.shape[-1]: # it's important that we make sure that target lenghts are at least # one shorter than logit lenghts to prevent -inf labels[i, max_length_labels[i] - 1:] = -100 loss = model(input_values, labels=labels).loss self.parent.assertFalse(torch.isinf(loss).item()) loss.backward()
def check_seq_classifier_loss(self, config, input_values, *args): model = Data2VecAudioForSequenceClassification(config=config) model.to(torch_device) # make sure that dropout is disabled model.eval() input_values = input_values[:3] attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long) input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label)) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i]:] = 0.0 attention_mask[i, input_lengths[i]:] = 0 masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() unmasked_loss = model(input_values, labels=labels).loss.item() self.parent.assertTrue(isinstance(masked_loss, float)) self.parent.assertTrue(isinstance(unmasked_loss, float)) self.parent.assertTrue(masked_loss != unmasked_loss)
def check_ctc_loss(self, config, input_values, *args): model = SEWDForCTC(config=config) model.to(torch_device) # make sure that dropout is disabled model.eval() input_values = input_values[:3] attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long) input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i] :] = 0.0 attention_mask[i, input_lengths[i] :] = 0 model.config.ctc_loss_reduction = "sum" sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() model.config.ctc_loss_reduction = "mean" mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() self.parent.assertTrue(isinstance(sum_loss, float)) self.parent.assertTrue(isinstance(mean_loss, float))
def check_xvector_training(self, config, *args): config.ctc_zero_infinity = True model = UniSpeechSatForXVector(config=config) model.to(torch_device) model.train() # freeze everything but the classification head model.freeze_base_model() # use a longer sequence length to account for TDNN temporal downsampling input_values = floats_tensor([self.batch_size, self.seq_length * 2], self.vocab_size) input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label)) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i]:] = 0.0 loss = model(input_values, labels=labels).loss self.parent.assertFalse(torch.isinf(loss).item()) loss.backward()
def prepare_config_and_inputs_for_decoder(self): ( config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, ) = self.prepare_config_and_inputs() config.is_decoder = True encoder_hidden_states = floats_tensor( [self.batch_size, self.seq_length, self.hidden_size]) encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) return ( config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, )
def test_retain_grad_hidden_states_attentions(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.output_hidden_states = True config.output_attentions = True # no need to test all models as different heads yield the same functionality model_class = self.all_model_classes[0] model = model_class(config) model.to(torch_device) # set layer drop to 0 model.config.layerdrop = 0.0 input_values = inputs_dict["input_values"] input_lengths = torch.tensor( [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device ) output_lengths = model._get_feat_extract_output_lengths(input_lengths) labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size) inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"]) inputs_dict["labels"] = labels outputs = model(**inputs_dict) output = outputs[0] # Encoder-/Decoder-only models hidden_states = outputs.hidden_states[0] hidden_states.retain_grad() output.flatten()[0].backward(retain_graph=True) self.assertIsNotNone(hidden_states.grad)
def test_save_load_custom_head(self): model_name = "bert-base-uncased" model_config = AutoConfig.from_pretrained( model_name, custom_heads={"tag": CustomHead}) model1 = AutoModelWithHeads.from_pretrained(model_name, config=model_config) model2 = AutoModelWithHeads.from_pretrained(model_name, config=model_config) config = { "head_type": "tag", "num_labels": 3, "layers": 2, "activation_function": "tanh" } model1.add_custom_head("custom_head", config) with tempfile.TemporaryDirectory() as temp_dir: model1.save_head(temp_dir, "custom_head") model2.load_head(temp_dir) model1.eval() model2.eval() in_data = ids_tensor((1, 128), 1000) output1 = model1(in_data) output2 = model2(in_data) self.assertEqual(output1[0].size(), output2[0].size()) state1 = model1.config.prediction_heads["custom_head"].state_dict() state2 = model2.config.prediction_heads["custom_head"].state_dict() for ((k1, v1), (k2, v2)) in zip(state1.items(), state2.items()): self.assertTrue(torch.equal(v1, v2))
def test_custom_head_from_model_config(self): model_name = "bert-base-uncased" model_config = AutoConfig.from_pretrained(model_name, custom_heads={"tag": CustomHead}) model = AutoModelWithHeads.from_pretrained(model_name, config=model_config) config = {"head_type": "tag", "num_labels": 3, "layers": 2, "activation_function": "tanh"} model.add_custom_head("custom_head", config) model.eval() in_data = ids_tensor((1, 128), 1000) output1 = model(in_data) model.add_tagging_head("tagging_head", num_labels=3, layers=2) output2 = model(in_data) self.assertEqual(output1[0].size(), output2[0].size())
def check_labels_out_of_vocab(self, config, input_values, *args): model = SEWDForCTC(config) model.to(torch_device) model.train() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100) with pytest.raises(ValueError): model(input_values, labels=labels)
def check_seq_classifier_training(self, config, input_values, *args): config.ctc_zero_infinity = True model = SEWDForSequenceClassification(config=config) model.to(torch_device) model.train() # freeze everything but the classification head model.freeze_base_model() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label)) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i] :] = 0.0 loss = model(input_values, labels=labels).loss self.parent.assertFalse(torch.isinf(loss).item()) loss.backward()
def create_and_check_decoder_model_past_large_inputs( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ): config.is_decoder = True config.add_cross_attention = True model = Data2VecTextForCausalLM(config=config).to(torch_device).eval() # make sure that ids don't start with pad token mask = input_ids.ne(config.pad_token_id).long() input_ids = input_ids * mask # first forward pass outputs = model( input_ids, attention_mask=input_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, use_cache=True, ) past_key_values = outputs.past_key_values # create hypothetical multiple next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) # make sure that ids don't start with pad token mask = next_tokens.ne(config.pad_token_id).long() next_tokens = next_tokens * mask next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) # append to next input_ids and next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) output_from_no_past = model( next_input_ids, attention_mask=next_attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, output_hidden_states=True, )["hidden_states"][0] output_from_past = model( next_tokens, attention_mask=next_attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, output_hidden_states=True, )["hidden_states"][0] # select random slice random_slice_idx = ids_tensor((1, ), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach( ) output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() self.parent.assertTrue( output_from_past_slice.shape[1] == next_tokens.shape[1]) # test that outputs are equal for slice self.parent.assertTrue( torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))