def check_save_and_load_encoder_decoder_model( self, config, decoder_config, decoder_input_ids, decoder_attention_mask, pixel_values=None, **kwargs ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) enc_dec_model.to(torch_device) enc_dec_model.eval() with torch.no_grad(): outputs = enc_dec_model( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname) enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname) VisionEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path=encoder_tmp_dirname, decoder_pretrained_model_name_or_path=decoder_tmp_dirname, ) after_outputs = enc_dec_model( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) out_1 = after_outputs[0].cpu().numpy() out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5)
def check_equivalence_tf_to_pt(self, config, decoder_config, inputs_dict): encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) # Using `_tf_model`, the test will fail, because the weights of `_tf_model` get extended before saving # the encoder/decoder models. # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see # https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245 # (the change in `src/transformers/modeling_tf_utils.py`) _tf_model = TFVisionEncoderDecoderModel(encoder_decoder_config) # Make sure model is built _tf_model(**inputs_dict) # Using `tf_model` to pass the test. encoder = _tf_model.encoder.__class__(encoder_decoder_config.encoder) decoder = _tf_model.decoder.__class__(encoder_decoder_config.decoder) # Make sure models are built encoder(encoder.dummy_inputs) decoder(decoder.dummy_inputs) tf_model = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder) with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: tf_model.encoder.save_pretrained(encoder_tmp_dirname) tf_model.decoder.save_pretrained(decoder_tmp_dirname) pt_model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_tf=True, decoder_from_tf=True ) # This is only for copying some specific attributes of this particular model. pt_model.config = tf_model.config self.check_pt_tf_equivalence(pt_model, tf_model, inputs_dict)
def check_encoder_decoder_model_from_pretrained(self, config, decoder_config, decoder_input_ids, decoder_attention_mask, return_dict, pixel_values=None, **kwargs): encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) kwargs = { "encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict } enc_dec_model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( **kwargs) enc_dec_model.to(torch_device) outputs_encoder_decoder = enc_dec_model( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, output_hidden_states=True, return_dict=True, ) self.assertEqual(outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, )))
def get_pretrained_model_and_inputs(self): model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( "hf-internal-testing/tiny-random-vit", "hf-internal-testing/tiny-bert") batch_size = 13 pixel_values = floats_tensor([ batch_size, model.encoder.config.num_channels, model.encoder.config.image_size, model.encoder.config.image_size, ]) # for ViT, the sequence length is equal to the number of patches + 1 (for the [CLS] token) seq_len = (model.encoder.config.image_size // model.encoder.config.patch_size)**2 + 1 attention_mask = random_attention_mask([batch_size, seq_len]) decoder_input_ids = ids_tensor([batch_size, 4], model.decoder.config.vocab_size) decoder_attention_mask = random_attention_mask([batch_size, 4]) inputs = { "pixel_values": pixel_values, "attention_mask": attention_mask, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": decoder_attention_mask, } return model, inputs
def get_pretrained_model_and_inputs(self): model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( "hf-internal-testing/tiny-random-deit", "hf-internal-testing/tiny-random-roberta") batch_size = 13 pixel_values = floats_tensor([ batch_size, model.encoder.config.num_channels, model.encoder.config.image_size, model.encoder.config.image_size, ]) # for DEiT, the sequence length is equal to the number of patches + 2 (for the [CLS] and distillation tokens) decoder_input_ids = ids_tensor([batch_size, 4], model.decoder.config.vocab_size) decoder_attention_mask = random_attention_mask([batch_size, 4]) inputs = { "pixel_values": pixel_values, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": decoder_attention_mask, } return model, inputs