def test_encoder_decoder_save_load_from_encoder_decoder(self):
        config = self.get_encoder_decoder_config_small()

        # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
        encoder = TFViTModel(config.encoder)
        encoder(encoder.dummy_inputs)
        decoder = TFGPT2LMHeadModel(config.decoder)
        decoder(decoder.dummy_inputs)

        encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder,
                                                           decoder=decoder)

        pixel_values = floats_tensor([
            13,
            encoder.config.num_channels,
            encoder.config.image_size,
            encoder.config.image_size,
        ])
        decoder_input_ids = ids_tensor([13, 1], decoder.config.vocab_size)

        logits_orig = encoder_decoder_orig(
            pixel_values=pixel_values,
            decoder_input_ids=decoder_input_ids).logits

        with tempfile.TemporaryDirectory() as tmp_dirname:
            encoder_path = os.path.join(tmp_dirname, "encoder")
            decoder_path = os.path.join(tmp_dirname, "decoder")

            encoder.save_pretrained(encoder_path)
            decoder.save_pretrained(decoder_path)

            encoder_decoder = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder_path, decoder_path)

        logits_1 = encoder_decoder(pixel_values=pixel_values,
                                   decoder_input_ids=decoder_input_ids).logits

        self.assertTrue(
            logits_orig.numpy().sum() - logits_1.numpy().sum() < 1e-3)

        max_diff = np.max(np.abs(logits_1.numpy() - logits_orig.numpy()))
        self.assertAlmostEqual(max_diff, 0.0, places=4)

        with tempfile.TemporaryDirectory() as tmp_dirname:
            encoder_decoder.save_pretrained(tmp_dirname)
            encoder_decoder = TFVisionEncoderDecoderModel.from_pretrained(
                tmp_dirname)

        logits_2 = encoder_decoder(pixel_values=pixel_values,
                                   decoder_input_ids=decoder_input_ids).logits

        max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy()))
        self.assertAlmostEqual(max_diff, 0.0, places=4)
    def create_and_check_model(self, config, pixel_values, labels):
        model = TFViTModel(config=config)
        result = model(pixel_values, training=False)
        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
        image_size = to_2tuple(self.image_size)
        patch_size = to_2tuple(self.patch_size)
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
                                                          patch_size[0])
        self.parent.assertEqual(
            result.last_hidden_state.shape,
            (self.batch_size, num_patches + 1, self.hidden_size))

        # Test with an image with different size than the one specified in config.
        image_size = self.image_size // 2
        pixel_values = pixel_values[:, :, :image_size, :image_size]
        result = model(pixel_values,
                       interpolate_pos_encoding=True,
                       training=False)
        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
        image_size = to_2tuple(image_size)
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
                                                          patch_size[0])
        self.parent.assertEqual(
            result.last_hidden_state.shape,
            (self.batch_size, num_patches + 1, self.hidden_size))
    def create_and_check_model(self, config, pixel_values, labels):
        model = TFViTModel(config=config)
        result = model(pixel_values, training=False)
        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))

        # Test with an image with different size than the one specified in config.
        image_size = self.image_size // 2
        pixel_values = pixel_values[:, :, :image_size, :image_size]
        result = model(pixel_values, interpolate_pos_encoding=True, training=False)
        seq_length = (image_size // self.patch_size) ** 2 + 1
        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, seq_length, self.hidden_size))
    def test_model_from_pretrained(self):

        model = TFViTModel.from_pretrained("google/vit-base-patch16-224")
        self.assertIsNotNone(model)
 def get_encoder_decoder_models(self):
     encoder_model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder")
     decoder_model = TFGPT2LMHeadModel.from_pretrained("../gpt2", config=self.get_decoder_config(), name="decoder")
     return {"encoder": encoder_model, "decoder": decoder_model}
 def get_encoder_decoder_model(self, config, decoder_config):
     encoder_model = TFViTModel(config, name="encoder")
     decoder_model = TFGPT2LMHeadModel(decoder_config, name="decoder")
     return encoder_model, decoder_model