def test_encoder_decoder_save_load_from_encoder_decoder(self): config = self.get_encoder_decoder_config_small() # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights) encoder = TFViTModel(config.encoder) encoder(encoder.dummy_inputs) decoder = TFGPT2LMHeadModel(config.decoder) decoder(decoder.dummy_inputs) encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder) pixel_values = floats_tensor([ 13, encoder.config.num_channels, encoder.config.image_size, encoder.config.image_size, ]) decoder_input_ids = ids_tensor([13, 1], decoder.config.vocab_size) logits_orig = encoder_decoder_orig( pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits with tempfile.TemporaryDirectory() as tmp_dirname: encoder_path = os.path.join(tmp_dirname, "encoder") decoder_path = os.path.join(tmp_dirname, "decoder") encoder.save_pretrained(encoder_path) decoder.save_pretrained(decoder_path) encoder_decoder = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_path, decoder_path) logits_1 = encoder_decoder(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits self.assertTrue( logits_orig.numpy().sum() - logits_1.numpy().sum() < 1e-3) max_diff = np.max(np.abs(logits_1.numpy() - logits_orig.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=4) with tempfile.TemporaryDirectory() as tmp_dirname: encoder_decoder.save_pretrained(tmp_dirname) encoder_decoder = TFVisionEncoderDecoderModel.from_pretrained( tmp_dirname) logits_2 = encoder_decoder(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy())) self.assertAlmostEqual(max_diff, 0.0, places=4)
def create_and_check_model(self, config, pixel_values, labels): model = TFViTModel(config=config) result = model(pixel_values, training=False) # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) image_size = to_2tuple(self.image_size) patch_size = to_2tuple(self.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) self.parent.assertEqual( result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) # Test with an image with different size than the one specified in config. image_size = self.image_size // 2 pixel_values = pixel_values[:, :, :image_size, :image_size] result = model(pixel_values, interpolate_pos_encoding=True, training=False) # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) image_size = to_2tuple(image_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) self.parent.assertEqual( result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
def create_and_check_model(self, config, pixel_values, labels): model = TFViTModel(config=config) result = model(pixel_values, training=False) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) # Test with an image with different size than the one specified in config. image_size = self.image_size // 2 pixel_values = pixel_values[:, :, :image_size, :image_size] result = model(pixel_values, interpolate_pos_encoding=True, training=False) seq_length = (image_size // self.patch_size) ** 2 + 1 self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, seq_length, self.hidden_size))
def test_model_from_pretrained(self): model = TFViTModel.from_pretrained("google/vit-base-patch16-224") self.assertIsNotNone(model)
def get_encoder_decoder_models(self): encoder_model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder") decoder_model = TFGPT2LMHeadModel.from_pretrained("../gpt2", config=self.get_decoder_config(), name="decoder") return {"encoder": encoder_model, "decoder": decoder_model}
def get_encoder_decoder_model(self, config, decoder_config): encoder_model = TFViTModel(config, name="encoder") decoder_model = TFGPT2LMHeadModel(decoder_config, name="decoder") return encoder_model, decoder_model