def test_inference(self):
        model_name = "openai/clip-vit-base-patch32"
        model = TFCLIPModel.from_pretrained(model_name)
        processor = CLIPProcessor.from_pretrained(model_name)

        image = prepare_img()
        inputs = processor(text=["a photo of a cat", "a photo of a dog"],
                           images=image,
                           padding=True,
                           return_tensors="tf")

        outputs = model(**inputs, training=False)

        # verify the logits
        self.assertEqual(
            outputs.logits_per_image.shape,
            tf.TensorShape(
                (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
        )
        self.assertEqual(
            outputs.logits_per_text.shape,
            tf.TensorShape(
                (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
        )

        expected_logits = tf.constant([[24.5701, 19.3049]])

        tf.debugging.assert_near(outputs.logits_per_image,
                                 expected_logits,
                                 atol=1e-3)
 def test_model_from_pretrained(self):
     for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         model = TFCLIPModel.from_pretrained(model_name)
         self.assertIsNotNone(model)