def test_save_load_pretrained_additional_features(self):
        processor = CLIPProcessor(
            tokenizer=self.get_tokenizer(),
            feature_extractor=self.get_feature_extractor())
        processor.save_pretrained(self.tmpdirname)

        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)",
                                                  eos_token="(EOS)")
        feature_extractor_add_kwargs = self.get_feature_extractor(
            do_normalize=False, padding_value=1.0)

        processor = CLIPProcessor.from_pretrained(self.tmpdirname,
                                                  bos_token="(BOS)",
                                                  eos_token="(EOS)",
                                                  do_normalize=False,
                                                  padding_value=1.0)

        self.assertEqual(processor.tokenizer.get_vocab(),
                         tokenizer_add_kwargs.get_vocab())
        self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)

        self.assertEqual(processor.feature_extractor.to_json_string(),
                         feature_extractor_add_kwargs.to_json_string())
        self.assertIsInstance(processor.feature_extractor,
                              CLIPFeatureExtractor)
    def test_save_load_pretrained_default(self):
        tokenizer_slow = self.get_tokenizer()
        tokenizer_fast = self.get_rust_tokenizer()
        feature_extractor = self.get_feature_extractor()

        processor_slow = CLIPProcessor(tokenizer=tokenizer_slow,
                                       feature_extractor=feature_extractor)
        processor_slow.save_pretrained(self.tmpdirname)
        processor_slow = CLIPProcessor.from_pretrained(self.tmpdirname,
                                                       use_fast=False)

        processor_fast = CLIPProcessor(tokenizer=tokenizer_fast,
                                       feature_extractor=feature_extractor)
        processor_fast.save_pretrained(self.tmpdirname)
        processor_fast = CLIPProcessor.from_pretrained(self.tmpdirname)

        self.assertEqual(processor_slow.tokenizer.get_vocab(),
                         tokenizer_slow.get_vocab())
        self.assertEqual(processor_fast.tokenizer.get_vocab(),
                         tokenizer_fast.get_vocab())
        self.assertEqual(tokenizer_slow.get_vocab(),
                         tokenizer_fast.get_vocab())
        self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
        self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)

        self.assertEqual(processor_slow.feature_extractor.to_json_string(),
                         feature_extractor.to_json_string())
        self.assertEqual(processor_fast.feature_extractor.to_json_string(),
                         feature_extractor.to_json_string())
        self.assertIsInstance(processor_slow.feature_extractor,
                              CLIPFeatureExtractor)
        self.assertIsInstance(processor_fast.feature_extractor,
                              CLIPFeatureExtractor)
示例#3
0
    def test_tokenizer_decode(self):
        feature_extractor = self.get_feature_extractor()
        tokenizer = self.get_tokenizer()

        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)

        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]

        decoded_processor = processor.batch_decode(predicted_ids)
        decoded_tok = tokenizer.batch_decode(predicted_ids)

        self.assertListEqual(decoded_tok, decoded_processor)
示例#4
0
    def test_save_load_pretrained_default(self):
        tokenizer = self.get_tokenizer()
        feature_extractor = self.get_feature_extractor()

        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)

        processor.save_pretrained(self.tmpdirname)
        processor = CLIPProcessor.from_pretrained(self.tmpdirname)

        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
        self.assertIsInstance(processor.tokenizer, CLIPTokenizer)

        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
        self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor)
    def test_inference(self):
        model_name = "openai/clip-vit-base-patch32"
        model = TFCLIPModel.from_pretrained(model_name)
        processor = CLIPProcessor.from_pretrained(model_name)

        image = prepare_img()
        inputs = processor(text=["a photo of a cat", "a photo of a dog"],
                           images=image,
                           padding=True,
                           return_tensors="tf")

        outputs = model(**inputs, training=False)

        # verify the logits
        self.assertEqual(
            outputs.logits_per_image.shape,
            tf.TensorShape(
                (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
        )
        self.assertEqual(
            outputs.logits_per_text.shape,
            tf.TensorShape(
                (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
        )

        expected_logits = tf.constant([[24.5701, 19.3049]])

        tf.debugging.assert_near(outputs.logits_per_image,
                                 expected_logits,
                                 atol=1e-3)
示例#6
0
    def test_inference(self):
        model_name = "nvidia/groupvit-gcc-yfcc"
        model = GroupViTModel.from_pretrained(model_name)
        processor = CLIPProcessor.from_pretrained(model_name)

        image = prepare_img()
        inputs = processor(text=["a photo of a cat", "a photo of a dog"],
                           images=image,
                           padding=True,
                           return_tensors="pt")

        # forward pass
        with torch.no_grad():
            outputs = model(**inputs)

        # verify the logits
        self.assertEqual(
            outputs.logits_per_image.shape,
            torch.Size(
                (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
        )
        self.assertEqual(
            outputs.logits_per_text.shape,
            torch.Size(
                (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
        )

        expected_logits = torch.tensor([[13.3523, 6.3629]])

        self.assertTrue(
            torch.allclose(outputs.logits_per_image,
                           expected_logits,
                           atol=1e-3))
示例#7
0
    def test_inference(self):
        model_name = "openai/clip-vit-base-patch32"
        model = CLIPModel.from_pretrained(model_name).to(torch_device)
        processor = CLIPProcessor.from_pretrained(model_name)

        image = prepare_img()
        inputs = processor(text=["a photo of a cat", "a photo of a dog"],
                           images=image,
                           padding=True,
                           return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():
            outputs = model(**inputs)

        # verify the logits
        self.assertEqual(
            outputs.logits_per_image.shape,
            torch.Size(
                (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
        )
        self.assertEqual(
            outputs.logits_per_text.shape,
            torch.Size(
                (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
        )

        expected_logits = torch.tensor([[24.5701, 19.3049]],
                                       device=torch_device)

        self.assertTrue(
            torch.allclose(outputs.logits_per_image,
                           expected_logits,
                           atol=1e-3))
示例#8
0
    def test_feature_extractor(self):
        feature_extractor = self.get_feature_extractor()
        tokenizer = self.get_tokenizer()

        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)

        image_input = self.prepare_image_inputs()

        input_feat_extract = feature_extractor(image_input, return_tensors="np")
        input_processor = processor(images=image_input, return_tensors="np")

        for key in input_feat_extract.keys():
            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
示例#9
0
    def test_tokenizer(self):
        feature_extractor = self.get_feature_extractor()
        tokenizer = self.get_tokenizer()

        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)

        input_str = "lower newer"

        encoded_processor = processor(text=input_str)

        encoded_tok = tokenizer(input_str)

        for key in encoded_tok.keys():
            self.assertListEqual(encoded_tok[key], encoded_processor[key])
示例#10
0
    def test_processor(self):
        feature_extractor = self.get_feature_extractor()
        tokenizer = self.get_tokenizer()

        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)

        input_str = "lower newer"
        image_input = self.prepare_image_inputs()

        inputs = processor(text=input_str, images=image_input)

        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])

        # test if it raises when no input is passed
        with pytest.raises(ValueError):
            processor()
def convert_groupvit_checkpoint(checkpoint_path,
                                pytorch_dump_folder_path,
                                model_name="groupvit-gcc-yfcc",
                                push_to_hub=False):
    """
    Copy/paste/tweak model's weights to the Transformers design.
    """
    config = GroupViTConfig()
    model = GroupViTModel(config).eval()

    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
    new_state_dict = convert_state_dict(state_dict, config)
    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict,
                                                          strict=False)
    assert missing_keys == ["text_model.embeddings.position_ids"]
    assert (unexpected_keys == ["multi_label_logit_scale"
                                ]) or (len(unexpected_keys) == 0)

    # verify result
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    image = prepare_img()
    inputs = processor(text=["a photo of a cat", "a photo of a dog"],
                       images=image,
                       padding=True,
                       return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    if model_name == "groupvit-gcc-yfcc":
        expected_logits = torch.tensor([[13.3523, 6.3629]])
    elif model_name == "groupvit-gcc-redcaps":
        expected_logits = torch.tensor([[16.1873, 8.6230]])
    else:
        raise ValueError(f"Model name {model_name} not supported.")
    assert torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3)

    processor.save_pretrained(pytorch_dump_folder_path)
    model.save_pretrained(pytorch_dump_folder_path)
    print("Successfully saved processor and model to",
          pytorch_dump_folder_path)

    if push_to_hub:
        print("Pushing to the hub...")
        processor.push_to_hub(model_name, organization="nielsr")
        model.push_to_hub(model_name, organization="nielsr")
示例#12
0
import random
import matplotlib.pyplot as plt

# IMPORT MODELS
maskrcnn_model = torchvision.models.detection.maskrcnn_resnet50_fpn(
    pretrained=True)
maskrcnn_model.eval()

roi_extractor = AgnosticRoIExtractor(maskrcnn_model)
roi_extractor.eval()

clip_model = CLIPModel.from_pretrained(
    "/home/gridsan/groups/fastai/omoll/seesaw_root2/models/clip-vit-base-patch32/"
)
clip_processor = CLIPProcessor.from_pretrained(
    "/home/gridsan/groups/fastai/omoll/seesaw_root2/models/clip-vit-base-patch32/"
)


def run_clip_proposal(image, boxes, padding):
    images, new_boxes = image_clipper(image, boxes, padding)
    inputs = clip_processor.feature_extractor(images=images,
                                              return_tensors="pt")
    vision_outputs = clip_model.vision_model(**inputs)
    image_embeds = vision_outputs[1]
    image_embeds = clip_model.visual_projection(image_embeds)
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    return image_embeds


def image_clipper(image, boxes, padding):
示例#13
0
from sentence_transformers import SentenceTransformer, util, models
from PIL import ImageFile, Image
import numpy as np
import requests




###########

image = Image.open('two_dogs_in_snow.jpg')

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



inputs = processor(texts=["a cat", "a dog"], images=[image], return_tensors="pt", padding=True)
output = model(**inputs)
#vision_outputs = model.vision_model(pixel_values=inputs['pixel_values'])
#image_embeds = model.visual_projection(vision_outputs[1])

#print(image_embeds.shape)
#exit()



#Load CLIP model
clip = models.CLIPModel()
示例#14
0
def preprocess_roi_dataset(
    sds: SeesawDatasetManager,
    output_path,
    clip_model_path=None,
    cpu=False,
    image_limiter=None,
    box_limiter=100,
    padding=5,
):
    if (not cpu) and torch.cuda.is_available():
        device = torch.device("cuda")
        print("USING GPU")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    dataset = sds.get_pytorch_dataset()
    output_path = resolve_path(output_path)
    assert not os.path.exists(output_path), "output path already exists"
    clip = False
    if (clip_model_path):
        clip = True
        clip_model_path = resolve_path(clip_model_path)
        assert os.path.exists(clip_model_path), "clip model path doesn't exist"

    dirname = os.path.basename(output_path)
    dirpath = os.path.dirname(output_path)
    output_path = f"{dirpath}/.tmp.{dirname}"
    final_output_path = f"{dirpath}/{dirname}"

    os.makedirs(dirpath, exist_ok=True)

    if os.path.exists(output_path):  # remove old tmpfile
        shutil.rmtree(output_path)

    os.makedirs(output_path)
    '''
    vector_path = f"{output_path}/vectors"
    os.makedirs(vector_path)

    model_link = f"{output_path}/model"
    os.symlink(model_path, model_link)

    dataset_link = f"{output_path}/dataset"
    os.symlink(sds.dataset_root, dataset_link)

    real_prefix = f"{os.path.realpath(sds.image_root)}/"
    read_paths = ((real_prefix + sds.paths)).tolist()
    read_paths = [os.path.normpath(p) for p in read_paths]
    meta_dict = dict(zip(read_paths, zip(sds.paths, np.arange(len(sds.paths)))))
    '''

    maskrcnn_model = torchvision.models.detection.maskrcnn_resnet50_fpn(
        pretrained=True).to(device)
    maskrcnn_model.eval()

    roi_extractor = AgnosticRoIExtractor(maskrcnn_model).to(device)
    roi_extractor.eval()
    roi_extractor.model.rpn.min_size = 10
    roi_extractor.model.rpn.nms_thresh = 0

    clip_model = CLIPModel.from_pretrained(clip_model_path).to(device)
    clip_processor = CLIPProcessor.from_pretrained(clip_model_path)
    print("Models defined")
    ims = []
    paths = []
    with torch.no_grad():
        for i in range(len(dataset)):
            if i % 2000 == 0:
                if i != 0:
                    ans = list(zip(paths, output))
                    df = to_dataframe(ans)
                    df['dbidx'] = dbidx
                    if clip:
                        df['clip_feature'] = clip_features
                    #clip_array = run_clip_on_proposal()
                    #df.assign(clip_feature_vector=TensorArray(clip_array))
                    df.to_parquet(output_path + "/" + str(i) + ".parquet")
                clip_features = []
                output = []
                paths = []
                dbidx = []

            data = dataset[i]
            ims.append(data['image'])
            paths.append(data['file_path'])
            images = torchvision.transforms.ToTensor()(
                data['image']).unsqueeze(0).to(device)
            print("starting roi")
            a = roi_extractor(images)[0]
            if a['scores'].shape[0] > box_limiter:
                a['boxes'] = torch.split(a['boxes'], box_limiter)[0]
                a['scores'] = torch.split(a['scores'], box_limiter)[0]
                a['features'] = torch.split(a['features'].detach(),
                                            box_limiter)[0]
            dbidx.extend([i] * len(a['scores']))
            if clip:
                clip_array = run_clip_proposal(data['image'], a['boxes'],
                                               padding, clip_model,
                                               clip_processor, device)
                a['clip_feature_vector'] = clip_array
                clip_features += clip_array.tolist()
            output.append(a)
            print(i)

        ans = list(zip(paths, output))
        df = to_dataframe(ans)
        df['dbidx'] = dbidx
        if clip:
            df['clip_feature'] = clip_features
        #clip_array = run_clip_on_proposal()
        #df.assign(clip_feature_vector=TensorArray(clip_array))
        df.to_parquet(output_path + "/final.parquet")

        os.rename(output_path, final_output_path)
示例#15
0
    reporter = CLIReporter(
        parameter_columns=[],
        metric_columns=[
            metric,
            "training_iteration",
        ],
        max_report_frequency=60,
    )

    base_model = CLIPModel.from_pretrained(
        f"/home/gridsan/omoll/xmodexp/notebooks/models/clip-vit-base-patch32")
    init_config = base_model.config
    init_state_dict = base_model.state_dict()

    processor = CLIPProcessor.from_pretrained(
        f"/home/gridsan/omoll/xmodexp/notebooks/models/clip-vit-base-patch32")

    trainable = make_trainable(
        num_epochs=args.max_epochs,
        gpus_per_trial=gpus_per_trial,
        dataset=bird_dataset,
        init_config=init_config,
        init_state_dict=init_state_dict,
        processor=processor,
    )

    analysis = tune.run(
        trainable,
        resources_per_trial={
            "cpu": cpus_per_trial,
            "gpu": gpus_per_trial