def test_save_load_pretrained_additional_features(self): processor = CLIPProcessor( tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()) processor.save_pretrained(self.tmpdirname) tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") feature_extractor_add_kwargs = self.get_feature_extractor( do_normalize=False, padding_value=1.0) processor = CLIPProcessor.from_pretrained(self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor)
def test_save_load_pretrained_default(self): tokenizer_slow = self.get_tokenizer() tokenizer_fast = self.get_rust_tokenizer() feature_extractor = self.get_feature_extractor() processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor) processor_slow.save_pretrained(self.tmpdirname) processor_slow = CLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False) processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor) processor_fast.save_pretrained(self.tmpdirname) processor_fast = CLIPProcessor.from_pretrained(self.tmpdirname) self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer) self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertIsInstance(processor_slow.feature_extractor, CLIPFeatureExtractor) self.assertIsInstance(processor_fast.feature_extractor, CLIPFeatureExtractor)
def test_tokenizer_decode(self): feature_extractor = self.get_feature_extractor() tokenizer = self.get_tokenizer() processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] decoded_processor = processor.batch_decode(predicted_ids) decoded_tok = tokenizer.batch_decode(predicted_ids) self.assertListEqual(decoded_tok, decoded_processor)
def test_save_load_pretrained_default(self): tokenizer = self.get_tokenizer() feature_extractor = self.get_feature_extractor() processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor.save_pretrained(self.tmpdirname) processor = CLIPProcessor.from_pretrained(self.tmpdirname) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) self.assertIsInstance(processor.tokenizer, CLIPTokenizer) self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor)
def test_inference(self): model_name = "openai/clip-vit-base-patch32" model = TFCLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) image = prepare_img() inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf") outputs = model(**inputs, training=False) # verify the logits self.assertEqual( outputs.logits_per_image.shape, tf.TensorShape( (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), ) self.assertEqual( outputs.logits_per_text.shape, tf.TensorShape( (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), ) expected_logits = tf.constant([[24.5701, 19.3049]]) tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3)
def test_inference(self): model_name = "nvidia/groupvit-gcc-yfcc" model = GroupViTModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) image = prepare_img() inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt") # forward pass with torch.no_grad(): outputs = model(**inputs) # verify the logits self.assertEqual( outputs.logits_per_image.shape, torch.Size( (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), ) self.assertEqual( outputs.logits_per_text.shape, torch.Size( (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), ) expected_logits = torch.tensor([[13.3523, 6.3629]]) self.assertTrue( torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
def test_inference(self): model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name).to(torch_device) processor = CLIPProcessor.from_pretrained(model_name) image = prepare_img() inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt").to(torch_device) # forward pass with torch.no_grad(): outputs = model(**inputs) # verify the logits self.assertEqual( outputs.logits_per_image.shape, torch.Size( (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), ) self.assertEqual( outputs.logits_per_text.shape, torch.Size( (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), ) expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device) self.assertTrue( torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
def test_feature_extractor(self): feature_extractor = self.get_feature_extractor() tokenizer = self.get_tokenizer() processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) image_input = self.prepare_image_inputs() input_feat_extract = feature_extractor(image_input, return_tensors="np") input_processor = processor(images=image_input, return_tensors="np") for key in input_feat_extract.keys(): self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self): feature_extractor = self.get_feature_extractor() tokenizer = self.get_tokenizer() processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) input_str = "lower newer" encoded_processor = processor(text=input_str) encoded_tok = tokenizer(input_str) for key in encoded_tok.keys(): self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_processor(self): feature_extractor = self.get_feature_extractor() tokenizer = self.get_tokenizer() processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) input_str = "lower newer" image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input) self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"]) # test if it raises when no input is passed with pytest.raises(ValueError): processor()
def convert_groupvit_checkpoint(checkpoint_path, pytorch_dump_folder_path, model_name="groupvit-gcc-yfcc", push_to_hub=False): """ Copy/paste/tweak model's weights to the Transformers design. """ config = GroupViTConfig() model = GroupViTModel(config).eval() state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] new_state_dict = convert_state_dict(state_dict, config) missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) assert missing_keys == ["text_model.embeddings.position_ids"] assert (unexpected_keys == ["multi_label_logit_scale" ]) or (len(unexpected_keys) == 0) # verify result processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") image = prepare_img() inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) if model_name == "groupvit-gcc-yfcc": expected_logits = torch.tensor([[13.3523, 6.3629]]) elif model_name == "groupvit-gcc-redcaps": expected_logits = torch.tensor([[16.1873, 8.6230]]) else: raise ValueError(f"Model name {model_name} not supported.") assert torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3) processor.save_pretrained(pytorch_dump_folder_path) model.save_pretrained(pytorch_dump_folder_path) print("Successfully saved processor and model to", pytorch_dump_folder_path) if push_to_hub: print("Pushing to the hub...") processor.push_to_hub(model_name, organization="nielsr") model.push_to_hub(model_name, organization="nielsr")
import random import matplotlib.pyplot as plt # IMPORT MODELS maskrcnn_model = torchvision.models.detection.maskrcnn_resnet50_fpn( pretrained=True) maskrcnn_model.eval() roi_extractor = AgnosticRoIExtractor(maskrcnn_model) roi_extractor.eval() clip_model = CLIPModel.from_pretrained( "/home/gridsan/groups/fastai/omoll/seesaw_root2/models/clip-vit-base-patch32/" ) clip_processor = CLIPProcessor.from_pretrained( "/home/gridsan/groups/fastai/omoll/seesaw_root2/models/clip-vit-base-patch32/" ) def run_clip_proposal(image, boxes, padding): images, new_boxes = image_clipper(image, boxes, padding) inputs = clip_processor.feature_extractor(images=images, return_tensors="pt") vision_outputs = clip_model.vision_model(**inputs) image_embeds = vision_outputs[1] image_embeds = clip_model.visual_projection(image_embeds) image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True) return image_embeds def image_clipper(image, boxes, padding):
from sentence_transformers import SentenceTransformer, util, models from PIL import ImageFile, Image import numpy as np import requests ########### image = Image.open('two_dogs_in_snow.jpg') from transformers import CLIPProcessor, CLIPModel model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") inputs = processor(texts=["a cat", "a dog"], images=[image], return_tensors="pt", padding=True) output = model(**inputs) #vision_outputs = model.vision_model(pixel_values=inputs['pixel_values']) #image_embeds = model.visual_projection(vision_outputs[1]) #print(image_embeds.shape) #exit() #Load CLIP model clip = models.CLIPModel()
def preprocess_roi_dataset( sds: SeesawDatasetManager, output_path, clip_model_path=None, cpu=False, image_limiter=None, box_limiter=100, padding=5, ): if (not cpu) and torch.cuda.is_available(): device = torch.device("cuda") print("USING GPU") else: device = torch.device("cpu") print("Using CPU") dataset = sds.get_pytorch_dataset() output_path = resolve_path(output_path) assert not os.path.exists(output_path), "output path already exists" clip = False if (clip_model_path): clip = True clip_model_path = resolve_path(clip_model_path) assert os.path.exists(clip_model_path), "clip model path doesn't exist" dirname = os.path.basename(output_path) dirpath = os.path.dirname(output_path) output_path = f"{dirpath}/.tmp.{dirname}" final_output_path = f"{dirpath}/{dirname}" os.makedirs(dirpath, exist_ok=True) if os.path.exists(output_path): # remove old tmpfile shutil.rmtree(output_path) os.makedirs(output_path) ''' vector_path = f"{output_path}/vectors" os.makedirs(vector_path) model_link = f"{output_path}/model" os.symlink(model_path, model_link) dataset_link = f"{output_path}/dataset" os.symlink(sds.dataset_root, dataset_link) real_prefix = f"{os.path.realpath(sds.image_root)}/" read_paths = ((real_prefix + sds.paths)).tolist() read_paths = [os.path.normpath(p) for p in read_paths] meta_dict = dict(zip(read_paths, zip(sds.paths, np.arange(len(sds.paths))))) ''' maskrcnn_model = torchvision.models.detection.maskrcnn_resnet50_fpn( pretrained=True).to(device) maskrcnn_model.eval() roi_extractor = AgnosticRoIExtractor(maskrcnn_model).to(device) roi_extractor.eval() roi_extractor.model.rpn.min_size = 10 roi_extractor.model.rpn.nms_thresh = 0 clip_model = CLIPModel.from_pretrained(clip_model_path).to(device) clip_processor = CLIPProcessor.from_pretrained(clip_model_path) print("Models defined") ims = [] paths = [] with torch.no_grad(): for i in range(len(dataset)): if i % 2000 == 0: if i != 0: ans = list(zip(paths, output)) df = to_dataframe(ans) df['dbidx'] = dbidx if clip: df['clip_feature'] = clip_features #clip_array = run_clip_on_proposal() #df.assign(clip_feature_vector=TensorArray(clip_array)) df.to_parquet(output_path + "/" + str(i) + ".parquet") clip_features = [] output = [] paths = [] dbidx = [] data = dataset[i] ims.append(data['image']) paths.append(data['file_path']) images = torchvision.transforms.ToTensor()( data['image']).unsqueeze(0).to(device) print("starting roi") a = roi_extractor(images)[0] if a['scores'].shape[0] > box_limiter: a['boxes'] = torch.split(a['boxes'], box_limiter)[0] a['scores'] = torch.split(a['scores'], box_limiter)[0] a['features'] = torch.split(a['features'].detach(), box_limiter)[0] dbidx.extend([i] * len(a['scores'])) if clip: clip_array = run_clip_proposal(data['image'], a['boxes'], padding, clip_model, clip_processor, device) a['clip_feature_vector'] = clip_array clip_features += clip_array.tolist() output.append(a) print(i) ans = list(zip(paths, output)) df = to_dataframe(ans) df['dbidx'] = dbidx if clip: df['clip_feature'] = clip_features #clip_array = run_clip_on_proposal() #df.assign(clip_feature_vector=TensorArray(clip_array)) df.to_parquet(output_path + "/final.parquet") os.rename(output_path, final_output_path)
reporter = CLIReporter( parameter_columns=[], metric_columns=[ metric, "training_iteration", ], max_report_frequency=60, ) base_model = CLIPModel.from_pretrained( f"/home/gridsan/omoll/xmodexp/notebooks/models/clip-vit-base-patch32") init_config = base_model.config init_state_dict = base_model.state_dict() processor = CLIPProcessor.from_pretrained( f"/home/gridsan/omoll/xmodexp/notebooks/models/clip-vit-base-patch32") trainable = make_trainable( num_epochs=args.max_epochs, gpus_per_trial=gpus_per_trial, dataset=bird_dataset, init_config=init_config, init_state_dict=init_state_dict, processor=processor, ) analysis = tune.run( trainable, resources_per_trial={ "cpu": cpus_per_trial, "gpu": gpus_per_trial