def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None): """ Copy/paste/tweak model's weights to transformers design. """ if config_path is not None: config = CLIPConfig.from_pretrained(config_path) else: config = CLIPConfig(projection_dim=512, text_config={}, vision_config={}) hf_model = CLIPModel(config).eval() pt_model, _ = load(checkpoint_path, device="cpu", jit=False) pt_model = pt_model.eval() copy_text_model_and_projection(hf_model, pt_model) copy_vison_model_and_projection(hf_model, pt_model) hf_model.logit_scale = pt_model.logit_scale input_ids = torch.arange(0, 77).unsqueeze(0) pixel_values = torch.randn(1, 3, 224, 224) hf_logits_per_image, hf_logits_per_text = hf_model( input_ids=input_ids, pixel_values=pixel_values, return_dict=True )[1:3] pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids) assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3) assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3) hf_model.save_pretrained(pytorch_dump_folder_path)
def clip_fine_tune( config, num_epochs, num_gpus, dataset: pa.Table, init_config: CLIPConfig, init_state_dict: dict, processor: CLIPProcessor, ): if "SLURM_NTASKS" in os.environ: del os.environ["SLURM_NTASKS"] if "SLURM_JOB_NAME" in os.environ: del os.environ["SLURM_JOB_NAME"] bird_dataset = dataset data_mod = MultiModalDataModule( dataset=bird_dataset, processor=processor, test_size=config["test_size"], batch_size=config["batch_size"], val_batch_size=config["val_batch_size"], num_workers=config["num_workers"], ) clip_model = CLIPModel(init_config) clip_model.load_state_dict(init_state_dict) model = CLIPFineTunedModel(clip_model, **config) tune_cbs = [ TuneReportCheckpointCallback(["val_loss"], on="validation_end") ] logger = TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".") trainer = pl.Trainer( logger=logger, num_sanity_val_steps=0, max_epochs=num_epochs, gpus=math.ceil(num_gpus), progress_bar_refresh_rate=0, log_every_n_steps=1, callbacks=[LearningRateMonitor(logging_interval="step")] + tune_cbs, ) trainer.validate(model, data_mod) trainer.fit(model, data_mod) return trainer
def test_inference(self): model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name).to(torch_device) processor = CLIPProcessor.from_pretrained(model_name) image = prepare_img() inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt").to(torch_device) # forward pass with torch.no_grad(): outputs = model(**inputs) # verify the logits self.assertEqual( outputs.logits_per_image.shape, torch.Size( (inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), ) self.assertEqual( outputs.logits_per_text.shape, torch.Size( (inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), ) expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device) self.assertTrue( torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): model = CLIPModel(config).to(torch_device).eval() result = model(input_ids, pixel_values, attention_mask) self.parent.assertEqual( result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size) ) self.parent.assertEqual( result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size) )
def __init__(self, path_or_model, **kwargs): super().__init__() # todo: its easier to let pl save everything so it can also restore with .load_from_checkpoint # then, instead of passing a model or path, we should pass a config to construct a blank clip model, # then use load weights to get the weights self.save_hyperparameters(ignore="path_or_model") if isinstance(path_or_model, str): path = path_or_model model = CLIPModel.from_pretrained(path) elif isinstance(path_or_model, CLIPModel): model = path_or_model model.logit_scale = nn.Parameter( torch.tensor( self.hparams.logit_scale_init, device=model.logit_scale.device, dtype=model.logit_scale.dtype, )) self.model = model
from seesaw.roi_extractor import AgnosticRoIExtractor from seesaw.roi_extractor import to_dataframe import tensorflow as tf import random import matplotlib.pyplot as plt # IMPORT MODELS maskrcnn_model = torchvision.models.detection.maskrcnn_resnet50_fpn( pretrained=True) maskrcnn_model.eval() roi_extractor = AgnosticRoIExtractor(maskrcnn_model) roi_extractor.eval() clip_model = CLIPModel.from_pretrained( "/home/gridsan/groups/fastai/omoll/seesaw_root2/models/clip-vit-base-patch32/" ) clip_processor = CLIPProcessor.from_pretrained( "/home/gridsan/groups/fastai/omoll/seesaw_root2/models/clip-vit-base-patch32/" ) def run_clip_proposal(image, boxes, padding): images, new_boxes = image_clipper(image, boxes, padding) inputs = clip_processor.feature_extractor(images=images, return_tensors="pt") vision_outputs = clip_model.vision_model(**inputs) image_embeds = vision_outputs[1] image_embeds = clip_model.visual_projection(image_embeds) image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True) return image_embeds
def test_model_from_pretrained(self): for model_name in CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = CLIPModel.from_pretrained(model_name) self.assertIsNotNone(model)
from sentence_transformers import SentenceTransformer, util, models from PIL import ImageFile, Image import numpy as np import requests ########### image = Image.open('two_dogs_in_snow.jpg') from transformers import CLIPProcessor, CLIPModel model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") inputs = processor(texts=["a cat", "a dog"], images=[image], return_tensors="pt", padding=True) output = model(**inputs) #vision_outputs = model.vision_model(pixel_values=inputs['pixel_values']) #image_embeds = model.visual_projection(vision_outputs[1]) #print(image_embeds.shape) #exit() #Load CLIP model clip = models.CLIPModel()
def preprocess_roi_dataset( sds: SeesawDatasetManager, output_path, clip_model_path=None, cpu=False, image_limiter=None, box_limiter=100, padding=5, ): if (not cpu) and torch.cuda.is_available(): device = torch.device("cuda") print("USING GPU") else: device = torch.device("cpu") print("Using CPU") dataset = sds.get_pytorch_dataset() output_path = resolve_path(output_path) assert not os.path.exists(output_path), "output path already exists" clip = False if (clip_model_path): clip = True clip_model_path = resolve_path(clip_model_path) assert os.path.exists(clip_model_path), "clip model path doesn't exist" dirname = os.path.basename(output_path) dirpath = os.path.dirname(output_path) output_path = f"{dirpath}/.tmp.{dirname}" final_output_path = f"{dirpath}/{dirname}" os.makedirs(dirpath, exist_ok=True) if os.path.exists(output_path): # remove old tmpfile shutil.rmtree(output_path) os.makedirs(output_path) ''' vector_path = f"{output_path}/vectors" os.makedirs(vector_path) model_link = f"{output_path}/model" os.symlink(model_path, model_link) dataset_link = f"{output_path}/dataset" os.symlink(sds.dataset_root, dataset_link) real_prefix = f"{os.path.realpath(sds.image_root)}/" read_paths = ((real_prefix + sds.paths)).tolist() read_paths = [os.path.normpath(p) for p in read_paths] meta_dict = dict(zip(read_paths, zip(sds.paths, np.arange(len(sds.paths))))) ''' maskrcnn_model = torchvision.models.detection.maskrcnn_resnet50_fpn( pretrained=True).to(device) maskrcnn_model.eval() roi_extractor = AgnosticRoIExtractor(maskrcnn_model).to(device) roi_extractor.eval() roi_extractor.model.rpn.min_size = 10 roi_extractor.model.rpn.nms_thresh = 0 clip_model = CLIPModel.from_pretrained(clip_model_path).to(device) clip_processor = CLIPProcessor.from_pretrained(clip_model_path) print("Models defined") ims = [] paths = [] with torch.no_grad(): for i in range(len(dataset)): if i % 2000 == 0: if i != 0: ans = list(zip(paths, output)) df = to_dataframe(ans) df['dbidx'] = dbidx if clip: df['clip_feature'] = clip_features #clip_array = run_clip_on_proposal() #df.assign(clip_feature_vector=TensorArray(clip_array)) df.to_parquet(output_path + "/" + str(i) + ".parquet") clip_features = [] output = [] paths = [] dbidx = [] data = dataset[i] ims.append(data['image']) paths.append(data['file_path']) images = torchvision.transforms.ToTensor()( data['image']).unsqueeze(0).to(device) print("starting roi") a = roi_extractor(images)[0] if a['scores'].shape[0] > box_limiter: a['boxes'] = torch.split(a['boxes'], box_limiter)[0] a['scores'] = torch.split(a['scores'], box_limiter)[0] a['features'] = torch.split(a['features'].detach(), box_limiter)[0] dbidx.extend([i] * len(a['scores'])) if clip: clip_array = run_clip_proposal(data['image'], a['boxes'], padding, clip_model, clip_processor, device) a['clip_feature_vector'] = clip_array clip_features += clip_array.tolist() output.append(a) print(i) ans = list(zip(paths, output)) df = to_dataframe(ans) df['dbidx'] = dbidx if clip: df['clip_feature'] = clip_features #clip_array = run_clip_on_proposal() #df.assign(clip_feature_vector=TensorArray(clip_array)) df.to_parquet(output_path + "/final.parquet") os.rename(output_path, final_output_path)
scheduler = ASHAScheduler( max_t=max(grace_period, args.max_epochs), grace_period=grace_period, reduction_factor=2, ) reporter = CLIReporter( parameter_columns=[], metric_columns=[ metric, "training_iteration", ], max_report_frequency=60, ) base_model = CLIPModel.from_pretrained( f"/home/gridsan/omoll/xmodexp/notebooks/models/clip-vit-base-patch32") init_config = base_model.config init_state_dict = base_model.state_dict() processor = CLIPProcessor.from_pretrained( f"/home/gridsan/omoll/xmodexp/notebooks/models/clip-vit-base-patch32") trainable = make_trainable( num_epochs=args.max_epochs, gpus_per_trial=gpus_per_trial, dataset=bird_dataset, init_config=init_config, init_state_dict=init_state_dict, processor=processor, )