Exemplo n.º 1
0
    def test_inference_interpolate_pos_encoding(self):
        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
        # allowing to interpolate the pre-trained position embeddings in order to use
        # the model on higher resolutions. The DINO model by Facebook AI leverages this
        # to visualize self-attention on higher resolution images.
        model = ViTModel.from_pretrained("facebook/dino-vits8").to(
            torch_device)

        feature_extractor = ViTFeatureExtractor.from_pretrained(
            "facebook/dino-vits8", size=480)
        image = prepare_img()
        inputs = feature_extractor(images=image, return_tensors="pt")
        pixel_values = inputs.pixel_values.to(torch_device)

        # forward pass
        with torch.no_grad():
            outputs = model(pixel_values, interpolate_pos_encoding=True)

        # verify the logits
        expected_shape = torch.Size((1, 3601, 384))
        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)

        expected_slice = torch.tensor([[4.2340, 4.3906, -6.6692],
                                       [4.5463, 1.8928, -6.7257],
                                       [4.4429, 0.8496,
                                        -5.8585]]).to(torch_device)

        self.assertTrue(
            torch.allclose(outputs.last_hidden_state[0, :3, :3],
                           expected_slice,
                           atol=1e-4))
Exemplo n.º 2
0
def save_pretrained_model(config):
    try:

        feature_extractor = ViTFeatureExtractor.from_pretrained(
            'google/vit-base-patch16-224-in21k')
        model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

        feature_extractor.save_pretrained(config.pretrained_vitfe_path)
        model.save_pretrained(config.pretrained_vit_path)
    except Exception as e:
        print(f'Error - {str (e)}')
        return 1
    return 0
Exemplo n.º 3
0
    def __init__(self):
        super(Encoder_ViT_Pretrained, self).__init__()

        #        with open("model.pickle", "rb") as f:
        #            self.pretrained_model = pickle.load(f)
        self.pretrained_model = ViTModel.from_pretrained(
            'google/vit-base-patch16-224-in21k')

        #        print(self.pretrained_model)

        # Freeze All layers as they will be used for inference
        for param in self.pretrained_model.parameters():
            param.requires_grad = False
Exemplo n.º 4
0
 def create_and_check_model(self, config, pixel_values, labels):
     model = ViTModel(config=config)
     model.to(torch_device)
     model.eval()
     result = model(pixel_values)
     self.parent.assertEqual(
         result.last_hidden_state.shape,
         (self.batch_size, self.seq_length, self.hidden_size))
Exemplo n.º 5
0
 def create_and_check_model(self, config, pixel_values, labels):
     model = ViTModel(config=config)
     model.to(torch_device)
     model.eval()
     result = model(pixel_values)
     # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
     image_size = to_2tuple(self.image_size)
     patch_size = to_2tuple(self.patch_size)
     num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
     self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
    def test_encoder_decoder_save_load_from_encoder_decoder_from_pt(self):
        config = self.get_encoder_decoder_config_small()

        # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
        encoder_pt = ViTModel(config.encoder).to(torch_device).eval()
        decoder_pt = GPT2LMHeadModel(config.decoder).to(torch_device).eval()

        encoder_decoder_pt = VisionEncoderDecoderModel(encoder=encoder_pt, decoder=decoder_pt).to(torch_device).eval()

        pixel_values = floats_tensor(
            [
                13,
                encoder_pt.config.num_channels,
                encoder_pt.config.image_size,
                encoder_pt.config.image_size,
            ]
        )
        decoder_input_ids = ids_tensor([13, 1], decoder_pt.config.vocab_size)

        pt_pixel_values = torch.tensor(pixel_values.numpy(), device=torch_device, dtype=torch.float)
        pt_decoder_input_ids = torch.tensor(decoder_input_ids.numpy(), device=torch_device, dtype=torch.long)

        logits_pt = encoder_decoder_pt(pixel_values=pt_pixel_values, decoder_input_ids=pt_decoder_input_ids).logits

        # PyTorch => TensorFlow
        with tempfile.TemporaryDirectory() as tmp_dirname_1, tempfile.TemporaryDirectory() as tmp_dirname_2:
            encoder_decoder_pt.encoder.save_pretrained(tmp_dirname_1)
            encoder_decoder_pt.decoder.save_pretrained(tmp_dirname_2)
            encoder_decoder_tf = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                tmp_dirname_1, tmp_dirname_2, encoder_from_pt=True, decoder_from_pt=True
            )

        logits_tf = encoder_decoder_tf(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits

        max_diff = np.max(np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy()))
        self.assertAlmostEqual(max_diff, 0.0, places=3)

        # Make sure `from_pretrained` following `save_pretrained` work and give the same result
        # (See https://github.com/huggingface/transformers/pull/14016)
        with tempfile.TemporaryDirectory() as tmp_dirname:
            encoder_decoder_tf.save_pretrained(tmp_dirname)
            encoder_decoder_tf = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname)

            logits_tf_2 = encoder_decoder_tf(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits

            max_diff = np.max(np.abs(logits_tf_2.numpy() - logits_tf.numpy()))
            self.assertAlmostEqual(max_diff, 0.0, places=3)
Exemplo n.º 7
0
Arquivo: train.py Projeto: ksg14/DHIC
def train (feature_extractor, model, decoder, dataloader):
	for image, caption, target, target_seq_len in train_dataloader:
	# print (f'image shape - {image.shape}')
	# print (f'caption - {caption.shape}')
	# print (f'target - {target.shape}')
	# print (f'target_seq_len shape- {target_seq_len.shape}')
	# print (f'target_seq_len - {target_seq_len}')

	# print (f'image[0].shape {image [0].shape}')

	# print (f'max - {image.max ()}')
	# print (f'min - {image.min ()}')

	images_list = [image [i] for i in range (config.batch_sz)]
	# print (type (images_list))
	# print (type (images_list [0]))
	# print (images_list [0].shape)

	inputs = feature_extractor(images=images_list, return_tensors="pt")
	outputs = model(**inputs, output_attentions=False, output_hidden_states=False)
	last_hidden_states = outputs.last_hidden_state

	print (f'output shape - {last_hidden_states.shape}')
	break


if __name__ == '__main__':
	config = Config ()

	text_transform = ToSequence (tokenizer=indic_tokenize.trivial_tokenize)
	image_transform = T.Compose ([T.ToTensor(), T.Resize ((224, 224))])

	train_dataset = HVGDataset (config.train_captions, config.word_to_index_path, config.index_to_word_path, config.images_path, config.max_len, text_transform=text_transform, image_transform=image_transform)
	train_dataloader = DataLoader (train_dataset, batch_size=config.batch_sz, shuffle=True)

	feature_extractor = ViTFeatureExtractor.from_pretrained(config.pretrained_vitfe_path)
	model = ViTModel.from_pretrained(config.pretrained_vit_path)

	

	train (feature_extractor=feature_extractor, \
			model=model, \
			decoder=decoder, \
			dataloader=train_dataloader)
def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our ViT structure.
    """

    # define default ViT configuration
    config = ViTConfig()
    base_model = False
    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
    if vit_name[-5:] == "in21k":
        base_model = True
        config.patch_size = int(vit_name[-12:-10])
        config.image_size = int(vit_name[-9:-6])
    else:
        config.num_labels = 1000
        repo_id = "datasets/huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(
            open(cached_download(hf_hub_url(repo_id, filename)), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
        config.patch_size = int(vit_name[-6:-4])
        config.image_size = int(vit_name[-3:])
    # size of the architecture
    if "deit" in vit_name:
        if vit_name[9:].startswith("tiny"):
            config.hidden_size = 192
            config.intermediate_size = 768
            config.num_hidden_layers = 12
            config.num_attention_heads = 3
        elif vit_name[9:].startswith("small"):
            config.hidden_size = 384
            config.intermediate_size = 1536
            config.num_hidden_layers = 12
            config.num_attention_heads = 6
        else:
            pass
    else:
        if vit_name[4:].startswith("small"):
            config.hidden_size = 768
            config.intermediate_size = 2304
            config.num_hidden_layers = 8
            config.num_attention_heads = 8
        elif vit_name[4:].startswith("base"):
            pass
        elif vit_name[4:].startswith("large"):
            config.hidden_size = 1024
            config.intermediate_size = 4096
            config.num_hidden_layers = 24
            config.num_attention_heads = 16
        elif vit_name[4:].startswith("huge"):
            config.hidden_size = 1280
            config.intermediate_size = 5120
            config.num_hidden_layers = 32
            config.num_attention_heads = 16

    # load original model from timm
    timm_model = timm.create_model(vit_name, pretrained=True)
    timm_model.eval()

    # load state_dict of original model, remove and rename some keys
    state_dict = timm_model.state_dict()
    if base_model:
        remove_classification_head_(state_dict)
    rename_keys = create_rename_keys(config, base_model)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, config, base_model)

    # load HuggingFace model
    if vit_name[-5:] == "in21k":
        model = ViTModel(config).eval()
    else:
        model = ViTForImageClassification(config).eval()
    model.load_state_dict(state_dict)

    # Check outputs on an image, prepared by ViTFeatureExtractor/DeiTFeatureExtractor
    if "deit" in vit_name:
        feature_extractor = DeiTFeatureExtractor(size=config.image_size)
    else:
        feature_extractor = ViTFeatureExtractor(size=config.image_size)
    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)

    if base_model:
        timm_pooled_output = timm_model.forward_features(pixel_values)
        assert timm_pooled_output.shape == outputs.pooler_output.shape
        assert torch.allclose(timm_pooled_output,
                              outputs.pooler_output,
                              atol=1e-3)
    else:
        timm_logits = timm_model(pixel_values)
        assert timm_logits.shape == outputs.logits.shape
        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
Exemplo n.º 9
0
 def get_vision_text_model(self, vision_config, text_config):
     vision_model = ViTModel(vision_config).eval()
     text_model = BertModel(text_config).eval()
     return vision_model, text_model
Exemplo n.º 10
0
    def __init__(
        self,
        height: int,
        width: int,
        num_channels: int = 3,
        use_pretrained: bool = True,
        pretrained_model: str = "google/vit-base-patch16-224",
        saved_weights_in_checkpoint: bool = False,
        hidden_size: int = 768,
        num_hidden_layers: int = 12,
        num_attention_heads: int = 12,
        intermediate_size: int = 3072,
        hidden_act: str = "gelu",
        hidden_dropout_prob: float = 0.1,
        attention_probs_dropout_prob: float = 0.1,
        initializer_range: float = 0.02,
        layer_norm_eps: float = 1e-12,
        gradient_checkpointing: bool = False,
        patch_size: int = 16,
        trainable: bool = True,
        output_attentions: bool = False,
        **kwargs,
    ):
        """Creates a ViT encoder using transformers.ViTModel.

        use_pretrained: If True, uses a pretrained transformer based on the
            pretrained_model argument.
        pretrained: If str, expects the path to a pretrained model or the id of
            a model on huggingface.co, and ignores the configuration provided in
            the arguments.
        """
        super().__init__()
        try:
            from transformers import ViTConfig, ViTModel
        except ModuleNotFoundError:
            raise RuntimeError(
                " transformers is not installed. "
                "In order to install all image feature dependencies run "
                "pip install ludwig[image]")

        # map parameter input feature config names to internal names
        img_height = height
        img_width = width
        in_channels = num_channels

        img_width = img_width or img_height
        if img_width != img_height:
            raise ValueError("img_height and img_width should be identical.")
        self._input_shape = (in_channels, img_height, img_width)

        if use_pretrained and not saved_weights_in_checkpoint:
            self.transformer = ViTModel.from_pretrained(pretrained_model)
        else:
            config = ViTConfig(
                image_size=img_height,
                num_channels=in_channels,
                patch_size=patch_size,
                hidden_size=hidden_size,
                num_hidden_layers=num_hidden_layers,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                hidden_act=hidden_act,
                hidden_dropout_prob=hidden_dropout_prob,
                attention_probs_dropout_prob=attention_probs_dropout_prob,
                initializer_range=initializer_range,
                layer_norm_eps=layer_norm_eps,
                gradient_checkpointing=gradient_checkpointing,
            )
            self.transformer = ViTModel(config)

        if trainable:
            self.transformer.train()
        else:
            freeze_parameters(self.transformer)

        self._output_shape = (self.transformer.config.hidden_size, )
        self.output_attentions = output_attentions
Exemplo n.º 11
0
 def test_model_from_pretrained(self):
     for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         model = ViTModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 def get_encoder_decoder_model(self, config, decoder_config):
     encoder_model = ViTModel(config).eval()
     decoder_model = TrOCRForCausalLM(decoder_config).eval()
     return encoder_model, decoder_model
 def get_encoder_decoder_model(self, config, decoder_config):
     encoder_model = ViTModel(config).eval()
     decoder_model = BertLMHeadModel(decoder_config).eval()
     return encoder_model, decoder_model
Exemplo n.º 14
0
Arquivo: demo.py Projeto: AK391/dino
from torchvision import transforms as pth_transforms
import numpy as np
from PIL import Image
from transformers import ViTFeatureExtractor
import os
import gradio as gr

torch.hub.download_url_to_file(
    'https://cdn.pixabay.com/photo/2018/08/12/16/59/ara-3601194_1280.jpg',
    'parrot.jpg')
torch.hub.download_url_to_file(
    'https://cdn.pixabay.com/photo/2016/12/13/00/13/rabbit-1903016_1280.jpg',
    'rabbit.jpg')

# let's use the small DeiT model trained with a patch size of 8
model = ViTModel.from_pretrained("nielsr/dino_deits8", add_pooling_layer=False)


def apply_mask(image, mask, color, alpha=0.5):
    for c in range(3):
        image[:, :, c] = image[:, :, c] * (
            1 - alpha * mask) + alpha * mask * color[c] * 255
    return image


def random_colors(N, bright=True):
    """
    Generate random colors.
    """
    brightness = 1.0 if bright else 0.7
    hsv = [(i / N, 1, brightness) for i in range(N)]
Exemplo n.º 15
0
def convert_vit_checkpoint(model_name,
                           pytorch_dump_folder_path,
                           base_model=True):
    """
    Copy/paste/tweak model's weights to our ViT structure.
    """

    # define default ViT configuration
    config = ViTConfig()
    # patch_size
    if model_name[-1] == "8":
        config.patch_size = 8
    # set labels if required
    if not base_model:
        config.num_labels = 1000
        repo_id = "datasets/huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
    # size of the architecture
    if model_name in ["dino_vits8", "dino_vits16"]:
        config.hidden_size = 384
        config.intermediate_size = 1536
        config.num_hidden_layers = 12
        config.num_attention_heads = 6

    # load original model from torch hub
    original_model = torch.hub.load("facebookresearch/dino:main", model_name)
    original_model.eval()

    # load state_dict of original model, remove and rename some keys
    state_dict = original_model.state_dict()
    if base_model:
        remove_classification_head_(state_dict)
    rename_keys = create_rename_keys(config, base_model=base_model)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, config, base_model)

    # load HuggingFace model
    if base_model:
        model = ViTModel(config, add_pooling_layer=False).eval()
    else:
        model = ViTForImageClassification(config).eval()
    model.load_state_dict(state_dict)

    # Check outputs on an image, prepared by ViTFeatureExtractor
    feature_extractor = ViTFeatureExtractor()
    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)

    if base_model:
        final_hidden_state_cls_token = original_model(pixel_values)
        assert torch.allclose(final_hidden_state_cls_token,
                              outputs.last_hidden_state[:, 0, :],
                              atol=1e-1)
    else:
        logits = original_model(pixel_values)
        assert logits.shape == outputs.logits.shape
        assert torch.allclose(logits, outputs.logits, atol=1e-3)

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our VisionEncoderDecoderModel structure.
    """
    # define encoder and decoder configs based on checkpoint_url
    encoder_config = ViTConfig(image_size=384, qkv_bias=False)
    decoder_config = TrOCRConfig()

    # size of the architecture
    if "base" in checkpoint_url:
        decoder_config.encoder_hidden_size = 768
    elif "large" in checkpoint_url:
        # use ViT-large encoder
        encoder_config.hidden_size = 1024
        encoder_config.intermediate_size = 4096
        encoder_config.num_hidden_layers = 24
        encoder_config.num_attention_heads = 16
        decoder_config.encoder_hidden_size = 1024
    else:
        raise ValueError(
            "Should either find 'base' or 'large' in checkpoint URL")

    # the large-printed + stage1 checkpoints uses sinusoidal position embeddings, no layernorm afterwards
    if "large-printed" in checkpoint_url or "stage1" in checkpoint_url:
        decoder_config.tie_word_embeddings = False
        decoder_config.activation_function = "relu"
        decoder_config.max_position_embeddings = 1024
        decoder_config.scale_embedding = True
        decoder_config.use_learned_position_embeddings = False
        decoder_config.layernorm_embedding = False

    # load HuggingFace model
    encoder = ViTModel(encoder_config, add_pooling_layer=False)
    decoder = TrOCRForCausalLM(decoder_config)
    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
    model.eval()

    # load state_dict of original model, rename some keys
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url,
                                                    map_location="cpu",
                                                    check_hash=True)["model"]

    rename_keys = create_rename_keys(encoder_config, decoder_config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, encoder_config)

    # remove parameters we don't need
    del state_dict["encoder.deit.head.weight"]
    del state_dict["encoder.deit.head.bias"]
    del state_dict["decoder.version"]

    # add prefix to decoder keys
    for key, val in state_dict.copy().items():
        val = state_dict.pop(key)
        if key.startswith("decoder") and "output_projection" not in key:
            state_dict["decoder.model." + key] = val
        else:
            state_dict[key] = val

    # load state dict
    model.load_state_dict(state_dict)

    # Check outputs on an image
    feature_extractor = ViTFeatureExtractor(size=encoder_config.image_size)
    tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
    processor = TrOCRProcessor(feature_extractor, tokenizer)

    pixel_values = processor(images=prepare_img(checkpoint_url),
                             return_tensors="pt").pixel_values

    # verify logits
    decoder_input_ids = torch.tensor(
        [[model.config.decoder.decoder_start_token_id]])
    outputs = model(pixel_values=pixel_values,
                    decoder_input_ids=decoder_input_ids)
    logits = outputs.logits

    expected_shape = torch.Size([1, 1, 50265])
    if "trocr-base-handwritten" in checkpoint_url:
        expected_slice = torch.tensor([
            -1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764,
            1.7560, 8.7358, -1.5311
        ])
    elif "trocr-large-handwritten" in checkpoint_url:
        expected_slice = torch.tensor([
            -2.6437, -1.3129, -2.2596, -5.3455, 6.3539, 1.7604, 5.4991, 1.4702,
            5.6113, 2.0170
        ])
    elif "trocr-base-printed" in checkpoint_url:
        expected_slice = torch.tensor([
            -5.6816, -5.8388, 1.1398, -6.9034, 6.8505, -2.4393, 1.2284,
            -1.0232, -1.9661, -3.9210
        ])
    elif "trocr-large-printed" in checkpoint_url:
        expected_slice = torch.tensor([
            -6.0162, -7.0959, 4.4155, -5.1063, 7.0468, -3.1631, 2.6466,
            -0.3081, -0.8106, -1.7535
        ])

    if "stage1" not in checkpoint_url:
        assert logits.shape == expected_shape, "Shape of logits not as expected"
        assert torch.allclose(
            logits[0, 0, :10], expected_slice,
            atol=1e-3), "First elements of logits not as expected"

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving processor to {pytorch_dump_folder_path}")
    processor.save_pretrained(pytorch_dump_folder_path)