Exemplo n.º 1
0
    def test_inference_interpolate_pos_encoding(self):
        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
        # allowing to interpolate the pre-trained position embeddings in order to use
        # the model on higher resolutions. The DINO model by Facebook AI leverages this
        # to visualize self-attention on higher resolution images.
        model = ViTModel.from_pretrained("facebook/dino-vits8").to(
            torch_device)

        feature_extractor = ViTFeatureExtractor.from_pretrained(
            "facebook/dino-vits8", size=480)
        image = prepare_img()
        inputs = feature_extractor(images=image, return_tensors="pt")
        pixel_values = inputs.pixel_values.to(torch_device)

        # forward pass
        with torch.no_grad():
            outputs = model(pixel_values, interpolate_pos_encoding=True)

        # verify the logits
        expected_shape = torch.Size((1, 3601, 384))
        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)

        expected_slice = torch.tensor([[4.2340, 4.3906, -6.6692],
                                       [4.5463, 1.8928, -6.7257],
                                       [4.4429, 0.8496,
                                        -5.8585]]).to(torch_device)

        self.assertTrue(
            torch.allclose(outputs.last_hidden_state[0, :3, :3],
                           expected_slice,
                           atol=1e-4))
Exemplo n.º 2
0
    def __init__(self, model_name="ydshieh/vit-gpt2-coco-en", device=None):
        """
        ```
        ImageCaptioner constructor

        Args:
          model_name(str): name of  image captioning model
          device(str): device to use (e.g., 'cuda', 'cpu')
        ```
        """
        if not I.PIL_INSTALLED:
            raise Exception(
                "PIL is not installed. Please install with: pip install pillow>=9.0.1"
            )

        super().__init__(device=device,
                         quantize=False,
                         min_transformers_version="4.12.3")
        self.model_name = model_name
        from transformers import (
            AutoTokenizer,
            VisionEncoderDecoderModel,
            ViTFeatureExtractor,
        )

        self.model = VisionEncoderDecoderModel.from_pretrained(
            self.model_name).to(self.torch_device)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.extractor = ViTFeatureExtractor.from_pretrained(self.model_name)
Exemplo n.º 3
0
    def test_inference_coco_en(self):

        loc = "ydshieh/vit-gpt2-coco-en"

        feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
        tokenizer = AutoTokenizer.from_pretrained(loc)
        model = VisionEncoderDecoderModel.from_pretrained(loc)
        model.to(torch_device)
        model.eval()

        # We will verify our results on an image of cute cats
        img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(torch_device)

        decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(torch_device)

        with torch.no_grad():
            logits = model(pixel_values, decoder_input_ids)[0].detach().cpu().numpy()

        # verify the logits
        expected_shape = (1, 1, model.config.decoder.vocab_size)
        self.assertEqual(logits.shape, expected_shape)

        EXPECTED_LOGIT_SLICE = np.array(
            [
                -38.705807,
                -30.639929,
                -31.41903,
                -39.012012,
                -38.38696,
                -34.887207,
                -33.290855,
                -35.68447,
                -38.508484,
                -36.124645,
            ]
        )
        max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE))
        self.assertLessEqual(max_diff, 1e-4)

        def generate_step(pixel_values):

            outputs = model.generate(
                pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True
            )
            output_ids = outputs.sequences
            preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            preds = [pred.strip() for pred in preds]

            return preds, outputs.sequences_scores.detach().cpu().numpy()

        preds, scores = generate_step(pixel_values)

        EXPECTED_SCORES = np.array([-0.59562886])
        max_diff = np.amax(np.abs(scores - EXPECTED_SCORES))
        self.assertLessEqual(max_diff, 1e-4)

        # should produce
        # ["a cat laying on top of a couch next to another cat"]
        self.assertEqual(preds, ["a cat laying on top of a couch next to another cat"])
    def test_inference_coco_en(self):

        loc = "ydshieh/vit-gpt2-coco-en"

        feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
        tokenizer = AutoTokenizer.from_pretrained(loc)
        model = FlaxVisionEncoderDecoderModel.from_pretrained(loc)

        img = prepare_img()
        pixel_values = feature_extractor(images=img,
                                         return_tensors="np").pixel_values

        decoder_input_ids = np.array([[model.config.decoder_start_token_id]])
        logits = model(pixel_values, decoder_input_ids)[0]
        logits = np.array(logits)

        # verify the logits
        expected_shape = (1, 1, model.config.decoder.vocab_size)
        self.assertEqual(logits.shape, expected_shape)

        EXPECTED_LOGIT_SLICE = np.array([
            -38.705837,
            -30.639936,
            -31.41905,
            -39.01204,
            -38.38698,
            -34.887215,
            -33.29087,
            -35.684475,
            -38.50852,
            -36.124676,
        ])
        max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE))
        self.assertLessEqual(max_diff, 1e-4)

        def generate_step(pixel_values):

            outputs = model.generate(pixel_values, max_length=16, num_beams=4)
            output_ids = outputs.sequences
            preds = tokenizer.batch_decode(output_ids,
                                           skip_special_tokens=True)
            preds = [pred.strip() for pred in preds]

            return preds, outputs.scores

        preds, scores = generate_step(pixel_values)

        EXPECTED_SCORES = np.array([-0.59563464])
        scores = np.array(scores)
        max_diff = np.amax(np.abs(scores - EXPECTED_SCORES))
        self.assertLessEqual(max_diff, 1e-4)

        # should produce
        # ["a cat laying on top of a couch next to another cat"]
        self.assertEqual(
            preds, ["a cat laying on top of a couch next to another cat"])
Exemplo n.º 5
0
def save_pretrained_model(config):
    try:

        feature_extractor = ViTFeatureExtractor.from_pretrained(
            'google/vit-base-patch16-224-in21k')
        model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

        feature_extractor.save_pretrained(config.pretrained_vitfe_path)
        model.save_pretrained(config.pretrained_vit_path)
    except Exception as e:
        print(f'Error - {str (e)}')
        return 1
    return 0
Exemplo n.º 6
0
Arquivo: demo.py Projeto: AK391/dino
def visualize(image):
    image = image.convert("RGB")
    feature_extractor = ViTFeatureExtractor(do_resize=True,
                                            size=224,
                                            image_mean=[0.485, 0.456, 0.406],
                                            image_std=[0.485, 0.456, 0.406])
    img = feature_extractor(
        images=image, return_tensors="pt"
    ).pixel_values  # we remove the batch dimension, is added later on

    outputs = model(pixel_values=img, output_attentions=True)

    attentions = outputs['attentions'][
        -1]  # we are only interested in the attention maps of the last layer
    nh = attentions.shape[1]  # number of head

    # we keep only the output patch attention
    attentions = attentions[0, :, 0, 1:].reshape(nh, -1)

    threshold = 0.6
    w_featmap = img.shape[-2] // model.config.patch_size
    h_featmap = img.shape[-1] // model.config.patch_size

    # we keep only a certain percentage of the mass
    val, idx = torch.sort(attentions)
    val /= torch.sum(val, dim=1, keepdim=True)
    cumval = torch.cumsum(val, dim=1)
    th_attn = cumval > (1 - threshold)
    idx2 = torch.argsort(idx)
    for head in range(nh):
        th_attn[head] = th_attn[head][idx2[head]]
    th_attn = th_attn.reshape(nh, w_featmap, h_featmap).float()
    # interpolate
    th_attn = nn.functional.interpolate(th_attn.unsqueeze(0),
                                        scale_factor=model.config.patch_size,
                                        mode="nearest")[0].cpu().numpy()

    attentions = attentions.reshape(nh, w_featmap, h_featmap)
    attentions = nn.functional.interpolate(
        attentions.unsqueeze(0),
        scale_factor=model.config.patch_size,
        mode="nearest")[0].cpu()
    attentions = attentions.detach().numpy()

    # show and save attentions heatmaps

    plt.axis("off")
    plt.imshow(attentions[2])
    return plt
Exemplo n.º 7
0
Arquivo: train.py Projeto: ksg14/DHIC
def train (feature_extractor, model, decoder, dataloader):
	for image, caption, target, target_seq_len in train_dataloader:
	# print (f'image shape - {image.shape}')
	# print (f'caption - {caption.shape}')
	# print (f'target - {target.shape}')
	# print (f'target_seq_len shape- {target_seq_len.shape}')
	# print (f'target_seq_len - {target_seq_len}')

	# print (f'image[0].shape {image [0].shape}')

	# print (f'max - {image.max ()}')
	# print (f'min - {image.min ()}')

	images_list = [image [i] for i in range (config.batch_sz)]
	# print (type (images_list))
	# print (type (images_list [0]))
	# print (images_list [0].shape)

	inputs = feature_extractor(images=images_list, return_tensors="pt")
	outputs = model(**inputs, output_attentions=False, output_hidden_states=False)
	last_hidden_states = outputs.last_hidden_state

	print (f'output shape - {last_hidden_states.shape}')
	break


if __name__ == '__main__':
	config = Config ()

	text_transform = ToSequence (tokenizer=indic_tokenize.trivial_tokenize)
	image_transform = T.Compose ([T.ToTensor(), T.Resize ((224, 224))])

	train_dataset = HVGDataset (config.train_captions, config.word_to_index_path, config.index_to_word_path, config.images_path, config.max_len, text_transform=text_transform, image_transform=image_transform)
	train_dataloader = DataLoader (train_dataset, batch_size=config.batch_sz, shuffle=True)

	feature_extractor = ViTFeatureExtractor.from_pretrained(config.pretrained_vitfe_path)
	model = ViTModel.from_pretrained(config.pretrained_vit_path)

	

	train (feature_extractor=feature_extractor, \
			model=model, \
			decoder=decoder, \
			dataloader=train_dataloader)
def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our ViT structure.
    """

    # define default ViT configuration
    config = ViTConfig()
    base_model = False
    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
    if vit_name[-5:] == "in21k":
        base_model = True
        config.patch_size = int(vit_name[-12:-10])
        config.image_size = int(vit_name[-9:-6])
    else:
        config.num_labels = 1000
        repo_id = "datasets/huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(
            open(cached_download(hf_hub_url(repo_id, filename)), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
        config.patch_size = int(vit_name[-6:-4])
        config.image_size = int(vit_name[-3:])
    # size of the architecture
    if "deit" in vit_name:
        if vit_name[9:].startswith("tiny"):
            config.hidden_size = 192
            config.intermediate_size = 768
            config.num_hidden_layers = 12
            config.num_attention_heads = 3
        elif vit_name[9:].startswith("small"):
            config.hidden_size = 384
            config.intermediate_size = 1536
            config.num_hidden_layers = 12
            config.num_attention_heads = 6
        else:
            pass
    else:
        if vit_name[4:].startswith("small"):
            config.hidden_size = 768
            config.intermediate_size = 2304
            config.num_hidden_layers = 8
            config.num_attention_heads = 8
        elif vit_name[4:].startswith("base"):
            pass
        elif vit_name[4:].startswith("large"):
            config.hidden_size = 1024
            config.intermediate_size = 4096
            config.num_hidden_layers = 24
            config.num_attention_heads = 16
        elif vit_name[4:].startswith("huge"):
            config.hidden_size = 1280
            config.intermediate_size = 5120
            config.num_hidden_layers = 32
            config.num_attention_heads = 16

    # load original model from timm
    timm_model = timm.create_model(vit_name, pretrained=True)
    timm_model.eval()

    # load state_dict of original model, remove and rename some keys
    state_dict = timm_model.state_dict()
    if base_model:
        remove_classification_head_(state_dict)
    rename_keys = create_rename_keys(config, base_model)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, config, base_model)

    # load HuggingFace model
    if vit_name[-5:] == "in21k":
        model = ViTModel(config).eval()
    else:
        model = ViTForImageClassification(config).eval()
    model.load_state_dict(state_dict)

    # Check outputs on an image, prepared by ViTFeatureExtractor/DeiTFeatureExtractor
    if "deit" in vit_name:
        feature_extractor = DeiTFeatureExtractor(size=config.image_size)
    else:
        feature_extractor = ViTFeatureExtractor(size=config.image_size)
    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)

    if base_model:
        timm_pooled_output = timm_model.forward_features(pixel_values)
        assert timm_pooled_output.shape == outputs.pooler_output.shape
        assert torch.allclose(timm_pooled_output,
                              outputs.pooler_output,
                              atol=1e-3)
    else:
        timm_logits = timm_model(pixel_values)
        assert timm_logits.shape == outputs.logits.shape
        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
Exemplo n.º 9
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, CustomTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Initialize our dataset.
    ds = load_dataset(
        data_args.dataset_name,
        data_args.dataset_config_name,
        data_files=data_args.data_files,
        cache_dir=model_args.cache_dir,
    )

    # If we don't have a validation split, split off a percentage of train as validation.
    data_args.train_val_split = None if "validation" in ds.keys(
    ) else data_args.train_val_split
    if isinstance(data_args.train_val_split,
                  float) and data_args.train_val_split > 0.0:
        split = ds["train"].train_test_split(data_args.train_val_split)
        ds["train"] = split["train"]
        ds["validation"] = split["test"]

    # Load pretrained model and feature extractor
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.config_name:
        config = ViTMAEConfig.from_pretrained(model_args.config_name,
                                              **config_kwargs)
    elif model_args.model_name_or_path:
        config = ViTMAEConfig.from_pretrained(model_args.model_name_or_path,
                                              **config_kwargs)
    else:
        config = ViTMAEConfig()
        logger.warning(
            "You are instantiating a new config instance from scratch.")
        if model_args.config_overrides is not None:
            logger.info(f"Overriding config: {model_args.config_overrides}")
            config.update_from_string(model_args.config_overrides)
            logger.info(f"New config: {config}")

    # adapt config
    config.update({
        "mask_ratio": model_args.mask_ratio,
        "norm_pix_loss": model_args.norm_pix_loss,
    })

    # create feature extractor
    if model_args.feature_extractor_name:
        feature_extractor = ViTFeatureExtractor.from_pretrained(
            model_args.feature_extractor_name, **config_kwargs)
    elif model_args.model_name_or_path:
        feature_extractor = ViTFeatureExtractor.from_pretrained(
            model_args.model_name_or_path, **config_kwargs)
    else:
        feature_extractor = ViTFeatureExtractor()

    # create model
    if model_args.model_name_or_path:
        model = ViTMAEForPreTraining.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        logger.info("Training new model from scratch")
        model = ViTMAEForPreTraining(config)

    if training_args.do_train:
        column_names = ds["train"].column_names
    else:
        column_names = ds["validation"].column_names

    if data_args.image_column_name is not None:
        image_column_name = data_args.image_column_name
    elif "image" in column_names:
        image_column_name = "image"
    elif "img" in column_names:
        image_column_name = "img"
    else:
        image_column_name = column_names[0]

    # transformations as done in original MAE paper
    # source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py
    transforms = Compose([
        Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
        RandomResizedCrop(feature_extractor.size,
                          scale=(0.2, 1.0),
                          interpolation=InterpolationMode.BICUBIC),
        RandomHorizontalFlip(),
        ToTensor(),
        Normalize(mean=feature_extractor.image_mean,
                  std=feature_extractor.image_std),
    ])

    def preprocess_images(examples):
        """Preprocess a batch of images by applying transforms."""

        examples["pixel_values"] = [
            transforms(image) for image in examples[image_column_name]
        ]
        return examples

    if training_args.do_train:
        if "train" not in ds:
            raise ValueError("--do_train requires a train dataset")
        if data_args.max_train_samples is not None:
            ds["train"] = ds["train"].shuffle(seed=training_args.seed).select(
                range(data_args.max_train_samples))
        # Set the training transforms
        ds["train"].set_transform(preprocess_images)

    if training_args.do_eval:
        if "validation" not in ds:
            raise ValueError("--do_eval requires a validation dataset")
        if data_args.max_eval_samples is not None:
            ds["validation"] = (ds["validation"].shuffle(
                seed=training_args.seed).select(
                    range(data_args.max_eval_samples)))
        # Set the validation transforms
        ds["validation"].set_transform(preprocess_images)

    # Compute absolute learning rate
    total_train_batch_size = (training_args.train_batch_size *
                              training_args.gradient_accumulation_steps *
                              training_args.world_size)
    if training_args.base_learning_rate is not None:
        training_args.learning_rate = training_args.base_learning_rate * total_train_batch_size / 256

    # Initialize our trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"] if training_args.do_train else None,
        eval_dataset=ds["validation"] if training_args.do_eval else None,
        tokenizer=feature_extractor,
        data_collator=collate_fn,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()
        trainer.log_metrics("train", train_result.metrics)
        trainer.save_metrics("train", train_result.metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        metrics = trainer.evaluate()
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Write model card and (optionally) push to hub
    kwargs = {
        "tasks": "masked-auto-encoding",
        "dataset": data_args.dataset_name,
        "tags": ["masked-auto-encoding"],
    }
    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
    else:
        trainer.create_model_card(**kwargs)
Exemplo n.º 10
0
# @author Loreto Parisi (loretoparisi at gmail dot com)
# Copyright (c) 2021 Loreto Parisi (loretoparisi at gmail dot com)

import os, sys
import torch
from transformers import ViTFeatureExtractor, ViTForImageClassification
from PIL import Image
import requests

BASE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH, '..'))
from lpdutils.lpimagedataset import LPImageDataSet

# to choose a different model by image size, patch size, and parameters number, see README
feature_extractor = ViTFeatureExtractor.from_pretrained(
    'google/vit-large-patch16-224',
    cache_dir=os.getenv("cache_dir", "../../models"))
model = ViTForImageClassification.from_pretrained(
    'google/vit-large-patch16-224',
    cache_dir=os.getenv("cache_dir", "../../models"))

# load local dataset
batch_size = 2
num_workers = 2
my_dataset = LPImageDataSet(os.path.join(
    os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'imagenet'),
                            transform=LPImageDataSet.transform)
imageloader = torch.utils.data.DataLoader(my_dataset,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          num_workers=num_workers,
Exemplo n.º 11
0
def main():
    ds = load_dataset(
        data_args.dataset_name,
        data_args.dataset_config,
        data_files=data_args.data_files,
        cache_dir=model_args.cache_dir,
    )

    # If we don't have a validation split, split off a percentage of train as validation.
    data_args.train_val_split = None if "validation" in ds.keys(
    ) else data_args.train_val_split
    if isinstance(data_args.train_val_split,
                  float) and data_args.train_val_split > 0.0:
        split = ds["train"].train_test_split(data_args.train_val_split)
        ds["train"] = split["train"]
        ds["validation"] = split["test"]

    # Load pretrained model and feature extractor
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config_kw = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_version,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.config_name:
        config = ViTMAEConfig.from_pretrained(model_args.config_name,
                                              **config_kw)
    elif model_args.model_name:
        config = ViTMAEConfig.from_pretrained(model_args.model_name,
                                              **config_kw)
    else:
        config = ViTMAEConfig()
        logger.warning(
            "You are instantiating a new config instance from scratch.")
        if model_args.config_overrides is not None:
            logger.info(f"Overriding config: {model_args.config_overrides}")
            config.update_from_string(model_args.config_overrides)
            logger.info(f"New config: {config}")

    # adapt config
    config.update({
        "mask_ratio": model_args.mask_ratio,
        "norm_pix_loss": model_args.norm_pix_loss,
    })

    # create feature extractor
    if model_args.feature_extractor:
        feature_extractor = ViTFeatureExtractor.from_pretrained(
            model_args.feature_extractor, **config_kw)
    elif model_args.model_name:
        feature_extractor = ViTFeatureExtractor.from_pretrained(
            model_args.model_name, **config_kw)
    else:
        feature_extractor = ViTFeatureExtractor()

    # create model
    if model_args.model_name:
        model = ViTMAEForPreTraining.from_pretrained(
            model_args.model_name,
            from_tf=bool(".ckpt" in model_args.model_name),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_version,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        logger.info("Training new model")
        model = ViTMAEForPreTraining(config)

    if training_args.do_train:
        column_names = ds["train"].column_names
    else:
        column_names = ds["validation"].column_names

    if data_args.image_column_name is not None:
        image_column_name = data_args.image_column_name
    elif "image" in column_names:
        image_column_name = "image"
    elif "img" in column_names:
        image_column_name = "img"
    else:
        image_column_name = column_names[0]

    # transformations as done in original MAE paper
    # source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py
    transforms = Compose([
        Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
        RandomResizedCrop(feature_extractor.size,
                          scale=(0.2, 1.0),
                          interpolation=InterpolationMode.BICUBIC),
        RandomHorizontalFlip(),
        ToTensor(),
        Normalize(mean=feature_extractor.image_mean,
                  std=feature_extractor.image_std),
    ])

    def preprocess_images(examples):
        """Preprocess a batch of images by applying transforms."""

        examples["pixel_values"] = [
            transforms(image) for image in examples[image_column_name]
        ]
        return examples

    if training_args.do_train:
        if "train" not in ds:
            raise ValueError("--do_train requires a train dataset")
        if data_args.max_train_samples is not None:
            ds["train"] = (ds["train"].shuffle(seed=training_args.seed).select(
                range(data_args.max_train_samples)))
        # Set the training transforms
        ds["train"].set_transform(preprocess_images)

    if training_args.do_eval:
        if "validation" not in ds:
            raise ValueError("--do_eval requires a validation dataset")
        if data_args.max_eval_samples is not None:
            ds["validation"] = (ds["validation"].shuffle(
                seed=training_args.seed).select(
                    range(data_args.max_eval_samples)))
        # Set the validation transforms
        ds["validation"].set_transform(preprocess_images)

    # Compute absolute learning rate
    total_train_batch_size = (training_args.train_batch_size *
                              training_args.grad_accumulation_steps *
                              training_args.world_size)
    if training_args.base_lr is not None:
        training_args.lr = training_args.base_lr * total_train_batch_size / 256

    # Initialize our trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"] if training_args.do_train else None,
        eval_dataset=ds["validation"] if training_args.do_eval else None,
        tokenizer=feature_extractor,
        data_collator=collate_fn,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()
        trainer.log_metrics("train", train_result.metrics)
        trainer.save_metrics("train", train_result.metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        metrics = trainer.evaluate()
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Write model card and (optionally) push to hub
    kw = {
        "tasks": "masked-auto-encoding",
        "dataset": data_args.dataset_name,
        "tags": ["masked-auto-encoding"],
    }
    if training_args.push_to_hub:
        trainer.push_to_hub(**kw)
    else:
        trainer.create_model_card(**kw)
Exemplo n.º 12
0
def main(args):

    if use_ViT_Enc:
        print("It is using ViT encoder!!!!")
        transform = None
        feature_extractor = ViTFeatureExtractor.from_pretrained(
            'google/vit-base-patch16-224-in21k')

    else:
        feature_extractor = None
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Resize((args['image_size'], args['image_size'])),
            # The normalize parameters depends on the model we're gonna use
            # If we apply transfer learning from a model that used ImageNet, then
            # we should use the ImageNet values to normalize the dataset.
            # Otherwise we could just normalize the values between -1 and 1 using the
            # standard mean and standard deviation
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
        ])

    dataset = Flickr8kDataset(dataset_folder='data',
                              transform=transform,
                              reduce=True,
                              vocab_max_size=args['vocabulary_size'],
                              feature_extractor=feature_extractor)

    # Create the model
    if use_ViT_Enc:
        model = ViTImageCaptioningModel(
            embed_size=args['embedding_dimension'],
            vocab=dataset.vocab,
            caption_max_length=args['captions_max_length'],
        ).to(device)
    else:
        model = ImageCaptioningModel(
            image_features_dim=args['image_features_dimension'],
            embed_size=args['embedding_dimension'],
            vocab=dataset.vocab,
            caption_max_length=args['captions_max_length'],
        ).to(device)

    # Perform the split of the dataset
    train_split, test_split = split_subsets(dataset, all_captions=True)

    train_loader = DataLoader(train_split,
                              shuffle=True,
                              batch_size=args['batch_size'],
                              collate_fn=CapsCollate(
                                  pad_idx=dataset.vocab.word_to_index['<PAD>'],
                                  batch_first=True))

    test_loader = DataLoader(test_split,
                             shuffle=True,
                             batch_size=args['batch_size'],
                             collate_fn=CapsCollate(
                                 pad_idx=dataset.vocab.word_to_index['<PAD>'],
                                 batch_first=True))

    optimizer = optim.Adam(model.parameters(),
                           lr=args['learning_rate'],
                           betas=(0.9, 0.98),
                           eps=1e-9)
    criterion = nn.CrossEntropyLoss(
        ignore_index=dataset.vocab.word_to_index['<PAD>'])

    train(num_epochs=args['epochs'],
          model=model,
          train_loader=train_loader,
          test_loader=test_loader,
          optimizer=optimizer,
          criterion=criterion,
          device=device,
          log_interval=args['log_interval'])
def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our VisionEncoderDecoderModel structure.
    """
    # define encoder and decoder configs based on checkpoint_url
    encoder_config = ViTConfig(image_size=384, qkv_bias=False)
    decoder_config = TrOCRConfig()

    # size of the architecture
    if "base" in checkpoint_url:
        decoder_config.encoder_hidden_size = 768
    elif "large" in checkpoint_url:
        # use ViT-large encoder
        encoder_config.hidden_size = 1024
        encoder_config.intermediate_size = 4096
        encoder_config.num_hidden_layers = 24
        encoder_config.num_attention_heads = 16
        decoder_config.encoder_hidden_size = 1024
    else:
        raise ValueError(
            "Should either find 'base' or 'large' in checkpoint URL")

    # the large-printed + stage1 checkpoints uses sinusoidal position embeddings, no layernorm afterwards
    if "large-printed" in checkpoint_url or "stage1" in checkpoint_url:
        decoder_config.tie_word_embeddings = False
        decoder_config.activation_function = "relu"
        decoder_config.max_position_embeddings = 1024
        decoder_config.scale_embedding = True
        decoder_config.use_learned_position_embeddings = False
        decoder_config.layernorm_embedding = False

    # load HuggingFace model
    encoder = ViTModel(encoder_config, add_pooling_layer=False)
    decoder = TrOCRForCausalLM(decoder_config)
    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
    model.eval()

    # load state_dict of original model, rename some keys
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url,
                                                    map_location="cpu",
                                                    check_hash=True)["model"]

    rename_keys = create_rename_keys(encoder_config, decoder_config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, encoder_config)

    # remove parameters we don't need
    del state_dict["encoder.deit.head.weight"]
    del state_dict["encoder.deit.head.bias"]
    del state_dict["decoder.version"]

    # add prefix to decoder keys
    for key, val in state_dict.copy().items():
        val = state_dict.pop(key)
        if key.startswith("decoder") and "output_projection" not in key:
            state_dict["decoder.model." + key] = val
        else:
            state_dict[key] = val

    # load state dict
    model.load_state_dict(state_dict)

    # Check outputs on an image
    feature_extractor = ViTFeatureExtractor(size=encoder_config.image_size)
    tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
    processor = TrOCRProcessor(feature_extractor, tokenizer)

    pixel_values = processor(images=prepare_img(checkpoint_url),
                             return_tensors="pt").pixel_values

    # verify logits
    decoder_input_ids = torch.tensor(
        [[model.config.decoder.decoder_start_token_id]])
    outputs = model(pixel_values=pixel_values,
                    decoder_input_ids=decoder_input_ids)
    logits = outputs.logits

    expected_shape = torch.Size([1, 1, 50265])
    if "trocr-base-handwritten" in checkpoint_url:
        expected_slice = torch.tensor([
            -1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764,
            1.7560, 8.7358, -1.5311
        ])
    elif "trocr-large-handwritten" in checkpoint_url:
        expected_slice = torch.tensor([
            -2.6437, -1.3129, -2.2596, -5.3455, 6.3539, 1.7604, 5.4991, 1.4702,
            5.6113, 2.0170
        ])
    elif "trocr-base-printed" in checkpoint_url:
        expected_slice = torch.tensor([
            -5.6816, -5.8388, 1.1398, -6.9034, 6.8505, -2.4393, 1.2284,
            -1.0232, -1.9661, -3.9210
        ])
    elif "trocr-large-printed" in checkpoint_url:
        expected_slice = torch.tensor([
            -6.0162, -7.0959, 4.4155, -5.1063, 7.0468, -3.1631, 2.6466,
            -0.3081, -0.8106, -1.7535
        ])

    if "stage1" not in checkpoint_url:
        assert logits.shape == expected_shape, "Shape of logits not as expected"
        assert torch.allclose(
            logits[0, 0, :10], expected_slice,
            atol=1e-3), "First elements of logits not as expected"

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving processor to {pytorch_dump_folder_path}")
    processor.save_pretrained(pytorch_dump_folder_path)
Exemplo n.º 14
0
 def get_feature_extractor(self, **kwargs):
     return ViTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
Exemplo n.º 15
0
def convert_vit_checkpoint(model_name,
                           pytorch_dump_folder_path,
                           base_model=True):
    """
    Copy/paste/tweak model's weights to our ViT structure.
    """

    # define default ViT configuration
    config = ViTConfig()
    # patch_size
    if model_name[-1] == "8":
        config.patch_size = 8
    # set labels if required
    if not base_model:
        config.num_labels = 1000
        repo_id = "datasets/huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
    # size of the architecture
    if model_name in ["dino_vits8", "dino_vits16"]:
        config.hidden_size = 384
        config.intermediate_size = 1536
        config.num_hidden_layers = 12
        config.num_attention_heads = 6

    # load original model from torch hub
    original_model = torch.hub.load("facebookresearch/dino:main", model_name)
    original_model.eval()

    # load state_dict of original model, remove and rename some keys
    state_dict = original_model.state_dict()
    if base_model:
        remove_classification_head_(state_dict)
    rename_keys = create_rename_keys(config, base_model=base_model)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, config, base_model)

    # load HuggingFace model
    if base_model:
        model = ViTModel(config, add_pooling_layer=False).eval()
    else:
        model = ViTForImageClassification(config).eval()
    model.load_state_dict(state_dict)

    # Check outputs on an image, prepared by ViTFeatureExtractor
    feature_extractor = ViTFeatureExtractor()
    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)

    if base_model:
        final_hidden_state_cls_token = original_model(pixel_values)
        assert torch.allclose(final_hidden_state_cls_token,
                              outputs.last_hidden_state[:, 0, :],
                              atol=1e-1)
    else:
        logits = original_model(pixel_values)
        assert logits.shape == outputs.logits.shape
        assert torch.allclose(logits, outputs.logits, atol=1e-3)

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
 def default_feature_extractor(self):
     return ViTFeatureExtractor.from_pretrained(
         "google/vit-base-patch16-224") if is_vision_available() else None
 def default_feature_extractor(self):
     return ViTFeatureExtractor.from_pretrained(
         "facebook/vit-mae-base") if is_vision_available() else None
Exemplo n.º 18
0
# ### Data loading
#
# First we specify the pre-trained ViT model we are going to use. The
# model ["google/vit-base-patch16-224"]
# (https://huggingface.co/google/vit-base-patch16-224) is pre-trained
# on ImageNet-21k (14 million images, 21,843 classes) at resolution
# 224x224, and fine-tuned on ImageNet 2012 (1 million images, 1,000
# classes) at resolution 224x224.
#
# We'll use a pre-trained ViT feature extractor that matches the ViT
# model to preprocess the input images.

VITMODEL = 'google/vit-base-patch16-224'

feature_extractor = ViTFeatureExtractor.from_pretrained(VITMODEL)

# Next we define functions to load and preprocess the images:


def _load_and_process_image(path, label):
    img = Image.open(path.numpy()).convert("RGB")
    proc_img = feature_extractor(images=img,
                                 return_tensors="np")['pixel_values']
    return np.squeeze(proc_img), label


def load_and_process_image(path, label):
    image, label = tf.py_function(_load_and_process_image, (path, label),
                                  (tf.float32, tf.int32))
    image.set_shape([None, None, None])