def get_swinv2_config(swinv2_name):
    config = Swinv2Config()
    name_split = swinv2_name.split("_")

    model_size = name_split[1]
    if "to" in name_split[3]:
        img_size = int(name_split[3][-3:])
    else:
        img_size = int(name_split[3])
    if "to" in name_split[2]:
        window_size = int(name_split[2][-2:])
    else:
        window_size = int(name_split[2][6:])

    if model_size == "tiny":
        embed_dim = 96
        depths = (2, 2, 6, 2)
        num_heads = (3, 6, 12, 24)
    elif model_size == "small":
        embed_dim = 96
        depths = (2, 2, 18, 2)
        num_heads = (3, 6, 12, 24)
    elif model_size == "base":
        embed_dim = 128
        depths = (2, 2, 18, 2)
        num_heads = (4, 8, 16, 32)
    else:
        embed_dim = 192
        depths = (2, 2, 18, 2)
        num_heads = (6, 12, 24, 48)

    if "to" in swinv2_name:
        config.pretrained_window_sizes = (12, 12, 12, 6)

    if ("22k" in swinv2_name) and ("to" not in swinv2_name):
        num_classes = 21841
        repo_id = "datasets/huggingface/label-files"
        filename = "imagenet-22k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}

    else:
        num_classes = 1000
        repo_id = "datasets/huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}

    config.image_size = img_size
    config.num_labels = num_classes
    config.embed_dim = embed_dim
    config.depths = depths
    config.num_heads = num_heads
    config.window_size = window_size

    return config
Пример #2
0
def from_hub(repo_id: str, **kwargs: Any):
    """Instantiate & load a pretrained model from HF hub.

    >>> from doctr.models import from_hub
    >>> model = from_hub("mindee/fasterrcnn_mobilenet_v3_large_fpn")

    Args:
        repo_id: HuggingFace model hub repo
        kwargs: kwargs of `hf_hub_download` or `snapshot_download`

    Returns:
        Model loaded with the checkpoint
    """

    # Get the config
    with open(hf_hub_download(repo_id, filename="config.json", **kwargs),
              "rb") as f:
        cfg = json.load(f)

    arch = cfg["arch"]
    task = cfg["task"]
    cfg.pop("arch")
    cfg.pop("task")

    if task == "classification":
        model = models.classification.__dict__[arch](
            pretrained=False,
            classes=cfg["classes"],
            num_classes=cfg["num_classes"])
    elif task == "detection":
        model = models.detection.__dict__[arch](pretrained=False)
    elif task == "recognition":
        model = models.recognition.__dict__[arch](
            pretrained=False,
            input_shape=cfg["input_shape"],
            vocab=cfg["vocab"])
    elif task == "obj_detection" and is_torch_available():
        model = models.obj_detection.__dict__[arch](
            pretrained=False,
            image_mean=cfg["mean"],
            image_std=cfg["std"],
            max_size=cfg["input_shape"][-1],
            num_classes=len(cfg["classes"]),
        )

    # update model cfg
    model.cfg = cfg

    # Load checkpoint
    if is_torch_available():
        state_dict = torch.load(hf_hub_download(repo_id,
                                                filename="pytorch_model.bin",
                                                **kwargs),
                                map_location="cpu")
        model.load_state_dict(state_dict)
    else:  # tf
        repo_path = snapshot_download(repo_id, **kwargs)
        model.load_weights(os.path.join(repo_path, "tf_model", "weights"))

    return model
Пример #3
0
def download_from_string(
    path, module, keys, package, quantized=False
):
    model = path
    repo_id = f'{HUGGINGFACE_USERNAME}/{module}-{model}'

    if quantized:
        repo_id = f'{repo_id}-quantized'
        try:
            list_repo_files(repo_id)
        except:
            raise ValueError(
                f'Quantized model for `{os.path.join(module, model)}` is not available, please load normal model.'
            )
        logger.warning('Load quantized model will cause accuracy drop.')

    files = {}
    for k, file in keys.items():
        if '/' in file:
            splitted = os.path.split(file)
            repo_id_ = splitted[0].replace('/', '-')
            repo_id_ = f'{HUGGINGFACE_USERNAME}/{repo_id_}'
            file = splitted[1]
        else:
            repo_id_ = repo_id
        files[k] = hf_hub_download(repo_id_, file)

    return files
Пример #4
0
def get_mobilevit_config(mobilevit_name):
    config = MobileViTConfig()

    # size of the architecture
    if "mobilevit_s" in mobilevit_name:
        config.hidden_sizes = [144, 192, 240]
        config.neck_hidden_sizes = [16, 32, 64, 96, 128, 160, 640]
    elif "mobilevit_xs" in mobilevit_name:
        config.hidden_sizes = [96, 120, 144]
        config.neck_hidden_sizes = [16, 32, 48, 64, 80, 96, 384]
    elif "mobilevit_xxs" in mobilevit_name:
        config.hidden_sizes = [64, 80, 96]
        config.neck_hidden_sizes = [16, 16, 24, 48, 64, 80, 320]
        config.hidden_dropout_prob = 0.05
        config.expand_ratio = 2.0

    if mobilevit_name.startswith("deeplabv3_"):
        config.image_size = 512
        config.output_stride = 16
        config.num_labels = 21
        filename = "pascal-voc-id2label.json"
    else:
        config.num_labels = 1000
        filename = "imagenet-1k-id2label.json"

    repo_id = "datasets/huggingface/label-files"
    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}

    return config
def get_yolos_config(yolos_name):
    config = YolosConfig()

    # size of the architecture
    if "yolos_ti" in yolos_name:
        config.hidden_size = 192
        config.intermediate_size = 768
        config.num_hidden_layers = 12
        config.num_attention_heads = 3
        config.image_size = [800, 1333]
        config.use_mid_position_embeddings = False
    elif yolos_name == "yolos_s_dWr":
        config.hidden_size = 330
        config.num_hidden_layers = 14
        config.num_attention_heads = 6
        config.intermediate_size = 1320
    elif "yolos_s" in yolos_name:
        config.hidden_size = 384
        config.intermediate_size = 1536
        config.num_hidden_layers = 12
        config.num_attention_heads = 6
    elif "yolos_b" in yolos_name:
        config.image_size = [800, 1344]

    config.num_labels = 91
    repo_id = "datasets/huggingface/label-files"
    filename = "coco-detection-id2label.json"
    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}

    return config
Пример #6
0
def get_videomae_config(model_name):
    config = VideoMAEConfig()

    if "large" in model_name:
        config.hidden_size = 1024
        config.intermediate_size = 4096
        config.num_hidden_layers = 24
        config.num_attention_heads = 16
        config.decoder_num_hidden_layers = 12
        config.decoder_num_attention_heads = 8
        config.decoder_hidden_size = 512
        config.decoder_intermediate_size = 2048

    if "finetuned" not in model_name:
        config.use_mean_pooling = False

    if "finetuned" in model_name:
        repo_id = "datasets/huggingface/label-files"
        if "kinetics" in model_name:
            config.num_labels = 400
            filename = "kinetics400-id2label.json"
        elif "ssv2" in model_name:
            config.num_labels = 174
            filename = "something-something-v2-id2label.json"
        else:
            raise ValueError(
                "Model name should either contain 'kinetics' or 'ssv2' in case it's fine-tuned."
            )
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}

    return config
def convert_weights_and_push(save_directory: Path,
                             model_name: str = None,
                             push_to_hub: bool = True):
    filename = "imagenet-1k-id2label.json"
    num_labels = 1000
    expected_shape = (1, num_labels)

    repo_id = "datasets/huggingface/label-files"
    num_labels = num_labels
    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
    id2label = {int(k): v for k, v in id2label.items()}

    id2label = id2label
    label2id = {v: k for k, v in id2label.items()}

    ImageNetPreTrainedConfig = partial(ResNetConfig,
                                       num_labels=num_labels,
                                       id2label=id2label,
                                       label2id=label2id)

    names_to_config = {
        "resnet18":
        ImageNetPreTrainedConfig(depths=[2, 2, 2, 2],
                                 hidden_sizes=[64, 128, 256, 512],
                                 layer_type="basic"),
        "resnet26":
        ImageNetPreTrainedConfig(depths=[2, 2, 2, 2],
                                 hidden_sizes=[256, 512, 1024, 2048],
                                 layer_type="bottleneck"),
        "resnet34":
        ImageNetPreTrainedConfig(depths=[3, 4, 6, 3],
                                 hidden_sizes=[64, 128, 256, 512],
                                 layer_type="basic"),
        "resnet50":
        ImageNetPreTrainedConfig(depths=[3, 4, 6, 3],
                                 hidden_sizes=[256, 512, 1024, 2048],
                                 layer_type="bottleneck"),
        "resnet101":
        ImageNetPreTrainedConfig(depths=[3, 4, 23, 3],
                                 hidden_sizes=[256, 512, 1024, 2048],
                                 layer_type="bottleneck"),
        "resnet152":
        ImageNetPreTrainedConfig(depths=[3, 8, 36, 3],
                                 hidden_sizes=[256, 512, 1024, 2048],
                                 layer_type="bottleneck"),
    }

    if model_name:
        convert_weight_and_push(model_name, names_to_config[model_name],
                                save_directory, push_to_hub)
    else:
        for model_name, config in names_to_config.items():
            convert_weight_and_push(model_name, config, save_directory,
                                    push_to_hub)
    return config, expected_shape
Пример #8
0
def download_checkpoints(dataset=None):
    """Downloads the checkpoints from the HuggingFace Hub for the specified
    dataset

    Args:
        - dataset (str): one of 'fastmri' or 'OASIS'. Defaults to None,
        which means that both are downloaded.
    """
    if dataset is None:
        for d in ['fastmri', 'OASIS']:
            download_checkpoints(dataset=d)
    else:
        for model_name in MODEL_NAMES:
            repo_id = REPO_ID_BASE.format(model_name=model_name,
                                          dataset=dataset)
            hf_hub_download(
                repo_id=repo_id,
                filename='model_weights.h5',
                cache_dir=Path(CHECKPOINTS_DIR) / 'checkpoints',
                force_filename=f'{model_name}-{dataset}.hdf5',
            )
Пример #9
0
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *init_inputs, **kwargs):
        if os.path.isdir(pretrained_model_name_or_path):
            block_records_path = os.path.join(pretrained_model_name_or_path, _REALM_BLOCK_RECORDS_FILENAME)
        else:
            block_records_path = hf_hub_download(
                repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kwargs
            )
        block_records = np.load(block_records_path, allow_pickle=True)

        tokenizer = RealmTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)

        return cls(block_records, tokenizer)
    def test_inference_for_pretraining(self):
        model = VideoMAEForPreTraining.from_pretrained(
            "MCG-NJU/videomae-base-short").to(torch_device)

        feature_extractor = self.default_feature_extractor
        video = prepare_video()
        inputs = feature_extractor(video, return_tensors="pt").to(torch_device)

        # add boolean mask, indicating which patches to mask
        local_path = hf_hub_download(
            repo_id="hf-internal-testing/bool-masked-pos",
            filename="bool_masked_pos.pt")
        inputs["bool_masked_pos"] = torch.load(local_path)

        # forward pass
        with torch.no_grad():
            outputs = model(**inputs)

        # verify the logits
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor(
            [[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302],
             [0.5862, 0.7468, 0.7325]],
            device=torch_device)
        self.assertEqual(outputs.logits.shape, expected_shape)
        self.assertTrue(
            torch.allclose(outputs.logits[0, :3, :3],
                           expected_slice,
                           atol=1e-4))

        # verify the loss (`config.norm_pix_loss` = `True`)
        expected_loss = torch.tensor([0.5142], device=torch_device)
        self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))

        # verify the loss (`config.norm_pix_loss` = `False`)
        model = VideoMAEForPreTraining.from_pretrained(
            "MCG-NJU/videomae-base-short",
            norm_pix_loss=False).to(torch_device)

        with torch.no_grad():
            outputs = model(**inputs)

        expected_loss = torch.tensor(torch.tensor([0.6469]),
                                     device=torch_device)
        self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))
Пример #11
0
def get_convnext_config(checkpoint_url):
    config = ConvNextConfig()

    if "tiny" in checkpoint_url:
        depths = [3, 3, 9, 3]
        hidden_sizes = [96, 192, 384, 768]
    if "small" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [96, 192, 384, 768]
    if "base" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [128, 256, 512, 1024]
    if "large" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [192, 384, 768, 1536]
    if "xlarge" in checkpoint_url:
        depths = [3, 3, 27, 3]
        hidden_sizes = [256, 512, 1024, 2048]

    if "1k" in checkpoint_url:
        num_labels = 1000
        filename = "imagenet-1k-id2label.json"
        expected_shape = (1, 1000)
    else:
        num_labels = 21841
        filename = "imagenet-22k-id2label.json"
        expected_shape = (1, 21841)

    repo_id = "datasets/huggingface/label-files"
    config.num_labels = num_labels
    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    if "1k" not in checkpoint_url:
        # this dataset contains 21843 labels but the model only has 21841
        # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18
        del id2label[9205]
        del id2label[15027]
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}
    config.hidden_sizes = hidden_sizes
    config.depths = depths

    return config, expected_shape
Пример #12
0
    def _get_hf_hub_pretrained_model_info(cls,
                                          model_name: str,
                                          refresh_cache: bool = False
                                          ) -> (type, str):
        """
        Resolve the HuggingFace Hub model pretrained information given a model name.
        The model name must be of general syntax ``{source_repo}/{model_name}``.

        Note:
            The ``{source_repo}`` need not be ``nvidia``, it can be any public repository, even external to Nvidia.
            This allows public, externally contributed models to be run freely using Nvidia NeMo.

        Args:
            model_name: Str name of the model. Must be the original name or an alias of the model, without any '/'.
            refresh_cache: Bool, determines whether cache must be refreshed (model is re-downloaded).

        Returns:
            A tuple of details describing :
            -   The resolved class of the model. Since the source is external to NeMo, always default to using
                the calling class. Depend on target class resolution by restore_from() for calling the correct class.
            -   The path to the NeMo model (.nemo file) in some cached directory (managed by HF Hub).
        """
        # Resolve the model name without origin for filename
        resolved_model_filename = model_name.split("/")[-1] + '.nemo'

        # Check if api token exists, use if it does
        is_token_available = HfFolder.get_token() is not None

        # Try to load the model from the Huggingface Hub
        path = hf_hub_download(
            repo_id=model_name,
            filename=resolved_model_filename,
            library_name="nemo",
            library_version=nemo.__version__,
            force_download=refresh_cache,
            use_auth_token=is_token_available,
        )

        # Cannot pre-resolve the specific class without double instantiation (first for config, second for model params)
        # Default to current class, and perform basic class path resolution (handled via restore_from() + target class)
        class_ = cls

        return class_, path
Пример #13
0
def download_from_dict(file, s3_file, package, quantized=False):
    home, _ = _get_home(package=package)
    if quantized:
        if 'quantized' not in file:
            f = file['model'].replace(home, '').split('/')
            raise ValueError(
                f'Quantized model for {f[1]} module is not available, please load normal model.'
            )
        model = 'quantized'
        logger.warning('Load quantized model will cause accuracy drop.')
    else:
        model = 'model'

    files = {}
    for k, file in s3_file.items():
        base_path, file = os.path.split(file)
        base_path = base_path.replace('/', '-')
        base_path = f'{HUGGINGFACE_USERNAME}/{base_path}'
        logger.info(f'downloading frozen {base_path}/{file}')
        files[k] = hf_hub_download(base_path, file)
    return files
Пример #14
0
def load_trained_model(name_or_path):
    import huggingface_hub as HFhub
    tokenizer = AutoTokenizer.from_pretrained(name_or_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(name_or_path)
    # load preprocessing_kwargs from the model repo on HF hub, or from the local model directory
    kwargs_filename = None
    if name_or_path.startswith(
            "kleinay/"
    ) and 'preprocessing_kwargs.json' in HFhub.list_repo_files(name_or_path):
        kwargs_filename = HFhub.hf_hub_download(
            repo_id=name_or_path, filename="preprocessing_kwargs.json")
    elif Path(name_or_path).is_dir() and (Path(name_or_path) /
                                          "experiment_kwargs.json").exists():
        kwargs_filename = Path(name_or_path) / "experiment_kwargs.json"

    if kwargs_filename:
        preprocessing_kwargs = json.load(open(kwargs_filename))
        # integrate into model.config (for decoding args, e.g. "num_beams"), and save also as standalone object for preprocessing
        model.config.preprocessing_kwargs = Namespace(**preprocessing_kwargs)
        model.config.update(preprocessing_kwargs)
    return model, tokenizer
Пример #15
0
    def from_pretrained(
        cls,
        pretrained_model_name_or_path,
        *init_inputs,
        **kw,
    ):
        if os.path.isdir(pretrained_model_name_or_path):
            block_records_path = os.path.join(pretrained_model_name_or_path,
                                              _REALM_BLOCK_RECORDS_FILENAME)
        else:
            block_records_path = hf_hub_download(
                repo_id=pretrained_model_name_or_path,
                filename=_REALM_BLOCK_RECORDS_FILENAME,
                **kw,
            )
        block_records = np.load(block_records_path, allow_pickle=True)

        tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path, *init_inputs, **kw)

        return cls(block_records, tokenizer)
Пример #16
0
    def __init__(self, model_id: str):

        # Reload Keras SavedModel
        self.model = from_pretrained_keras(model_id)

        # Handle binary classification with a single output unit.
        self.single_output_unit = self.model.output_shape[1] == 1
        self.num_labels = 2 if self.single_output_unit else self.model.output_shape[
            1]

        # Config is required to know the mapping to label.
        config_file = hf_hub_download(model_id, filename=CONFIG_FILENAME)
        with open(config_file) as config:
            config = json.load(config)

        self.id2label = config.get(
            "id2label", {str(i): f"LABEL_{i}"
                         for i in range(self.num_labels)})

        # Define PIL Interpolation method. Uses Image.NEAREST by default.
        self.interpolation = _PIL_INTERPOLATION_METHODS.get(
            config.get("interpolation", "nearest"), None)
        # Return at most the top 5 predicted classes
        self.top_k = 5
Пример #17
0
def main():
    args = parse_args()

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_semantic_segmentation_no_trainer", args)

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
    # in the environment
    accelerator = (Accelerator(log_with=args.report_to,
                               logging_dir=args.output_dir)
                   if args.with_tracking else Accelerator())
    logger.info(accelerator.state, main_process_only=False)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    # We set device_specific to True as we want different data augmentation per device.
    if args.seed is not None:
        set_seed(args.seed, device_specific=True)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
            if args.hub_model_id is None:
                repo_name = get_full_repo_name(Path(args.output_dir).name,
                                               token=args.hub_token)
            else:
                repo_name = args.hub_model_id
            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"),
                      "w+") as gitignore:
                if "step_*" not in gitignore:
                    gitignore.write("step_*\n")
                if "epoch_*" not in gitignore:
                    gitignore.write("epoch_*\n")
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # Load dataset
    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
    # TODO support datasets from local folders
    dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir)

    # Rename column names to standardized names (only "image" and "label" need to be present)
    if "pixel_values" in dataset["train"].column_names:
        dataset = dataset.rename_columns({"pixel_values": "image"})
    if "annotation" in dataset["train"].column_names:
        dataset = dataset.rename_columns({"annotation": "label"})

    # If we don't have a validation split, split off a percentage of train as validation.
    args.train_val_split = None if "validation" in dataset.keys(
    ) else args.train_val_split
    if isinstance(args.train_val_split, float) and args.train_val_split > 0.0:
        split = dataset["train"].train_test_split(args.train_val_split)
        dataset["train"] = split["train"]
        dataset["validation"] = split["test"]

    # Prepare label mappings.
    # We'll include these in the model's config to get human readable labels in the Inference API.
    if args.dataset_name == "scene_parse_150":
        repo_id = "datasets/huggingface/label-files"
        filename = "ade20k-id2label.json"
    else:
        repo_id = f"datasets/{args.dataset_name}"
        filename = "id2label.json"
    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    label2id = {v: k for k, v in id2label.items()}

    # Load pretrained model and feature extractor
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        id2label=id2label,
                                        label2id=label2id)
    feature_extractor = AutoFeatureExtractor.from_pretrained(
        args.model_name_or_path)
    model = AutoModelForSemanticSegmentation.from_pretrained(
        args.model_name_or_path, config=config)

    # Preprocessing the datasets
    # Define torchvision transforms to be applied to each image + target.
    # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
    # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
    train_transforms = Compose([
        ReduceLabels() if args.reduce_labels else Identity(),
        RandomCrop(size=feature_extractor.size),
        RandomHorizontalFlip(flip_prob=0.5),
        PILToTensor(),
        ConvertImageDtype(torch.float),
        Normalize(mean=feature_extractor.image_mean,
                  std=feature_extractor.image_std),
    ])
    # Define torchvision transform to be applied to each image.
    # jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
    val_transforms = Compose([
        ReduceLabels() if args.reduce_labels else Identity(),
        Resize(size=(feature_extractor.size, feature_extractor.size)),
        PILToTensor(),
        ConvertImageDtype(torch.float),
        Normalize(mean=feature_extractor.image_mean,
                  std=feature_extractor.image_std),
    ])

    def preprocess_train(example_batch):
        pixel_values = []
        labels = []
        for image, target in zip(example_batch["image"],
                                 example_batch["label"]):
            image, target = train_transforms(image.convert("RGB"), target)
            pixel_values.append(image)
            labels.append(target)

        encoding = dict()
        encoding["pixel_values"] = torch.stack(pixel_values)
        encoding["labels"] = torch.stack(labels)

        return encoding

    def preprocess_val(example_batch):
        pixel_values = []
        labels = []
        for image, target in zip(example_batch["image"],
                                 example_batch["label"]):
            image, target = val_transforms(image.convert("RGB"), target)
            pixel_values.append(image)
            labels.append(target)

        encoding = dict()
        encoding["pixel_values"] = torch.stack(pixel_values)
        encoding["labels"] = torch.stack(labels)

        return encoding

    with accelerator.main_process_first():
        train_dataset = dataset["train"].with_transform(preprocess_train)
        eval_dataset = dataset["validation"].with_transform(preprocess_val)

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=default_data_collator,
                                  batch_size=args.per_device_train_batch_size)
    eval_dataloader = DataLoader(eval_dataset,
                                 collate_fn=default_data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    # Optimizer
    optimizer = torch.optim.AdamW(
        list(model.parameters()),
        lr=args.learning_rate,
        betas=[args.adam_beta1, args.adam_beta2],
        eps=args.adam_epsilon,
    )

    # Figure out how many steps we should save the Accelerator states
    if hasattr(args.checkpointing_steps, "isdigit"):
        checkpointing_steps = args.checkpointing_steps
        if args.checkpointing_steps.isdigit():
            checkpointing_steps = int(args.checkpointing_steps)
    else:
        checkpointing_steps = None

    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
        overrode_max_train_steps = True

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler)

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if overrode_max_train_steps:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    # Afterwards we recalculate our number of training epochs
    args.num_train_epochs = math.ceil(args.max_train_steps /
                                      num_update_steps_per_epoch)

    # Instantiate metric
    metric = evaluate.load("mean_iou")

    # We need to initialize the trackers we use, and also store our configuration.
    # We initialize the trackers only on main process because `accelerator.log`
    # only logs on main process and we don't want empty logs/runs on other processes.
    if args.with_tracking:
        if accelerator.is_main_process:
            experiment_config = vars(args)
            # TensorBoard cannot log Enums, need the raw value
            experiment_config["lr_scheduler_type"] = experiment_config[
                "lr_scheduler_type"].value
            accelerator.init_trackers("semantic_segmentation_no_trainer",
                                      experiment_config)

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0
    starting_epoch = 0

    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
            accelerator.print(
                f"Resumed from checkpoint: {args.resume_from_checkpoint}")
            accelerator.load_state(args.resume_from_checkpoint)
            path = os.path.basename(args.resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
            dirs.sort(key=os.path.getctime)
            path = dirs[
                -1]  # Sorts folders by date modified, most recent checkpoint is the last
        # Extract `epoch_{i}` or `step_{i}`
        training_difference = os.path.splitext(path)[0]

        if "epoch" in training_difference:
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
            resume_step = int(training_difference.replace("step_", ""))
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)

    for epoch in range(starting_epoch, args.num_train_epochs):
        if args.with_tracking:
            total_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            # We need to skip steps until we reach the resumed step
            if args.resume_from_checkpoint and epoch == starting_epoch:
                if resume_step is not None and step < resume_step:
                    completed_steps += 1
                    continue
            outputs = model(**batch)
            loss = outputs.loss
            # We keep track of the loss at each epoch
            if args.with_tracking:
                total_loss += loss.detach().float()
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
                if completed_steps % checkpointing_steps == 0:
                    output_dir = f"step_{completed_steps }"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
                    accelerator.save_state(output_dir)

                    if args.push_to_hub and epoch < args.num_train_epochs - 1:
                        accelerator.wait_for_everyone()
                        unwrapped_model = accelerator.unwrap_model(model)
                        unwrapped_model.save_pretrained(
                            args.output_dir,
                            is_main_process=accelerator.is_main_process,
                            save_function=accelerator.save,
                        )
                        if accelerator.is_main_process:
                            feature_extractor.save_pretrained(args.output_dir)
                            repo.push_to_hub(
                                commit_message=
                                f"Training in progress {completed_steps} steps",
                                blocking=False,
                                auto_lfs_prune=True,
                            )

            if completed_steps >= args.max_train_steps:
                break

        logger.info("***** Running evaluation *****")
        model.eval()
        samples_seen = 0
        for step, batch in enumerate(
                tqdm(eval_dataloader,
                     disable=not accelerator.is_local_main_process)):
            with torch.no_grad():
                outputs = model(**batch)

            upsampled_logits = torch.nn.functional.interpolate(
                outputs.logits,
                size=batch["labels"].shape[-2:],
                mode="bilinear",
                align_corners=False)
            predictions = upsampled_logits.argmax(dim=1)

            predictions, references = accelerator.gather(
                (predictions, batch["labels"]))

            # If we are in a multiprocess environment, the last batch has duplicates
            if accelerator.num_processes > 1:
                if step == len(eval_dataloader) - 1:
                    predictions = predictions[:len(eval_dataloader.dataset) -
                                              samples_seen]
                    references = references[:len(eval_dataloader.dataset) -
                                            samples_seen]
                else:
                    samples_seen += references.shape[0]

            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metrics = metric.compute(
            num_labels=len(id2label),
            ignore_index=255,
            reduce_labels=False,  # we've already reduced the labels before
        )
        logger.info(f"epoch {epoch}: {eval_metrics}")

        if args.with_tracking:
            accelerator.log(
                {
                    "mean_iou": eval_metrics["mean_iou"],
                    "mean_accuracy": eval_metrics["mean_accuracy"],
                    "overall_accuracy": eval_metrics["overall_accuracy"],
                    "train_loss": total_loss.item() / len(train_dataloader),
                    "epoch": epoch,
                    "step": completed_steps,
                },
                step=completed_steps,
            )

        if args.push_to_hub and epoch < args.num_train_epochs - 1:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                args.output_dir,
                is_main_process=accelerator.is_main_process,
                save_function=accelerator.save)
            if accelerator.is_main_process:
                feature_extractor.save_pretrained(args.output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress epoch {epoch}",
                    blocking=False,
                    auto_lfs_prune=True)

        if args.checkpointing_steps == "epoch":
            output_dir = f"epoch_{epoch}"
            if args.output_dir is not None:
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            args.output_dir,
            is_main_process=accelerator.is_main_process,
            save_function=accelerator.save)
        if accelerator.is_main_process:
            feature_extractor.save_pretrained(args.output_dir)
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training",
                                 auto_lfs_prune=True)

            with open(os.path.join(args.output_dir, "all_results.json"),
                      "w") as f:
                json.dump(
                    {
                        "eval_overall_accuracy":
                        eval_metrics["overall_accuracy"]
                    }, f)
Пример #18
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_semantic_segmentation", model_args, data_args)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Load dataset
    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
    # TODO support datasets from local folders
    dataset = load_dataset(data_args.dataset_name,
                           cache_dir=model_args.cache_dir)

    # Rename column names to standardized names (only "image" and "label" need to be present)
    if "pixel_values" in dataset["train"].column_names:
        dataset = dataset.rename_columns({"pixel_values": "image"})
    if "annotation" in dataset["train"].column_names:
        dataset = dataset.rename_columns({"annotation": "label"})

    # If we don't have a validation split, split off a percentage of train as validation.
    data_args.train_val_split = None if "validation" in dataset.keys(
    ) else data_args.train_val_split
    if isinstance(data_args.train_val_split,
                  float) and data_args.train_val_split > 0.0:
        split = dataset["train"].train_test_split(data_args.train_val_split)
        dataset["train"] = split["train"]
        dataset["validation"] = split["test"]

    # Prepare label mappings.
    # We'll include these in the model's config to get human readable labels in the Inference API.
    if data_args.dataset_name == "scene_parse_150":
        repo_id = "datasets/huggingface/label-files"
        filename = "ade20k-id2label.json"
    else:
        repo_id = f"datasets/{data_args.dataset_name}"
        filename = "id2label.json"
    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    label2id = {v: str(k) for k, v in id2label.items()}

    # Load the mean IoU metric from the datasets package
    metric = evaluate.load("mean_iou")

    # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    @torch.no_grad()
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        logits_tensor = torch.from_numpy(logits)
        # scale the logits to the size of the label
        logits_tensor = nn.functional.interpolate(
            logits_tensor,
            size=labels.shape[-2:],
            mode="bilinear",
            align_corners=False,
        ).argmax(dim=1)

        pred_labels = logits_tensor.detach().cpu().numpy()
        metrics = metric.compute(
            predictions=pred_labels,
            references=labels,
            num_labels=len(id2label),
            ignore_index=0,
            reduce_labels=feature_extractor.reduce_labels,
        )
        # add per category metrics as individual key-value pairs
        per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
        per_category_iou = metrics.pop("per_category_iou").tolist()

        metrics.update({
            f"accuracy_{id2label[i]}": v
            for i, v in enumerate(per_category_accuracy)
        })
        metrics.update(
            {f"iou_{id2label[i]}": v
             for i, v in enumerate(per_category_iou)})

        return metrics

    config = AutoConfig.from_pretrained(
        model_args.config_name or model_args.model_name_or_path,
        label2id=label2id,
        id2label=id2label,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForSemanticSegmentation.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    feature_extractor = AutoFeatureExtractor.from_pretrained(
        model_args.feature_extractor_name or model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Define torchvision transforms to be applied to each image + target.
    # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
    # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
    train_transforms = Compose([
        ReduceLabels() if data_args.reduce_labels else Identity(),
        RandomCrop(size=feature_extractor.size),
        RandomHorizontalFlip(flip_prob=0.5),
        PILToTensor(),
        ConvertImageDtype(torch.float),
        Normalize(mean=feature_extractor.image_mean,
                  std=feature_extractor.image_std),
    ])
    # Define torchvision transform to be applied to each image.
    # jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
    val_transforms = Compose([
        ReduceLabels() if data_args.reduce_labels else Identity(),
        Resize(size=(feature_extractor.size, feature_extractor.size)),
        PILToTensor(),
        ConvertImageDtype(torch.float),
        Normalize(mean=feature_extractor.image_mean,
                  std=feature_extractor.image_std),
    ])

    def preprocess_train(example_batch):
        pixel_values = []
        labels = []
        for image, target in zip(example_batch["image"],
                                 example_batch["label"]):
            image, target = train_transforms(image.convert("RGB"), target)
            pixel_values.append(image)
            labels.append(target)

        encoding = dict()
        encoding["pixel_values"] = torch.stack(pixel_values)
        encoding["labels"] = torch.stack(labels)

        return encoding

    def preprocess_val(example_batch):
        pixel_values = []
        labels = []
        for image, target in zip(example_batch["image"],
                                 example_batch["label"]):
            image, target = val_transforms(image.convert("RGB"), target)
            pixel_values.append(image)
            labels.append(target)

        encoding = dict()
        encoding["pixel_values"] = torch.stack(pixel_values)
        encoding["labels"] = torch.stack(labels)

        return encoding

    if training_args.do_train:
        if "train" not in dataset:
            raise ValueError("--do_train requires a train dataset")
        if data_args.max_train_samples is not None:
            dataset["train"] = (dataset["train"].shuffle(
                seed=training_args.seed).select(
                    range(data_args.max_train_samples)))
        # Set the training transforms
        dataset["train"].set_transform(preprocess_train)

    if training_args.do_eval:
        if "validation" not in dataset:
            raise ValueError("--do_eval requires a validation dataset")
        if data_args.max_eval_samples is not None:
            dataset["validation"] = (dataset["validation"].shuffle(
                seed=training_args.seed).select(
                    range(data_args.max_eval_samples)))
        # Set the validation transforms
        dataset["validation"].set_transform(preprocess_val)

    # Initalize our trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"] if training_args.do_train else None,
        eval_dataset=dataset["validation"] if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=feature_extractor,
        data_collator=default_data_collator,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()
        trainer.log_metrics("train", train_result.metrics)
        trainer.save_metrics("train", train_result.metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        metrics = trainer.evaluate()
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Write model card and (optionally) push to hub
    kwargs = {
        "finetuned_from": model_args.model_name_or_path,
        "dataset": data_args.dataset_name,
        "tags": ["image-segmentation", "vision"],
    }
    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
    else:
        trainer.create_model_card(**kwargs)
def convert_segformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our SegFormer structure.
    """

    # load default SegFormer configuration
    config = SegformerConfig()
    encoder_only = False

    # set attributes based on model_name
    repo_id = "datasets/huggingface/label-files"
    if "segformer" in model_name:
        size = model_name[len("segformer.") : len("segformer.") + 2]
        if "ade" in model_name:
            config.num_labels = 150
            filename = "ade20k-id2label.json"
            expected_shape = (1, 150, 128, 128)
        elif "city" in model_name:
            config.num_labels = 19
            filename = "cityscapes-id2label.json"
            expected_shape = (1, 19, 128, 128)
        else:
            raise ValueError(f"Model {model_name} not supported")
    elif "mit" in model_name:
        encoder_only = True
        size = model_name[4:6]
        config.num_labels = 1000
        filename = "imagenet-1k-id2label.json"
        expected_shape = (1, 1000)
    else:
        raise ValueError(f"Model {model_name} not supported")

    # set config attributes
    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}
    if size == "b0":
        pass
    elif size == "b1":
        config.hidden_sizes = [64, 128, 320, 512]
        config.decoder_hidden_size = 256
    elif size == "b2":
        config.hidden_sizes = [64, 128, 320, 512]
        config.decoder_hidden_size = 768
        config.depths = [3, 4, 6, 3]
    elif size == "b3":
        config.hidden_sizes = [64, 128, 320, 512]
        config.decoder_hidden_size = 768
        config.depths = [3, 4, 18, 3]
    elif size == "b4":
        config.hidden_sizes = [64, 128, 320, 512]
        config.decoder_hidden_size = 768
        config.depths = [3, 8, 27, 3]
    elif size == "b5":
        config.hidden_sizes = [64, 128, 320, 512]
        config.decoder_hidden_size = 768
        config.depths = [3, 6, 40, 3]
    else:
        raise ValueError(f"Size {size} not supported")

    # load feature extractor (only resize + normalize)
    feature_extractor = SegformerFeatureExtractor(
        image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
    )

    # prepare image
    image = prepare_img()
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values

    logger.info(f"Converting model {model_name}...")

    # load original state dict
    if encoder_only:
        state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
    else:
        state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))["state_dict"]

    # rename keys
    state_dict = rename_keys(state_dict, encoder_only=encoder_only)
    if not encoder_only:
        del state_dict["decode_head.conv_seg.weight"]
        del state_dict["decode_head.conv_seg.bias"]

    # key and value matrices need special treatment
    read_in_k_v(state_dict, config)

    # create HuggingFace model and load state dict
    if encoder_only:
        config.reshape_last_stage = False
        model = SegformerForImageClassification(config)
    else:
        model = SegformerForSemanticSegmentation(config)
    model.load_state_dict(state_dict)
    model.eval()

    # forward pass
    outputs = model(pixel_values)
    logits = outputs.logits

    # set expected_slice based on model name
    # ADE20k checkpoints
    if model_name == "segformer.b0.512x512.ade.160k":
        expected_slice = torch.tensor(
            [
                [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
                [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
                [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
            ]
        )
    elif model_name == "segformer.b1.512x512.ade.160k":
        expected_slice = torch.tensor(
            [
                [[-7.5820, -8.7231, -8.3215], [-8.0600, -10.3529, -10.0304], [-7.5208, -9.4103, -9.6239]],
                [[-12.6918, -13.8994, -13.7137], [-13.3196, -15.7523, -15.4789], [-12.9343, -14.8757, -14.9689]],
                [[-11.1911, -11.9421, -11.3243], [-11.3342, -13.6839, -13.3581], [-10.3909, -12.1832, -12.4858]],
            ]
        )
    elif model_name == "segformer.b2.512x512.ade.160k":
        expected_slice = torch.tensor(
            [
                [[-11.8173, -14.3850, -16.3128], [-14.5648, -16.5804, -18.6568], [-14.7223, -15.7387, -18.4218]],
                [[-15.7290, -17.9171, -19.4423], [-18.3105, -19.9448, -21.4661], [-17.9296, -18.6497, -20.7910]],
                [[-15.0783, -17.0336, -18.2789], [-16.8771, -18.6870, -20.1612], [-16.2454, -17.1426, -19.5055]],
            ]
        )
    elif model_name == "segformer.b3.512x512.ade.160k":
        expected_slice = torch.tensor(
            [
                [[-9.0878, -10.2081, -10.1891], [-9.3144, -10.7941, -10.9843], [-9.2294, -10.3855, -10.5704]],
                [[-12.2316, -13.9068, -13.6102], [-12.9161, -14.3702, -14.3235], [-12.5233, -13.7174, -13.7932]],
                [[-14.6275, -15.2490, -14.9727], [-14.3400, -15.9687, -16.2827], [-14.1484, -15.4033, -15.8937]],
            ]
        )
    elif model_name == "segformer.b4.512x512.ade.160k":
        expected_slice = torch.tensor(
            [
                [[-12.3144, -13.2447, -14.0802], [-13.3614, -14.5816, -15.6117], [-13.3340, -14.4433, -16.2219]],
                [[-19.2781, -20.4128, -20.7506], [-20.6153, -21.6566, -22.0998], [-19.9800, -21.0430, -22.1494]],
                [[-18.8739, -19.7804, -21.1834], [-20.1233, -21.6765, -23.2944], [-20.0315, -21.2641, -23.6944]],
            ]
        )
    elif model_name == "segformer.b5.640x640.ade.160k":
        expected_slice = torch.tensor(
            [
                [[-9.5524, -12.0835, -11.7348], [-10.5229, -13.6446, -14.5662], [-9.5842, -12.8851, -13.9414]],
                [[-15.3432, -17.5323, -17.0818], [-16.3330, -18.9255, -19.2101], [-15.1340, -17.7848, -18.3971]],
                [[-12.6072, -14.9486, -14.6631], [-13.7629, -17.0907, -17.7745], [-12.7899, -16.1695, -17.1671]],
            ]
        )
    # Cityscapes checkpoints
    elif model_name == "segformer.b0.1024x1024.city.160k":
        expected_slice = torch.tensor(
            [
                [[-11.9295, -13.4057, -14.8106], [-13.3431, -14.8179, -15.3781], [-14.2836, -15.5942, -16.1588]],
                [[-11.4906, -12.8067, -13.6564], [-13.1189, -14.0500, -14.1543], [-13.8748, -14.5136, -14.8789]],
                [[0.5374, 0.1067, -0.4742], [0.1141, -0.2255, -0.7099], [-0.3000, -0.5924, -1.3105]],
            ]
        )
    elif model_name == "segformer.b0.512x1024.city.160k":
        expected_slice = torch.tensor(
            [
                [[-7.8217, -9.8767, -10.1717], [-9.4438, -10.9058, -11.4047], [-9.7939, -12.3495, -12.1079]],
                [[-7.1514, -9.5336, -10.0860], [-9.7776, -11.6822, -11.8439], [-10.1411, -12.7655, -12.8972]],
                [[0.3021, 0.0805, -0.2310], [-0.0328, -0.1605, -0.2714], [-0.1408, -0.5477, -0.6976]],
            ]
        )
    elif model_name == "segformer.b0.640x1280.city.160k":
        expected_slice = torch.tensor(
            [
                [
                    [-1.1372e01, -1.2787e01, -1.3477e01],
                    [-1.2536e01, -1.4194e01, -1.4409e01],
                    [-1.3217e01, -1.4888e01, -1.5327e01],
                ],
                [
                    [-1.4791e01, -1.7122e01, -1.8277e01],
                    [-1.7163e01, -1.9192e01, -1.9533e01],
                    [-1.7897e01, -1.9991e01, -2.0315e01],
                ],
                [
                    [7.6723e-01, 4.1921e-01, -7.7878e-02],
                    [4.7772e-01, 9.5557e-03, -2.8082e-01],
                    [3.6032e-01, -2.4826e-01, -5.1168e-01],
                ],
            ]
        )
    elif model_name == "segformer.b0.768x768.city.160k":
        expected_slice = torch.tensor(
            [
                [[-9.4959, -11.3087, -11.7479], [-11.0025, -12.6540, -12.3319], [-11.4064, -13.0487, -12.9905]],
                [[-9.8905, -11.3084, -12.0854], [-11.1726, -12.7698, -12.9583], [-11.5985, -13.3278, -14.1774]],
                [[0.2213, 0.0192, -0.2466], [-0.1731, -0.4213, -0.4874], [-0.3126, -0.6541, -1.1389]],
            ]
        )
    elif model_name == "segformer.b1.1024x1024.city.160k":
        expected_slice = torch.tensor(
            [
                [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]],
                [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]],
                [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]],
            ]
        )
    elif model_name == "segformer.b2.1024x1024.city.160k":
        expected_slice = torch.tensor(
            [
                [[-16.0976, -16.4856, -17.3962], [-16.6234, -19.0342, -19.7685], [-16.0900, -18.0661, -19.1180]],
                [[-18.4750, -18.8488, -19.5074], [-19.4030, -22.1570, -22.5977], [-19.1191, -20.8486, -22.3783]],
                [[-4.5178, -5.5037, -6.5109], [-5.0884, -7.2174, -8.0334], [-4.4156, -5.8117, -7.2970]],
            ]
        )
    elif model_name == "segformer.b3.1024x1024.city.160k":
        expected_slice = torch.tensor(
            [
                [[-14.2081, -14.4732, -14.1977], [-14.5867, -16.4423, -16.6356], [-13.4441, -14.9685, -16.8696]],
                [[-14.4576, -14.7073, -15.0451], [-15.0816, -17.6237, -17.9873], [-14.4213, -16.0199, -18.5992]],
                [[-4.7349, -4.9588, -5.0966], [-4.3210, -6.9325, -7.2591], [-3.4312, -4.7484, -7.1917]],
            ]
        )
    elif model_name == "segformer.b4.1024x1024.city.160k":
        expected_slice = torch.tensor(
            [
                [[-11.7737, -11.9526, -11.3273], [-13.6692, -14.4574, -13.8878], [-13.8937, -14.6924, -15.9345]],
                [[-14.6706, -14.5330, -14.1306], [-16.1502, -16.8180, -16.4269], [-16.8338, -17.8939, -20.1746]],
                [[1.0491, 0.8289, 1.0310], [1.1044, 0.5219, 0.8055], [1.0899, 0.6926, 0.5590]],
            ]
        )
    elif model_name == "segformer.b5.1024x1024.city.160k":
        expected_slice = torch.tensor(
            [
                [[-12.5641, -13.4777, -13.0684], [-13.9587, -15.8983, -16.6557], [-13.3109, -15.7350, -16.3141]],
                [[-14.7074, -15.4352, -14.5944], [-16.6353, -18.1663, -18.6120], [-15.1702, -18.0329, -18.1547]],
                [[-1.7990, -2.0951, -1.7784], [-2.6397, -3.8245, -3.9686], [-1.5264, -2.8126, -2.9316]],
            ]
        )
    else:
        predicted_class_idx = logits.argmax(-1).item()
        print("Predicted class:", model.config.id2label[predicted_class_idx])

    # verify logits
    if not encoder_only:
        assert logits.shape == expected_shape
        assert torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-2)

    # finally, save model and feature extractor
    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
Пример #20
0
def cached_file(
    path_or_repo_id: Union[str, os.PathLike],
    filename: str,
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: bool = False,
    proxies: Optional[Dict[str, str]] = None,
    use_auth_token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    local_files_only: bool = False,
    subfolder: str = "",
    user_agent: Optional[Union[str, Dict[str, str]]] = None,
    _raise_exceptions_for_missing_entries: bool = True,
    _raise_exceptions_for_connection_errors: bool = True,
    _commit_hash: Optional[str] = None,
):
    """
    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.

    Args:
        path_or_repo_id (`str` or `os.PathLike`):
            This can be either:

            - a string, the *model id* of a model repo on huggingface.co.
            - a path to a *directory* potentially containing the file.
        filename (`str`):
            The name of the file to locate in `path_or_repo`.
        cache_dir (`str` or `os.PathLike`, *optional*):
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
        resume_download (`bool`, *optional*, defaults to `False`):
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
        proxies (`Dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        use_auth_token (`str` or *bool*, *optional*):
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
            when running `huggingface-cli login` (stored in `~/.huggingface`).
        revision (`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
            identifier allowed by git.
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
        subfolder (`str`, *optional*, defaults to `""`):
            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.

    <Tip>

    Passing `use_auth_token=True` is required when you want to use a private model.

    </Tip>

    Returns:
        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo).

    Examples:

    ```python
    # Download a model weight from the Hub and cache it.
    model_weights_file = cached_file("bert-base-uncased", "pytorch_model.bin")
    ```"""
    # Private arguments
    #     _raise_exceptions_for_missing_entries: if False, do not raise an exception for missing entries but return
    #         None.
    #     _raise_exceptions_for_connection_errors: if False, do not raise an exception for connection errors but return
    #         None.
    #     _commit_hash: passed when we are chaining several calls to various files (e.g. when loading a tokenizer or
    #         a pipeline). If files are cached for this commit hash, avoid calls to head and get from the cache.
    if is_offline_mode() and not local_files_only:
        logger.info("Offline mode: forcing local_files_only=True")
        local_files_only = True
    if subfolder is None:
        subfolder = ""

    path_or_repo_id = str(path_or_repo_id)
    full_filename = os.path.join(subfolder, filename)
    if os.path.isdir(path_or_repo_id):
        resolved_file = os.path.join(os.path.join(path_or_repo_id, subfolder),
                                     filename)
        if not os.path.isfile(resolved_file):
            if _raise_exceptions_for_missing_entries:
                raise EnvironmentError(
                    f"Could not locate {full_filename} inside {path_or_repo_id}."
                )
            else:
                return None
        return resolved_file

    if cache_dir is None:
        cache_dir = TRANSFORMERS_CACHE
    if isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)

    if _commit_hash is not None:
        # If the file is cached under that commit hash, we return it directly.
        resolved_file = try_to_load_from_cache(cache_dir,
                                               path_or_repo_id,
                                               full_filename,
                                               commit_hash=_commit_hash)
        if resolved_file is not None:
            return resolved_file

    user_agent = http_user_agent(user_agent)
    try:
        # Load from URL or cache if already cached
        with _patch_hf_hub_tqdm():
            resolved_file = hf_hub_download(
                path_or_repo_id,
                filename,
                subfolder=None if len(subfolder) == 0 else subfolder,
                revision=revision,
                cache_dir=cache_dir,
                user_agent=user_agent,
                force_download=force_download,
                proxies=proxies,
                resume_download=resume_download,
                use_auth_token=use_auth_token,
                local_files_only=local_files_only,
            )

    except RepositoryNotFoundError:
        raise EnvironmentError(
            f"{path_or_repo_id} is not a local folder and is not a valid model identifier "
            "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
            "pass a token having permission to this repo with `use_auth_token` or log in with "
            "`huggingface-cli login` and pass `use_auth_token=True`.")
    except RevisionNotFoundError:
        raise EnvironmentError(
            f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
            "for this model name. Check the model page at "
            f"'https://huggingface.co/{path_or_repo_id}' for available revisions."
        )
    except EntryNotFoundError:
        if not _raise_exceptions_for_missing_entries:
            return None
        if revision is None:
            revision = "main"
        raise EnvironmentError(
            f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout "
            f"'https://huggingface.co/{path_or_repo_id}/{revision}' for available files."
        )
    except HTTPError as err:
        # First we try to see if we have a cached version (not up to date):
        resolved_file = try_to_load_from_cache(cache_dir,
                                               path_or_repo_id,
                                               full_filename,
                                               revision=revision)
        if resolved_file is not None:
            return resolved_file
        if not _raise_exceptions_for_connection_errors:
            return None

        raise EnvironmentError(
            f"There was a specific connection error when trying to load {path_or_repo_id}:\n{err}"
        )
    except ValueError as err:
        # HuggingFace Hub returns a ValueError for a missing file when local_files_only=True we need to catch it here
        # This could be caught above along in `EntryNotFoundError` if hf_hub sent a different error message here
        if LOCAL_FILES_ONLY_HF_ERROR in err.args[
                0] and local_files_only and not _raise_exceptions_for_missing_entries:
            return None

        # Otherwise we try to see if we have a cached version (not up to date):
        resolved_file = try_to_load_from_cache(cache_dir,
                                               path_or_repo_id,
                                               full_filename,
                                               revision=revision)
        if resolved_file is not None:
            return resolved_file
        if not _raise_exceptions_for_connection_errors:
            return None
        raise EnvironmentError(
            f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this file, couldn't find it in the"
            f" cached files and it looks like {path_or_repo_id} is not the path to a directory containing a file named"
            f" {full_filename}.\nCheckout your internet connection or see how to run the library in offline mode at"
            " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
        )

    return resolved_file
def convert_vilt_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our ViLT structure.
    """

    # define configuration and initialize HuggingFace model
    config = ViltConfig(image_size=384,
                        patch_size=32,
                        tie_word_embeddings=False)
    mlm_model = False
    vqa_model = False
    nlvr_model = False
    irtr_model = False
    if "vqa" in checkpoint_url:
        vqa_model = True
        config.num_labels = 3129
        repo_id = "datasets/huggingface/label-files"
        filename = "vqa2-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
        model = ViltForQuestionAnswering(config)
    elif "nlvr" in checkpoint_url:
        nlvr_model = True
        config.num_labels = 2
        config.id2label = {0: "False", 1: "True"}
        config.label2id = {v: k for k, v in config.id2label.items()}
        config.modality_type_vocab_size = 3
        model = ViltForImagesAndTextClassification(config)
    elif "irtr" in checkpoint_url:
        irtr_model = True
        model = ViltForImageAndTextRetrieval(config)
    elif "mlm_itm" in checkpoint_url:
        mlm_model = True
        model = ViltForMaskedLM(config)
    else:
        raise ValueError("Unknown model type")

    # load state_dict of original model, remove and rename some keys
    state_dict = torch.hub.load_state_dict_from_url(
        checkpoint_url, map_location="cpu")["state_dict"]
    rename_keys = create_rename_keys(config, vqa_model, nlvr_model, irtr_model)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, config)
    if mlm_model or irtr_model:
        ignore_keys = ["itm_score.fc.weight", "itm_score.fc.bias"]
        for k in ignore_keys:
            state_dict.pop(k, None)

    # load state dict into HuggingFace model
    model.eval()
    if mlm_model:
        missing_keys, unexpected_keys = model.load_state_dict(state_dict,
                                                              strict=False)
        assert missing_keys == ["mlm_score.decoder.bias"]
    else:
        model.load_state_dict(state_dict)

    # Define processor
    feature_extractor = ViltFeatureExtractor(size=384)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    processor = ViltProcessor(feature_extractor, tokenizer)

    # Forward pass on example inputs (image + text)
    if nlvr_model:
        image1 = Image.open(
            requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg",
                         stream=True).raw)
        image2 = Image.open(
            requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg",
                         stream=True).raw)
        text = (
            "The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
            " standing.")
        encoding_1 = processor(image1, text, return_tensors="pt")
        encoding_2 = processor(image2, text, return_tensors="pt")
        outputs = model(
            input_ids=encoding_1.input_ids,
            pixel_values=encoding_1.pixel_values,
            pixel_values_2=encoding_2.pixel_values,
        )
    else:
        image = Image.open(
            requests.get(
                "http://images.cocodataset.org/val2017/000000039769.jpg",
                stream=True).raw)
        if mlm_model:
            text = "a bunch of [MASK] laying on a [MASK]."
        else:
            text = "How many cats are there?"
        encoding = processor(image, text, return_tensors="pt")
        outputs = model(**encoding)

    # Verify outputs
    if mlm_model:
        expected_shape = torch.Size([1, 11, 30522])
        expected_slice = torch.tensor([-12.5061, -12.5123, -12.5174])
        assert outputs.logits.shape == expected_shape
        assert torch.allclose(outputs.logits[0, 0, :3],
                              expected_slice,
                              atol=1e-4)

        # verify masked token prediction equals "cats"
        predicted_id = outputs.logits[0, 4, :].argmax(-1).item()
        assert tokenizer.decode([predicted_id]) == "cats"
    elif vqa_model:
        expected_shape = torch.Size([1, 3129])
        expected_slice = torch.tensor([-15.9495, -18.1472, -10.3041])
        assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
        assert outputs.logits.shape == expected_shape
        assert torch.allclose(outputs.logits[0, 0, :3],
                              expected_slice,
                              atol=1e-4)

        # verify vqa prediction equals "2"
        predicted_idx = outputs.logits.argmax(-1).item()
        assert model.config.id2label[predicted_idx] == "2"
    elif nlvr_model:
        expected_shape = torch.Size([1, 2])
        expected_slice = torch.tensor([-2.8721, 2.1291])
        assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
        assert outputs.logits.shape == expected_shape

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model and processor to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    processor.save_pretrained(pytorch_dump_folder_path)
Пример #22
0
# @author Loreto Parisi (loretoparisi at gmail dot com)
# Copyright (c) 2022 Loreto Parisi (loretoparisi at gmail dot com)

import os
from huggingface_hub import hf_hub_download
import fasttext


def prob2labels(predictions, decimal_places=2):
    labels = {}
    # predictions are ordered to max prob
    int2label_dict = predictions[0]
    for (index, value) in enumerate(predictions[1]):
        label = int2label_dict[index].replace('__label__', '')
        labels[label] = round(value, decimal_places)
    return labels


model_path = hf_hub_download("julien-c/fasttext-language-id",
                             "lid.176.bin",
                             cache_dir=os.getenv("cache_dir", "../../models"))

# Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.
# LP: ignore this Warning: https://fasttext.cc/blog/2019/06/25/blog-post.html
model = fasttext.load_model(model_path)

sequence = "The head of the United Nations says there is no military solution in Syria"
num_labels = 3
predictions = model.predict(sequence, k=num_labels)

print(sequence, prob2labels(predictions))
Пример #23
0
def convert_perceiver_checkpoint(pickle_file, pytorch_dump_folder_path, architecture="MLM"):
    with open(pickle_file, "rb") as f:
        checkpoint = pickle.loads(f.read())
    state = None
    if isinstance(checkpoint, dict) and architecture in [
        "image_classification",
        "image_classification_fourier",
        "image_classification_conv",
    ]:
        params = checkpoint["params"]
        state = checkpoint["state"]
    else:
        params = checkpoint
    state_dict = dict()
    for scope_name, parameters in hk.data_structures.to_mutable_dict(params).items():
        for param_name, param in parameters.items():
            state_dict[scope_name + "/" + param_name] = param
    if state is not None:
        for scope_name, parameters in hk.data_structures.to_mutable_dict(state).items():
            for param_name, param in parameters.items():
                state_dict[scope_name + "/" + param_name] = param
    rename_keys(state_dict, architecture=architecture)
    cfg = PerceiverConfig()
    subsampling = None
    repo_id = "datasets/huggingface/label-files"
    if architecture == "MLM":
        cfg.qk_channels = 8 * 32
        cfg.v_channels = 1280
        model = PerceiverForMaskedLM(cfg)
    elif "image_classification" in architecture:
        cfg.num_latents = 512
        cfg.d_latents = 1024
        cfg.d_model = 512
        cfg.num_blocks = 8
        cfg.num_self_attends_per_block = 6
        cfg.num_cross_attention_heads = 1
        cfg.num_self_attention_heads = 8
        cfg.qk_channels = None
        cfg.v_channels = None
        # set labels
        cfg.num_labels = 1000
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        cfg.id2label = id2label
        cfg.label2id = {v: k for k, v in id2label.items()}
        if architecture == "image_classification":
            cfg.image_size = 224
            model = PerceiverForImageClassificationLearned(cfg)
        elif architecture == "image_classification_fourier":
            cfg.d_model = 261
            model = PerceiverForImageClassificationFourier(cfg)
        elif architecture == "image_classification_conv":
            cfg.d_model = 322
            model = PerceiverForImageClassificationConvProcessing(cfg)
        else:
            raise ValueError(f"Architecture {architecture} not supported")
    elif architecture == "optical_flow":
        cfg.num_latents = 2048
        cfg.d_latents = 512
        cfg.d_model = 322
        cfg.num_blocks = 1
        cfg.num_self_attends_per_block = 24
        cfg.num_self_attention_heads = 16
        cfg.num_cross_attention_heads = 1
        model = PerceiverForOpticalFlow(cfg)
    elif architecture == "multimodal_autoencoding":
        cfg.num_latents = 28 * 28 * 1
        cfg.d_latents = 512
        cfg.d_model = 704
        cfg.num_blocks = 1
        cfg.num_self_attends_per_block = 8
        cfg.num_self_attention_heads = 8
        cfg.num_cross_attention_heads = 1
        cfg.num_labels = 700
        # define dummy inputs + subsampling (as each forward pass is only on a chunk of image + audio data)
        images = torch.randn((1, 16, 3, 224, 224))
        audio = torch.randn((1, 30720, 1))
        nchunks = 128
        image_chunk_size = np.prod((16, 224, 224)) // nchunks
        audio_chunk_size = audio.shape[1] // cfg.samples_per_patch // nchunks
        # process the first chunk
        chunk_idx = 0
        subsampling = {
            "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
            "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
            "label": None,
        }
        model = PerceiverForMultimodalAutoencoding(cfg)
        # set labels
        filename = "kinetics700-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        cfg.id2label = id2label
        cfg.label2id = {v: k for k, v in id2label.items()}
    else:
        raise ValueError(f"Architecture {architecture} not supported")
    model.eval()
    model.load_state_dict(state_dict)
    input_mask = None
    if architecture == "MLM":
        tokenizer = PerceiverTokenizer.from_pretrained(
            "/Users/NielsRogge/Documents/Perceiver/Tokenizer files"
        )
        text = "This is an incomplete sentence where some words are missing."
        encoding = tokenizer(text, padding="max_length", return_tensors="pt")
        # mask " missing.". Note that the model performs much better if the masked chunk starts with a space.
        encoding.input_ids[0, 51:60] = tokenizer.mask_token_id
        inputs = encoding.input_ids
        input_mask = encoding.attention_mask
    elif architecture in [
        "image_classification",
        "image_classification_fourier",
        "image_classification_conv",
    ]:
        feature_extractor = PerceiverFeatureExtractor()
        image = prepare_img()
        encoding = feature_extractor(image, return_tensors="pt")
        inputs = encoding.pixel_values
    elif architecture == "optical_flow":
        inputs = torch.randn(1, 2, 27, 368, 496)
    elif architecture == "multimodal_autoencoding":
        images = torch.randn((1, 16, 3, 224, 224))
        audio = torch.randn((1, 30720, 1))
        inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))
    if architecture == "multimodal_autoencoding":
        outputs = model(
            inputs=inputs, attention_mask=input_mask, subsampled_output_points=subsampling
        )
    else:
        outputs = model(inputs=inputs, attention_mask=input_mask)
    logits = outputs.logits
    if not isinstance(logits, dict):
        print("Shape of logits:", logits.shape)
    else:
        for k, v in logits.items():
            print(f"Shape of logits of modality {k}", v.shape)
    if architecture == "MLM":
        expected_slice = torch.tensor(
            [
                [-11.8336, -11.6850, -11.8483],
                [-12.8149, -12.5863, -12.7904],
                [-12.8440, -12.6410, -12.8646],
            ]
        )
        assert torch.allclose(logits[0, :3, :3], expected_slice)
        masked_tokens_predictions = logits[0, 51:60].argmax(dim=-1).tolist()
        expected_list = [38, 115, 111, 121, 121, 111, 116, 109, 52]
        assert masked_tokens_predictions == expected_list
        print("Greedy predictions:")
        print(masked_tokens_predictions)
        print()
        print("Predicted string:")
        print(tokenizer.decode(masked_tokens_predictions))

    elif architecture in [
        "image_classification",
        "image_classification_fourier",
        "image_classification_conv",
    ]:
        print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
def convert_weights_and_push(save_directory: Path,
                             model_name: str = None,
                             push_to_hub: bool = True):
    filename = "imagenet-1k-id2label.json"
    num_labels = 1000

    repo_id = "datasets/huggingface/label-files"
    num_labels = num_labels
    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
    id2label = {int(k): v for k, v in id2label.items()}

    id2label = id2label
    label2id = {v: k for k, v in id2label.items()}

    ImageNetPreTrainedConfig = partial(VanConfig,
                                       num_labels=num_labels,
                                       id2label=id2label,
                                       label2id=label2id)

    names_to_config = {
        "van-tiny":
        ImageNetPreTrainedConfig(
            hidden_sizes=[32, 64, 160, 256],
            depths=[3, 3, 5, 2],
            mlp_ratios=[8, 8, 4, 4],
        ),
        "van-small":
        ImageNetPreTrainedConfig(
            hidden_sizes=[64, 128, 320, 512],
            depths=[2, 2, 4, 2],
            mlp_ratios=[8, 8, 4, 4],
        ),
        "van-base":
        ImageNetPreTrainedConfig(
            hidden_sizes=[64, 128, 320, 512],
            depths=[3, 3, 12, 3],
            mlp_ratios=[8, 8, 4, 4],
        ),
        "van-large":
        ImageNetPreTrainedConfig(
            hidden_sizes=[64, 128, 320, 512],
            depths=[3, 5, 27, 3],
            mlp_ratios=[8, 8, 4, 4],
        ),
    }

    names_to_original_models = {
        "van-tiny": van_tiny,
        "van-small": van_small,
        "van-base": van_base,
        "van-large": van_large,
    }

    names_to_original_checkpoints = {
        "van-tiny":
        "https://huggingface.co/Visual-Attention-Network/VAN-Tiny-original/resolve/main/van_tiny_754.pth.tar",
        "van-small":
        "https://huggingface.co/Visual-Attention-Network/VAN-Small-original/resolve/main/van_small_811.pth.tar",
        "van-base":
        "https://huggingface.co/Visual-Attention-Network/VAN-Base-original/resolve/main/van_base_828.pth.tar",
        "van-large":
        "https://huggingface.co/Visual-Attention-Network/VAN-Large-original/resolve/main/van_large_839.pth.tar",
    }

    if model_name:
        convert_weight_and_push(
            model_name,
            names_to_config[model_name],
            checkpoint=names_to_original_checkpoints[model_name],
            from_model=names_to_original_models[model_name](),
            save_directory=save_directory,
            push_to_hub=push_to_hub,
        )
    else:
        for model_name, config in names_to_config.items():
            convert_weight_and_push(
                model_name,
                config,
                checkpoint=names_to_original_checkpoints[model_name],
                from_model=names_to_original_models[model_name](),
                save_directory=save_directory,
                push_to_hub=push_to_hub,
            )
def main():
    args = get_args()

    is_finetuned = "ft1k" in args.hf_checkpoint_name
    is_large = "large" in args.hf_checkpoint_name

    if is_finetuned:
        # To convert Beit's data2vec_vision to HF you need to copy
        # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_finetune.py
        # into this folder.
        import modeling_finetune  # noqa: F401
    else:
        # To convert Beit's data2vec_vision to HF you need to copy
        # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_cyclical.py
        # into this folder
        # IMPORTANT: Note that for now we've only converted the down-stream
        # model and not the full pretrained model. This means for the integration
        # test you need to add a `return x` after the following line:
        # https://github.com/facebookresearch/data2vec_vision/blob/af9a36349aaed59ae66e69b5dabeef2d62fdc5da/beit/modeling_cyclical.py#L197
        # to make the integration test pass.
        import modeling_cyclical  # noqa: F401

    # 1. Create model config
    config = Data2VecVisionConfig()
    if is_finetuned:
        config.use_relative_position_bias = True
        config.use_shared_relative_position_bias = False
        config.use_mean_pooling = True
        config.num_labels = 1000

        repo_id = "datasets/huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
    else:
        config.use_relative_position_bias = False
        config.use_shared_relative_position_bias = True
        config.use_mean_pooling = False

    if is_large:
        config.hidden_size = 1024
        config.intermediate_size = 4096
        config.num_hidden_layers = 24
        config.num_attention_heads = 16

    # 2. Load Beit model
    orig_model = load_beit_model(args, is_finetuned, is_large)
    orig_model.eval()

    # 3. Forward Beit model
    feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False)
    image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
    encoding = feature_extractor(images=image, return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
    with torch.no_grad():
        orig_model_output = orig_model(*orig_args)

    # 4. Load HF Data2VecVision model
    if is_finetuned:
        hf_model = Data2VecVisionForImageClassification(config)
        hf_model.eval()
        has_lm_head = False
        hf_prefix = "data2vec_vision."
    else:
        hf_model = Data2VecVisionModel(config)
        hf_model.eval()
        has_lm_head = True
        hf_prefix = ""

    rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
    state_dict = orig_model.state_dict()
    for src, dest in rename_keys:
        val = state_dict.pop(src)
        state_dict[dest] = val

    read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
    missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
    print("HF missing", missing_keys)
    print("HF unexpected_keys", unexpected_keys)

    # 5. Forward HF Data2VecVision model
    with torch.no_grad():
        hf_model_output = hf_model(pixel_values)

    hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state

    # 6. Compare
    max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item()

    print(f"max_absolute_diff = {max_absolute_diff}")
    success = torch.allclose(hf_output, orig_model_output, atol=1e-3)
    print("Do both models output the same tensors?", "🔥" if success else "💩")
    if not success:
        raise Exception("Something went wRoNg")

    # 7. Save
    print(f"Saving to {args.hf_checkpoint_name}")
    hf_model.save_pretrained(args.hf_checkpoint_name)
    feature_extractor.save_pretrained(args.hf_checkpoint_name)
Пример #26
0
def convert_poolformer_checkpoint(model_name, checkpoint_path,
                                  pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our PoolFormer structure.
    """

    # load default PoolFormer configuration
    config = PoolFormerConfig()

    # set attributes based on model_name
    repo_id = "datasets/huggingface/label-files"
    size = model_name[-3:]
    config.num_labels = 1000
    filename = "imagenet-1k-id2label.json"
    expected_shape = (1, 1000)

    # set config attributes
    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}
    if size == "s12":
        config.depths = [2, 2, 6, 2]
        config.hidden_sizes = [64, 128, 320, 512]
        config.mlp_ratio = 4.0
        crop_pct = 0.9
    elif size == "s24":
        config.depths = [4, 4, 12, 4]
        config.hidden_sizes = [64, 128, 320, 512]
        config.mlp_ratio = 4.0
        crop_pct = 0.9
    elif size == "s36":
        config.depths = [6, 6, 18, 6]
        config.hidden_sizes = [64, 128, 320, 512]
        config.mlp_ratio = 4.0
        config.layer_scale_init_value = 1e-6
        crop_pct = 0.9
    elif size == "m36":
        config.depths = [6, 6, 18, 6]
        config.hidden_sizes = [96, 192, 384, 768]
        config.mlp_ratio = 4.0
        config.layer_scale_init_value = 1e-6
        crop_pct = 0.95
    elif size == "m48":
        config.depths = [8, 8, 24, 8]
        config.hidden_sizes = [96, 192, 384, 768]
        config.mlp_ratio = 4.0
        config.layer_scale_init_value = 1e-6
        crop_pct = 0.95
    else:
        raise ValueError(f"Size {size} not supported")

    # load feature extractor
    feature_extractor = PoolFormerFeatureExtractor(crop_pct=crop_pct)

    # Prepare image
    image = prepare_img()
    pixel_values = feature_extractor(images=image,
                                     return_tensors="pt").pixel_values

    logger.info(f"Converting model {model_name}...")

    # load original state dict
    state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))

    # rename keys
    state_dict = rename_keys(state_dict)

    # create HuggingFace model and load state dict
    model = PoolFormerForImageClassification(config)
    model.load_state_dict(state_dict)
    model.eval()

    # Define feature extractor
    feature_extractor = PoolFormerFeatureExtractor(crop_pct=crop_pct)
    pixel_values = feature_extractor(images=prepare_img(),
                                     return_tensors="pt").pixel_values

    # forward pass
    outputs = model(pixel_values)
    logits = outputs.logits

    # define expected logit slices for different models
    if size == "s12":
        expected_slice = torch.tensor([-0.3045, -0.6758, -0.4869])
    elif size == "s24":
        expected_slice = torch.tensor([0.4402, -0.1374, -0.8045])
    elif size == "s36":
        expected_slice = torch.tensor([-0.6080, -0.5133, -0.5898])
    elif size == "m36":
        expected_slice = torch.tensor([0.3952, 0.2263, -1.2668])
    elif size == "m48":
        expected_slice = torch.tensor([0.1167, -0.0656, -0.3423])
    else:
        raise ValueError(f"Size {size} not supported")

    # verify logits
    assert logits.shape == expected_shape
    assert torch.allclose(logits[0, :3], expected_slice, atol=1e-2)

    # finally, save model and feature extractor
    logger.info(
        f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}..."
    )
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our BEiT structure.
    """

    # define default BEiT configuration
    config = BeitConfig()
    has_lm_head = False
    is_semantic = False
    repo_id = "datasets/huggingface/label-files"
    # set config parameters based on URL
    if checkpoint_url[-9:-4] == "pt22k":
        # masked image modeling
        config.use_shared_relative_position_bias = True
        config.use_mask_token = True
        has_lm_head = True
    elif checkpoint_url[-9:-4] == "ft22k":
        # intermediate fine-tuning on ImageNet-22k
        config.use_relative_position_bias = True
        config.num_labels = 21841
        filename = "imagenet-22k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        # this dataset contains 21843 labels but the model only has 21841
        # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18
        del id2label[9205]
        del id2label[15027]
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
    elif checkpoint_url[-8:-4] == "to1k":
        # fine-tuning on ImageNet-1k
        config.use_relative_position_bias = True
        config.num_labels = 1000
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
        if "384" in checkpoint_url:
            config.image_size = 384
        if "512" in checkpoint_url:
            config.image_size = 512
    elif "ade20k" in checkpoint_url:
        # fine-tuning
        config.use_relative_position_bias = True
        config.num_labels = 150
        filename = "ade20k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
        config.image_size = 640
        is_semantic = True
    else:
        raise ValueError(
            "Checkpoint not supported, URL should either end with 'pt22k', 'ft22k', 'to1k' or 'ade20k'"
        )

    # size of the architecture
    if "base" in checkpoint_url:
        pass
    elif "large" in checkpoint_url:
        config.hidden_size = 1024
        config.intermediate_size = 4096
        config.num_hidden_layers = 24
        config.num_attention_heads = 16
        if "ade20k" in checkpoint_url:
            config.image_size = 640
            config.out_indices = [7, 11, 15, 23]
    else:
        raise ValueError(
            "Should either find 'base' or 'large' in checkpoint URL")

    # load state_dict of original model, remove and rename some keys
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url,
                                                    map_location="cpu",
                                                    check_hash=True)
    state_dict = state_dict[
        "model"] if "ade20k" not in checkpoint_url else state_dict["state_dict"]

    rename_keys = create_rename_keys(config,
                                     has_lm_head=has_lm_head,
                                     is_semantic=is_semantic)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict,
                  config,
                  has_lm_head=has_lm_head,
                  is_semantic=is_semantic)
    if is_semantic:
        # add prefix to decoder keys
        for key, val in state_dict.copy().items():
            val = state_dict.pop(key)
            if key.startswith("backbone.fpn"):
                key = key.replace("backbone.fpn", "fpn")
            state_dict[key] = val

    # load HuggingFace model
    if checkpoint_url[-9:-4] == "pt22k":
        model = BeitForMaskedImageModeling(config)
    elif "ade20k" in checkpoint_url:
        model = BeitForSemanticSegmentation(config)
    else:
        model = BeitForImageClassification(config)
    model.eval()
    model.load_state_dict(state_dict)

    # Check outputs on an image
    if is_semantic:
        feature_extractor = BeitFeatureExtractor(size=config.image_size,
                                                 do_center_crop=False)
        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
        image = Image.open(ds[0]["file"])
    else:
        feature_extractor = BeitFeatureExtractor(size=config.image_size,
                                                 resample=Image.BILINEAR,
                                                 do_center_crop=False)
        image = prepare_img()

    encoding = feature_extractor(images=image, return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    outputs = model(pixel_values)
    logits = outputs.logits

    # verify logits
    expected_shape = torch.Size([1, 1000])
    if checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k"):
        expected_shape = torch.Size([1, 196, 8192])
    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k"):
        expected_shape = torch.Size([1, 196, 8192])
    elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22k"):
        expected_shape = torch.Size([1, 21841])
        expected_logits = torch.tensor([2.2288, 2.4671, 0.7395])
        expected_class_idx = 2397
    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22k"):
        expected_shape = torch.Size([1, 21841])
        expected_logits = torch.tensor([1.6881, -0.2787, 0.5901])
        expected_class_idx = 2396
    elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft1k"):
        expected_logits = torch.tensor([0.1241, 0.0798, -0.6569])
        expected_class_idx = 285
    elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22kto1k"):
        expected_logits = torch.tensor([-1.2385, -1.0987, -1.0108])
        expected_class_idx = 281
    elif checkpoint_url[:-4].endswith("beit_base_patch16_384_pt22k_ft22kto1k"):
        expected_logits = torch.tensor([-1.5303, -0.9484, -0.3147])
        expected_class_idx = 761
    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft1k"):
        expected_logits = torch.tensor([0.4610, -0.0928, 0.2086])
        expected_class_idx = 761
    elif checkpoint_url[:-4].endswith(
            "beit_large_patch16_224_pt22k_ft22kto1k"):
        expected_logits = torch.tensor([-0.4804, 0.6257, -0.1837])
        expected_class_idx = 761
    elif checkpoint_url[:-4].endswith(
            "beit_large_patch16_384_pt22k_ft22kto1k"):
        expected_logits = torch.tensor([[-0.5122, 0.5117, -0.2113]])
        expected_class_idx = 761
    elif checkpoint_url[:-4].endswith(
            "beit_large_patch16_512_pt22k_ft22kto1k"):
        expected_logits = torch.tensor([-0.3062, 0.7261, 0.4852])
        expected_class_idx = 761
    elif checkpoint_url[:-4].endswith(
            "beit_base_patch16_640_pt22k_ft22ktoade20k"):
        expected_shape = (1, 150, 160, 160)
        expected_logits = torch.tensor([
            [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561],
             [-2.9549, -1.3228, -2.1347]],
            [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277],
             [-3.8356, -2.4643, -3.3535]],
            [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035],
             [3.2413, 4.7813, 4.9969]],
        ])
    elif checkpoint_url[:-4].endswith(
            "beit_large_patch16_640_pt22k_ft22ktoade20k"):
        expected_shape = (1, 150, 160, 160)
        expected_logits = torch.tensor([
            [[-4.3305, -2.3049, -3.0161], [-2.9591, -1.5305, -2.2251],
             [-3.4198, -1.8004, -2.9062]],
            [[-5.8922, -3.7435, -4.3978], [-4.2063, -2.7872, -3.4755],
             [-4.2791, -3.1874, -4.1681]],
            [[0.9895, 4.3467, 4.7663], [4.2476, 5.6830, 6.1518],
             [4.5550, 6.2495, 6.5154]],
        ])
    else:
        raise ValueError("Can't verify logits as model is not supported")

    assert logits.shape == expected_shape, "Shape of logits not as expected"
    if not has_lm_head:
        if is_semantic:
            assert torch.allclose(
                logits[0, :3, :3, :3], expected_logits,
                atol=1e-3), "First elements of logits not as expected"
        else:
            print("Predicted class idx:", logits.argmax(-1).item())
            assert torch.allclose(
                logits[0, :3], expected_logits,
                atol=1e-3), "First elements of logits not as expected"
            assert logits.argmax(-1).item(
            ) == expected_class_idx, "Predicted class index not as expected"

    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
Пример #28
0
 def __init__(self, model_id: str):
     model_path = hf_hub_download(model_id,
                                  "model.bin",
                                  library_name="fasttext")
     self.model = fasttext.load_model(model_path)
Пример #29
0
def prepare_video():
    file = hf_hub_download(
        repo_id="datasets/hf-internal-testing/spaghetti-video",
        filename="eating_spaghetti.npy")
    video = np.load(file)
    return list(video)
Пример #30
0
def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path,
                                model_name, push_to_hub):
    config = get_videomae_config(model_name)

    if "finetuned" in model_name:
        model = VideoMAEForVideoClassification(config)
    else:
        model = VideoMAEForPreTraining(config)

    # download original checkpoint, hosted on Google Drive
    output = "pytorch_model.bin"
    gdown.cached_download(checkpoint_url, output, quiet=False)
    files = torch.load(output, map_location="cpu")
    if "model" in files:
        state_dict = files["model"]
    else:
        state_dict = files["module"]
    new_state_dict = convert_state_dict(state_dict, config)

    model.load_state_dict(new_state_dict)
    model.eval()

    # verify model on basic input
    feature_extractor = VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5],
                                                 image_std=[0.5, 0.5, 0.5])
    video = prepare_video()
    inputs = feature_extractor(video, return_tensors="pt")

    if "finetuned" not in model_name:
        local_path = hf_hub_download(
            repo_id="hf-internal-testing/bool-masked-pos",
            filename="bool_masked_pos.pt")
        inputs["bool_masked_pos"] = torch.load(local_path)

    outputs = model(**inputs)
    logits = outputs.logits

    model_names = [
        # Kinetics-400 checkpoints (short = pretrained only for 800 epochs instead of 1600)
        "videomae-base-short",
        "videomae-base-short-finetuned-kinetics",
        "videomae-base",
        "videomae-base-finetuned-kinetics",
        "videomae-large",
        "videomae-large-finetuned-kinetics",
        # Something-Something-v2 checkpoints (short = pretrained only for 800 epochs instead of 2400)
        "videomae-base-short-ssv2",
        "videomae-base-short-finetuned-ssv2",
        "videomae-base-ssv2",
        "videomae-base-finetuned-ssv2",
    ]

    # NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5]
    if model_name == "videomae-base":
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089],
                                       [0.6701, 0.7487, 0.6209],
                                       [0.4287, 0.5158, 0.4773]])
    elif model_name == "videomae-base-short":
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor([[0.7994, 0.9612, 0.8508],
                                       [0.7401, 0.8958, 0.8302],
                                       [0.5862, 0.7468, 0.7325]])
        # we verified the loss both for normalized and unnormalized targets for this one
        expected_loss = torch.tensor(
            [0.5142]) if config.norm_pix_loss else torch.tensor([0.6469])
    elif model_name == "videomae-large":
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor([[0.7149, 0.7997, 0.6966],
                                       [0.6768, 0.7869, 0.6948],
                                       [0.5139, 0.6221, 0.5605]])
    elif model_name == "videomae-large-finetuned-kinetics":
        expected_shape = torch.Size([1, 400])
        expected_slice = torch.tensor([0.0771, 0.0011, -0.3625])
    elif model_name == "videomae-base-short-finetuned-kinetics":
        expected_shape = torch.Size([1, 400])
        expected_slice = torch.tensor([0.6588, 0.0990, -0.2493])
    elif model_name == "videomae-base-finetuned-kinetics":
        expected_shape = torch.Size([1, 400])
        expected_slice = torch.tensor([0.3669, -0.0688, -0.2421])
    elif model_name == "videomae-base-short-ssv2":
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor([[0.4712, 0.5296, 0.5786],
                                       [0.2278, 0.2729, 0.4026],
                                       [0.0352, 0.0730, 0.2506]])
    elif model_name == "videomae-base-short-finetuned-ssv2":
        expected_shape = torch.Size([1, 174])
        expected_slice = torch.tensor([-0.0537, -0.1539, -0.3266])
    elif model_name == "videomae-base-ssv2":
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor([[0.8131, 0.8727, 0.8546],
                                       [0.7366, 0.9377, 0.8870],
                                       [0.5935, 0.8874, 0.8564]])
    elif model_name == "videomae-base-finetuned-ssv2":
        expected_shape = torch.Size([1, 174])
        expected_slice = torch.tensor([0.1961, -0.8337, -0.6389])
    else:
        raise ValueError(
            f"Model name not supported. Should be one of {model_names}")

    # verify logits
    assert logits.shape == expected_shape
    if "finetuned" in model_name:
        assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
    else:
        print("Logits:", logits[0, :3, :3])
        assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
    print("Logits ok!")

    # verify loss, if applicable
    if model_name == "videomae-base-short":
        loss = outputs.loss
        assert torch.allclose(loss, expected_loss, atol=1e-4)
        print("Loss ok!")

    if pytorch_dump_folder_path is not None:
        print(
            f"Saving model and feature extractor to {pytorch_dump_folder_path}"
        )
        feature_extractor.save_pretrained(pytorch_dump_folder_path)
        model.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
        print("Pushing to the hub...")
        model.push_to_hub(model_name, organization="nielsr")