示例#1
0
    def from_pretrained(cls, pretrained_model_name_or_path:str):
        """
            Setting up this method will enable to load directly from huggingface hub just like other HF models are loaded
        """
        model_id = pretrained_model_name_or_path

        if len(model_id.split("/")) == 1:
            name = model_id
        else:
            username, name = model_id.split("/")

        if name in os.listdir():
            print("LOADING config & model weights from local directory")
            config_file = os.path.join(name, "config.json")
            model_file = os.path.join(name, "pytorch_model.bin")
        else:
            config_url = hf_bucket_url(model_id, filename="config.json")
            config_file = cached_path(config_url)
            # downloading & load only the adapter weights from huggingface hub
            # and corresponding bert weights will be loaded when class is getting initiated
            model_url = hf_bucket_url(model_id, filename="pytorch_model.bin")
            model_file = cached_path(model_url)

        with open(config_file, "r", encoding="utf-8") as f:
            config = json.load(f)
        config = Dict.from_nested_dict(config)

        state_dict = torch.load(model_file, map_location="cpu")
        # randomly initializing model from given config with bert weights restored
        model = cls(config)
        # now restoring adapter weights
        model.load_state_dict(state_dict, strict=False)
        model.eval()

        return model
示例#2
0
 def test_revision_not_found(self):
     # Valid file but missing revision
     url = hf_bucket_url(MODEL_ID,
                         filename=CONFIG_NAME,
                         revision=REVISION_ID_INVALID)
     with self.assertRaisesRegex(RevisionNotFoundError, "404 Client Error"):
         _ = get_from_cache(url)
示例#3
0
def model_file_from_short_name(short_name, model_type):
    """Get model weights file by short name"""
    model_file = hf_bucket_url(
        short_name,
        filename=(TF2_WEIGHTS_NAME if model_type == 'tf' else WEIGHTS_NAME),
        use_cdn=True)
    return model_file
 def test_lfs_object(self):
     url = hf_bucket_url(MODEL_ID,
                         filename=WEIGHTS_NAME,
                         revision=REVISION_ID_DEFAULT)
     filepath = get_from_cache(url, force_download=True)
     metadata = filename_to_url(filepath)
     self.assertEqual(metadata, (url, f'"{PINNED_SHA256}"'))
 def test_standard_object_rev(self):
     # Same object, but different revision
     url = hf_bucket_url(MODEL_ID,
                         filename=CONFIG_NAME,
                         revision=REVISION_ID_ONE_SPECIFIC_COMMIT)
     filepath = get_from_cache(url, force_download=True)
     metadata = filename_to_url(filepath)
     self.assertNotEqual(metadata[1], f'"{PINNED_SHA1}"')
def convert_pt_checkpoint_to_tf(model_type,
                                pytorch_checkpoint_path,
                                config_file,
                                tf_dump_path,
                                compare_with_pt_model=False,
                                use_cached_models=True):
    if model_type not in MODEL_CLASSES:
        raise ValueError(
            "Unrecognized model type, should be one of {}.".format(
                list(MODEL_CLASSES.keys())))

    config_class, model_class, pt_model_class, aws_config_map = MODEL_CLASSES[
        model_type]

    # Initialise TF model
    if config_file in aws_config_map:
        config_file = cached_path(aws_config_map[config_file],
                                  force_download=not use_cached_models)
    config = config_class.from_json_file(config_file)
    config.output_hidden_states = True
    config.output_attentions = True
    print("Building TensorFlow model from configuration: {}".format(
        str(config)))
    tf_model = model_class(config)

    # Load weights from tf checkpoint
    if pytorch_checkpoint_path in aws_config_map.keys():
        pytorch_checkpoint_url = hf_bucket_url(pytorch_checkpoint_path,
                                               filename=WEIGHTS_NAME)
        pytorch_checkpoint_path = cached_path(
            pytorch_checkpoint_url, force_download=not use_cached_models)
    # Load PyTorch checkpoint in tf2 model:
    tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model,
                                                    pytorch_checkpoint_path)

    if compare_with_pt_model:
        tfo = tf_model(tf_model.dummy_inputs,
                       training=False)  # build the network

        state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
        pt_model = pt_model_class.from_pretrained(
            pretrained_model_name_or_path=None,
            config=config,
            state_dict=state_dict)

        with torch.no_grad():
            pto = pt_model(**pt_model.dummy_inputs)

        np_pt = pto[0].numpy()
        np_tf = tfo[0].numpy()
        diff = np.amax(np.abs(np_pt - np_tf))
        print("Max absolute difference between models outputs {}".format(diff))
        assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(
            diff)

    # Save pytorch-model
    print("Save TensorFlow model to {}".format(tf_dump_path))
    tf_model.save_weights(tf_dump_path, save_format="h5")
示例#7
0
def download_file_from_hf(pretrained_model_name_or_path: str,
                          file_name: str) -> str:
    # Load model
    if pretrained_model_name_or_path is not None:
        if os.path.isdir(pretrained_model_name_or_path):
            if os.path.isfile(
                    os.path.join(pretrained_model_name_or_path, file_name)):
                # Load from a PyTorch checkpoint
                archive_file = os.path.join(pretrained_model_name_or_path,
                                            file_name)
            else:
                raise EnvironmentError(
                    "Error no file named {} found in directory {}".format(
                        file_name,
                        pretrained_model_name_or_path,
                    ))
        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(
                pretrained_model_name_or_path):
            archive_file = pretrained_model_name_or_path
        else:
            archive_file = hf_bucket_url(
                pretrained_model_name_or_path,
                filename=file_name,
                revision=None,
                mirror=None,
            )

        try:
            # Load from URL or cache if already cached
            resolved_archive_file = cached_path(
                archive_file,
                cache_dir=None,
                force_download=False,
                proxies=None,
                resume_download=False,
                local_files_only=False,
            )
        except EnvironmentError as err:
            logger.error(err)
            msg = (
                f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on"
                f"'https://huggingface.co/models'\n\n"
                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a"
                f"file named one of {file_name}.\n\n")
            raise EnvironmentError(msg)

        if resolved_archive_file == archive_file:
            logger.info("loading weights file {}".format(archive_file))
        else:
            logger.info("loading weights file {} from cache at {}".format(
                archive_file, resolved_archive_file))
    else:
        resolved_archive_file = None

    return resolved_archive_file
示例#8
0
def load_cached_hf_parameters(model_name_or_path, cache_dir):
    archive_file = hf_bucket_url(
        model_name_or_path,
        filename='pytorch_model.bin'
    )
    resolved_archive_file = cached_path(
        archive_file,
        cache_dir=cache_dir
    )
    state_dict = torch.load(resolved_archive_file, map_location="cpu")
    return state_dict
示例#9
0
 def _get_config_dict(cls, path, **kw):
     local_files_only = kw.pop("local_files_only", False)
     from_pipeline = kw.pop("_from_pipeline", None)
     user_agent = {
         "file_type": "config",
         "from_auto_class": kw.pop("_from_auto", False)
     }
     if from_pipeline is not None:
         user_agent["using_pipeline"] = from_pipeline
     if is_offline_mode() and not local_files_only:
         log.info("Offline mode: forcing local_files_only=True")
         local_files_only = True
     path = str(path)
     if os.path.isfile(path) or is_remote_url(path):
         x = path
     else:
         f = kw.pop("_configuration_file", CONFIG_NAME)
         if os.path.isdir(path):
             x = os.path.join(path, f)
         else:
             x = hf_bucket_url(path,
                               filename=f,
                               revision=kw.pop("revision", None),
                               mirror=None)
     try:
         x2 = cached_path(
             x,
             cache_dir=kw.pop("cache_dir", None),
             force_download=kw.pop("force_download", False),
             proxies=kw.pop("proxies", None),
             resume_download=kw.pop("resume_download", False),
             local_files_only=local_files_only,
             use_auth_token=kw.pop("use_auth_token", None),
             user_agent=user_agent,
         )
     except RepositoryNotFoundError as e:
         raise OSError() from e
     except RevisionNotFoundError as e:
         raise OSError() from e
     except EntryNotFoundError as e:
         raise OSError() from e
     except HTTPError as e:
         raise OSError() from e
     except OSError as e:
         raise e
     try:
         y = cls._dict_from_json_file(x2)
     except (json.JSONDecodeError, UnicodeDecodeError) as e:
         raise OSError() from e
     if x2 == x:
         log.info(f"loading {x}")
     else:
         log.info(f"loading {x} from cache at {x2}")
     return y, kw
示例#10
0
def load_model_from_cache(model_name_or_path, model_arch, cache_dir, filename,
                          config):
    url = hf_bucket_url(model_name_or_path, filename=filename)
    path = cached_path(url, cache_dir=cache_dir) + "." + model_arch
    xml_path = path + ".xml"
    bin_path = path + ".bin"
    model = None
    if os.path.exists(xml_path) and os.path.exists(bin_path):
        logger.info(f"Load OpenVINO model from cache: {xml_path}")
        model = load_ov_model_from_ir(xml_path, bin_path, config)
    return model, path
示例#11
0
def is_pretrained_model(model_name):
    # check if it's a built-in pre-trained config:
    if model_name in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
        return True
    
    # check if it's a model on the huggingface model hub:
    url = hf_bucket_url(model_name, CONFIG_NAME)
    r = requests.head(url)
    if r.status_code == 200:
        return True

    return False
示例#12
0
def tokenizer_files_from_short_name(short_name):
    """Get all possible files for a tokenizer model by short name"""
    use_fast = False
    config = AutoConfig.from_pretrained(short_name)
    vocab_files = []
    for config_class, (tokenizer_class_py,
                       tokenizer_class_fast) in TOKENIZER_MAPPING.items():
        if isinstance(config, config_class):
            tokenizer_class = tokenizer_class_fast if (
                use_fast and tokenizer_class_fast) else tokenizer_class_py
            vocab_files = list(tokenizer_class.vocab_files_names.values())
    additional_files = [
        ADDED_TOKENS_FILE, SPECIAL_TOKENS_MAP_FILE, TOKENIZER_CONFIG_FILE,
        FULL_TOKENIZER_FILE
    ]
    tokenizer_files = []
    for filename in vocab_files + additional_files:
        tokenizer_files.append(
            hf_bucket_url(short_name, filename=filename, use_cdn=False))
    return tokenizer_files
示例#13
0
def _check_and_rename_pretrained_model_file(pretrained_model_dir, model_id,
                                            file_name, use_cdn):
    target_file_path = join_path(pretrained_model_dir, model_id, file_name)
    if os.path.exists(target_file_path):
        return True
    file_url = hf_bucket_url(model_id, file_name, use_cdn=use_cdn)
    file_dir_path = join_path(pretrained_model_dir, model_id)
    url_file_name = url_to_filename(file_url)

    matching_files = [
        file
        for file in fnmatch.filter(os.listdir(file_dir_path), url_file_name +
                                   ".*")
        if not file.endswith(".json") and not file.endswith(".lock")
    ]
    if len(matching_files) > 0:
        found_file_name = join_path(file_dir_path, matching_files[-1])
        os.rename(found_file_name, target_file_path)
        return True
    return False
import numpy as np
import torch
import subprocess

config_path = BART_PRETRAINED_CONFIG_ARCHIVE_MAP['bart-large-xsum']
vocab_path = vocab_url
merges_path = merges_url
weights_path = 'bart-large-xsum'

target_path = Path.home() / 'rustbert' / 'bart-large-xsum'

temp_config = get_from_cache(config_path)
temp_vocab = get_from_cache(vocab_path)
temp_merges = get_from_cache(merges_path)
temp_weights = get_from_cache(
    hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True))

os.makedirs(str(target_path), exist_ok=True)

config_path = str(target_path / 'config.json')
vocab_path = str(target_path / 'vocab.txt')
merges_path = str(target_path / 'merges.txt')
model_path = str(target_path / 'model.bin')

shutil.copy(temp_config, config_path)
shutil.copy(temp_vocab, vocab_path)
shutil.copy(temp_merges, merges_path)
shutil.copy(temp_weights, model_path)

weights = torch.load(temp_weights, map_location='cpu')
nps = {}
示例#15
0
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
                        **kwargs):
        """Instantiate a pretrained pytorch model from a pre-trained model configuration.

        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with ``model.train()``

        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
        It is up to you to train those weights with a downstream fine-tuning task.

        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.

        Parameters:
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)

            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method

            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:

                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.

            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.

            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.

            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.

            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.

            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.

            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:

                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.

        Examples::

            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
        config = kwargs.pop('config', None)
        state_dict = kwargs.pop('state_dict', None)
        cache_dir = kwargs.pop('cache_dir', None)
        from_tf = kwargs.pop('from_tf', False)
        force_download = kwargs.pop('force_download', False)
        proxies = kwargs.pop('proxies', None)
        output_loading_info = kwargs.pop('output_loading_info', False)
        random_init = kwargs.pop("random_init", False)
        use_cdn = kwargs.pop("use_cdn", True)
        local_files_only = kwargs.pop("local_files_only", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        kwargs_config = kwargs.copy()

        mapping_keys_state_dic = kwargs.pop("mapping_keys_state_dic", None)
        kwargs_config.pop("mapping_keys_state_dic", None)

        if config is None:

            config, model_kwargs = cls.config_class.from_pretrained(
                pretrained_model_name_or_path,
                *model_args,
                cache_dir=cache_dir,
                return_unused_kwargs=True,
                force_download=force_download,
                **kwargs_config)
        else:
            model_kwargs = kwargs

        # Load model
        if pretrained_model_name_or_path is not None:
            if os.path.isdir(pretrained_model_name_or_path):
                if from_tf and os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     TF_WEIGHTS_NAME + ".index")):
                    # Load from a TF 1.0 checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                TF_WEIGHTS_NAME + ".index")
                elif from_tf and os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     TF2_WEIGHTS_NAME)):
                    # Load from a TF 2.0 checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                TF2_WEIGHTS_NAME)
                elif os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     WEIGHTS_NAME)):
                    # Load from a PyTorch checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                WEIGHTS_NAME)
                else:
                    raise EnvironmentError(
                        "Error no file named {} found in directory {} or `from_tf` set to False"
                        .format(
                            [
                                WEIGHTS_NAME, TF2_WEIGHTS_NAME,
                                TF_WEIGHTS_NAME + ".index"
                            ],
                            pretrained_model_name_or_path,
                        ))
            elif os.path.isfile(
                    pretrained_model_name_or_path) or is_remote_url(
                        pretrained_model_name_or_path):
                archive_file = pretrained_model_name_or_path
            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
                assert (
                    from_tf
                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
                    pretrained_model_name_or_path + ".index")
                archive_file = pretrained_model_name_or_path + ".index"
            else:
                archive_file = hf_bucket_url(
                    pretrained_model_name_or_path,
                    filename=(TF2_WEIGHTS_NAME if from_tf else WEIGHTS_NAME),
                    use_cdn=use_cdn,
                )

            try:
                # Load from URL or cache if already cached
                resolved_archive_file = cached_path(
                    archive_file,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    resume_download=resume_download,
                    local_files_only=local_files_only,
                )
                if resolved_archive_file is None:
                    raise EnvironmentError
            except EnvironmentError:
                msg = (
                    f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                    f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
                    f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME}.\n\n"
                )
                raise EnvironmentError(msg)

            if resolved_archive_file == archive_file:
                logger.info("loading weights file {}".format(archive_file))
            else:
                logger.info("loading weights file {} from cache at {}".format(
                    archive_file, resolved_archive_file))
        else:
            resolved_archive_file = None

        # Instantiate model.

        model = cls(config, *model_args, **model_kwargs)

        if state_dict is None and not from_tf:
            state_dict = torch.load(resolved_archive_file, map_location='cpu')

        missing_keys = []
        unexpected_keys = []
        error_msgs = []

        if from_tf:
            if resolved_archive_file.endswith('.index'):
                # Load from a TensorFlow 1.X checkpoint - provided by original authors
                model = cls.load_tf_weights(
                    model, config,
                    resolved_archive_file[:-6])  # Remove the '.index'
            else:
                # Load from our TensorFlow 2.0 checkpoints
                try:
                    from transformers import load_tf2_checkpoint_in_pytorch_model
                    model = load_tf2_checkpoint_in_pytorch_model(
                        model, resolved_archive_file, allow_missing_keys=True)
                except ImportError as e:
                    logger.error(
                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
                    )
                    raise e
        else:
            # Convert old format to new format if needed from a PyTorch state_dict
            old_keys = []
            new_keys = []

            for key in state_dict.keys():
                new_key = None
                if 'gamma' in key:
                    new_key = key.replace('gamma', 'weight')
                if 'beta' in key:
                    new_key = key.replace('beta', 'bias')
                if new_key:
                    old_keys.append(key)
                    new_keys.append(new_key)
            for old_key, new_key in zip(old_keys, new_keys):
                state_dict[new_key] = state_dict.pop(old_key)

            # copy state_dict so _load_from_state_dict can modify it
            metadata = getattr(state_dict, '_metadata', None)
            state_dict = state_dict.copy()
            if metadata is not None:
                state_dict._metadata = metadata
            # assert mapping_keys_state_dic is not None, "ERROR did not found mapping dicts for {} ".format(pretrained_model_name_or_path)
            # mapping_keys_state_dic = {"roberta": "encoder", "lm_head": "head.mlm"}
            if mapping_keys_state_dic is not None:
                assert isinstance(mapping_keys_state_dic, dict), "ERROR "
                print(
                    "INFO : from loading from pretrained method (assuming loading original google model : "
                    "need to rename some keys {})".format(
                        mapping_keys_state_dic))
                state_dict = cls.adapt_state_dic_to_multitask(
                    state_dict,
                    keys_mapping=mapping_keys_state_dic,
                    add_prefix=pretrained_model_name_or_path ==
                    "asafaya/bert-base-arabic")
                #pdb.set_trace()

            def load(module, prefix=''):

                local_metadata = {"version": 1}

                if not prefix.startswith("head") or prefix.startswith(
                        "head.mlm"):
                    assert len(
                        missing_keys
                    ) == 0, "ERROR {} missing keys in state_dict {}".format(
                        prefix, missing_keys)
                else:
                    if len(missing_keys) == 0:
                        print(
                            "Warning {} missing keys in state_dict {} (warning expected for task-specific fine-tuning)"
                            .format(prefix, missing_keys))

                module._load_from_state_dict(state_dict, prefix,
                                             local_metadata, True,
                                             missing_keys, unexpected_keys,
                                             error_msgs)
                for name, child in module._modules.items():

                    # load_params_only_ls = kwargs.get("load_params_only_ls ")
                    not_load_params_ls = kwargs.get(
                        "not_load_params_ls") if kwargs.get(
                            "not_load_params_ls") is not None else []
                    assert isinstance(
                        not_load_params_ls, list
                    ), f"Argument error not_load_params_ls should be a list but is {not_load_params_ls}"
                    matching_not_load = []
                    # RANDOM-INIT
                    for pattern in not_load_params_ls:
                        matching = re.match(pattern, prefix + name)
                        if matching is not None:
                            matching_not_load.append(matching)
                    if len(matching_not_load) > 0:
                        # means there is at least one patter in not load pattern that matched --> so should load
                        print("MATCH not loading : {} parameters {} ".format(
                            prefix + name, not_load_params_ls))
                    if child is not None and len(matching_not_load) == 0:
                        #print("MODEL loading : child {} full {} ".format(name, prefix + name + '.'))
                        load(child, prefix + name + '.')
                    else:
                        print(
                            "MODEL not loading : child {} matching_not_load {} "
                            .format(child, matching_not_load))

            # Make sure we are able to load base models as well as derived models (with heads)
            start_prefix = ''
            model_to_load = model
            if not hasattr(model, cls.base_model_prefix) and any(
                    s.startswith(cls.base_model_prefix)
                    for s in state_dict.keys()):
                start_prefix = cls.base_model_prefix + '.'
            if hasattr(model, cls.base_model_prefix) and not any(
                    s.startswith(cls.base_model_prefix)
                    for s in state_dict.keys()):
                model_to_load = getattr(model, cls.base_model_prefix)
            if not random_init:
                load(model_to_load, prefix=start_prefix)
            else:
                print("WARNING : RANDOM INTIALIZATION OF BERTMULTITASK")

            if len(missing_keys) > 0:
                logger.info(
                    "Weights of {} not initialized from pretrained model: {}".
                    format(model.__class__.__name__, missing_keys))
            if len(unexpected_keys) > 0:
                logger.info(
                    "Weights from pretrained model not used in {}: {}".format(
                        model.__class__.__name__, unexpected_keys))
            if len(error_msgs) > 0:
                raise RuntimeError(
                    'Error(s) in loading state_dict for {}:\n\t{}'.format(
                        model.__class__.__name__, "\n\t".join(error_msgs)))

        if hasattr(model, 'tie_weights'):
            model.tie_weights(
            )  # make sure word embedding weights are still tied

        # Set model in evaluation mode to desactivate DropOut modules by default
        model.eval()

        if output_loading_info:
            loading_info = {
                "missing_keys": missing_keys,
                "unexpected_keys": unexpected_keys,
                "error_msgs": error_msgs
            }
            return model, loading_info

        return model
示例#16
0
    def from_pretrained(cls, model_name_or_path, *model_args, **kwargs):
        cache_dir = kwargs.get("cache_dir", None)
        from_pt = kwargs.pop("from_pt", False)
        from_tf = kwargs.pop("from_tf", False)
        from_ov = kwargs.get("from_ov", not (from_pt | from_tf))
        force_download = kwargs.get("force_download", False)
        resume_download = kwargs.get("resume_download", False)
        proxies = kwargs.get("proxies", None)
        local_files_only = kwargs.get("local_files_only", False)
        use_auth_token = kwargs.get("use_auth_token", None)
        revision = kwargs.get("revision", None)
        from_pipeline = kwargs.get("_from_pipeline", None)
        from_auto_class = kwargs.get("_from_auto", False)

        config = kwargs.get(
            "config") if "config" in kwargs else AutoConfig.from_pretrained(
                model_name_or_path)

        if from_pt:
            model = cls._pt_auto_model.from_pretrained(model_name_or_path,
                                                       *model_args, **kwargs)
            net = load_ov_model_from_pytorch(model)
            return OVPreTrainedModel(net, model.config)
        elif from_tf:
            model, cache_path = load_model_from_cache(model_name_or_path,
                                                      cls.__name__, cache_dir,
                                                      TF2_WEIGHTS_NAME, config)
            if model is not None:
                return model
            model = cls._tf_auto_model.from_pretrained(model_name_or_path,
                                                       *model_args, **kwargs)
            return load_ov_model_from_tf(model, cache_path)

        user_agent = {
            "file_type": "model",
            "framework": "openvino",
            "from_auto_class": from_auto_class
        }
        if from_pipeline is not None:
            user_agent["using_pipeline"] = from_pipeline

        # Load model
        OV_BIN_NAME = OV_WEIGHTS_NAME.replace(".xml", ".bin")
        if model_name_or_path is not None:
            if os.path.isdir(model_name_or_path):
                if (from_ov and os.path.isfile(
                        os.path.join(model_name_or_path, OV_WEIGHTS_NAME))
                        and os.path.isfile(
                            os.path.join(model_name_or_path, OV_BIN_NAME))):
                    # Load from an OpenVINO IR
                    archive_files = [
                        os.path.join(model_name_or_path, name)
                        for name in [OV_WEIGHTS_NAME, OV_BIN_NAME]
                    ]
                else:
                    raise EnvironmentError(
                        f"Error no files named {[OV_WEIGHTS_NAME, OV_BIN_NAME]} found in directory "
                        f"{model_name_or_path} or `from_ov` set to False")
            # elif os.path.isfile(model_name_or_path) or is_remote_url(model_name_or_path):
            #     archive_file = model_name_or_path
            else:
                names = [OV_WEIGHTS_NAME, OV_BIN_NAME]
                archive_files = [
                    hf_bucket_url(
                        model_name_or_path,
                        filename=name,
                        revision=revision,
                    ) for name in names
                ]

            # redirect to the cache, if necessary
            try:
                resolved_archive_files = [
                    cached_path(
                        archive_file,
                        cache_dir=cache_dir,
                        force_download=force_download,
                        proxies=proxies,
                        resume_download=resume_download,
                        local_files_only=local_files_only,
                        use_auth_token=use_auth_token,
                        user_agent=user_agent,
                    ) for archive_file in archive_files
                ]
            except EnvironmentError as err:
                logger.error(err)
                name = model_name_or_path
                msg = (
                    f"Can't load weights for '{name}'. Make sure that:\n\n"
                    f"- '{name}' is a correct model identifier listed on 'https://huggingface.co/models'\n"
                    f"  (make sure '{name}' is not a path to a local directory with something else, in that case)\n\n"
                    f"- or '{name}' is the correct path to a directory containing a file named {OV_WEIGHTS_NAME}.\n\n"
                )
                raise EnvironmentError(msg)

            if resolved_archive_files == archive_files:
                logger.info(f"loading weights file {archive_files}")
            else:
                logger.info(
                    f"loading weights file {archive_files} from cache at {resolved_archive_files}"
                )
        else:
            resolved_archive_files = None

        return load_ov_model_from_ir(*resolved_archive_files, config=config)
示例#17
0
def get_pretrained_state_dict(pretrained_model_name_or_path, *model_args,
                              **kwargs):
    """Get PyTorch state dict via HuggingFace transformers library."""
    config = kwargs.pop("config", None)
    state_dict = kwargs.pop("state_dict", None)
    cache_dir = kwargs.pop("cache_dir", None)
    # from_tf = kwargs.pop("from_tf", False)
    force_download = kwargs.pop("force_download", False)
    resume_download = kwargs.pop("resume_download", False)
    proxies = kwargs.pop("proxies", None)
    output_loading_info = kwargs.pop("output_loading_info", False)
    local_files_only = kwargs.pop("local_files_only", False)
    use_cdn = kwargs.pop("use_cdn", True)
    mirror = kwargs.pop("mirror", None)

    if pretrained_model_name_or_path is not None:
        if os.path.isdir(pretrained_model_name_or_path):
            if os.path.isfile(
                    os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
                # Load from a PyTorch checkpoint
                archive_file = os.path.join(pretrained_model_name_or_path,
                                            WEIGHTS_NAME)
            else:
                raise EnvironmentError(
                    "Error no file named {} found in directory {}".format(
                        WEIGHTS_NAME,
                        pretrained_model_name_or_path,
                    ))
        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(
                pretrained_model_name_or_path):
            archive_file = pretrained_model_name_or_path
        elif os.path.isfile(pretrained_model_name_or_path + ".index"):
            assert False, "Loading TensorFlow checkpoints is not supported"
        else:
            archive_file = hf_bucket_url(
                pretrained_model_name_or_path,
                filename=WEIGHTS_NAME,
                use_cdn=use_cdn,
                mirror=mirror,
            )

        try:
            # Load from URL or cache if already cached
            resolved_archive_file = cached_path(
                archive_file,
                cache_dir=cache_dir,
                force_download=force_download,
                proxies=proxies,
                resume_download=resume_download,
                local_files_only=local_files_only,
            )
            if resolved_archive_file is None:
                raise EnvironmentError
        except EnvironmentError:
            msg = (
                f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named {WEIGHTS_NAME}.\n\n"
            )
            raise EnvironmentError(msg)

        if resolved_archive_file == archive_file:
            print("loading weights file {}".format(archive_file))
        else:
            print("loading weights file {} from cache at {}".format(
                archive_file, resolved_archive_file))
    else:
        resolved_archive_file = None

    if state_dict is None:
        try:
            state_dict = torch.load(resolved_archive_file, map_location="cpu")
        except Exception:
            raise OSError(
                "Unable to load weights from pytorch checkpoint file.")
    return state_dict
 def test_file_not_found(self):
     # Valid revision (None) but missing file.
     url = hf_bucket_url(MODEL_ID, filename="missing.bin")
     with self.assertRaisesRegex(requests.exceptions.HTTPError,
                                 "404 Client Error"):
         _ = get_from_cache(url)
示例#19
0
    def get_config_dict(cls,
                        pretrained_model_name_or_path: str,
                        pretrained_config_archive_map: Optional[Dict] = None,
                        **kwargs) -> Tuple[Dict, Dict]:
        """
        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used
        for instantiating a Config using `from_dict`.

        Parameters:
            pretrained_model_name_or_path (:obj:`string`):
                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
            pretrained_config_archive_map: (:obj:`Dict[str, str]`, `optional`) Dict:
                A map of `shortcut names` to `url`. By default, will use the current class attribute.

        Returns:
            :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object.

        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)

        if pretrained_config_archive_map is None:
            pretrained_config_archive_map = cls.pretrained_config_archive_map

        if pretrained_model_name_or_path in pretrained_config_archive_map:
            config_file = pretrained_config_archive_map[
                pretrained_model_name_or_path]
        elif os.path.isdir(pretrained_model_name_or_path):
            config_file = os.path.join(pretrained_model_name_or_path,
                                       CONFIG_NAME)
        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(
                pretrained_model_name_or_path):
            config_file = pretrained_model_name_or_path
        else:
            config_file = hf_bucket_url(pretrained_model_name_or_path,
                                        postfix=CONFIG_NAME)

        try:
            # Load from URL or cache if already cached
            resolved_config_file = cached_path(
                config_file,
                cache_dir=cache_dir,
                force_download=force_download,
                proxies=proxies,
                resume_download=resume_download,
                local_files_only=local_files_only,
            )
            # Load config dict
            if resolved_config_file is None:
                raise EnvironmentError
            config_dict = cls._dict_from_json_file(resolved_config_file)

        except EnvironmentError:
            if pretrained_model_name_or_path in pretrained_config_archive_map:
                msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
                    config_file)
            else:
                msg = (
                    "Can't load '{}'. Make sure that:\n\n"
                    "- '{}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
                    "- or '{}' is the correct path to a directory containing a '{}' file\n\n"
                    .format(
                        pretrained_model_name_or_path,
                        pretrained_model_name_or_path,
                        pretrained_model_name_or_path,
                        CONFIG_NAME,
                    ))
            raise EnvironmentError(msg)

        except json.JSONDecodeError:
            msg = (
                "Couldn't reach server at '{}' to download configuration file or "
                "configuration file is not a valid JSON file. "
                "Please check network or file content here: {}.".format(
                    config_file, resolved_config_file))
            raise EnvironmentError(msg)

        if resolved_config_file == config_file:
            logger.info("loading configuration file {}".format(config_file))
        else:
            logger.info(
                "loading configuration file {} from cache at {}".format(
                    config_file, resolved_config_file))

        return config_dict, kwargs
from pathlib import Path
import shutil
import os
import numpy as np
import torch
import subprocess

config_path = T5_PRETRAINED_CONFIG_ARCHIVE_MAP['t5-base']
vocab_path = PRETRAINED_VOCAB_FILES_MAP['vocab_file']['t5-base']
weights_path = 't5-base'

target_path = Path.home() / 'rustbert' / 't5-base'

temp_config = get_from_cache(config_path)
temp_vocab = get_from_cache(vocab_path)
temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True))

os.makedirs(str(target_path), exist_ok=True)

config_path = str(target_path / 'config.json')
vocab_path = str(target_path / 'spiece.model')
model_path = str(target_path / 'model.bin')

shutil.copy(temp_config, config_path)
shutil.copy(temp_vocab, vocab_path)
shutil.copy(temp_weights, model_path)

weights = torch.load(temp_weights, map_location='cpu')
nps = {}
for k, v in weights.items():
    k = k.replace("gamma", "weight").replace("beta", "bias")
示例#21
0
def config_file_from_short_name(short_name):
    return hf_bucket_url(short_name, filename=CONFIG_NAME, use_cdn=False)
import shutil
import os
import numpy as np
import torch
import subprocess

config_path = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP["bert-base-uncased"]
vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["bert-base-uncased"]
weights_path = "bert-base-uncased"

target_path = Path().absolute()

temp_config = get_from_cache(config_path)
temp_vocab = get_from_cache(vocab_path)
temp_weights = get_from_cache(
    hf_bucket_url(weights_path, filename="pytorch_model.bin"))

os.makedirs(str(target_path), exist_ok=True)

config_path = str(target_path / 'config.json')
vocab_path = str(target_path / 'vocab.txt')
model_path = str(target_path / 'model.bin')

shutil.copy(temp_config, config_path)
shutil.copy(temp_vocab, vocab_path)
shutil.copy(temp_weights, model_path)

weights = torch.load(temp_weights, map_location='cpu')
nps = {}
for k, v in weights.items():
    k = k.replace("gamma", "weight").replace("beta", "bias")
示例#23
0
def from_pretrained_detailed(model_class, pretrained_model_name_or_path,
                             *model_args, **kwargs):
    r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.

    The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
    It is up to you to train those weights with a downstream fine-tuning task.

    The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.

    Parameters:
        pretrained_model_name_or_path: either:
            - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
            - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
            - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
            - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.

        model_args: (`optional`) Sequence of positional arguments:
            All remaning positional arguments will be passed to the underlying model's ``__init__`` method

        config: (`optional`) one of:
                - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
                - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`

            Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.

        from_pt: (`optional`) boolean, default False:
            Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).

        cache_dir: (`optional`) string:
            Path to a directory in which a downloaded pre-trained model
            configuration should be cached if the standard cache should not be used.

        force_download: (`optional`) boolean, default False:
            Force to (re-)download the model weights and configuration files and override the cached versions if they exists.

        resume_download: (`optional`) boolean, default False:
            Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.

        proxies: (`optional`) dict, default None:
            A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
            The proxies are used on each request.

        output_loading_info: (`optional`) boolean:
            Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.

        kwargs: (`optional`) Remaining dictionary of keyword arguments:
            Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:

            - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
            - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
            
            - If layer pruning is supported, ``layer_pruning`` will passed as a dictionary contains layer pruning configurations as follows:
                - strategy:
                    can be one of these values: {`top`, `buttom`, `symmetric`, `alternate`, `custom`}
                - k:
                    is the number of layers to prune. mandatory if strategy is one of {`top`, `buttom`, `symmetric`, `alternate`}
                - layers_indexes:
                    is array of layers indexs to prune. mandatory if strategy is `custom`
                - is_odd:
                    is odd alternate or not. mandatory if strategy is `alternate`

    Examples::

        # For example purposes. Not runnable.
        model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
        model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
        model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)

    """
    config = kwargs.pop("config", None)
    cache_dir = kwargs.pop("cache_dir", None)
    from_pt = kwargs.pop("from_pt", False)
    force_download = kwargs.pop("force_download", False)
    resume_download = kwargs.pop("resume_download", False)
    proxies = kwargs.pop("proxies", None)
    output_loading_info = kwargs.pop("output_loading_info", False)
    local_files_only = kwargs.pop("local_files_only", False)
    use_cdn = kwargs.pop("use_cdn", True)

    # mwahdan: Read layer_pruning config if exist
    layer_pruning = kwargs.pop("layer_pruning", None)

    # Load config if we don't provide a configuration
    if not isinstance(config, PretrainedConfig):
        config_path = config if config is not None else pretrained_model_name_or_path
        config, model_kwargs = model_class.config_class.from_pretrained(
            config_path,
            *model_args,
            cache_dir=cache_dir,
            return_unused_kwargs=True,
            force_download=force_download,
            resume_download=resume_download,
            proxies=proxies,
            local_files_only=local_files_only,
            **kwargs,
        )
    else:
        model_kwargs = kwargs

    # Load model
    if pretrained_model_name_or_path is not None:
        if os.path.isdir(pretrained_model_name_or_path):
            if os.path.isfile(
                    os.path.join(pretrained_model_name_or_path,
                                 TF2_WEIGHTS_NAME)):
                # Load from a TF 2.0 checkpoint
                archive_file = os.path.join(pretrained_model_name_or_path,
                                            TF2_WEIGHTS_NAME)
            elif from_pt and os.path.isfile(
                    os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
                # Load from a PyTorch checkpoint
                archive_file = os.path.join(pretrained_model_name_or_path,
                                            WEIGHTS_NAME)
            else:
                raise EnvironmentError(
                    "Error no file named {} found in directory {} or `from_pt` set to False"
                    .format([WEIGHTS_NAME, TF2_WEIGHTS_NAME],
                            pretrained_model_name_or_path))
        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(
                pretrained_model_name_or_path):
            archive_file = pretrained_model_name_or_path
        elif os.path.isfile(pretrained_model_name_or_path + ".index"):
            archive_file = pretrained_model_name_or_path + ".index"
        else:
            archive_file = hf_bucket_url(
                pretrained_model_name_or_path,
                filename=(WEIGHTS_NAME if from_pt else TF2_WEIGHTS_NAME),
            )

        try:
            # Load from URL or cache if already cached
            resolved_archive_file = cached_path(
                archive_file,
                cache_dir=cache_dir,
                force_download=force_download,
                proxies=proxies,
                resume_download=resume_download,
                local_files_only=local_files_only,
            )
            if resolved_archive_file is None:
                raise EnvironmentError
        except EnvironmentError:
            msg = (
                f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {TF2_WEIGHTS_NAME}, {WEIGHTS_NAME}.\n\n"
            )
            raise EnvironmentError(msg)
        if resolved_archive_file == archive_file:
            logger.info("loading weights file {}".format(archive_file))
        else:
            logger.info("loading weights file {} from cache at {}".format(
                archive_file, resolved_archive_file))
    else:
        resolved_archive_file = None

    # mwahdan: Modify config
    if layer_pruning:
        layer_pruning_k = layer_pruning_layers_indexes = layer_pruning_is_odd = None
        layer_pruning_strategy = get_mandatory_parameter(
            'strategy', layer_pruning)
        if layer_pruning_strategy in {'top', 'buttom', 'symmetric'}:
            layer_pruning_k = get_mandatory_parameter('k', layer_pruning)
            config, original_num_layers = modify_num_of_layers(
                config, k=layer_pruning_k)
        elif layer_pruning_strategy == 'custom':
            layer_pruning_layers_indexes = get_mandatory_parameter(
                'layers_indexes', layer_pruning)
            config, original_num_layers = modify_num_of_layers(
                config, layers_indexes=layer_pruning_layers_indexes)
        elif layer_pruning_strategy == 'alternate':
            layer_pruning_k = get_mandatory_parameter('k', layer_pruning)
            layer_pruning_is_odd = get_mandatory_parameter(
                'is_odd', layer_pruning)
            config, original_num_layers = modify_num_of_layers(
                config, k=layer_pruning_k, is_alternate=True)
        else:
            raise Exception('`%s` is not a supported layer pruning strategy' %
                            layer_pruning_strategy)

    # Instantiate model.
    model = model_class(config, *model_args, **model_kwargs)

    # mwahdan: Rename layers
    if layer_pruning:
        model = rename_layers_in_strategy(model, layer_pruning_strategy,
                                          original_num_layers, layer_pruning_k,
                                          layer_pruning_layers_indexes,
                                          layer_pruning_is_odd)

    if from_pt:
        # Load from a PyTorch checkpoint
        model = load_pytorch_checkpoint_in_tf2_model(model,
                                                     resolved_archive_file,
                                                     allow_missing_keys=True)
        # mwahdan: Rename layers
        if layer_pruning is not None:
            model = rename_layers(model)
        return model

    model(model.dummy_inputs,
          training=False)  # build the network with dummy inputs

    assert os.path.isfile(
        resolved_archive_file), "Error retrieving file {}".format(
            resolved_archive_file)
    # 'by_name' allow us to do transfer learning by skipping/adding layers
    # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
    try:
        # added skip_mismatch=True because we will prune full layers
        model.load_weights(resolved_archive_file,
                           by_name=True,
                           skip_mismatch=True)
        # mwahdan: Rename layers
    except OSError:
        raise OSError(
            "Unable to load weights from h5 file. "
            "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
        )

    model(model.dummy_inputs, training=False)  # Make sure restore ops are run

    # mwahdan: Rename layers
    if layer_pruning is not None:
        model = rename_layers(model)

    # Check if the models are the same to output loading informations
    with h5py.File(resolved_archive_file, "r") as f:
        if "layer_names" not in f.attrs and "model_weights" in f:
            f = f["model_weights"]
        hdf5_layer_names = set(
            hdf5_format.load_attributes_from_hdf5_group(f, "layer_names"))
    model_layer_names = set(layer.name for layer in model.layers)
    missing_keys = list(model_layer_names - hdf5_layer_names)
    unexpected_keys = list(hdf5_layer_names - model_layer_names)
    error_msgs = []

    if len(unexpected_keys) > 0:
        logger.warning(
            f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when "
            f"initializing {model.__class__.__name__}: {unexpected_keys}\n"
            f"- This IS expected if you are initializing {model.__class__.__name__} from the checkpoint of a model trained on another task "
            f"or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n"
            f"- This IS NOT expected if you are initializing {model.__class__.__name__} from the checkpoint of a model that you expect "
            f"to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
        )
    else:
        logger.warning(
            f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n"
        )
    if len(missing_keys) > 0:
        logger.warning(
            f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
            f"and are newly initialized: {missing_keys}\n"
            f"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."
        )
    else:
        logger.warning(
            f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n"
            f"If your task is similar to the task the model of the ckeckpoint was trained on, "
            f"you can already use {model.__class__.__name__} for predictions without further training."
        )
    if len(error_msgs) > 0:
        raise RuntimeError("Error(s) in loading weights for {}:\n\t{}".format(
            model.__class__.__name__, "\n\t".join(error_msgs)))
    if output_loading_info:
        loading_info = {
            "missing_keys": missing_keys,
            "unexpected_keys": unexpected_keys,
            "error_msgs": error_msgs
        }
        return model, loading_info

    return model
示例#24
0
            else:
                output_dict = outputs
        else:
            output_dict = {}
            for k, v in outputs.items():
                if format_columns is not None and k not in format_columns and not output_all_columns:
                    continue
                if format_columns is None or k in format_columns:
                    v = map_nested(command, v, **map_nested_kwargs)
                output_dict[k] = v
        return output_dict


ds = FixedDataset.from_file('../WRITTEN/dataset.arrow')
ds.set_format(type='tensorflow', columns=['input_ids'], shape=[2048])
mirrored_strategy = tf.distribute.MirroredStrategy(
    devices=["/gpu:0", "/gpu:1"])
with mirrored_strategy.scope():
    config_name = 'gpt2'
    model = TFGPT2LMHeadModel.from_pretrained(config_name)
    gpt2_weights_file_url = hf_bucket_url(config_name,
                                          filename=TF2_WEIGHTS_NAME)
    gpt2_weights_file = cached_path(gpt2_weights_file_url)
    model.load_weights(gpt2_weights_file, by_name=True)
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss)
    model.fit(tf.data.Dataset.from_tensor_slices(ds['input_ids']),
              epochs=2,
              steps_per_epoch=115)
示例#25
0
#! /usr/bin/python3 -i
# coding=utf-8

import os
PACKAGE_DIR=os.path.abspath(os.path.dirname(__file__))
DOWNLOAD_DIR=os.path.join(PACKAGE_DIR,"models")

from transformers.file_utils import hf_bucket_url
MODEL_URL=hf_bucket_url("KoichiYasuoka/SuPar-Kanbun","suparkanbun/models/")

import numpy
from spacy.language import Language
from spacy.symbols import LANG,NORM,LEMMA,POS,TAG,DEP,HEAD
from spacy.tokens import Doc,Span,Token
from spacy.util import get_lang_class

class SuParKanbunLanguage(Language):
  lang="lzh"
  max_length=10**6
  def __init__(self,BERT,Danku):
    self.Defaults.lex_attr_getters[LANG]=lambda _text:"lzh"
    try:
      self.vocab=self.Defaults.create_vocab()
      self.pipeline=[]
    except:
      from spacy.vocab import create_vocab
      self.vocab=create_vocab("lzh",self.Defaults)
      self._components=[]
      self._disabled=set()
    self.tokenizer=SuParKanbunTokenizer(BERT,Danku,self.vocab)
    self._meta={
示例#26
0
 def test_model_not_found(self):
     # Invalid model file.
     url = hf_bucket_url("bert-base", filename="pytorch_model.bin")
     with self.assertRaisesRegex(RepositoryNotFoundError,
                                 "404 Client Error"):
         _ = get_from_cache(url)