def from_pretrained(cls, pretrained_model_name_or_path:str): """ Setting up this method will enable to load directly from huggingface hub just like other HF models are loaded """ model_id = pretrained_model_name_or_path if len(model_id.split("/")) == 1: name = model_id else: username, name = model_id.split("/") if name in os.listdir(): print("LOADING config & model weights from local directory") config_file = os.path.join(name, "config.json") model_file = os.path.join(name, "pytorch_model.bin") else: config_url = hf_bucket_url(model_id, filename="config.json") config_file = cached_path(config_url) # downloading & load only the adapter weights from huggingface hub # and corresponding bert weights will be loaded when class is getting initiated model_url = hf_bucket_url(model_id, filename="pytorch_model.bin") model_file = cached_path(model_url) with open(config_file, "r", encoding="utf-8") as f: config = json.load(f) config = Dict.from_nested_dict(config) state_dict = torch.load(model_file, map_location="cpu") # randomly initializing model from given config with bert weights restored model = cls(config) # now restoring adapter weights model.load_state_dict(state_dict, strict=False) model.eval() return model
def get_affect_words_and_int (affect_class): emotions = "https://raw.githubusercontent.com/ishikasingh/Affective-text-gen/master/NRC-Emotion-Intensity-Lexicon-v1.txt" filepath = cached_path(emotions) with open(filepath, "r") as f: words = f.read().strip().split("\n")[1:] words = [w.split("\t") for w in words] return [w[0] for w in words if w[1] == affect_class], [float(w[-1]) for w in words if w[1] == affect_class]
def download(url, file, dir="."): import shutil from transformers.file_utils import cached_path t = os.path.join(dir, "filesize.txt") shutil.copy(cached_path(url + "filesize.txt"), t) with open(t, "r") as f: r = f.read() ft = 0 for t in r.split("\n"): s = t.split() if len(s) == 2: if s[0] == file: ft = int(s[1]) if ft == 0: return shutil.copy(cached_path(url + file), os.path.join(dir, file))
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \ List[List[List[int]]]: bow_indices = [] # 여러개가 들어올 것으로 가정 현재는 ['military'] for id_or_path in bag_of_words_ids_or_paths: # 자신들의 미리 설정된 데이터에 존재한다면 if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP: # transformer의 cached_path로 filepath 설정 # 즉 현상황은 cached_path("https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt") filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path]) else: # 아니면 단순 path 로 설정 filepath = id_or_path # 내부에는 military 와 연관 단어 리스트가 존재 이를 정리해서 words 에 정리 with open(filepath, "r") as f: words = f.read().strip().split("\n") # 각각의 단어를 tokenize 하여 리스트에 삽입 bow_indices.append( [tokenizer.encode(word.strip(), add_prefix_space=True, add_special_tokens=False) for word in words]) return bow_indices
def detect(self, texts): """ Detects the language for each element in texts. Args: texts: list of text Returns: list of languages """ if not self.detector: # Suppress unnecessary warning fasttext.FastText.eprint = lambda x: None # Load language detection model path = cached_path(self.langdetect) self.detector = fasttext.load_model(path) # Transform texts to format expected by language detection model texts = [ x.lower().replace("\n", " ").replace("\r\n", " ") for x in texts ] return [x[0].split("__")[-1] for x in self.detector.predict(texts)[0]]
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer, omit_file=None) -> \ List[List[List[int]]]: bow_indices = [] for id_or_path in bag_of_words_ids_or_paths: if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP: filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path]) else: filepath = id_or_path with open(filepath, "r") as f: words = f.read().strip().split("\n") if (omit_file): with open(omit_file, "r") as f: omit_words = f.read().strip().split("\n") new_words = list() for word in words: if (word not in omit_words): new_words.append(word) bow_indices.append([ tokenizer.encode(word.strip(), add_prefix_space=True, add_special_tokens=False) for word in new_words ]) else: bow_indices.append([ tokenizer.encode(word.strip(), add_prefix_space=True, add_special_tokens=False) for word in words ]) return bow_indices
def setUp(self): super().setUp() data_cached = cached_path( "https://cdn-datasets.huggingface.co/translation/wmt_en_ro-tr40k-va0.5k-te0.5k.tar.gz", extract_compressed_file=True, ) self.data_dir = f"{data_cached}/wmt_en_ro-tr40k-va0.5k-te0.5k"
def get_bag_of_words_indices(bag_of_words_ids_or_paths, tokenizer): bow_indices = [] for id_or_path in bag_of_words_ids_or_paths: print(id_or_path) if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP: filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path]) return
def download_file_from_hf(pretrained_model_name_or_path: str, file_name: str) -> str: # Load model if pretrained_model_name_or_path is not None: if os.path.isdir(pretrained_model_name_or_path): if os.path.isfile( os.path.join(pretrained_model_name_or_path, file_name)): # Load from a PyTorch checkpoint archive_file = os.path.join(pretrained_model_name_or_path, file_name) else: raise EnvironmentError( "Error no file named {} found in directory {}".format( file_name, pretrained_model_name_or_path, )) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url( pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path else: archive_file = hf_bucket_url( pretrained_model_name_or_path, filename=file_name, revision=None, mirror=None, ) try: # Load from URL or cache if already cached resolved_archive_file = cached_path( archive_file, cache_dir=None, force_download=False, proxies=None, resume_download=False, local_files_only=False, ) except EnvironmentError as err: logger.error(err) msg = ( f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on" f"'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a" f"file named one of {file_name}.\n\n") raise EnvironmentError(msg) if resolved_archive_file == archive_file: logger.info("loading weights file {}".format(archive_file)) else: logger.info("loading weights file {} from cache at {}".format( archive_file, resolved_archive_file)) else: resolved_archive_file = None return resolved_archive_file
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]: bow_indices = [] for id_or_path in bag_of_words_ids_or_paths: if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP: filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path]) else: filepath = id_or_path with open(filepath, "r") as f: words = f.read().strip().split("\n") bow_indices.append([tokenizer.encode(word.strip(), add_prefix_space=True) for word in words]) return bow_indices
def load_model_from_cache(model_name_or_path, model_arch, cache_dir, filename, config): url = hf_bucket_url(model_name_or_path, filename=filename) path = cached_path(url, cache_dir=cache_dir) + "." + model_arch xml_path = path + ".xml" bin_path = path + ".bin" model = None if os.path.exists(xml_path) and os.path.exists(bin_path): logger.info(f"Load OpenVINO model from cache: {xml_path}") model = load_ov_model_from_ir(xml_path, bin_path, config) return model, path
def load_cached_hf_parameters(model_name_or_path, cache_dir): archive_file = hf_bucket_url( model_name_or_path, filename='pytorch_model.bin' ) resolved_archive_file = cached_path( archive_file, cache_dir=cache_dir ) state_dict = torch.load(resolved_archive_file, map_location="cpu") return state_dict
def _get_config_dict(cls, path, **kw): local_files_only = kw.pop("local_files_only", False) from_pipeline = kw.pop("_from_pipeline", None) user_agent = { "file_type": "config", "from_auto_class": kw.pop("_from_auto", False) } if from_pipeline is not None: user_agent["using_pipeline"] = from_pipeline if is_offline_mode() and not local_files_only: log.info("Offline mode: forcing local_files_only=True") local_files_only = True path = str(path) if os.path.isfile(path) or is_remote_url(path): x = path else: f = kw.pop("_configuration_file", CONFIG_NAME) if os.path.isdir(path): x = os.path.join(path, f) else: x = hf_bucket_url(path, filename=f, revision=kw.pop("revision", None), mirror=None) try: x2 = cached_path( x, cache_dir=kw.pop("cache_dir", None), force_download=kw.pop("force_download", False), proxies=kw.pop("proxies", None), resume_download=kw.pop("resume_download", False), local_files_only=local_files_only, use_auth_token=kw.pop("use_auth_token", None), user_agent=user_agent, ) except RepositoryNotFoundError as e: raise OSError() from e except RevisionNotFoundError as e: raise OSError() from e except EntryNotFoundError as e: raise OSError() from e except HTTPError as e: raise OSError() from e except OSError as e: raise e try: y = cls._dict_from_json_file(x2) except (json.JSONDecodeError, UnicodeDecodeError) as e: raise OSError() from e if x2 == x: log.info(f"loading {x}") else: log.info(f"loading {x} from cache at {x2}") return y, kw
def get_classifier(name): params = DISCRIMINATOR_MODELS_PARAMS[name] if "url" in params: resolved_archive_file = cached_path(params["url"]) elif "path" in params: resolved_archive_file = params["path"] else: raise ValueError("Either url or path have to be specified " "in the discriminator model parameters") return
def get_classifier( name: Optional[str], class_label: Union[str, int], device: str, verbosity_level: int = REGULAR ) -> Tuple[Optional[ClassificationHead], Optional[int]]: """ Загружаем предварительно сохранный малый торчевский классификатор-дискриминатор по имени. """ if name is None: return None, None params = DISCRIMINATOR_MODELS_PARAMS[name] classifier = ClassificationHead(class_size=params['class_size'], embed_size=params['embed_size']).to(device) if "url" in params: resolved_archive_file = cached_path(params["url"]) elif "path" in params: resolved_archive_file = params["path"] else: raise ValueError("Either url or path have to be specified " "in the discriminator model parameters") classifier.load_state_dict( torch.load(resolved_archive_file, map_location=device)) classifier.eval() if isinstance(class_label, str): if class_label in params["class_vocab"]: label_id = params["class_vocab"][class_label] else: label_id = params["default_class"] if verbosity_level >= REGULAR: print("class_label {} not in class_vocab".format(class_label)) print("available values are: {}".format(params["class_vocab"])) print("using default class {}".format(label_id)) elif isinstance(class_label, int): if class_label in set(params["class_vocab"].values()): label_id = class_label else: label_id = params["default_class"] if verbosity_level >= REGULAR: print("class_label {} not in class_vocab".format(class_label)) print("available values are: {}".format(params["class_vocab"])) print("using default class {}".format(label_id)) else: label_id = params["default_class"] return classifier, label_id
def get_classifier(name: str, device: str) -> ClassificationHead: params = DISCRIMINATOR_MODELS_PARAMS[name] classifier = ClassificationHead(class_size=params["class_size"], embed_size=params["embed_size"]).to(device) if "url" in params: resolved_archive_file = cached_path(params["url"]) elif "path" in params: resolved_archive_file = params["path"] else: raise ValueError("Either url or path have to be specified " "in the discriminator model parameters") classifier.load_state_dict( torch.load(resolved_archive_file, map_location=device)) classifier.eval() return classifier
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]: """ BOW 词集进行tokenizer :param bag_of_words_ids_or_paths: :param tokenizer: :return: """ bow_indices = [] for id_or_path in bag_of_words_ids_or_paths: if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP: filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path]) else: filepath = id_or_path with open(filepath, "r") as f: words = f.read().strip().split("\n")
def get_classifier( model, name: Optional[str], class_label: Union[str, int], device: str) -> Tuple[Optional[ClassificationHead], Optional[int]]: if name is None: return None, None params = DISCRIMINATOR_MODELS_PARAMS[name] classifier = ClassificationHead(class_size=params["class_size"], embed_size=params["embed_size"]).to(device) if "url" in params: resolved_archive_file = cached_path(params["url"]) elif "path" in params: resolved_archive_file = params["path"] else: raise ValueError( "Either url or path have to be specified in the discriminator model parameters" ) classifier.load_state_dict( torch.load(resolved_archive_file, map_location=device)) classifier.eval() if isinstance(class_label, str): if class_label in params["class_vocab"]: label_id = params["class_vocab"][class_label] else: label_id = params["default_class"] print("class_label {} not in class_vocab".format(class_label)) print("available values are: {}".format(params["class_vocab"])) print("using default class {}".format(label_id)) elif isinstance(class_label, int): if class_label in set(params["class_vocab"].values()): label_id = class_label else: label_id = params["default_class"] print("class_label {} not in class_vocab".format(class_label)) print("available values are: {}".format(params["class_vocab"])) print("using default class {}".format(label_id)) else: label_id = params["default_class"] return classifier, label_id
def load_from_cache(pretrained_model_name_or_path, s3_dict, **kwargs): # Adjusted from HF Transformers to fit loading WordEmbeddings from deepsets s3 # Load from URL or cache if already cached cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) s3_file = s3_dict[pretrained_model_name_or_path] try: resolved_file = cached_path( s3_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, ) if resolved_file is None: raise EnvironmentError except EnvironmentError: if pretrained_model_name_or_path in s3_dict: msg = "Couldn't reach server at '{}' to download data.".format( s3_file) else: msg = ( "Model name '{}' was not found in model name list. " "We assumed '{}' was a path, a model identifier, or url to a configuration file or " "a directory containing such a file but couldn't find any such file at this path or url." .format( pretrained_model_name_or_path, s3_file, )) raise EnvironmentError(msg) if resolved_file == s3_file: logger.info("loading file {}".format(s3_file)) else: logger.info("loading file {} from cache at {}".format( s3_file, resolved_file)) return resolved_file
def get_classifier(discrim_meta: Optional[dict], device: str) -> Optional[ClassificationHead]: if discrim_meta is None: return None, None params = discrim_meta classifier = ClassificationHead(class_size=params['class_size'], embed_size=params['embed_size']).to(device) if "url" in params: resolved_archive_file = cached_path(params["url"]) elif "path" in params: resolved_archive_file = params["path"] else: raise ValueError("Either url or path have to be specified " "in the discriminator model parameters") classifier.load_state_dict( torch.load(resolved_archive_file, map_location=device)) classifier.eval() return classifier
def get_bag_of_words_indices_rhyming(bag_of_words_ids_or_paths: List[str], tokenizer, rhyming_words: List[str]) -> \ List[List[List[int]]]: bow_indices = [] for id_or_path in bag_of_words_ids_or_paths: if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP: filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path]) else: filepath = id_or_path with open(filepath, "r") as f: if len(rhyming_words) > 0: words = rhyming_words else: words = f.read().strip().split("\n") # words.extend(rhyming_words) bow_indices.append([ tokenizer.encode(word.strip(), add_prefix_space=True, add_special_tokens=False) for word in words ]) return bow_indices
def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): """ Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. """ if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name] else: vocab_file = pretrained_model_name if os.path.isdir(vocab_file): vocab_file = os.path.join(vocab_file, VOCAB_NAME) # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) except FileNotFoundError: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find any file " "associated to this path or url.".format( pretrained_model_name, ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), vocab_file)) return None if resolved_vocab_file == vocab_file: logger.info("loading vocabulary file {}".format(vocab_file)) else: logger.info("loading vocabulary file {} from cache at {}".format( vocab_file, resolved_vocab_file)) if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: # if we're using a pretrained model, ensure the tokenizer wont index sequences longer # than the number of positional embeddings max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[ pretrained_model_name] kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) # Instantiate tokenizer. tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) return tokenizer
def load_from_bert_pretrained(cls, config_file, pretrained_model_name='bert-base-uncased', **kwargs): model = cls(config_file, **kwargs) model(model.dummy_inputs, training=False) ckpt_layer_mapping = {} for vind, ckpt_ind in enumerate(model.config.ckpt_layer_mapping.split(',')): ckpt_layer_mapping['layer_._{}'.format(vind)] = 'layer_._{}'.format(ckpt_ind) archive_file = hf_bucket_url(pretrained_model_name, filename=TF2_WEIGHTS_NAME, use_cdn=True) resolved_archive_file = cached_path(archive_file, cache_dir=None, force_download=False, resume_download=False, proxies=None) f = h5py.File(resolved_archive_file, mode='r') layer_names = load_attributes_from_hdf5_group(f, 'layer_names') g = f[layer_names[0]] weight_names = load_attributes_from_hdf5_group(g, 'weight_names') weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names] weights_map = {'/'.join(name.split('/')[2:]): i for i, name in enumerate(weight_names)} weight_value_tuples = [] w_names = [] for w in model.layers[0].weights: w_name = '/'.join(w.name.split('/')[3:]) for k in ckpt_layer_mapping: if w_name.find(k): w_name = w_name.replace(k, ckpt_layer_mapping[k]) break if w_name in weights_map and w.shape == weight_values[weights_map[w_name]].shape: w_names.append(w_name) weight_value_tuples.append((w, weight_values[weights_map[w_name]])) logger.info("Loaded %d weights" % (len(w_names))) logger.info("Loaded weights names are: %s" % (", ".join(w_names))) K.batch_set_value(weight_value_tuples) print("Loaded %d weights" % (len(w_names))) print("Loaded weights names are: %s" % (", ".join(w_names))) model(model.dummy_inputs, training=False) return model
def __init__(self, vocab_size=30522, bert_model_name=None, num_hidden_layers=12, num_attention_heads=12, max_position_embeddings=512, intermediate_size=3072, hidden_size=768, dropout=0.1): config = BertConfig(vocab_size_or_config_json_file=vocab_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, max_position_embeddings=max_position_embeddings, intermediate_size=intermediate_size, hidden_size=hidden_size, attention_probs_dropout_prob=dropout, hidden_act="gelu", hidden_dropout_prob=dropout, initializer_range=0.02, layer_norm_eps=1e-12, type_vocab_size=2) super(TransformerEncoder, self).__init__(config=config) if bert_model_name: # extract file path if bert_model_name in BERT_PRETRAINED_MODEL_ARCHIVE_MAP: archive_file = BERT_PRETRAINED_MODEL_ARCHIVE_MAP[ bert_model_name] else: archive_file = bert_model_name archive_file = cached_path(archive_file) bert_state_dict = torch.load(archive_file) bert_state_keys = bert_state_dict.keys() for k, p in self.named_parameters(): bert_key = "bert." + k if bert_key in bert_state_keys and bert_state_dict[ bert_key].size() == p.size(): p.data.copy_(bert_state_dict[bert_key].data)
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \ List[List[List[int]]]: """ Превращает список слов из файла в список списков индексов токенов. Для каждой строки - свой список с токенами """ bow_indices = [] for id_or_path in bag_of_words_ids_or_paths: if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP: filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path]) else: filepath = id_or_path with open(filepath, "r") as f: words = f.read().strip().split("\n") bow_indices.append([ tokenizer.encode( word.strip(), # похоже, разбивает длинные слова на несколько коротких частей-токенов add_prefix_space=True, add_special_tokens=False) for word in words ]) return bow_indices
def get_regressor( name: Optional[str], device: str, verbosity_level: int = REGULAR ) -> Tuple[Optional[ClassificationHead], Optional[int]]: if name is None: return None, None params = DISCRIMINATOR_MODELS_PARAMS[name] classifier = RegressionHead1(embed_size=params['embed_size']).to(device) if "url" in params: resolved_archive_file = cached_path(params["url"]) elif "path" in params: resolved_archive_file = params["path"] else: raise ValueError("Either url or path have to be specified " "in the discriminator model parameters") classifier.load_state_dict( torch.load(resolved_archive_file, map_location=device)) classifier.eval() return classifier
def get_classifier( discrim_meta: Optional[dict], class_label: Union[str, int], device: str) -> Tuple[Optional[ClassificationHead], Optional[int]]: if discrim_meta is None: return None, None params = discrim_meta classifier = ClassificationHead(class_size=params['class_size'], embed_size=params['embed_size']).to(device) if "url" in params: resolved_archive_file = cached_path(params["url"]) elif "path" in params: resolved_archive_file = params["path"] else: raise ValueError("Either url or path have to be specified " "in the discriminator model parameters") classifier.load_state_dict( torch.load(resolved_archive_file, map_location=device)) classifier.eval() if isinstance(class_label, str): if class_label in params["class_vocab"]: label_id = params["class_vocab"][class_label] else: label_id = params["default_class"] elif isinstance(class_label, int): if class_label in set(params["class_vocab"].values()): label_id = class_label else: label_id = params["default_class"] else: label_id = params["default_class"] return classifier, label_id
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): """Instantiate a pretrained pytorch model from a pre-trained model configuration. The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with ``model.train()`` The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning task. The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. Parameters: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``) model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop('config', None) state_dict = kwargs.pop('state_dict', None) cache_dir = kwargs.pop('cache_dir', None) from_tf = kwargs.pop('from_tf', False) force_download = kwargs.pop('force_download', False) proxies = kwargs.pop('proxies', None) output_loading_info = kwargs.pop('output_loading_info', False) random_init = kwargs.pop("random_init", False) use_cdn = kwargs.pop("use_cdn", True) local_files_only = kwargs.pop("local_files_only", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) kwargs_config = kwargs.copy() mapping_keys_state_dic = kwargs.pop("mapping_keys_state_dic", None) kwargs_config.pop("mapping_keys_state_dic", None) if config is None: config, model_kwargs = cls.config_class.from_pretrained( pretrained_model_name_or_path, *model_args, cache_dir=cache_dir, return_unused_kwargs=True, force_download=force_download, **kwargs_config) else: model_kwargs = kwargs # Load model if pretrained_model_name_or_path is not None: if os.path.isdir(pretrained_model_name_or_path): if from_tf and os.path.isfile( os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")): # Load from a TF 1.0 checkpoint archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index") elif from_tf and os.path.isfile( os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): # Load from a TF 2.0 checkpoint archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) elif os.path.isfile( os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): # Load from a PyTorch checkpoint archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) else: raise EnvironmentError( "Error no file named {} found in directory {} or `from_tf` set to False" .format( [ WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index" ], pretrained_model_name_or_path, )) elif os.path.isfile( pretrained_model_name_or_path) or is_remote_url( pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): assert ( from_tf ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( pretrained_model_name_or_path + ".index") archive_file = pretrained_model_name_or_path + ".index" else: archive_file = hf_bucket_url( pretrained_model_name_or_path, filename=(TF2_WEIGHTS_NAME if from_tf else WEIGHTS_NAME), use_cdn=use_cdn, ) try: # Load from URL or cache if already cached resolved_archive_file = cached_path( archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, ) if resolved_archive_file is None: raise EnvironmentError except EnvironmentError: msg = ( f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME}.\n\n" ) raise EnvironmentError(msg) if resolved_archive_file == archive_file: logger.info("loading weights file {}".format(archive_file)) else: logger.info("loading weights file {} from cache at {}".format( archive_file, resolved_archive_file)) else: resolved_archive_file = None # Instantiate model. model = cls(config, *model_args, **model_kwargs) if state_dict is None and not from_tf: state_dict = torch.load(resolved_archive_file, map_location='cpu') missing_keys = [] unexpected_keys = [] error_msgs = [] if from_tf: if resolved_archive_file.endswith('.index'): # Load from a TensorFlow 1.X checkpoint - provided by original authors model = cls.load_tf_weights( model, config, resolved_archive_file[:-6]) # Remove the '.index' else: # Load from our TensorFlow 2.0 checkpoints try: from transformers import load_tf2_checkpoint_in_pytorch_model model = load_tf2_checkpoint_in_pytorch_model( model, resolved_archive_file, allow_missing_keys=True) except ImportError as e: logger.error( "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." ) raise e else: # Convert old format to new format if needed from a PyTorch state_dict old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None if 'gamma' in key: new_key = key.replace('gamma', 'weight') if 'beta' in key: new_key = key.replace('beta', 'bias') if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata # assert mapping_keys_state_dic is not None, "ERROR did not found mapping dicts for {} ".format(pretrained_model_name_or_path) # mapping_keys_state_dic = {"roberta": "encoder", "lm_head": "head.mlm"} if mapping_keys_state_dic is not None: assert isinstance(mapping_keys_state_dic, dict), "ERROR " print( "INFO : from loading from pretrained method (assuming loading original google model : " "need to rename some keys {})".format( mapping_keys_state_dic)) state_dict = cls.adapt_state_dic_to_multitask( state_dict, keys_mapping=mapping_keys_state_dic, add_prefix=pretrained_model_name_or_path == "asafaya/bert-base-arabic") #pdb.set_trace() def load(module, prefix=''): local_metadata = {"version": 1} if not prefix.startswith("head") or prefix.startswith( "head.mlm"): assert len( missing_keys ) == 0, "ERROR {} missing keys in state_dict {}".format( prefix, missing_keys) else: if len(missing_keys) == 0: print( "Warning {} missing keys in state_dict {} (warning expected for task-specific fine-tuning)" .format(prefix, missing_keys)) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): # load_params_only_ls = kwargs.get("load_params_only_ls ") not_load_params_ls = kwargs.get( "not_load_params_ls") if kwargs.get( "not_load_params_ls") is not None else [] assert isinstance( not_load_params_ls, list ), f"Argument error not_load_params_ls should be a list but is {not_load_params_ls}" matching_not_load = [] # RANDOM-INIT for pattern in not_load_params_ls: matching = re.match(pattern, prefix + name) if matching is not None: matching_not_load.append(matching) if len(matching_not_load) > 0: # means there is at least one patter in not load pattern that matched --> so should load print("MATCH not loading : {} parameters {} ".format( prefix + name, not_load_params_ls)) if child is not None and len(matching_not_load) == 0: #print("MODEL loading : child {} full {} ".format(name, prefix + name + '.')) load(child, prefix + name + '.') else: print( "MODEL not loading : child {} matching_not_load {} " .format(child, matching_not_load)) # Make sure we are able to load base models as well as derived models (with heads) start_prefix = '' model_to_load = model if not hasattr(model, cls.base_model_prefix) and any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): start_prefix = cls.base_model_prefix + '.' if hasattr(model, cls.base_model_prefix) and not any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): model_to_load = getattr(model, cls.base_model_prefix) if not random_init: load(model_to_load, prefix=start_prefix) else: print("WARNING : RANDOM INTIALIZATION OF BERTMULTITASK") if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) if len(error_msgs) > 0: raise RuntimeError( 'Error(s) in loading state_dict for {}:\n\t{}'.format( model.__class__.__name__, "\n\t".join(error_msgs))) if hasattr(model, 'tie_weights'): model.tie_weights( ) # make sure word embedding weights are still tied # Set model in evaluation mode to desactivate DropOut modules by default model.eval() if output_loading_info: loading_info = { "missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs } return model, loading_info return model
def from_pretrained(cls, model_name_or_path, *model_args, **kwargs): cache_dir = kwargs.get("cache_dir", None) from_pt = kwargs.pop("from_pt", False) from_tf = kwargs.pop("from_tf", False) from_ov = kwargs.get("from_ov", not (from_pt | from_tf)) force_download = kwargs.get("force_download", False) resume_download = kwargs.get("resume_download", False) proxies = kwargs.get("proxies", None) local_files_only = kwargs.get("local_files_only", False) use_auth_token = kwargs.get("use_auth_token", None) revision = kwargs.get("revision", None) from_pipeline = kwargs.get("_from_pipeline", None) from_auto_class = kwargs.get("_from_auto", False) config = kwargs.get( "config") if "config" in kwargs else AutoConfig.from_pretrained( model_name_or_path) if from_pt: model = cls._pt_auto_model.from_pretrained(model_name_or_path, *model_args, **kwargs) net = load_ov_model_from_pytorch(model) return OVPreTrainedModel(net, model.config) elif from_tf: model, cache_path = load_model_from_cache(model_name_or_path, cls.__name__, cache_dir, TF2_WEIGHTS_NAME, config) if model is not None: return model model = cls._tf_auto_model.from_pretrained(model_name_or_path, *model_args, **kwargs) return load_ov_model_from_tf(model, cache_path) user_agent = { "file_type": "model", "framework": "openvino", "from_auto_class": from_auto_class } if from_pipeline is not None: user_agent["using_pipeline"] = from_pipeline # Load model OV_BIN_NAME = OV_WEIGHTS_NAME.replace(".xml", ".bin") if model_name_or_path is not None: if os.path.isdir(model_name_or_path): if (from_ov and os.path.isfile( os.path.join(model_name_or_path, OV_WEIGHTS_NAME)) and os.path.isfile( os.path.join(model_name_or_path, OV_BIN_NAME))): # Load from an OpenVINO IR archive_files = [ os.path.join(model_name_or_path, name) for name in [OV_WEIGHTS_NAME, OV_BIN_NAME] ] else: raise EnvironmentError( f"Error no files named {[OV_WEIGHTS_NAME, OV_BIN_NAME]} found in directory " f"{model_name_or_path} or `from_ov` set to False") # elif os.path.isfile(model_name_or_path) or is_remote_url(model_name_or_path): # archive_file = model_name_or_path else: names = [OV_WEIGHTS_NAME, OV_BIN_NAME] archive_files = [ hf_bucket_url( model_name_or_path, filename=name, revision=revision, ) for name in names ] # redirect to the cache, if necessary try: resolved_archive_files = [ cached_path( archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, user_agent=user_agent, ) for archive_file in archive_files ] except EnvironmentError as err: logger.error(err) name = model_name_or_path msg = ( f"Can't load weights for '{name}'. Make sure that:\n\n" f"- '{name}' is a correct model identifier listed on 'https://huggingface.co/models'\n" f" (make sure '{name}' is not a path to a local directory with something else, in that case)\n\n" f"- or '{name}' is the correct path to a directory containing a file named {OV_WEIGHTS_NAME}.\n\n" ) raise EnvironmentError(msg) if resolved_archive_files == archive_files: logger.info(f"loading weights file {archive_files}") else: logger.info( f"loading weights file {archive_files} from cache at {resolved_archive_files}" ) else: resolved_archive_files = None return load_ov_model_from_ir(*resolved_archive_files, config=config)
def get_pretrained_state_dict(pretrained_model_name_or_path, *model_args, **kwargs): """Get PyTorch state dict via HuggingFace transformers library.""" config = kwargs.pop("config", None) state_dict = kwargs.pop("state_dict", None) cache_dir = kwargs.pop("cache_dir", None) # from_tf = kwargs.pop("from_tf", False) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) output_loading_info = kwargs.pop("output_loading_info", False) local_files_only = kwargs.pop("local_files_only", False) use_cdn = kwargs.pop("use_cdn", True) mirror = kwargs.pop("mirror", None) if pretrained_model_name_or_path is not None: if os.path.isdir(pretrained_model_name_or_path): if os.path.isfile( os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): # Load from a PyTorch checkpoint archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) else: raise EnvironmentError( "Error no file named {} found in directory {}".format( WEIGHTS_NAME, pretrained_model_name_or_path, )) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url( pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): assert False, "Loading TensorFlow checkpoints is not supported" else: archive_file = hf_bucket_url( pretrained_model_name_or_path, filename=WEIGHTS_NAME, use_cdn=use_cdn, mirror=mirror, ) try: # Load from URL or cache if already cached resolved_archive_file = cached_path( archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, ) if resolved_archive_file is None: raise EnvironmentError except EnvironmentError: msg = ( f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named {WEIGHTS_NAME}.\n\n" ) raise EnvironmentError(msg) if resolved_archive_file == archive_file: print("loading weights file {}".format(archive_file)) else: print("loading weights file {} from cache at {}".format( archive_file, resolved_archive_file)) else: resolved_archive_file = None if state_dict is None: try: state_dict = torch.load(resolved_archive_file, map_location="cpu") except Exception: raise OSError( "Unable to load weights from pytorch checkpoint file.") return state_dict