def _download_termtree(self, filename): default_root = os.path.join(MODEL_HOME, 'ernie-ctm') fullname = os.path.join(default_root, filename) url = URLS[filename] if not os.path.exists(fullname): get_path_from_url(url, default_root) return fullname
def _get_data(self, mode, **kwargs): """Downloads dataset.""" default_root = os.path.join(DATA_HOME, self.__class__.__name__) data_dir = os.path.join(default_root, "aclImdb", mode) if not os.path.exists(data_dir): path = get_path_from_url(self.URL, default_root, self.MD5) return data_dir
def __init__( self, root=None, mode='train', ): assert mode in [ "train", "test" ], "Unknown mode %s, it should be 'train' or 'test'." % mode if root is None: root = DATA_HOME data_dir = os.path.join(root, "aclImdb") if not os.path.exists(data_dir): data_dir = get_path_from_url(self.URL, root, self.MD5) self.examples = self._read_data_file(data_dir, mode)
# Shape: (batch_size, num_classes) logits = self.output_layer(fc2_out) return logits if __name__ == '__main__': assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." paddle.set_device(args.device) # Loads vocab. vocab_path = "./dict.txt" if not os.path.exists(vocab_path): # download in current directory get_path_from_url(WORD_DICT_URL, "./") vocab = data.load_vocab(vocab_path) if '[PAD]' not in vocab: vocab['[PAD]'] = len(vocab) # Loads dataset. train_ds, dev_ds = load_dataset("chnsenticorp", splits=["train", "dev"]) # Constructs the newtork. model = BoWModel(vocab_size=len(vocab), num_classes=len(train_ds.label_list), vocab_path=vocab_path, use_token_embedding=args.use_token_embedding) if args.use_token_embedding: vocab = model.embedder.vocab data.set_tokenizer(vocab)
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ Instantiate an instance of `PretrainedModel` from a predefined model specified by name or path. Args: pretrained_model_name_or_path (str): A name of or a file path to a pretrained model. *args (tuple): position arguments for `__init__`. If provide, use this as position argument values for model initialization. **kwargs (dict): keyword arguments for `__init__`. If provide, use this to update pre-defined keyword argument values for model initialization. If the key is in base model `__init__`, update keyword argument of base model; else update keyword argument of derived model. Returns: PretrainedModel: An instance of PretrainedModel. """ pretrained_models = list(cls.pretrained_init_configuration.keys()) resource_files = {} init_configuration = {} if pretrained_model_name_or_path in pretrained_models: for file_id, map_list in cls.pretrained_resource_files_map.items(): resource_files[file_id] = map_list[ pretrained_model_name_or_path] init_configuration = copy.deepcopy( cls. pretrained_init_configuration[pretrained_model_name_or_path]) else: if os.path.isdir(pretrained_model_name_or_path): for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join( pretrained_model_name_or_path, file_name) resource_files[file_id] = full_file_name resource_files["model_config_file"] = os.path.join( pretrained_model_name_or_path, cls.model_config_file) else: raise ValueError( "Calling {}.from_pretrained() with a model identifier or the " "path to a directory instead. The supported model " "identifiers are as follows: {}".format( cls.__name__, cls.pretrained_init_configuration.keys())) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) resolved_resource_files = {} for file_id, file_path in resource_files.items(): path = os.path.join(default_root, file_path.split('/')[-1]) if file_path is None or os.path.isfile(file_path): resolved_resource_files[file_id] = file_path elif os.path.exists(path): logger.info("Already cached %s" % path) resolved_resource_files[file_id] = path else: logger.info("Downloading %s and saved to %s" % (file_path, default_root)) resolved_resource_files[file_id] = get_path_from_url( file_path, default_root) # Prepare model initialization kwargs # Did we saved some inputs and kwargs to reload ? model_config_file = resolved_resource_files.pop( "model_config_file", None) if model_config_file is not None: with io.open(model_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: init_kwargs = init_configuration # position args are stored in kwargs, maybe better not include init_args = init_kwargs.pop("init_args", ()) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", cls.base_model_class.__name__) # Check if the loaded config matches the current model class's __init__ # arguments. If not match, the loaded config is for the base model class. if init_class == cls.base_model_class.__name__: base_args = init_args base_kwargs = init_kwargs derived_args = () derived_kwargs = {} base_arg_index = None else: # extract config for base model derived_args = list(init_args) derived_kwargs = init_kwargs base_arg = None for i, arg in enumerate(init_args): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = i base_arg = arg break for arg_name, arg in init_kwargs.items(): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = arg_name base_arg = arg break base_args = base_arg.pop("init_args", ()) base_kwargs = base_arg if cls == cls.base_model_class: # Update with newly provided args and kwargs for base model base_args = base_args if not args else args base_kwargs.update(kwargs) model = cls(*base_args, **base_kwargs) else: # Update with newly provided args and kwargs for derived model base_parameters_dict = inspect.signature( cls.base_model_class.__init__).parameters for k, v in kwargs.items(): if k in base_parameters_dict: base_kwargs[k] = v base_model = cls.base_model_class(*base_args, **base_kwargs) if base_arg_index is not None: derived_args[base_arg_index] = base_model else: derived_args = (base_model, ) # assume at the first position derived_args = derived_args if not args else args derived_parameters_dict = inspect.signature( cls.__init__).parameters for k, v in kwargs.items(): if k in derived_parameters_dict: derived_kwargs[k] = v model = cls(*derived_args, **derived_kwargs) # Maybe need more ways to load resources. weight_path = list(resolved_resource_files.values())[0] assert weight_path.endswith( ".pdparams"), "suffix of weight must be .pdparams" state_dict = paddle.load(weight_path) # Make sure we are able to load base models as well as derived models # (with heads) start_prefix = "" model_to_load = model state_to_load = state_dict unexpected_keys = [] missing_keys = [] if not hasattr(model, cls.base_model_prefix) and any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # base model state_to_load = {} start_prefix = cls.base_model_prefix + "." for k, v in state_dict.items(): if k.startswith(cls.base_model_prefix): state_to_load[k[len(start_prefix):]] = v else: unexpected_keys.append(k) if hasattr(model, cls.base_model_prefix) and not any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # derived model (base model with heads) model_to_load = getattr(model, cls.base_model_prefix) for k in model.state_dict().keys(): if not k.startswith(cls.base_model_prefix): missing_keys.append(k) if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) model_to_load.set_state_dict(state_to_load) if paddle.in_dynamic_mode(): return model return model, state_to_load
def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, **kwargs): if task: if cls._task_choice == True: cls._name_mapping = get_name_mapping(task) else: print('We only support task choice for AutoModel.') all_model_names = [] for pretrained_model_names, model_name in cls._pretrained_model_dict.items( ): for name in pretrained_model_names: all_model_names.append(name) # From built-in pretrained models if pretrained_model_name_or_path in all_model_names: for pretrained_model_names, model_name in cls._pretrained_model_dict.items( ): # From built-in pretrained models for pattern in pretrained_model_names: if pattern == pretrained_model_name_or_path: init_class = cls._name_mapping[model_name + '_Import_Class'] class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.modeling") model_class = getattr(import_class, init_class) return model_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, cls.model_config_file) if os.path.exists(config_file): with io.open(config_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class: for model_flag, name in MAPPING_NAMES.items(): if model_flag in init_class: model_name = model_flag + 'Model' break else: # From pretrained_model_name_or_path for model_flag, name in MAPPING_NAMES.items(): if name in pretrained_model_name_or_path.lower(): model_name = model_flag + 'Model' break init_class = cls._name_mapping[model_name + '_Import_Class'] class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.modeling") model_name = getattr(import_class, init_class) return model_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # Assuming from community-contributed pretrained models else: community_config_path = os.path.join( COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.model_config_file) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) try: resolved_vocab_file = get_path_from_url( community_config_path, default_root) except RuntimeError as err: logger.error(err) raise RuntimeError( f"Can't load weights for '{pretrained_model_name_or_path}'.\n" f"Please make sure that '{pretrained_model_name_or_path}' is:\n" "- a correct model-identifier of built-in pretrained models,\n" "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant modeling files(model_weights and model_config).\n" ) if os.path.exists(resolved_vocab_file): with io.open(resolved_vocab_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class: for model_flag, name in MAPPING_NAMES.items(): if model_flag in init_class: model_name = model_flag + 'Model' break else: # From pretrained_model_name_or_path for model_flag, name in MAPPING_NAMES.items(): if name in pretrained_model_name_or_path.lower(): model_name = model_flag + 'Model' break init_class = cls._name_mapping[model_name + '_Import_Class'] class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.modeling") model_name = getattr(import_class, init_class) return model_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs)
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ Creates an instance of `PretrainedModel`. Model weights are loaded by specifying name of a built-in pretrained model, or a community contributed model, or a local file directory path. Args: pretrained_model_name_or_path (str): Name of pretrained model or dir path to load from. The string can be: - Name of a built-in pretrained model - Name of a community-contributed pretrained model. - Local directory path which contains model weights file("model_state.pdparams") and model config file ("model_config.json"). *args (tuple): Position arguments for model `__init__`. If provided, use these as position argument values for model initialization. **kwargs (dict): Keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for model initialization. If the keyword is in `__init__` argument names of base model, update argument values of the base model; else update argument values of derived model. Returns: PretrainedModel: An instance of `PretrainedModel`. Example: .. code-block:: from paddlenlp.transformers import BertForSequenceClassification # Name of built-in pretrained model model = BertForSequenceClassification.from_pretrained('bert-base-uncased') # Name of community-contributed pretrained model model = BertForSequenceClassification.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') # Load from local directory path model = BertForSequenceClassification.from_pretrained('./my_bert/') """ pretrained_models = list(cls.pretrained_init_configuration.keys()) resource_files = {} init_configuration = {} # From built-in pretrained models if pretrained_model_name_or_path in pretrained_models: for file_id, map_list in cls.pretrained_resource_files_map.items(): resource_files[file_id] = map_list[ pretrained_model_name_or_path] init_configuration = copy.deepcopy( cls. pretrained_init_configuration[pretrained_model_name_or_path]) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join(pretrained_model_name_or_path, file_name) resource_files[file_id] = full_file_name resource_files["model_config_file"] = os.path.join( pretrained_model_name_or_path, cls.model_config_file) else: # Assuming from community-contributed pretrained models for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join(COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, file_name) resource_files[file_id] = full_file_name resource_files["model_config_file"] = os.path.join( COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.model_config_file) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) resolved_resource_files = {} for file_id, file_path in resource_files.items(): if file_path is None or os.path.isfile(file_path): resolved_resource_files[file_id] = file_path continue path = os.path.join(default_root, file_path.split('/')[-1]) if os.path.exists(path): logger.info("Already cached %s" % path) resolved_resource_files[file_id] = path else: logger.info("Downloading %s and saved to %s" % (file_path, default_root)) try: resolved_resource_files[file_id] = get_path_from_url( file_path, default_root) except RuntimeError as err: logger.error(err) raise RuntimeError( f"Can't load weights for '{pretrained_model_name_or_path}'.\n" f"Please make sure that '{pretrained_model_name_or_path}' is:\n" "- a correct model-identifier of built-in pretrained models,\n" "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant modeling files(model_weights and model_config).\n" ) # Prepare model initialization kwargs # Did we saved some inputs and kwargs to reload ? model_config_file = resolved_resource_files.pop( "model_config_file", None) if model_config_file is not None: with io.open(model_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: init_kwargs = init_configuration # position args are stored in kwargs, maybe better not include init_args = init_kwargs.pop("init_args", ()) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", cls.base_model_class.__name__) # Check if the loaded config matches the current model class's __init__ # arguments. If not match, the loaded config is for the base model class. if init_class == cls.base_model_class.__name__: base_args = init_args base_kwargs = init_kwargs derived_args = () derived_kwargs = {} base_arg_index = None else: # extract config for base model derived_args = list(init_args) derived_kwargs = init_kwargs base_arg = None for i, arg in enumerate(init_args): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = i base_arg = arg break for arg_name, arg in init_kwargs.items(): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = arg_name base_arg = arg break base_args = base_arg.pop("init_args", ()) base_kwargs = base_arg if cls == cls.base_model_class: # Update with newly provided args and kwargs for base model base_args = base_args if not args else args base_kwargs.update(kwargs) model = cls(*base_args, **base_kwargs) else: # Update with newly provided args and kwargs for derived model base_parameters_dict = inspect.signature( cls.base_model_class.__init__).parameters for k, v in kwargs.items(): if k in base_parameters_dict: base_kwargs[k] = v base_model = cls.base_model_class(*base_args, **base_kwargs) if base_arg_index is not None: derived_args[base_arg_index] = base_model else: derived_args = (base_model, ) # assume at the first position derived_args = derived_args if not args else args derived_parameters_dict = inspect.signature( cls.__init__).parameters for k, v in kwargs.items(): if k in derived_parameters_dict: derived_kwargs[k] = v model = cls(*derived_args, **derived_kwargs) # Maybe need more ways to load resources. weight_path = resolved_resource_files["model_state"] assert weight_path.endswith( ".pdparams"), "suffix of weight must be .pdparams" state_dict = paddle.load(weight_path) # Make sure we are able to load base models as well as derived models # (with heads) start_prefix = "" model_to_load = model state_to_load = state_dict unexpected_keys = [] missing_keys = [] if not hasattr(model, cls.base_model_prefix) and any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # base model state_to_load = {} start_prefix = cls.base_model_prefix + "." for k, v in state_dict.items(): if k.startswith(cls.base_model_prefix): state_to_load[k[len(start_prefix):]] = v else: unexpected_keys.append(k) if hasattr(model, cls.base_model_prefix) and not any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # derived model (base model with heads) model_to_load = getattr(model, cls.base_model_prefix) for k in model.state_dict().keys(): if not k.startswith(cls.base_model_prefix): missing_keys.append(k) if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) if paddle.in_dynamic_mode(): model_to_load.set_state_dict(state_to_load) return model return model, state_to_load
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ Creates an instance of `PretrainedModel`. Model weights are loaded by specifying name of a built-in pretrained model, or a community contributed model, or a local file directory path. Args: pretrained_model_name_or_path (str): Name of pretrained model or dir path to load from. The string can be: - Name of a built-in pretrained model - Name of a community-contributed pretrained model. - Local directory path which contains model weights file("model_state.pdparams") and model config file ("model_config.json"). *args (tuple): Position arguments for model `__init__`. If provided, use these as position argument values for model initialization. **kwargs (dict): Keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for model initialization. If the keyword is in `__init__` argument names of base model, update argument values of the base model; else update argument values of derived model. load_state_as_np (bool, optional): The weights read in can be choosed to place on CPU or GPU though the model is on the default device. If `True`, load the model weights as `numpy.ndarray` on CPU. Otherwise, weights would be loaded as tensors on the default device. Note that if on GPU, the latter would creates extra temporary tensors in addition to the model weights, which doubles the memory usage . Thus it is suggested to use `True` for big models on GPU. Default to `False`. Returns: PretrainedModel: An instance of `PretrainedModel`. Example: .. code-block:: from paddlenlp.transformers import BertForSequenceClassification # Name of built-in pretrained model model = BertForSequenceClassification.from_pretrained('bert-base-uncased') # Name of community-contributed pretrained model model = BertForSequenceClassification.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') # Load from local directory path model = BertForSequenceClassification.from_pretrained('./my_bert/') """ pretrained_models = list(cls.pretrained_init_configuration.keys()) resource_files = {} init_configuration = {} load_state_as_np = kwargs.pop("load_state_as_np", False) # From built-in pretrained models if pretrained_model_name_or_path in pretrained_models: for file_id, map_list in cls.pretrained_resource_files_map.items(): resource_files[file_id] = map_list[ pretrained_model_name_or_path] init_configuration = copy.deepcopy( cls. pretrained_init_configuration[pretrained_model_name_or_path]) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join(pretrained_model_name_or_path, file_name) resource_files[file_id] = full_file_name resource_files["model_config_file"] = os.path.join( pretrained_model_name_or_path, cls.model_config_file) else: # Assuming from community-contributed pretrained models for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join(COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, file_name) resource_files[file_id] = full_file_name resource_files["model_config_file"] = os.path.join( COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.model_config_file) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) resolved_resource_files = {} for file_id, file_path in resource_files.items(): if file_path is None or os.path.isfile(file_path): resolved_resource_files[file_id] = file_path continue path = os.path.join(default_root, file_path.split('/')[-1]) if os.path.exists(path): logger.info("Already cached %s" % path) resolved_resource_files[file_id] = path else: logger.info("Downloading %s and saved to %s" % (file_path, default_root)) try: resolved_resource_files[file_id] = get_path_from_url( file_path, default_root) except RuntimeError as err: logger.error(err) raise RuntimeError( f"Can't load weights for '{pretrained_model_name_or_path}'.\n" f"Please make sure that '{pretrained_model_name_or_path}' is:\n" "- a correct model-identifier of built-in pretrained models,\n" "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant modeling files(model_weights and model_config).\n" ) # Prepare model initialization kwargs # Did we saved some inputs and kwargs to reload ? model_config_file = resolved_resource_files.pop( "model_config_file", None) if model_config_file is not None: with io.open(model_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: init_kwargs = init_configuration # position args are stored in kwargs, maybe better not include init_args = init_kwargs.pop("init_args", ()) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", cls.base_model_class.__name__) # Check if the loaded config matches the current model class's __init__ # arguments. If not match, the loaded config is for the base model class. if init_class == cls.base_model_class.__name__: base_args = init_args base_kwargs = init_kwargs derived_args = () derived_kwargs = {} base_arg_index = None else: # extract config for base model derived_args = list(init_args) derived_kwargs = init_kwargs base_arg = None for i, arg in enumerate(init_args): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = i base_arg = arg break for arg_name, arg in init_kwargs.items(): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = arg_name base_arg = arg break base_args = base_arg.pop("init_args", ()) base_kwargs = base_arg if cls == cls.base_model_class: # Update with newly provided args and kwargs for base model base_args = base_args if not args else args base_kwargs.update(kwargs) model = cls(*base_args, **base_kwargs) else: # Update with newly provided args and kwargs for derived model base_parameters_dict = inspect.signature( cls.base_model_class.__init__).parameters for k, v in kwargs.items(): if k in base_parameters_dict: base_kwargs[k] = v base_model = cls.base_model_class(*base_args, **base_kwargs) if base_arg_index is not None: derived_args[base_arg_index] = base_model else: derived_args = (base_model, ) # assume at the first position derived_args = derived_args if not args else args derived_parameters_dict = inspect.signature( cls.__init__).parameters for k, v in kwargs.items(): if k in derived_parameters_dict: derived_kwargs[k] = v model = cls(*derived_args, **derived_kwargs) # Maybe need more ways to load resources. weight_path = resolved_resource_files["model_state"] assert weight_path.endswith( ".pdparams"), "suffix of weight must be .pdparams" # NOTE: Allow to load partial model for model parallel. # TODO(guosheng): To make model loading for the model parallel automatic, # maybe we should make rank 0 worker load weights of the full model on # CPU, then split weights into multiple parts and pickle separately. # The other workers wait util pickle finish and then load the corresponding # partial weights. Also we can directly use separate weight files for # simplicity. state_dict = paddle.load(weight_path, return_numpy=load_state_as_np) # Make sure we are able to load base models as well as derived models # (with heads) start_prefix = "" model_to_load = model state_to_load = state_dict unexpected_keys = [] missing_keys = [] if not hasattr(model, cls.base_model_prefix) and any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # base model state_to_load = {} start_prefix = cls.base_model_prefix + "." for k, v in state_dict.items(): if k.startswith(cls.base_model_prefix): state_to_load[k[len(start_prefix):]] = v else: unexpected_keys.append(k) if hasattr(model, cls.base_model_prefix) and not any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # derived model (base model with heads) model_to_load = getattr(model, cls.base_model_prefix) for k in model.state_dict().keys(): if not k.startswith(cls.base_model_prefix): missing_keys.append(k) if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) # Allow the float16 model to load float32 weights, which decreases memory # usage in model loading stage and is useful to big models. dtype_prefix_len = len("paddle.") # paddle.float16 for k, v in model_to_load.state_dict().items(): if not isinstance(v, np.ndarray): dtype = str(v.dtype)[dtype_prefix_len:] # TODO(guosheng): add warnings for unmatched dtypes if k in state_to_load: state_to_load[k] = state_to_load[k].astype(dtype) # Logging model download statistics download_check(pretrained_model_name_or_path, "from_pretrained") # For model parallel if FasterGeneration # To avoid recursive import temporarily. import paddlenlp.ops.faster_transformer.transformer.decoding as ft_decoding state_to_load = ft_decoding.get_ft_para_conf().fit_partial_model( model_to_load, state_to_load) if paddle.in_dynamic_mode(): model_to_load.set_state_dict(state_to_load) return model return model, state_to_load
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): """ Creates an instance of `AutoTokenizer`. Related resources are loaded by specifying name of a built-in pretrained model, or a community-contributed pretrained model, or a local file directory path. Args: pretrained_model_name_or_path (str): Name of pretrained model or dir path to load from. The string can be: - Name of built-in pretrained model - Name of a community-contributed pretrained model. - Local directory path which contains tokenizer related resources and tokenizer config file ("tokenizer_config.json"). *args (tuple): position arguments for model `__init__`. If provided, use these as position argument values for tokenizer initialization. **kwargs (dict): keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for tokenizer initialization. Returns: PretrainedTokenizer: An instance of `PretrainedTokenizer`. Example: .. code-block:: from paddlenlp.transformers import AutoTokenizer # Name of built-in pretrained model tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> # Name of community-contributed pretrained model tokenizer = AutoTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> # Load from local directory path tokenizer = AutoTokenizer.from_pretrained('./my_bert/') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> """ all_tokenizer_names = [] for names, tokenizer_class in cls._tokenizer_mapping.items(): for name in names: all_tokenizer_names.append(name) # From built-in pretrained models if pretrained_model_name_or_path in all_tokenizer_names: for names, tokenizer_class in cls._tokenizer_mapping.items(): for pattern in names: if pattern == pretrained_model_name_or_path: return tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, cls.tokenizer_config_file) if os.path.exists(config_file): with io.open(config_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class: class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_name = getattr(import_class, init_class) return tokenizer_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # If no `init_class`, we use pattern recognition to recognize the tokenizer class. else: print( 'We use pattern recognition to recognize the Tokenizer class.' ) for key, pattern in cls._name_mapping.items(): if pattern in pretrained_model_name_or_path.lower(): init_class = key class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_name = getattr(import_class, init_class) print( f"The 'pretrained_model_name_or_path' is {pretrained_model_name_or_path}, we import {tokenizer_name}." ) return tokenizer_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # Assuming from community-contributed pretrained models else: community_config_path = os.path.join(COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.tokenizer_config_file) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) try: resolved_vocab_file = get_path_from_url(community_config_path, default_root) except RuntimeError as err: logger.error(err) raise RuntimeError( f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" f"Please make sure that '{pretrained_model_name_or_path}' is:\n" "- a correct model-identifier of built-in pretrained models,\n" "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant tokenizer files.\n" ) if os.path.exists(resolved_vocab_file): with io.open(resolved_vocab_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class: class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_name = getattr(import_class, init_class) return tokenizer_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # If no `init_class`, we use pattern recognition to recognize the Tokenizer class. else: print( 'We use pattern recognition to recognize the Tokenizer class.' ) for key, pattern in cls._name_mapping.items(): if pattern in pretrained_model_name_or_path.lower(): init_class = key class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_name = getattr(import_class, init_class) print( f"The 'pretrained_model_name_or_path' is {pretrained_model_name_or_path}, we import {tokenizer_name}." ) return tokenizer_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs)
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): """ Creates an instance of `AutoTokenizer`. Related resources are loaded by specifying name of a built-in pretrained model, or a community-contributed pretrained model, or a local file directory path. Args: pretrained_model_name_or_path (str): Name of pretrained model or dir path to load from. The string can be: - Name of built-in pretrained model - Name of a community-contributed pretrained model. - Local directory path which contains tokenizer related resources and tokenizer config file ("tokenizer_config.json"). *args (tuple): position arguments for model `__init__`. If provided, use these as position argument values for tokenizer initialization. **kwargs (dict): keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for tokenizer initialization. Returns: PretrainedTokenizer: An instance of `PretrainedTokenizer`. Example: .. code-block:: from paddlenlp.transformers import AutoTokenizer # Name of built-in pretrained model tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> # Name of community-contributed pretrained model tokenizer = AutoTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> # Load from local directory path tokenizer = AutoTokenizer.from_pretrained('./my_bert/') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> """ # default not to use faster tokenizer use_faster = kwargs.pop("use_faster", False) all_tokenizer_names = [] for names, tokenizer_class in cls._tokenizer_mapping.items(): for name in names: all_tokenizer_names.append(name) # From built-in pretrained models if pretrained_model_name_or_path in all_tokenizer_names: for names, tokenizer_classes in cls._tokenizer_mapping.items(): for pattern in names: if pattern == pretrained_model_name_or_path: actual_tokenizer_class = None # Default setting the python tokenizer to actual_tokenizer_class for tokenizer_class in tokenizer_classes: if not tokenizer_class[1]: actual_tokenizer_class = tokenizer_class[0] break if use_faster: if is_faster_tokenizers_available(): is_support_faster_tokenizer = False for tokenizer_class in tokenizer_classes: if tokenizer_class[1]: actual_tokenizer_class = tokenizer_class[ 0] is_support_faster_tokenizer = True break if not is_support_faster_tokenizer: logger.warning( f"The tokenizer {actual_tokenizer_class} doesn't have the faster version." " Please check the map `paddlenlp.transformers.auto.tokenizer.FASTER_TOKENIZER_MAPPING_NAMES`" " to see which faster tokenizers are currently supported." ) else: logger.warning( "Can't find the faster_tokenizers package, " "please ensure install faster_tokenizers correctly. " "You can install faster_tokenizers by `pip install faster_tokenizers`" "(Currently only work for linux platform)." ) logger.info("We are using %s to load '%s'." % (actual_tokenizer_class, pretrained_model_name_or_path)) return actual_tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, cls.tokenizer_config_file) if os.path.exists(config_file): with io.open(config_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class is None: init_class = init_kwargs.pop("tokenizer_class", None) if init_class: class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_class = getattr(import_class, init_class) logger.info( "We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) return tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # If no `init_class`, we use pattern recognition to recognize the tokenizer class. else: print( 'We use pattern recognition to recognize the Tokenizer class.' ) for key, pattern in cls._name_mapping.items(): if pattern in pretrained_model_name_or_path.lower(): init_class = key class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer" ) tokenizer_class = getattr(import_class, init_class) logger.info("We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) return tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # Assuming from community-contributed pretrained models else: community_config_path = os.path.join( COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.tokenizer_config_file) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) try: resolved_vocab_file = get_path_from_url( community_config_path, default_root) except RuntimeError as err: logger.error(err) raise RuntimeError( f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" f"Please make sure that '{pretrained_model_name_or_path}' is:\n" "- a correct model-identifier of built-in pretrained models,\n" "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant tokenizer files.\n" ) if os.path.exists(resolved_vocab_file): with io.open(resolved_vocab_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class: class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_class = getattr(import_class, init_class) logger.info( "We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) return tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # If no `init_class`, we use pattern recognition to recognize the Tokenizer class. else: print( 'We use pattern recognition to recognize the Tokenizer class.' ) for key, pattern in cls._name_mapping.items(): if pattern in pretrained_model_name_or_path.lower(): init_class = key class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer" ) tokenizer_class = getattr(import_class, init_class) logger.info("We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) return tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs)
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ Instantiate an instance of `PretrainedTokenizer` from a predefined tokenizer specified by name or path., and it always corresponds to a pretrained model. Args: pretrained_model_name_or_path (str): A name of or a file path to a pretrained model. *args (tuple): position arguments for `__init__`. If provide, use this as position argument values for tokenizer initialization. **kwargs (dict): keyword arguments for `__init__`. If provide, use this to update pre-defined keyword argument values for tokenizer initialization. Returns: PretrainedTokenizer: An instance of PretrainedTokenizer. """ pretrained_models = list(cls.pretrained_init_configuration.keys()) vocab_files = {} init_configuration = {} if pretrained_model_name_or_path in pretrained_models: for file_id, map_list in cls.pretrained_resource_files_map.items(): vocab_files[file_id] = map_list[pretrained_model_name_or_path] init_configuration = copy.deepcopy( cls.pretrained_init_configuration[ pretrained_model_name_or_path]) else: if os.path.isdir(pretrained_model_name_or_path): for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join(pretrained_model_name_or_path, file_name) vocab_files[file_id] = full_file_name vocab_files["tokenizer_config_file"] = os.path.join( pretrained_model_name_or_path, cls.tokenizer_config_file) else: raise ValueError( "Calling {}.from_pretrained() with a model identifier or the " "path to a directory instead. The supported model " "identifiers are as follows: {}".format( cls.__name__, cls.pretrained_init_configuration.keys())) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) resolved_vocab_files = {} for file_id, file_path in vocab_files.items(): resolved_vocab_files[ file_id] = file_path if file_path is None or os.path.isfile( file_path) else get_path_from_url(file_path, default_root, None) # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? tokenizer_config_file = resolved_vocab_files.pop( "tokenizer_config_file", None) if tokenizer_config_file is not None: with io.open(tokenizer_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: init_kwargs = init_configuration # position args are stored in kwargs, maybe better not include init_args = init_kwargs.pop("init_args", ()) init_kwargs.pop("init_class", None) # Update with newly provided args and kwargs init_args = init_args if not args else args init_kwargs.update(kwargs) # Merge resolved_vocab_files arguments in init_kwargs if not including. # Maybe need more ways to load resources. for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path # TODO(guosheng): avoid reduplication of position args and key word args tokenizer = cls(*init_args, **init_kwargs) return tokenizer