Пример #1
0
def load_encoder_from_transformers_weights(
    encoder: nn.Module, weights_dict: dict, return_remainder=False
):
    """Find encoder weights in weights dict, load them into encoder, return any remaining weights.

    TODO: clarify how we know the encoder weights will be prefixed by transformer name.

    Args:
        encoder (PreTrainedModel): Transformer w/o heads (embedding layer + self-attention layer).
        weights_dict (Dict): model weights.
        return_remainder (bool): If True, return any leftover weights.

    Returns:
        Dict containing any leftover weights.

    """
    remainder_weights_dict = {}
    load_weights_dict = {}
    model_arch = ModelArchitectures.from_encoder(encoder=encoder)
    encoder_prefix = MODEL_PREFIX[model_arch] + "."
    # Encoder
    for k, v in weights_dict.items():
        if k.startswith(encoder_prefix):
            load_weights_dict[strings.remove_prefix(k, encoder_prefix)] = v
        else:
            remainder_weights_dict[k] = v
    encoder.load_state_dict(load_weights_dict)
    if return_remainder:
        return remainder_weights_dict
Пример #2
0
def get_tokenizer(model_type, tokenizer_path):
    """Instantiate a tokenizer for a given model type.

    Args:
        model_type (str): model shortcut name.
        tokenizer_path (str): path to tokenizer directory.

    Returns:
        Tokenizer for the given model type.

    """
    model_arch = ModelArchitectures.from_model_type(model_type)
    tokenizer_class = resolve_tokenizer_class(model_type)
    if model_arch in [ModelArchitectures.BERT]:
        if "-cased" in model_type:
            do_lower_case = False
        elif "-uncased" in model_type:
            do_lower_case = True
        else:
            raise RuntimeError(model_type)
    elif model_arch in [
            ModelArchitectures.XLM,
            ModelArchitectures.ROBERTA,
            ModelArchitectures.XLM_ROBERTA,
    ]:
        do_lower_case = False
    elif model_arch in [ModelArchitectures.ALBERT]:
        do_lower_case = True
    else:
        raise RuntimeError(str(tokenizer_class))
    tokenizer = tokenizer_class.from_pretrained(tokenizer_path,
                                                do_lower_case=do_lower_case)
    return tokenizer
Пример #3
0
def output_hidden_states_context(encoder):
    model_arch = ModelArchitectures.from_encoder(encoder)
    if model_arch in (
            ModelArchitectures.BERT,
            ModelArchitectures.ROBERTA,
            ModelArchitectures.ALBERT,
            ModelArchitectures.XLM_ROBERTA,
            ModelArchitectures.ELECTRA,
    ):
        if hasattr(encoder.encoder, "output_hidden_states"):
            # Transformers < v2
            modified_obj = encoder.encoder
        elif hasattr(encoder.encoder.config, "output_hidden_states"):
            # Transformers >= v3
            modified_obj = encoder.encoder.config
        else:
            raise RuntimeError(
                f"Failed to convert model {type(encoder)} to output hidden states"
            )
        old_value = modified_obj.output_hidden_states
        modified_obj.output_hidden_states = True
        yield
        modified_obj.output_hidden_states = old_value
    elif model_arch in (ModelArchitectures.BART, ModelArchitectures.MBART):
        yield
        return
    else:
        raise KeyError(model_arch)
Пример #4
0
def create_taskmodel(task, encoder, **taskmodel_kwargs) -> Taskmodel:
    """Creates, initializes and returns the task model for a given task type and encoder.

    Args:
        task (Task): Task object associated with the taskmodel being created.
        encoder (JiantTransformersModel): Transformer w/o heads
            (embedding layer + self-attention layer).
        **taskmodel_kwargs: Additional args for taskmodel setup

    Raises:
        KeyError if task does not have valid TASK_TYPE.

    Returns:
        Taskmodel

    """
    head_kwargs = {}
    head_kwargs["hidden_size"] = encoder.get_hidden_size()
    head_kwargs["hidden_dropout_prob"] = encoder.get_hidden_dropout_prob()
    head_kwargs["vocab_size"] = encoder.config.vocab_size
    head_kwargs["model_arch"] = ModelArchitectures(encoder.config.model_type)

    if hasattr(encoder, "hidden_act"):
        head_kwargs["hidden_act"] = encoder.config.hidden_act
    if hasattr(encoder, "layer_norm_eps"):
        head_kwargs["layer_norm_eps"] = encoder.config.layer_norm_eps

    head = JiantHeadFactory()(task, **head_kwargs)

    taskmodel = JiantTaskModelFactory()(task, encoder, head,
                                        **taskmodel_kwargs)
    return taskmodel
Пример #5
0
def setup_jiant_model(
    hf_pretrained_model_name_or_path: str,
    model_config_path: str,
    task_dict: Dict[str, Task],
    taskmodels_config: container_setup.TaskmodelsConfig,
):
    """Sets up tokenizer, encoder, and task models, and instantiates and returns a JiantModel.

    Args:
        hf_pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
            Can be either:

                - A string, the `model id` of a predefined tokenizer hosted inside a model
                  repo on huggingface.co. Valid model ids can be located at the root-level,
                  like ``bert-base-uncased``, or namespaced under
                  a user or organization name, like ``dbmdz/bert-base-german-cased``.
                - A path to a `directory` containing vocabulary files required by the
                  tokenizer, for instance saved using the
                  :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.,
                  ``./my_model_directory/``.
                - A path or url to a single saved vocabulary file if and only if
                  the tokenizer only requires a single vocabulary file (like Bert or XLNet),
                  e.g.: ``./my_model_directory/vocab.txt``. (Not
                  applicable to all derived classes)
        model_config_path (str): Path to the JSON file containing the configuration parameters.
        task_dict (Dict[str, tasks.Task]): map from task name to task instance.
        taskmodels_config: maps mapping from tasks to models, and specifying task-model configs.

    Returns:
        JiantModel nn.Module.

    """
    model = transformers.AutoModel.from_pretrained(hf_pretrained_model_name_or_path)
    model_arch = ModelArchitectures.from_model_type(model.base_model_prefix)
    transformers_class_spec = TRANSFORMERS_CLASS_SPEC_DICT[model_arch]
    tokenizer = transformers.AutoTokenizer.from_pretrained(hf_pretrained_model_name_or_path)
    ancestor_model = get_ancestor_model(
        transformers_class_spec=transformers_class_spec, model_config_path=model_config_path,
    )
    encoder = get_encoder(model_arch=model_arch, ancestor_model=ancestor_model)
    taskmodels_dict = {
        taskmodel_name: create_taskmodel(
            task=task_dict[task_name_list[0]],  # Take the first task
            model_arch=model_arch,
            encoder=encoder,
            taskmodel_kwargs=taskmodels_config.get_taskmodel_kwargs(taskmodel_name),
        )
        for taskmodel_name, task_name_list in get_taskmodel_and_task_names(
            taskmodels_config.task_to_taskmodel_map
        ).items()
    }
    return primary.JiantModel(
        task_dict=task_dict,
        encoder=encoder,
        taskmodels_dict=taskmodels_dict,
        task_to_taskmodel_map=taskmodels_config.task_to_taskmodel_map,
        tokenizer=tokenizer,
    )
Пример #6
0
def get_output_from_encoder(encoder, input_ids, segment_ids,
                            input_mask) -> EncoderOutput:
    """Pass inputs to encoder, return encoder output.

    Args:
        encoder: bare model outputting raw hidden-states without any specific head.
        input_ids: token indices (see huggingface.co/transformers/glossary.html#input-ids).
        segment_ids: token type ids (see huggingface.co/transformers/glossary.html#token-type-ids).
        input_mask: attention mask (see huggingface.co/transformers/glossary.html#attention-mask).

    Raises:
        RuntimeError if encoder output contains less than 2 elements.

    Returns:
        EncoderOutput containing pooled and unpooled model outputs as well as any other outputs.

    """
    model_arch = ModelArchitectures.from_encoder(encoder)
    if model_arch in [
            ModelArchitectures.BERT,
            ModelArchitectures.ROBERTA,
            ModelArchitectures.ALBERT,
            ModelArchitectures.XLM_ROBERTA,
    ]:
        pooled, unpooled, other = get_output_from_standard_transformer_models(
            encoder=encoder,
            input_ids=input_ids,
            segment_ids=segment_ids,
            input_mask=input_mask,
        )
    elif model_arch == ModelArchitectures.ELECTRA:
        pooled, unpooled, other = get_output_from_electra(
            encoder=encoder,
            input_ids=input_ids,
            segment_ids=segment_ids,
            input_mask=input_mask,
        )
    elif model_arch in [
            ModelArchitectures.BART,
            ModelArchitectures.MBART,
    ]:
        pooled, unpooled, other = get_output_from_bart_models(
            encoder=encoder,
            input_ids=input_ids,
            input_mask=input_mask,
        )
    else:
        raise KeyError(model_arch)

    # Extend later with attention, hidden_acts, etc
    if other:
        return EncoderOutput(pooled=pooled, unpooled=unpooled, other=other)
    else:
        return EncoderOutput(pooled=pooled, unpooled=unpooled)
Пример #7
0
    def __call__(cls, hf_model):
        """Returns the JiantTransformersModel wrapper class for the corresponding Hugging Face
        Transformer model.

        Args:
            hf_model (PreTrainedModel): Hugging Face model to convert to JiantTransformersModel

        Returns:
            JiantTransformersModel: Jiant wrapper class for Hugging Face model
        """
        encoder_class = cls.registry[ModelArchitectures(
            hf_model.config.model_type)]
        encoder = encoder_class(hf_model)
        return encoder
Пример #8
0
def load_encoder_from_transformers_weights(encoder: nn.Module,
                                           weights_dict: dict,
                                           return_remainder=False):
    """Find encoder weights in weights dict, load them into encoder, return any remaining weights.

    TODO: clarify how we know the encoder weights will be prefixed by transformer name.

    Args:
        encoder (PreTrainedModel): Transformer w/o heads (embedding layer + self-attention layer).
        weights_dict (Dict): model weights.
        return_remainder (bool): If True, return any leftover weights.

    Returns:
        Dict containing any leftover weights.

    """
    remainder_weights_dict = {}
    load_weights_dict = {}
    model_arch = ModelArchitectures.from_model_type(
        model_type=encoder.config.model_type)
    encoder_prefix = model_arch.value + "."
    # Encoder
    for k, v in weights_dict.items():
        if k.startswith(encoder_prefix):
            load_weights_dict[strings.remove_prefix(k, encoder_prefix)] = v
        elif k.startswith(encoder_prefix.split("-")[0]):
            # workaround for deberta-v2
            # remove "-v2" suffix. weight names are prefixed with "deberta" and not "deberta-v2"
            load_weights_dict[strings.remove_prefix(
                k,
                encoder_prefix.split("-")[0] + ".")] = v
        else:
            remainder_weights_dict[k] = v
    encoder.load_state_dict(load_weights_dict, strict=False)
    if remainder_weights_dict:
        warnings.warn("The following weights were not loaded: {}".format(
            remainder_weights_dict.keys()))
    if return_remainder:
        return remainder_weights_dict
Пример #9
0
def get_model_arch_from_jiant_model(jiant_model: nn.Module) -> ModelArchitectures:
    return ModelArchitectures.from_encoder(encoder=jiant_model.encoder)
Пример #10
0
 def build_featurization_spec(cls, model_type, max_seq_length):
     model_arch = ModelArchitectures.from_model_type(model_type)
     model_class = cls.get_registry()[model_arch]
     return model_class.get_feat_spec(model_type, max_seq_length)