Exemplo n.º 1
0
    def __init__(
        self,
        model_name: str,
        *,
        max_length: int = None,
        sub_module: str = None,
        train_parameters: bool = True,
        last_layer_only: bool = True,
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        masked_language_modeling: bool = True,
    ) -> None:
        TokenEmbedder.__init__(self)  # Call the base class constructor
        tokenizer = PretrainedTransformerTokenizer(model_name)
        self.masked_language_modeling = masked_language_modeling

        if self.masked_language_modeling:
            self.config = AutoConfig.from_pretrained(model_name,
                                                     output_hidden_states=True)
            # We only need access to the HF tokenizer if we are masked language modeling
            self.tokenizer = tokenizer.tokenizer
            # The only differences when masked language modeling are:
            # 1) `output_hidden_states` must be True to get access to token embeddings.
            # 2) We need to use `AutoModelForMaskedLM` to get the correct model
            self.transformer_model = AutoModelForMaskedLM.from_pretrained(
                model_name, config=self.config)
        # Eveything after the if statement (including the else) is copied directly from:
        # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py
        else:
            from allennlp.common import cached_transformers

            self.transformer_model = cached_transformers.get(
                model_name, True, override_weights_file,
                override_weights_strip_prefix)
            self.config = self.transformer_model.config

        if sub_module:
            assert hasattr(self.transformer_model, sub_module)
            self.transformer_model = getattr(self.transformer_model,
                                             sub_module)
        self._max_length = max_length

        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.config.hidden_size

        self._scalar_mix: Optional[ScalarMix] = None
        if not last_layer_only:
            self._scalar_mix = ScalarMix(self.config.num_hidden_layers)
            self.config.output_hidden_states = True

        self._num_added_start_tokens = len(
            tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
        self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens

        if not train_parameters:
            for param in self.transformer_model.parameters():
                param.requires_grad = False
Exemplo n.º 2
0
    def from_params(
            cls, vocab: Vocabulary,
            params: Params) -> 'BasicTextFieldEmbedder':  # type: ignore
        # pylint: disable=arguments-differ,bad-super-call

        # The original `from_params` for this class was designed in a way that didn't agree
        # with the constructor. The constructor wants a 'token_embedders' parameter that is a
        # `Dict[str, TokenEmbedder]`, but the original `from_params` implementation expected those
        # key-value pairs to be top-level in the params object.
        #
        # This breaks our 'configuration wizard' and configuration checks. Hence, going forward,
        # the params need a 'token_embedders' key so that they line up with what the constructor wants.
        # For now, the old behavior is still supported, but produces a DeprecationWarning.

        embedder_to_indexer_map = params.pop("embedder_to_indexer_map", None)
        if embedder_to_indexer_map is not None:
            embedder_to_indexer_map = embedder_to_indexer_map.as_dict(
                quiet=True)
        allow_unmatched_keys = params.pop_bool("allow_unmatched_keys", False)
        use_fp16 = params.pop_bool("use_fp16", False)
        token_embedder_params = params.pop('token_embedders', None)

        if token_embedder_params is not None:
            # New way: explicitly specified, so use it.
            token_embedders = {
                name: TokenEmbedder.from_params(subparams, vocab=vocab)
                for name, subparams in token_embedder_params.items()
            }

        else:
            # Warn that the original behavior is deprecated
            warnings.warn(
                DeprecationWarning(
                    "the token embedders for BasicTextFieldEmbedder should now "
                    "be specified as a dict under the 'token_embedders' key, "
                    "not as top-level key-value pairs"))

            token_embedders = {}
            keys = list(params.keys())
            for key in keys:
                embedder_params = params.pop(key)
                token_embedders[key] = TokenEmbedder.from_params(
                    vocab=vocab, params=embedder_params)

        params.assert_empty(cls.__name__)
        return cls(use_fp16, token_embedders, embedder_to_indexer_map,
                   allow_unmatched_keys)
Exemplo n.º 3
0
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder':
     token_embedders = {}
     keys = list(params.keys())
     for key in keys:
         embedder_params = params.pop(key)
         token_embedders[key] = TokenEmbedder.from_params(vocab, embedder_params)
     params.assert_empty(cls.__name__)
     return cls(token_embedders)
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder':
     token_embedders = {}
     keys = list(params.keys())
     for key in keys:
         embedder_params = params.pop(key)
         token_embedders[key] = TokenEmbedder.from_params(vocab, embedder_params)
     params.assert_empty(cls.__name__)
     return cls(token_embedders)
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder':  # type: ignore
        # pylint: disable=arguments-differ,bad-super-call

        # The original `from_params` for this class was designed in a way that didn't agree
        # with the constructor. The constructor wants a 'token_embedders' parameter that is a
        # `Dict[str, TokenEmbedder]`, but the original `from_params` implementation expected those
        # key-value pairs to be top-level in the params object.
        #
        # This breaks our 'configuration wizard' and configuration checks. Hence, going forward,
        # the params need a 'token_embedders' key so that they line up with what the constructor wants.
        # For now, the old behavior is still supported, but produces a DeprecationWarning.

        embedder_to_indexer_map = params.pop("embedder_to_indexer_map", None)
        if embedder_to_indexer_map is not None:
            embedder_to_indexer_map = embedder_to_indexer_map.as_dict(quiet=True)
        allow_unmatched_keys = params.pop_bool("allow_unmatched_keys", False)

        token_embedder_params = params.pop('token_embedders', None)

        if token_embedder_params is not None:
            # New way: explicitly specified, so use it.
            token_embedders = {
                    name: TokenEmbedder.from_params(subparams, vocab=vocab)
                    for name, subparams in token_embedder_params.items()
            }

        else:
            # Warn that the original behavior is deprecated
            warnings.warn(DeprecationWarning("the token embedders for BasicTextFieldEmbedder should now "
                                             "be specified as a dict under the 'token_embedders' key, "
                                             "not as top-level key-value pairs"))

            token_embedders = {}
            keys = list(params.keys())
            for key in keys:
                embedder_params = params.pop(key)
                token_embedders[key] = TokenEmbedder.from_params(vocab=vocab, params=embedder_params)

        params.assert_empty(cls.__name__)
        return cls(token_embedders, embedder_to_indexer_map, allow_unmatched_keys)
Exemplo n.º 6
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder':
        embedder_to_indexer_map = params.pop("embedder_to_indexer_map", None)
        if embedder_to_indexer_map is not None:
            embedder_to_indexer_map = embedder_to_indexer_map.as_dict(quiet=True)

        token_embedders = {}
        keys = list(params.keys())
        for key in keys:
            embedder_params = params.pop(key)
            token_embedders[key] = TokenEmbedder.from_params(vocab, embedder_params)

        params.assert_empty(cls.__name__)
        return cls(token_embedders, embedder_to_indexer_map)
Exemplo n.º 7
0
 def from_params(cls, vocab: Vocabulary,
                 params: Params) -> 'OptionalTextFieldEmbedder':
     token_embedders = {}
     keys = list(params.keys())
     for key in keys:
         embedder_params = params.pop(key)
         if ("embedding_dim" in embedder_params and \
             embedder_params.get("embedding_dim") == 0):
             # skip 0-dimensional embedders
             continue
         token_embedders[key] = TokenEmbedder.from_params(
             vocab, embedder_params)
     params.assert_empty(cls.__name__)
     return cls(token_embedders)
Exemplo n.º 8
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder':  # type: ignore
        # pylint: disable=arguments-differ
        embedder_to_indexer_map = params.pop("embedder_to_indexer_map", None)
        if embedder_to_indexer_map is not None:
            embedder_to_indexer_map = embedder_to_indexer_map.as_dict(quiet=True)
        allow_unmatched_keys = params.pop_bool("allow_unmatched_keys", False)

        token_embedders = {}
        keys = list(params.keys())
        for key in keys:
            embedder_params = params.pop(key)
            token_embedders[key] = TokenEmbedder.from_params(vocab=vocab, params=embedder_params)
        params.assert_empty(cls.__name__)
        return cls(token_embedders, embedder_to_indexer_map, allow_unmatched_keys)
Exemplo n.º 9
0
 def test_registry_has_builtin_token_embedders(self):
     assert TokenEmbedder.by_name("embedding").__name__ == 'Embedding'
     assert TokenEmbedder.by_name("character_encoding").__name__ == 'TokenCharactersEncoder'
Exemplo n.º 10
0
    # Custom vocab_to_cache logic requires a from_params implementation.
    @classmethod
    def from_params(cls, vocab, params):  # type: ignore
        # pylint: disable=arguments-differ
        params.add_file_to_archive(u'options_file')
        params.add_file_to_archive(u'weight_file')
        options_file = params.pop(u'options_file')
        weight_file = params.pop(u'weight_file')
        requires_grad = params.pop(u'requires_grad', False)
        do_layer_norm = params.pop_bool(u'do_layer_norm', False)
        dropout = params.pop_float(u"dropout", 0.5)
        namespace_to_cache = params.pop(u"namespace_to_cache", None)
        if namespace_to_cache is not None:
            vocab_to_cache = list(
                vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
        else:
            vocab_to_cache = None
        projection_dim = params.pop_int(u"projection_dim", None)
        params.assert_empty(cls.__name__)
        return cls(options_file=options_file,
                   weight_file=weight_file,
                   do_layer_norm=do_layer_norm,
                   dropout=dropout,
                   requires_grad=requires_grad,
                   projection_dim=projection_dim,
                   vocab_to_cache=vocab_to_cache)


ElmoTokenEmbedder = TokenEmbedder.register(u"elmo_token_embedder")(
    ElmoTokenEmbedder)
Exemplo n.º 11
0
 def test_registry_has_builtin_token_embedders(self):
     assert TokenEmbedder.by_name("embedding").__name__ == "from_vocab_or_file"
     assert TokenEmbedder.by_name("character_encoding").__name__ == "TokenCharactersEncoder"
Exemplo n.º 12
0
 def test_registry_has_builtin_token_embedders(self):
     assert TokenEmbedder.by_name("embedding").__name__ == 'Embedding'
     assert TokenEmbedder.by_name("character_encoding").__name__ == 'TokenCharactersEncoder'
    def __init__(
        self,
        model_name: str,
        *,
        max_length: int = None,
        sub_module: str = None,
        train_parameters: bool = True,
        last_layer_only: bool = True,
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        gradient_checkpointing: Optional[bool] = None,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        transformer_kwargs: Optional[Dict[str, Any]] = None,
        masked_language_modeling: bool = True,
        load_directory: Optional[str] = None
    ) -> None:
        TokenEmbedder.__init__(self)  # Call the base class constructor
        tokenizer = PretrainedTransformerTokenizer(model_name, tokenizer_kwargs=tokenizer_kwargs)
        self.masked_language_modeling = masked_language_modeling

        if self.masked_language_modeling:
            self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
            # We only need access to the HF tokenizer if we are masked language modeling
            self.tokenizer = tokenizer.tokenizer
            # The only differences when masked language modeling are:
            # 1) `output_hidden_states` must be True to get access to token embeddings.
            # 2) We need to use `AutoModelForMaskedLM` to get the correct model
            self.transformer_model = AutoModelForMaskedLM.from_pretrained(
            # self.transformer_model = RobertaForAugment.from_pretrained()
                model_name, config=self.config, **(transformer_kwargs or {})
            )

            if load_directory is not None:
                print("Loading Model from:", load_directory)
                state = torch.load(load_directory)
                model_dict = self.transformer_model.state_dict()
                # ckpt__dict = state['state_dict']
                state = {k: v for k, v in state.items() if k in model_dict}
                model_dict.update(state) 
                self.transformer_model.load_state_dict(model_dict, strict=False)
                print("Loading Model from:", load_directory, "...Finished.")
        # Eveything after the if statement (including the else) is copied directly from:
        # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py
        else:
            from allennlp.common import cached_transformers

            self.transformer_model = cached_transformers.get(
                model_name, True, override_weights_file, override_weights_strip_prefix
            )
            self.config = self.transformer_model.config

        if gradient_checkpointing is not None:
            self.transformer_model.config.update({"gradient_checkpointing": gradient_checkpointing})

        if sub_module:
            assert hasattr(self.transformer_model, sub_module)
            self.transformer_model = getattr(self.transformer_model, sub_module)

        # print("max_length", max_length)
        self._max_length = max_length

        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.config.hidden_size

        self._scalar_mix: Optional[ScalarMix] = None
        if not last_layer_only:
            self._scalar_mix = ScalarMix(self.config.num_hidden_layers)
            self.config.output_hidden_states = True

        self._num_added_start_tokens = len(tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
        self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens

        self.encoder = BertEncoder(self.config)
        self.layer = torch.nn.ModuleList([BertLayer(self.config)
                                    for _ in range(self.config.num_hidden_layers)])
        self.embeddings = BertEmbeddings(self.config)
        self.output_hidden_states = self.config.output_hidden_states

        if not train_parameters:
            for param in self.transformer_model.parameters():
                param.requires_grad = False
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x

    def get_output_dim(self):
        return self._encoder._module.get_output_dim()  # pylint: disable=protected-access

    def forward(self, token_characters):  # pylint: disable=arguments-differ
        mask = (token_characters != 0).long()
        return self._dropout(
            self._encoder(self._embedding(token_characters), mask))

    # The setdefault requires a custom from_params
    @classmethod
    def from_params(cls, vocab, params):  # type: ignore
        # pylint: disable=arguments-differ
        embedding_params = params.pop(u"embedding")
        # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
        # that to be "token_characters" by default.
        embedding_params.setdefault(u"vocab_namespace", u"token_characters")
        embedding = Embedding.from_params(vocab, embedding_params)
        encoder_params = params.pop(u"encoder")
        encoder = Seq2VecEncoder.from_params(encoder_params)
        dropout = params.pop_float(u"dropout", 0.0)
        params.assert_empty(cls.__name__)
        return cls(embedding, encoder, dropout)


TokenCharactersEncoder = TokenEmbedder.register(u"character_encoding")(
    TokenCharactersEncoder)
Exemplo n.º 15
0
        else:
            weight = None

        return cls(num_embeddings=num_embeddings,
                   embedding_dim=embedding_dim,
                   projection_dim=projection_dim,
                   weight=weight,
                   padding_index=padding_index,
                   trainable=trainable,
                   max_norm=max_norm,
                   norm_type=norm_type,
                   scale_grad_by_freq=scale_grad_by_freq,
                   sparse=sparse)


Embedding = TokenEmbedder.register(u"embedding")(Embedding)


def _read_pretrained_embeddings_file(file_uri,
                                     embedding_dim,
                                     vocab,
                                     namespace=u"tokens"):
    u"""
    Returns and embedding matrix for the given vocabulary using the pretrained embeddings
    contained in the given file. Embeddings for tokens not found in the pretrained embedding file
    are randomly initialized using a normal distribution with mean and standard deviation equal to
    those of the pretrained embeddings.

    We support two file formats:

        * text format - utf-8 encoded text file with space separated fields: [word] [dim 1] [dim 2] ...
            num_timesteps, device=get_device_of(inputs)) + vocab_size

        # Combine the inputs with positional encodings
        batch_tensor = torch.stack(
            [
                inputs,  # (batch_size, num_timesteps)
                positional_encodings.expand(batch_size, num_timesteps)
            ],
            dim=-1)

        byte_pairs_mask = inputs != 0

        # Embeddings is num_output_layers x (batch_size, num_timesteps, embedding_dim)
        layer_activations = self._transformer(batch_tensor)

        # Output of scalar_mix is (batch_size, num_timesteps, embedding_dim)
        mix = self._scalar_mix(layer_activations, byte_pairs_mask)

        # These embeddings are one per byte-pair, but we want one per original _word_.
        # So we choose the embedding corresponding to the last byte pair for each word,
        # which is captured by the ``offsets`` input.
        range_vector = get_range_vector(batch_size,
                                        device=get_device_of(mix)).unsqueeze(1)
        last_byte_pair_embeddings = mix[range_vector, offsets]

        return last_byte_pair_embeddings


OpenaiTransformerEmbedder = TokenEmbedder.register(
    u"openai_transformer_embedder")(OpenaiTransformerEmbedder)