Python PretrainedTransformerIndexer.as_padded_tensor_dict示例

编程语言: Python

命名空间/包名称: allennlp.data.token_indexers

方法/功能: as_padded_tensor_dict

hotexamples.com的示例: 5

Python PretrainedTransformerIndexer.as_padded_tensor_dict - 已找到5个示例。这些是从开源项目中提取的最受好评的allennlp.data.token_indexers.PretrainedTransformerIndexer.as_padded_tensor_dict现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

PretrainedTransformerIndexer(30)

tokens_to_indices(16)

as_padded_tensor_dict(5)

_add_encoding_to_vocabulary_if_needed(3)

count_vocab_items(3)

determine_num_special_tokens_added(3)

get_empty_token_list(3)

_postprocess_output(2)

as_padded_tensor(1)

indices_to_tokens(1)

示例#1

显示文件

    def test_mask(self):
        # We try these two models, because BERT pads tokens with 0, but RoBERTa pads tokens with 1.
        for model in ["bert-base-uncased", "roberta-base"]:
            allennlp_tokenizer = PretrainedTransformerTokenizer(model)
            indexer = PretrainedTransformerIndexer(model_name=model)
            string_no_specials = "AllenNLP is great"
            allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
            vocab = Vocabulary()
            indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
            expected_masks = [1] * len(indexed["token_ids"])
            assert indexed["mask"] == expected_masks
            max_length = 10
            padding_lengths = {key: max_length for key in indexed.keys()}
            padded_tokens = indexer.as_padded_tensor_dict(
                indexed, padding_lengths)
            padding_length = max_length - len(indexed["mask"])
            expected_masks = expected_masks + ([0] * padding_length)
            assert len(padded_tokens["mask"]) == max_length
            assert padded_tokens["mask"].tolist() == expected_masks

            assert len(padded_tokens["token_ids"]) == max_length
            padding_suffix = [allennlp_tokenizer.tokenizer.pad_token_id
                              ] * padding_length
            assert padded_tokens["token_ids"][-padding_length:].tolist(
            ) == padding_suffix

示例#2

显示文件

文件： pretrained_transformer_indexer_test.py 项目： yakazimir/allennlp

 def test_mask(self):
     allennlp_tokenizer = PretrainedTransformerTokenizer(
         "bert-base-uncased")
     indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased")
     string_no_specials = "AllenNLP is great"
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     expected_masks = [1] * len(indexed["token_ids"])
     assert indexed["mask"] == expected_masks
     max_length = 10
     padding_lengths = {"token_ids": max_length, "mask": max_length}
     padded_tokens = indexer.as_padded_tensor_dict(indexed, padding_lengths)
     padding_length = max_length - len(indexed["mask"])
     expected_masks = expected_masks + ([0] * padding_length)
     assert len(padded_tokens["mask"]) == max_length
     assert padded_tokens["mask"].tolist() == expected_masks

示例#3

显示文件

文件： pretrained_transformer_mismatched_indexer.py 项目： xing-hu/allennlp

class PretrainedTransformerMismatchedIndexer(TokenIndexer):
    """
    Use this indexer when (for whatever reason) you are not using a corresponding
    `PretrainedTransformerTokenizer` on your input. We assume that you used a tokenizer that splits
    strings into words, while the transformer expects wordpieces as input. This indexer splits the
    words into wordpieces and flattens them out. You should use the corresponding
    `PretrainedTransformerMismatchedEmbedder` to embed these wordpieces and then pull out a single
    vector for each original word.

    Registered as a `TokenIndexer` with name "pretrained_transformer_mismatched".

    # Parameters

    model_name : `str`
        The name of the `transformers` model to use.
    namespace : `str`, optional (default=`tags`)
        We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace.
        We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
        tokens to this namespace, which would break on loading because we wouldn't find our default
        OOV token.
    max_length : `int`, optional (default = `None`)
        If positive, split the document into segments of this many tokens (including special tokens)
        before feeding into the embedder. The embedder embeds these segments independently and
        concatenate the results to get the original document representation. Should be set to
        the same value as the `max_length` option on the `PretrainedTransformerMismatchedEmbedder`.
    tokenizer_kwargs : `Dict[str, Any]`, optional (default = `None`)
        Dictionary with
        [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/tokenization_utils.py#L691)
        for `AutoTokenizer.from_pretrained`.
    """  # noqa: E501

    def __init__(
        self,
        model_name: str,
        namespace: str = "tags",
        max_length: int = None,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        # The matched version v.s. mismatched
        self._matched_indexer = PretrainedTransformerIndexer(
            model_name,
            namespace=namespace,
            max_length=max_length,
            tokenizer_kwargs=tokenizer_kwargs,
            **kwargs,
        )
        self._allennlp_tokenizer = self._matched_indexer._allennlp_tokenizer
        self._tokenizer = self._matched_indexer._tokenizer
        self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens
        self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens

    @overrides
    def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str,
                                                                      int]]):
        return self._matched_indexer.count_vocab_items(token, counter)

    @overrides
    def tokens_to_indices(self, tokens: List[Token],
                          vocabulary: Vocabulary) -> IndexedTokenList:
        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)

        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize(
            [t.text for t in tokens])

        # For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
        # That results in the embedding for the token to be all zeros.
        offsets = [x if x is not None else (-1, -1) for x in offsets]

        output: IndexedTokenList = {
            "token_ids": [t.text_id for t in wordpieces],
            "mask":
            [True] * len(tokens),  # for original tokens (i.e. word-level)
            "type_ids": [t.type_id for t in wordpieces],
            "offsets": offsets,
            "wordpiece_mask":
            [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
        }

        return self._matched_indexer._postprocess_output(output)

    @overrides
    def get_empty_token_list(self) -> IndexedTokenList:
        output = self._matched_indexer.get_empty_token_list()
        output["offsets"] = []
        output["wordpiece_mask"] = []
        return output

    @overrides
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tokens = tokens.copy()
        padding_lengths = padding_lengths.copy()

        offsets_tokens = tokens.pop("offsets")
        offsets_padding_lengths = padding_lengths.pop("offsets")

        tensor_dict = self._matched_indexer.as_padded_tensor_dict(
            tokens, padding_lengths)
        tensor_dict["offsets"] = torch.LongTensor(
            pad_sequence_to_length(offsets_tokens,
                                   offsets_padding_lengths,
                                   default_value=lambda: (0, 0)))
        return tensor_dict

    def __eq__(self, other):
        if isinstance(other, PretrainedTransformerMismatchedIndexer):
            for key in self.__dict__:
                if key == "_tokenizer":
                    # This is a reference to a function in the huggingface code, which we can't
                    # really modify to make this clean.  So we special-case it.
                    continue
                if self.__dict__[key] != other.__dict__[key]:
                    return False
            return True
        return NotImplemented

示例#4

显示文件

文件： pretrained_transformer_mismatched_indexer.py 项目： uysalelif/allennlp

class PretrainedTransformerMismatchedIndexer(TokenIndexer):
    """
    Use this indexer when (for whatever reason) you are not using a corresponding
    `PretrainedTransformerTokenizer` on your input. We assume that you used a tokenizer that splits
    strings into words, while the transformer expects wordpieces as input. This indexer splits the
    words into wordpieces and flattens them out. You should use the corresponding
    `PretrainedTransformerMismatchedEmbedder` to embed these wordpieces and then pull out a single
    vector for each original word.

    # Parameters

    model_name : `str`
        The name of the `transformers` model to use.
    namespace : `str`, optional (default=`tags`)
        We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace.
        We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
        tokens to this namespace, which would break on loading because we wouldn't find our default
        OOV token.
    """
    def __init__(self,
                 model_name: str,
                 namespace: str = "tags",
                 **kwargs) -> None:
        super().__init__(**kwargs)
        # The matched version v.s. mismatched
        self._matched_indexer = PretrainedTransformerIndexer(
            model_name, namespace, **kwargs)

        # add_special_tokens=False since we don't want wordpieces to be surrounded by special tokens
        self._allennlp_tokenizer = PretrainedTransformerTokenizer(
            model_name, add_special_tokens=False)
        self._tokenizer = self._allennlp_tokenizer.tokenizer

        (
            self._num_added_start_tokens,
            self._num_added_end_tokens,
        ) = self._determine_num_special_tokens_added()

    @overrides
    def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str,
                                                                      int]]):
        return self._matched_indexer.count_vocab_items(token, counter)

    @overrides
    def tokens_to_indices(self, tokens: List[Token],
                          vocabulary: Vocabulary) -> IndexedTokenList:
        orig_token_mask = [1] * len(tokens)
        tokens, offsets = self._intra_word_tokenize(tokens)

        # {"token_ids": ..., "mask": ...}
        output = self._matched_indexer.tokens_to_indices(tokens, vocabulary)

        # Insert type ids for the special tokens.
        output[
            "type_ids"] = self._tokenizer.create_token_type_ids_from_sequences(
                output["token_ids"])
        # Insert the special tokens themselves.
        output["token_ids"] = self._tokenizer.build_inputs_with_special_tokens(
            output["token_ids"])
        output["mask"] = orig_token_mask
        output["offsets"] = [(start + self._num_added_start_tokens,
                              end + self._num_added_start_tokens)
                             for start, end in offsets]
        output["wordpiece_mask"] = [1] * len(output["token_ids"])
        return output

    @overrides
    def get_empty_token_list(self) -> IndexedTokenList:
        output = self._matched_indexer.get_empty_token_list()
        output["offsets"] = []
        output["wordpiece_mask"] = []
        return output

    @overrides
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tokens = tokens.copy()
        padding_lengths = padding_lengths.copy()

        offsets_tokens = tokens.pop("offsets")
        offsets_padding_lengths = padding_lengths.pop("offsets")

        tensor_dict = self._matched_indexer.as_padded_tensor_dict(
            tokens, padding_lengths)
        tensor_dict["offsets"] = torch.LongTensor(
            pad_sequence_to_length(offsets_tokens,
                                   offsets_padding_lengths,
                                   default_value=lambda: (0, 0)))
        return tensor_dict

    def __eq__(self, other):
        if isinstance(other, PretrainedTransformerMismatchedIndexer):
            for key in self.__dict__:
                if key == "tokenizer":
                    # This is a reference to a function in the huggingface code, which we can't
                    # really modify to make this clean.  So we special-case it.
                    continue
                if self.__dict__[key] != other.__dict__[key]:
                    return False
            return True
        return NotImplemented

    def _intra_word_tokenize(
            self,
            tokens: List[Token]) -> Tuple[List[Token], List[Tuple[int, int]]]:
        """
        Tokenizes each word into wordpieces separately. Also calculates offsets such that
        wordpices[offsets[i][0]:offsets[i][1] + 1] corresponds to the original i-th token.
        Does not insert special tokens.
        """
        wordpieces: List[Token] = []
        offsets = []
        cumulative = 0
        for token in tokens:
            subword_wordpieces = self._allennlp_tokenizer.tokenize(token.text)
            wordpieces.extend(subword_wordpieces)

            start_offset = cumulative
            cumulative += len(subword_wordpieces)
            end_offset = cumulative - 1  # inclusive
            offsets.append((start_offset, end_offset))

        return wordpieces, offsets

    def _determine_num_special_tokens_added(self) -> Tuple[int, int]:
        """
        Determines the number of tokens self._tokenizer adds to a sequence (currently doesn't
        consider sequence pairs) in the start & end.

        # Returns
        The number of tokens (`int`) that are inserted in the start & end of a sequence.
        """
        # Uses a slightly higher index to avoid tokenizer doing special things to lower-indexed
        # tokens which might be special.
        dummy = [1000]
        inserted = self._tokenizer.build_inputs_with_special_tokens(dummy)

        num_start = num_end = 0
        seen_dummy = False
        for idx in inserted:
            if idx == dummy[0]:
                if seen_dummy:  # seeing it twice
                    raise ValueError(
                        "Cannot auto-determine the number of special tokens added."
                    )
                seen_dummy = True
                continue

            if not seen_dummy:
                num_start += 1
            else:
                num_end += 1

        assert num_start + num_end == self._tokenizer.num_added_tokens()
        return num_start, num_end

示例#5

显示文件

class PretrainedTransformerMismatchedIndexer(TokenIndexer):
    """
    Use this indexer when (for whatever reason) you are not using a corresponding
    `PretrainedTransformerTokenizer` on your input. We assume that you used a tokenizer that splits
    strings into words, while the transformer expects wordpieces as input. This indexer splits the
    words into wordpieces and flattens them out. You should use the corresponding
    `PretrainedTransformerMismatchedEmbedder` to embed these wordpieces and then pull out a single
    vector for each original word.

    # Parameters

    model_name : `str`
        The name of the `transformers` model to use.
    namespace : `str`, optional (default=`tags`)
        We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace.
        We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
        tokens to this namespace, which would break on loading because we wouldn't find our default
        OOV token.
    max_length : `int`, optional (default = None)
        If positive, split the document into segments of this many tokens (including special tokens)
        before feeding into the embedder. The embedder embeds these segments independently and
        concatenate the results to get the original document representation. Should be set to
        the same value as the `max_length` option on the `PretrainedTransformerMismatchedEmbedder`.
    """
    def __init__(self,
                 model_name: str,
                 namespace: str = "tags",
                 max_length: int = None,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        # The matched version v.s. mismatched
        self._matched_indexer = PretrainedTransformerIndexer(
            model_name, namespace, max_length, **kwargs)
        self._tokenizer = self._matched_indexer._tokenizer
        self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens
        self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens

    @overrides
    def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str,
                                                                      int]]):
        return self._matched_indexer.count_vocab_items(token, counter)

    @overrides
    def tokens_to_indices(self, tokens: List[Token],
                          vocabulary: Vocabulary) -> IndexedTokenList:
        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)

        indices, offsets = self._intra_word_tokenize(tokens)
        # `create_token_type_ids_from_sequences()` inserts special tokens
        type_ids = self._tokenizer.create_token_type_ids_from_sequences(
            indices[self._num_added_start_tokens:-self._num_added_end_tokens])
        output: IndexedTokenList = {
            "token_ids": indices,
            "mask": [1] * len(tokens),  # for original tokens (i.e. word-level)
            "type_ids": type_ids,
            "offsets": offsets,
            "wordpiece_mask":
            [1] * len(indices),  # for wordpieces (i.e. subword-level)
        }

        return self._matched_indexer._postprocess_output(output)

    @overrides
    def get_empty_token_list(self) -> IndexedTokenList:
        output = self._matched_indexer.get_empty_token_list()
        output["offsets"] = []
        output["wordpiece_mask"] = []
        return output

    @overrides
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tokens = tokens.copy()
        padding_lengths = padding_lengths.copy()

        offsets_tokens = tokens.pop("offsets")
        offsets_padding_lengths = padding_lengths.pop("offsets")

        tensor_dict = self._matched_indexer.as_padded_tensor_dict(
            tokens, padding_lengths)
        tensor_dict["offsets"] = torch.LongTensor(
            pad_sequence_to_length(offsets_tokens,
                                   offsets_padding_lengths,
                                   default_value=lambda: (0, 0)))
        return tensor_dict

    def __eq__(self, other):
        if isinstance(other, PretrainedTransformerMismatchedIndexer):
            for key in self.__dict__:
                if key == "tokenizer":
                    # This is a reference to a function in the huggingface code, which we can't
                    # really modify to make this clean.  So we special-case it.
                    continue
                if self.__dict__[key] != other.__dict__[key]:
                    return False
            return True
        return NotImplemented

    def _intra_word_tokenize(
            self,
            tokens: List[Token]) -> Tuple[List[int], List[Tuple[int, int]]]:
        """
        Tokenizes each word into wordpieces separately and returns the wordpiece IDs.
        Also calculates offsets such that wordpices[offsets[i][0]:offsets[i][1] + 1]
        corresponds to the original i-th token.

        This function inserts special tokens.
        """
        wordpieces: List[int] = []
        offsets = []
        cumulative = self._num_added_start_tokens
        for token in tokens:
            subword_wordpieces = self._tokenizer.encode(
                token.text, add_special_tokens=False)
            wordpieces.extend(subword_wordpieces)

            start_offset = cumulative
            cumulative += len(subword_wordpieces)
            end_offset = cumulative - 1  # inclusive
            offsets.append((start_offset, end_offset))

        wordpieces = self._tokenizer.build_inputs_with_special_tokens(
            wordpieces)
        assert cumulative + self._num_added_end_tokens == len(wordpieces)

        return wordpieces, offsets