Exemplo n.º 1
0
    def test_mask(self):
        # We try these two models, because BERT pads tokens with 0, but RoBERTa pads tokens with 1.
        for model in ["bert-base-uncased", "roberta-base"]:
            allennlp_tokenizer = PretrainedTransformerTokenizer(model)
            indexer = PretrainedTransformerIndexer(model_name=model)
            string_no_specials = "AllenNLP is great"
            allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
            vocab = Vocabulary()
            indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
            expected_masks = [1] * len(indexed["token_ids"])
            assert indexed["mask"] == expected_masks
            max_length = 10
            padding_lengths = {key: max_length for key in indexed.keys()}
            padded_tokens = indexer.as_padded_tensor_dict(
                indexed, padding_lengths)
            padding_length = max_length - len(indexed["mask"])
            expected_masks = expected_masks + ([0] * padding_length)
            assert len(padded_tokens["mask"]) == max_length
            assert padded_tokens["mask"].tolist() == expected_masks

            assert len(padded_tokens["token_ids"]) == max_length
            padding_suffix = [allennlp_tokenizer.tokenizer.pad_token_id
                              ] * padding_length
            assert padded_tokens["token_ids"][-padding_length:].tolist(
            ) == padding_suffix
Exemplo n.º 2
0
 def test_as_array_produces_token_sequence(self):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",
                                               do_lowercase=True)
     indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased",
                                            do_lowercase=True)
     tokens = tokenizer.tokenize("AllenNLP is great")
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = [Token(token) for token in tokens]
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
     assert indexed["key"] == expected_ids
Exemplo n.º 3
0
 def __init__(self,
              model_name: str,
              namespace: str = "tags",
              max_length: int = None,
              **kwargs) -> None:
     super().__init__(**kwargs)
     # The matched version v.s. mismatched
     self._matched_indexer = PretrainedTransformerIndexer(
         model_name, namespace, max_length, **kwargs)
     self._tokenizer = self._matched_indexer._tokenizer
     self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens
     self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens
Exemplo n.º 4
0
 def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
     allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
     indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
     default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]"
     tokens = tokenizer.tokenize(default_format)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair(
         "AllenNLP is great!", "Really it is!")
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
Exemplo n.º 5
0
 def test_as_array_produces_token_sequence_roberta_sentence_pair(self):
     tokenizer = AutoTokenizer.from_pretrained("roberta-base")
     allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base")
     indexer = PretrainedTransformerIndexer(model_name="roberta-base")
     default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>"
     tokens = tokenizer.tokenize(default_format)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair(
         "AllenNLP is great!", "Really it is!")
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
     assert indexed["key"] == expected_ids
Exemplo n.º 6
0
 def check_vocab_size(model_name: str):
     namespace = "tags"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_vocab_size(
         namespace=namespace) == tokenizer.vocab_size
 def test_transformers_vocab_sizes(self, model_name):
     namespace = "tags"
     tokenizer = cached_transformers.get_tokenizer(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_vocab_size(
         namespace=namespace) == tokenizer.vocab_size
Exemplo n.º 8
0
 def test_transformers_vocabs_added_correctly(self):
     namespace, model_name = "tags", "roberta-base"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_token_to_index_vocabulary(
         namespace=namespace) == tokenizer.encoder
Exemplo n.º 9
0
 def test_as_array_produces_token_sequence_roberta(self):
     tokenizer = AutoTokenizer.from_pretrained("roberta-base")
     allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base")
     indexer = PretrainedTransformerIndexer(model_name="roberta-base")
     string_specials = "<s> AllenNLP is great </s>"
     string_no_specials = "AllenNLP is great"
     tokens = tokenizer.tokenize(string_specials)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     # tokens tokenized with our pretrained tokenizer have indices in them
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
     assert indexed["key"] == expected_ids
Exemplo n.º 10
0
 def test_as_array_produces_token_sequence_bert_cased(self):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
     allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
     indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
     string_specials = "[CLS] AllenNLP is great [SEP]"
     string_no_specials = "AllenNLP is great"
     tokens = tokenizer.tokenize(string_specials)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     # tokens tokenized with our pretrained tokenizer have indices in them
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
    def test_indices_to_tokens(self):
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-uncased")
        indexer_max_length = PretrainedTransformerIndexer(
            model_name="bert-base-uncased", max_length=4)
        indexer_no_max_length = PretrainedTransformerIndexer(
            model_name="bert-base-uncased")
        string_no_specials = "AllenNLP is great"

        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer_no_max_length.tokens_to_indices(
            allennlp_tokens, vocab)
        tokens_from_indices = indexer_no_max_length.indices_to_tokens(
            indexed, vocab)

        self._assert_tokens_equal(allennlp_tokens, tokens_from_indices)

        indexed = indexer_max_length.tokens_to_indices(allennlp_tokens, vocab)
        tokens_from_indices = indexer_max_length.indices_to_tokens(
            indexed, vocab)

        # For now we are not removing special tokens introduced from max_length
        sep_cls = [allennlp_tokens[-1], allennlp_tokens[0]]
        expected = (allennlp_tokens[:3] + sep_cls + allennlp_tokens[3:5] +
                    sep_cls + allennlp_tokens[5:])

        self._assert_tokens_equal(expected, tokens_from_indices)
    def test_type_ids_when_folding(self):
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-uncased", add_special_tokens=False)
        indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased",
                                               max_length=6)
        first_string = "How do trees get online?"
        second_string = "They log in!"

        tokens = allennlp_tokenizer.add_special_tokens(
            allennlp_tokenizer.tokenize(first_string),
            allennlp_tokenizer.tokenize(second_string))
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(tokens, vocab)
        assert min(indexed["type_ids"]) == 0
        assert max(indexed["type_ids"]) == 1
Exemplo n.º 13
0
 def test_as_array_produces_token_sequence_roberta_sentence_pair(self):
     tokenizer = cached_transformers.get_tokenizer("roberta-base")
     allennlp_tokenizer = PretrainedTransformerTokenizer(
         "roberta-base", add_special_tokens=False)
     indexer = PretrainedTransformerIndexer(model_name="roberta-base")
     default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>"
     tokens = tokenizer.tokenize(default_format)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = allennlp_tokenizer.add_special_tokens(
         allennlp_tokenizer.tokenize("AllenNLP is great!"),
         allennlp_tokenizer.tokenize("Really it is!"),
     )
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
Exemplo n.º 14
0
    def test_end_to_end(self, train_parameters: bool, last_layer_only: bool):
        tokenizer = PretrainedTransformerTokenizer(
            model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(
            model_name="bert-base-uncased")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = [
            "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]"
        ]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = [
            "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]"
        ]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased",
                    "train_parameters": train_parameters,
                    "last_layer_only": last_layer_only,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"]["token_ids"].shape == (2, max_length)

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, True, True, True, True],
            [True, True, True, True, True, True, True, False, False],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
        assert bert_vectors.requires_grad == (train_parameters
                                              or not last_layer_only)
Exemplo n.º 15
0
    def __init__(
        self,
        pretrained_model_pth: str = None,
    ):
        """
        两阶段数据读取模型
            1、先读s, 再读 o与p
            2、先读o, 再读 s和p
        :param token_indexers:
        :param pretrained_model_pth:
        :param lazy:
        """
        super().__init__(False)
        if pretrained_model_pth is None:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}
        else:
            # 注意,这里要进行替换,因为你的词表已经变了。
            self._token_indexers = {
                "tokens": PretrainedTransformerIndexer(pretrained_model_pth)
            }

        if pretrained_model_pth is not None:
            self.pretrained_tokenizer = AutoTokenizer.from_pretrained(
                pretrained_model_pth)
        else:
            self.pretrained_tokenizer = None
Exemplo n.º 16
0
    def __init__(self,
                 transformer_model_name: str = "bert-base-cased",
                 length_limit: int = 384,
                 question_length_limit: int = 64,
                 stride: int = 128,
                 raise_errors: bool = False,
                 tokenizer_kwargs: Dict[str, Any] = None,
                 one_instance_per_query: bool = False,
                 max_instances: int = None,
                 **kwargs) -> None:
        """
        Initialize the RecordTaskReader.
        """
        super(RecordTaskReader,
              self).__init__(manual_distributed_sharding=True, **kwargs)

        # Save the values passed to __init__ to protected attributes
        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name,
            add_special_tokens=False,
            tokenizer_kwargs=tokenizer_kwargs,
        )
        self._token_indexers = {
            "tokens":
            PretrainedTransformerIndexer(transformer_model_name,
                                         tokenizer_kwargs=tokenizer_kwargs)
        }
        self._length_limit = length_limit
        self._query_len_limit = question_length_limit
        self._stride = stride
        self._raise_errors = raise_errors
        self._cls_token = '@placeholder'
        self._max_instances = max_instances
        self._one_instance_per_query = one_instance_per_query
Exemplo n.º 17
0
 def __init__(
     self,
     lazy: bool = False,
     cache_directory: Optional[str] = None,
     max_instances: Optional[int] = None,
     min_num_candidate: int = 3,
     max_num_candidate: int = 5,
     transformer_model_name_or_archive_path: str = "bert-base-uncased",
 ) -> None:
     super().__init__(lazy=lazy,
                      cache_directory=cache_directory,
                      max_instances=max_instances)
     if "tar.gz" in transformer_model_name_or_archive_path:
         config = extract_config_from_archive(
             transformer_model_name_or_archive_path)
         model_name = config.as_dict(
         )["dataset_reader"]["tokenizer"]["model_name"]
     else:
         model_name = transformer_model_name_or_archive_path
     self._tokenizer = PretrainedTransformerTokenizer(
         model_name=model_name, add_special_tokens=False)
     self._tokenindexer = PretrainedTransformerIndexer(
         model_name=model_name)
     self._min_num_candidate = min_num_candidate
     self._max_num_candidate = max_num_candidate
Exemplo n.º 18
0
    def test_text_to_instance_with_bert_tokenizer_and_indexer(self):
        tokenizer = PretrainedTransformerTokenizer('bert-base-cased',
                                                   do_lowercase=False)
        token_indexer = PretrainedTransformerIndexer('bert-base-cased',
                                                     do_lowercase=False)
        reader = MaskedLanguageModelingReader(tokenizer,
                                              {'bert': token_indexer})
        instance = reader.text_to_instance(
            sentence='This is AllenNLP [MASK] token .', targets=['This'])
        assert [t.text for t in instance['tokens']] == [
            '[CLS]', 'This', 'is', 'Allen', '##NL', '##P', '[MASK]', 'token',
            '.', '[SEP]'
        ]
        assert [i.sequence_index for i in instance['mask_positions']] == [6]
        assert [t.text for t in instance['target_ids']] == ['This']

        vocab = Vocabulary()
        instance.index_fields(vocab)
        tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths())
        assert tensor_dict.keys() == {'tokens', 'mask_positions', 'target_ids'}
        bert_token_ids = tensor_dict['tokens']['bert'].numpy().tolist()
        target_ids = tensor_dict['target_ids']['bert'].numpy().tolist()
        # I don't know what wordpiece id BERT is going to assign to 'This', but it at least should
        # be the same between the input and the target.
        assert target_ids[0] == bert_token_ids[1]
    def __init__(self,
                 pretrained_model: str = None,
                 tokenizer: Optional[Tokenizer] = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_pieces: int = 512,
                 add_prefix: bool = False,
                 combine_input_fields: bool = True,
                 sample: int = -1) -> None:
        super().__init__()

        if pretrained_model != None:
            self._tokenizer = PretrainedTransformerTokenizer(
                pretrained_model, max_length=max_pieces)
            token_indexer = PretrainedTransformerIndexer(pretrained_model)
            self._token_indexers = {'tokens': token_indexer}
        else:
            self._tokenizer = tokenizer or SpacyTokenizer()
            self._token_indexers = token_indexers or {
                "tokens": SingleIdTokenIndexer()
            }

        self._sample = sample
        self._add_prefix = add_prefix
        self._combine_input_fields = combine_input_fields
        self._debug_prints = -1
Exemplo n.º 20
0
    def test_text_to_instance_with_bert_tokenizer_and_indexer(self):
        tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
        token_indexer = PretrainedTransformerIndexer("bert-base-cased")
        reader = NextTokenLmReader(tokenizer, {"bert": token_indexer})
        instance = reader.text_to_instance(sentence="AllenNLP is very", target="very")
        assert [t.text for t in instance["tokens"]] == [
            "[CLS]",
            "Allen",
            "##NL",
            "##P",
            "is",
            "very",
            "[SEP]",
        ]
        assert [t.text for t in instance["target_ids"]] == ["very"]

        vocab = Vocabulary()
        instance.index_fields(vocab)
        tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths())
        assert tensor_dict.keys() == {"tokens", "target_ids"}
        bert_token_ids = tensor_dict["tokens"]["bert"].numpy().tolist()
        target_ids = tensor_dict["target_ids"]["bert"].numpy().tolist()
        # I don't know what wordpiece id BERT is going to assign to 'This', but it at least should
        # be the same between the input and the target.
        assert target_ids[0] == bert_token_ids[5]
    def __init__(self,
                 pretrained_model: str,
                 max_pieces: int = 512,
                 num_choices: int = 4,
                 add_prefix: Dict[str, str] = None,
                 sample: int = -1) -> None:
        super().__init__()

        self._tokenizer = PretrainedTransformerTokenizer(pretrained_model)
        self._tokenizer_no_special_tokens = PretrainedTransformerTokenizer(
            pretrained_model, add_special_tokens=False)
        # self._tokenizer_internal = self._tokenizer._tokenizer
        self._tokenizer_internal = self._tokenizer.tokenizer
        token_indexer = PretrainedTransformerIndexer(pretrained_model)
        self._token_indexers = {'tokens': token_indexer}

        self._max_pieces = max_pieces
        self._sample = sample
        self._num_choices = num_choices
        self._add_prefix = add_prefix or {}

        for model in [
                "roberta", "bert", "openai-gpt", "gpt2", "transfo-xl", "xlnet",
                "xlm"
        ]:
            if model in pretrained_model:
                self._model_type = model
                break
Exemplo n.º 22
0
 def __init__(self, hparams, em_dict, rm_dict, entity_mentions, tokensD,
              all_known_e2, all_known_e1, scoresD_tail, scoresD_head, mode,
              max_instances, world_size) -> None:
     super().__init__(manual_distributed_sharding=True,
                      manual_multi_process_sharding=True,
                      max_instances=max_instances,
                      world_size=world_size)
     self.hparams = hparams
     self.em_dict = em_dict
     self.rm_dict = rm_dict
     self.entity_mentions = entity_mentions
     self._tokenizer = PretrainedTransformerTokenizer('bert-base-cased')
     self._token_indexer = {
         "tokens": PretrainedTransformerIndexer('bert-base-cased')
     }
     self._tokensD = tokensD
     self.random_indexes = list(range(len(em_dict)))
     random.shuffle(self.random_indexes)
     self.key_index = 0
     self.all_known_e1 = all_known_e1
     self.all_known_e2 = all_known_e2
     self.scoresD_tail = scoresD_tail
     self.scoresD_head = scoresD_head
     self.mode = mode
     self.factsD = None
     if hparams.retrieve_facts:
         self.factsD = pickle.load(open(hparams.retrieve_facts, 'rb'))
Exemplo n.º 23
0
    def __init__(self,
                 transformer_model_name: str = "bert-base-cased",
                 length_limit: int = 384,
                 stride: int = 128,
                 skip_invalid_examples: bool = False,
                 max_query_length: int = 64,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name,
            add_special_tokens=False,
            calculate_character_offsets=True)
        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(transformer_model_name)
        }
        self.length_limit = length_limit
        self.stride = stride
        self.skip_invalid_examples = skip_invalid_examples
        self.max_query_length = max_query_length
        self.non_content_type_id = max(
            self._tokenizer.tokenizer.encode_plus(
                "left", "right", return_token_type_ids=True)["token_type_ids"])

        # workaround for a bug in the transformers library
        if "distilbert" in transformer_model_name:
            self.non_content_type_id = 0
Exemplo n.º 24
0
    def __init__(self,
                 pretrained_model_pth: str = None,
                 lazy: bool = False,
                 mode: str = 'sop'):
        """
        三阶段数据读取模型
            1、先读s, 再读o,最后读pos
            2、先读o, 再读s,最后读p
        :param pretrained_model_pth:
        :param lazy:
        :param mode: 选择模式,先读哪一个
        """
        super().__init__(lazy)
        if pretrained_model_pth is None:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {
                "tokens": PretrainedTransformerIndexer(pretrained_model_pth)
            }

        self.pretrained_tokenizer = None
        if pretrained_model_pth is not None:
            self.pretrained_tokenizer = AutoTokenizer.from_pretrained(
                pretrained_model_pth)
        assert mode in ['sop', 'osp']
        self.mode = mode
    def test_text_to_instance_with_bert_tokenizer_and_indexer(self):
        tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
        token_indexer = PretrainedTransformerIndexer("bert-base-cased")
        reader = MaskedLanguageModelingReader(tokenizer,
                                              {"bert": token_indexer})
        instance = reader.text_to_instance(
            sentence="This is AllenNLP [MASK] token .", targets=["This"])
        assert [t.text for t in instance["tokens"]] == [
            "[CLS]",
            "This",
            "is",
            "Allen",
            "##NL",
            "##P",
            "[MASK]",
            "token",
            ".",
            "[SEP]",
        ]
        assert [i.sequence_index for i in instance["mask_positions"]] == [6]
        assert [t.text for t in instance["target_ids"]] == ["This"]

        vocab = Vocabulary()
        instance.index_fields(vocab)
        tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths())
        assert tensor_dict.keys() == {"tokens", "mask_positions", "target_ids"}
        bert_token_ids = tensor_dict["tokens"]["bert"]["token_ids"].numpy(
        ).tolist()
        target_ids = tensor_dict["target_ids"]["bert"]["token_ids"].numpy(
        ).tolist()
        # I don't know what wordpiece id BERT is going to assign to 'This', but it at least should
        # be the same between the input and the target.
        assert target_ids[0] == bert_token_ids[1]
 def token_indexer_returner(self):
     huggingface_name, do_lower_case = self.huggingfacename_returner()
     return {
         'tokens':
         PretrainedTransformerIndexer(model_name=huggingface_name,
                                      do_lowercase=do_lower_case)
     }
Exemplo n.º 27
0
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              max_tokens: int = None,
              pseudo: bool = False if args.pseudo == False else True):
     super().__init__(lazy)
     # self.tokenizer = tokenizer or WhitespaceTokenizer()
     # self.token_indexers = token_indexers or {"tokens":SingleIdTokenIndexer()}
     # from collections import defaultdict
     # ags = defaultdict(list)
     ags = {
         "additional_special_tokens":
         ("[pseudo1]", "[pseudo2]", "[pseudo3]", "[pseudo4]", "[pseudo5]",
          "[pseudo6]", "[pseudo7]", "[pseudo8]", "[pseudo9]")
     }
     self.tokenizer = tokenizer or PretrainedTransformerTokenizer(
         "bert-large-uncased",
         tokenizer_kwargs=(ags if args.pseudo else {}))
     self.token_indexers = token_indexers or {
         "tokens": PretrainedTransformerIndexer("bert-large-uncased")
     }
     self.max_tokens = max_tokens
     self.pseudo = pseudo
     self.tags = tags
Exemplo n.º 28
0
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        domain_identifier: str = None,
        bert_model_name: str = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        if token_indexers is not None:
            self._token_indexers = token_indexers
        elif bert_model_name is not None:
            from allennlp.data.token_indexers import PretrainedTransformerIndexer

            self._token_indexers = {
                "tokens": PretrainedTransformerIndexer(bert_model_name)
            }
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}
        self._domain_identifier = domain_identifier

        if bert_model_name is not None:
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                bert_model_name)
            self.lowercase_input = "uncased" in bert_model_name
        else:
            self.bert_tokenizer = None
            self.lowercase_input = False
Exemplo n.º 29
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self.transformer_model = "bert-base-uncased"
     self.tokenizer = PretrainedTransformerTokenizer(model_name=self.transformer_model,add_special_tokens=False,max_length=512)
     self.token_indexer = PretrainedTransformerIndexer(model_name=self.transformer_model,max_length =512)
Exemplo n.º 30
0
    def test_end_to_end_t5(
        self,
        train_parameters: bool,
        last_layer_only: bool,
        gradient_checkpointing: bool,
    ):
        tokenizer = PretrainedTransformerTokenizer(model_name="patrickvonplaten/t5-tiny-random")
        token_indexer = PretrainedTransformerIndexer(model_name="patrickvonplaten/t5-tiny-random")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = ["▁A", ",", "▁Allen", "N", "LP", "▁sentence", ".", "</s>"]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = ["▁Allen", "N", "LP", "▁is", "▁great", "</s>"]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params(
            {
                "token_embedders": {
                    "bert": {
                        "type": "pretrained_transformer",
                        "model_name": "patrickvonplaten/t5-tiny-random",
                        "train_parameters": train_parameters,
                        "last_layer_only": last_layer_only,
                        "gradient_checkpointing": gradient_checkpointing,
                        "sub_module": "encoder",
                    }
                }
            }
        )
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params)

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"]["token_ids"].shape == (2, max_length)

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, True, True, True],
            [True, True, True, True, True, True, False, False],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 8, 64)
        assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)