def test_mask(self): # We try these two models, because BERT pads tokens with 0, but RoBERTa pads tokens with 1. for model in ["bert-base-uncased", "roberta-base"]: allennlp_tokenizer = PretrainedTransformerTokenizer(model) indexer = PretrainedTransformerIndexer(model_name=model) string_no_specials = "AllenNLP is great" allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) expected_masks = [1] * len(indexed["token_ids"]) assert indexed["mask"] == expected_masks max_length = 10 padding_lengths = {key: max_length for key in indexed.keys()} padded_tokens = indexer.as_padded_tensor_dict( indexed, padding_lengths) padding_length = max_length - len(indexed["mask"]) expected_masks = expected_masks + ([0] * padding_length) assert len(padded_tokens["mask"]) == max_length assert padded_tokens["mask"].tolist() == expected_masks assert len(padded_tokens["token_ids"]) == max_length padding_suffix = [allennlp_tokenizer.tokenizer.pad_token_id ] * padding_length assert padded_tokens["token_ids"][-padding_length:].tolist( ) == padding_suffix
def test_as_array_produces_token_sequence(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lowercase=True) indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", do_lowercase=True) tokens = tokenizer.tokenize("AllenNLP is great") expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = [Token(token) for token in tokens] vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids
def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None, **kwargs) -> None: super().__init__(**kwargs) # The matched version v.s. mismatched self._matched_indexer = PretrainedTransformerIndexer( model_name, namespace, max_length, **kwargs) self._tokenizer = self._matched_indexer._tokenizer self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens
def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased") indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair( "AllenNLP is great!", "Really it is!") vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
def test_as_array_produces_token_sequence_roberta_sentence_pair(self): tokenizer = AutoTokenizer.from_pretrained("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base") indexer = PretrainedTransformerIndexer(model_name="roberta-base") default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair( "AllenNLP is great!", "Really it is!") vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids
def check_vocab_size(model_name: str): namespace = "tags" tokenizer = AutoTokenizer.from_pretrained(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_vocab_size( namespace=namespace) == tokenizer.vocab_size
def test_transformers_vocab_sizes(self, model_name): namespace = "tags" tokenizer = cached_transformers.get_tokenizer(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_vocab_size( namespace=namespace) == tokenizer.vocab_size
def test_transformers_vocabs_added_correctly(self): namespace, model_name = "tags", "roberta-base" tokenizer = AutoTokenizer.from_pretrained(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_token_to_index_vocabulary( namespace=namespace) == tokenizer.encoder
def test_as_array_produces_token_sequence_roberta(self): tokenizer = AutoTokenizer.from_pretrained("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base") indexer = PretrainedTransformerIndexer(model_name="roberta-base") string_specials = "<s> AllenNLP is great </s>" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids
def test_as_array_produces_token_sequence_bert_cased(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased") indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") string_specials = "[CLS] AllenNLP is great [SEP]" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
def test_indices_to_tokens(self): allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased") indexer_max_length = PretrainedTransformerIndexer( model_name="bert-base-uncased", max_length=4) indexer_no_max_length = PretrainedTransformerIndexer( model_name="bert-base-uncased") string_no_specials = "AllenNLP is great" allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer_no_max_length.tokens_to_indices( allennlp_tokens, vocab) tokens_from_indices = indexer_no_max_length.indices_to_tokens( indexed, vocab) self._assert_tokens_equal(allennlp_tokens, tokens_from_indices) indexed = indexer_max_length.tokens_to_indices(allennlp_tokens, vocab) tokens_from_indices = indexer_max_length.indices_to_tokens( indexed, vocab) # For now we are not removing special tokens introduced from max_length sep_cls = [allennlp_tokens[-1], allennlp_tokens[0]] expected = (allennlp_tokens[:3] + sep_cls + allennlp_tokens[3:5] + sep_cls + allennlp_tokens[5:]) self._assert_tokens_equal(expected, tokens_from_indices)
def test_type_ids_when_folding(self): allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased", add_special_tokens=False) indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", max_length=6) first_string = "How do trees get online?" second_string = "They log in!" tokens = allennlp_tokenizer.add_special_tokens( allennlp_tokenizer.tokenize(first_string), allennlp_tokenizer.tokenize(second_string)) vocab = Vocabulary() indexed = indexer.tokens_to_indices(tokens, vocab) assert min(indexed["type_ids"]) == 0 assert max(indexed["type_ids"]) == 1
def test_as_array_produces_token_sequence_roberta_sentence_pair(self): tokenizer = cached_transformers.get_tokenizer("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer( "roberta-base", add_special_tokens=False) indexer = PretrainedTransformerIndexer(model_name="roberta-base") default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.add_special_tokens( allennlp_tokenizer.tokenize("AllenNLP is great!"), allennlp_tokenizer.tokenize("Really it is!"), ) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
def test_end_to_end(self, train_parameters: bool, last_layer_only: bool): tokenizer = PretrainedTransformerTokenizer( model_name="bert-base-uncased") token_indexer = PretrainedTransformerIndexer( model_name="bert-base-uncased") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = [ "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]" ] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = [ "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]" ] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased", "train_parameters": train_parameters, "last_layer_only": last_layer_only, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"]["token_ids"].shape == (2, max_length) assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, True, True, True, True], [True, True, True, True, True, True, True, False, False], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 9, 768) assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)
def __init__( self, pretrained_model_pth: str = None, ): """ 两阶段数据读取模型 1、先读s, 再读 o与p 2、先读o, 再读 s和p :param token_indexers: :param pretrained_model_pth: :param lazy: """ super().__init__(False) if pretrained_model_pth is None: self._token_indexers = {"tokens": SingleIdTokenIndexer()} else: # 注意,这里要进行替换,因为你的词表已经变了。 self._token_indexers = { "tokens": PretrainedTransformerIndexer(pretrained_model_pth) } if pretrained_model_pth is not None: self.pretrained_tokenizer = AutoTokenizer.from_pretrained( pretrained_model_pth) else: self.pretrained_tokenizer = None
def __init__(self, transformer_model_name: str = "bert-base-cased", length_limit: int = 384, question_length_limit: int = 64, stride: int = 128, raise_errors: bool = False, tokenizer_kwargs: Dict[str, Any] = None, one_instance_per_query: bool = False, max_instances: int = None, **kwargs) -> None: """ Initialize the RecordTaskReader. """ super(RecordTaskReader, self).__init__(manual_distributed_sharding=True, **kwargs) # Save the values passed to __init__ to protected attributes self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False, tokenizer_kwargs=tokenizer_kwargs, ) self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name, tokenizer_kwargs=tokenizer_kwargs) } self._length_limit = length_limit self._query_len_limit = question_length_limit self._stride = stride self._raise_errors = raise_errors self._cls_token = '@placeholder' self._max_instances = max_instances self._one_instance_per_query = one_instance_per_query
def __init__( self, lazy: bool = False, cache_directory: Optional[str] = None, max_instances: Optional[int] = None, min_num_candidate: int = 3, max_num_candidate: int = 5, transformer_model_name_or_archive_path: str = "bert-base-uncased", ) -> None: super().__init__(lazy=lazy, cache_directory=cache_directory, max_instances=max_instances) if "tar.gz" in transformer_model_name_or_archive_path: config = extract_config_from_archive( transformer_model_name_or_archive_path) model_name = config.as_dict( )["dataset_reader"]["tokenizer"]["model_name"] else: model_name = transformer_model_name_or_archive_path self._tokenizer = PretrainedTransformerTokenizer( model_name=model_name, add_special_tokens=False) self._tokenindexer = PretrainedTransformerIndexer( model_name=model_name) self._min_num_candidate = min_num_candidate self._max_num_candidate = max_num_candidate
def test_text_to_instance_with_bert_tokenizer_and_indexer(self): tokenizer = PretrainedTransformerTokenizer('bert-base-cased', do_lowercase=False) token_indexer = PretrainedTransformerIndexer('bert-base-cased', do_lowercase=False) reader = MaskedLanguageModelingReader(tokenizer, {'bert': token_indexer}) instance = reader.text_to_instance( sentence='This is AllenNLP [MASK] token .', targets=['This']) assert [t.text for t in instance['tokens']] == [ '[CLS]', 'This', 'is', 'Allen', '##NL', '##P', '[MASK]', 'token', '.', '[SEP]' ] assert [i.sequence_index for i in instance['mask_positions']] == [6] assert [t.text for t in instance['target_ids']] == ['This'] vocab = Vocabulary() instance.index_fields(vocab) tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths()) assert tensor_dict.keys() == {'tokens', 'mask_positions', 'target_ids'} bert_token_ids = tensor_dict['tokens']['bert'].numpy().tolist() target_ids = tensor_dict['target_ids']['bert'].numpy().tolist() # I don't know what wordpiece id BERT is going to assign to 'This', but it at least should # be the same between the input and the target. assert target_ids[0] == bert_token_ids[1]
def __init__(self, pretrained_model: str = None, tokenizer: Optional[Tokenizer] = None, token_indexers: Dict[str, TokenIndexer] = None, max_pieces: int = 512, add_prefix: bool = False, combine_input_fields: bool = True, sample: int = -1) -> None: super().__init__() if pretrained_model != None: self._tokenizer = PretrainedTransformerTokenizer( pretrained_model, max_length=max_pieces) token_indexer = PretrainedTransformerIndexer(pretrained_model) self._token_indexers = {'tokens': token_indexer} else: self._tokenizer = tokenizer or SpacyTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._sample = sample self._add_prefix = add_prefix self._combine_input_fields = combine_input_fields self._debug_prints = -1
def test_text_to_instance_with_bert_tokenizer_and_indexer(self): tokenizer = PretrainedTransformerTokenizer("bert-base-cased") token_indexer = PretrainedTransformerIndexer("bert-base-cased") reader = NextTokenLmReader(tokenizer, {"bert": token_indexer}) instance = reader.text_to_instance(sentence="AllenNLP is very", target="very") assert [t.text for t in instance["tokens"]] == [ "[CLS]", "Allen", "##NL", "##P", "is", "very", "[SEP]", ] assert [t.text for t in instance["target_ids"]] == ["very"] vocab = Vocabulary() instance.index_fields(vocab) tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths()) assert tensor_dict.keys() == {"tokens", "target_ids"} bert_token_ids = tensor_dict["tokens"]["bert"].numpy().tolist() target_ids = tensor_dict["target_ids"]["bert"].numpy().tolist() # I don't know what wordpiece id BERT is going to assign to 'This', but it at least should # be the same between the input and the target. assert target_ids[0] == bert_token_ids[5]
def __init__(self, pretrained_model: str, max_pieces: int = 512, num_choices: int = 4, add_prefix: Dict[str, str] = None, sample: int = -1) -> None: super().__init__() self._tokenizer = PretrainedTransformerTokenizer(pretrained_model) self._tokenizer_no_special_tokens = PretrainedTransformerTokenizer( pretrained_model, add_special_tokens=False) # self._tokenizer_internal = self._tokenizer._tokenizer self._tokenizer_internal = self._tokenizer.tokenizer token_indexer = PretrainedTransformerIndexer(pretrained_model) self._token_indexers = {'tokens': token_indexer} self._max_pieces = max_pieces self._sample = sample self._num_choices = num_choices self._add_prefix = add_prefix or {} for model in [ "roberta", "bert", "openai-gpt", "gpt2", "transfo-xl", "xlnet", "xlm" ]: if model in pretrained_model: self._model_type = model break
def __init__(self, hparams, em_dict, rm_dict, entity_mentions, tokensD, all_known_e2, all_known_e1, scoresD_tail, scoresD_head, mode, max_instances, world_size) -> None: super().__init__(manual_distributed_sharding=True, manual_multi_process_sharding=True, max_instances=max_instances, world_size=world_size) self.hparams = hparams self.em_dict = em_dict self.rm_dict = rm_dict self.entity_mentions = entity_mentions self._tokenizer = PretrainedTransformerTokenizer('bert-base-cased') self._token_indexer = { "tokens": PretrainedTransformerIndexer('bert-base-cased') } self._tokensD = tokensD self.random_indexes = list(range(len(em_dict))) random.shuffle(self.random_indexes) self.key_index = 0 self.all_known_e1 = all_known_e1 self.all_known_e2 = all_known_e2 self.scoresD_tail = scoresD_tail self.scoresD_head = scoresD_head self.mode = mode self.factsD = None if hparams.retrieve_facts: self.factsD = pickle.load(open(hparams.retrieve_facts, 'rb'))
def __init__(self, transformer_model_name: str = "bert-base-cased", length_limit: int = 384, stride: int = 128, skip_invalid_examples: bool = False, max_query_length: int = 64, **kwargs) -> None: super().__init__(**kwargs) self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False, calculate_character_offsets=True) self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name) } self.length_limit = length_limit self.stride = stride self.skip_invalid_examples = skip_invalid_examples self.max_query_length = max_query_length self.non_content_type_id = max( self._tokenizer.tokenizer.encode_plus( "left", "right", return_token_type_ids=True)["token_type_ids"]) # workaround for a bug in the transformers library if "distilbert" in transformer_model_name: self.non_content_type_id = 0
def __init__(self, pretrained_model_pth: str = None, lazy: bool = False, mode: str = 'sop'): """ 三阶段数据读取模型 1、先读s, 再读o,最后读pos 2、先读o, 再读s,最后读p :param pretrained_model_pth: :param lazy: :param mode: 选择模式,先读哪一个 """ super().__init__(lazy) if pretrained_model_pth is None: self._token_indexers = {"tokens": SingleIdTokenIndexer()} else: self._token_indexers = { "tokens": PretrainedTransformerIndexer(pretrained_model_pth) } self.pretrained_tokenizer = None if pretrained_model_pth is not None: self.pretrained_tokenizer = AutoTokenizer.from_pretrained( pretrained_model_pth) assert mode in ['sop', 'osp'] self.mode = mode
def test_text_to_instance_with_bert_tokenizer_and_indexer(self): tokenizer = PretrainedTransformerTokenizer("bert-base-cased") token_indexer = PretrainedTransformerIndexer("bert-base-cased") reader = MaskedLanguageModelingReader(tokenizer, {"bert": token_indexer}) instance = reader.text_to_instance( sentence="This is AllenNLP [MASK] token .", targets=["This"]) assert [t.text for t in instance["tokens"]] == [ "[CLS]", "This", "is", "Allen", "##NL", "##P", "[MASK]", "token", ".", "[SEP]", ] assert [i.sequence_index for i in instance["mask_positions"]] == [6] assert [t.text for t in instance["target_ids"]] == ["This"] vocab = Vocabulary() instance.index_fields(vocab) tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths()) assert tensor_dict.keys() == {"tokens", "mask_positions", "target_ids"} bert_token_ids = tensor_dict["tokens"]["bert"]["token_ids"].numpy( ).tolist() target_ids = tensor_dict["target_ids"]["bert"]["token_ids"].numpy( ).tolist() # I don't know what wordpiece id BERT is going to assign to 'This', but it at least should # be the same between the input and the target. assert target_ids[0] == bert_token_ids[1]
def token_indexer_returner(self): huggingface_name, do_lower_case = self.huggingfacename_returner() return { 'tokens': PretrainedTransformerIndexer(model_name=huggingface_name, do_lowercase=do_lower_case) }
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_tokens: int = None, pseudo: bool = False if args.pseudo == False else True): super().__init__(lazy) # self.tokenizer = tokenizer or WhitespaceTokenizer() # self.token_indexers = token_indexers or {"tokens":SingleIdTokenIndexer()} # from collections import defaultdict # ags = defaultdict(list) ags = { "additional_special_tokens": ("[pseudo1]", "[pseudo2]", "[pseudo3]", "[pseudo4]", "[pseudo5]", "[pseudo6]", "[pseudo7]", "[pseudo8]", "[pseudo9]") } self.tokenizer = tokenizer or PretrainedTransformerTokenizer( "bert-large-uncased", tokenizer_kwargs=(ags if args.pseudo else {})) self.token_indexers = token_indexers or { "tokens": PretrainedTransformerIndexer("bert-large-uncased") } self.max_tokens = max_tokens self.pseudo = pseudo self.tags = tags
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, domain_identifier: str = None, bert_model_name: str = None, **kwargs, ) -> None: super().__init__(**kwargs) if token_indexers is not None: self._token_indexers = token_indexers elif bert_model_name is not None: from allennlp.data.token_indexers import PretrainedTransformerIndexer self._token_indexers = { "tokens": PretrainedTransformerIndexer(bert_model_name) } else: self._token_indexers = {"tokens": SingleIdTokenIndexer()} self._domain_identifier = domain_identifier if bert_model_name is not None: self.bert_tokenizer = BertTokenizer.from_pretrained( bert_model_name) self.lowercase_input = "uncased" in bert_model_name else: self.bert_tokenizer = None self.lowercase_input = False
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self.transformer_model = "bert-base-uncased" self.tokenizer = PretrainedTransformerTokenizer(model_name=self.transformer_model,add_special_tokens=False,max_length=512) self.token_indexer = PretrainedTransformerIndexer(model_name=self.transformer_model,max_length =512)
def test_end_to_end_t5( self, train_parameters: bool, last_layer_only: bool, gradient_checkpointing: bool, ): tokenizer = PretrainedTransformerTokenizer(model_name="patrickvonplaten/t5-tiny-random") token_indexer = PretrainedTransformerIndexer(model_name="patrickvonplaten/t5-tiny-random") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = ["▁A", ",", "▁Allen", "N", "LP", "▁sentence", ".", "</s>"] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = ["▁Allen", "N", "LP", "▁is", "▁great", "</s>"] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params( { "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "patrickvonplaten/t5-tiny-random", "train_parameters": train_parameters, "last_layer_only": last_layer_only, "gradient_checkpointing": gradient_checkpointing, "sub_module": "encoder", } } } ) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"]["token_ids"].shape == (2, max_length) assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, True, True, True], [True, True, True, True, True, True, False, False], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 8, 64) assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)