def test_indices_to_tokens(self): allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased") indexer_max_length = PretrainedTransformerIndexer( model_name="bert-base-uncased", max_length=4) indexer_no_max_length = PretrainedTransformerIndexer( model_name="bert-base-uncased") string_no_specials = "AllenNLP is great" allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer_no_max_length.tokens_to_indices( allennlp_tokens, vocab) tokens_from_indices = indexer_no_max_length.indices_to_tokens( indexed, vocab) self._assert_tokens_equal(allennlp_tokens, tokens_from_indices) indexed = indexer_max_length.tokens_to_indices(allennlp_tokens, vocab) tokens_from_indices = indexer_max_length.indices_to_tokens( indexed, vocab) # For now we are not removing special tokens introduced from max_length sep_cls = [allennlp_tokens[-1], allennlp_tokens[0]] expected = (allennlp_tokens[:3] + sep_cls + allennlp_tokens[3:5] + sep_cls + allennlp_tokens[5:]) self._assert_tokens_equal(expected, tokens_from_indices)
def test_mask(self): # We try these two models, because BERT pads tokens with 0, but RoBERTa pads tokens with 1. for model in ["bert-base-uncased", "roberta-base"]: allennlp_tokenizer = PretrainedTransformerTokenizer(model) indexer = PretrainedTransformerIndexer(model_name=model) string_no_specials = "AllenNLP is great" allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) expected_masks = [1] * len(indexed["token_ids"]) assert indexed["mask"] == expected_masks max_length = 10 padding_lengths = {key: max_length for key in indexed.keys()} padded_tokens = indexer.as_padded_tensor_dict( indexed, padding_lengths) padding_length = max_length - len(indexed["mask"]) expected_masks = expected_masks + ([0] * padding_length) assert len(padded_tokens["mask"]) == max_length assert padded_tokens["mask"].tolist() == expected_masks assert len(padded_tokens["token_ids"]) == max_length padding_suffix = [allennlp_tokenizer.tokenizer.pad_token_id ] * padding_length assert padded_tokens["token_ids"][-padding_length:].tolist( ) == padding_suffix
def test_as_array_produces_token_sequence(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased") tokens = tokenizer.tokenize("AllenNLP is great") expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = [Token(token) for token in tokens] vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") tokens = tokenizer.tokenize("AllenNLP is great") expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = [Token(token) for token in tokens] vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids
def test_as_array_produces_token_sequence(self): tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lowercase=True) indexer = PretrainedTransformerIndexer(model_name='bert-base-uncased', do_lowercase=True) tokens = tokenizer.tokenize('AllenNLP is great') expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = [Token(token) for token in tokens] vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, 'key') assert indexed['key'] == expected_ids
def test_as_array_produces_token_sequence_roberta_sentence_pair(self): tokenizer = AutoTokenizer.from_pretrained("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base") indexer = PretrainedTransformerIndexer(model_name="roberta-base") default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair( "AllenNLP is great!", "Really it is!") vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids
def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased") indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair( "AllenNLP is great!", "Really it is!") vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
def test_as_array_produces_token_sequence_roberta(self): tokenizer = AutoTokenizer.from_pretrained("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base") indexer = PretrainedTransformerIndexer(model_name="roberta-base") string_specials = "<s> AllenNLP is great </s>" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids
def test_transformers_vocabs_added_correctly(self): namespace, model_name = "tags", "roberta-base" tokenizer = AutoTokenizer.from_pretrained(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_token_to_index_vocabulary( namespace=namespace) == tokenizer.encoder
def check_vocab_size(model_name: str): namespace = "tags" tokenizer = AutoTokenizer.from_pretrained(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_vocab_size( namespace=namespace) == tokenizer.vocab_size
def test_as_array_produces_token_sequence_bert_cased(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased") indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") string_specials = "[CLS] AllenNLP is great [SEP]" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
def test_transformers_vocab_sizes(self, model_name): namespace = "tags" tokenizer = cached_transformers.get_tokenizer(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_vocab_size( namespace=namespace) == tokenizer.vocab_size
def test_as_array_produces_token_sequence_roberta_sentence_pair(self): tokenizer = cached_transformers.get_tokenizer("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer( "roberta-base", add_special_tokens=False) indexer = PretrainedTransformerIndexer(model_name="roberta-base") default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.add_special_tokens( allennlp_tokenizer.tokenize("AllenNLP is great!"), allennlp_tokenizer.tokenize("Really it is!"), ) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
def test_type_ids_when_folding(self): allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased", add_special_tokens=False) indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", max_length=6) first_string = "How do trees get online?" second_string = "They log in!" tokens = allennlp_tokenizer.add_special_tokens( allennlp_tokenizer.tokenize(first_string), allennlp_tokenizer.tokenize(second_string)) vocab = Vocabulary() indexed = indexer.tokens_to_indices(tokens, vocab) assert min(indexed["type_ids"]) == 0 assert max(indexed["type_ids"]) == 1
def test_mask(self): allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased") indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased") string_no_specials = "AllenNLP is great" allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) expected_masks = [1] * len(indexed["token_ids"]) assert indexed["mask"] == expected_masks max_length = 10 padding_lengths = {"token_ids": max_length, "mask": max_length} padded_tokens = indexer.as_padded_tensor_dict(indexed, padding_lengths) padding_length = max_length - len(indexed["mask"]) expected_masks = expected_masks + ([0] * padding_length) assert len(padded_tokens["mask"]) == max_length assert padded_tokens["mask"].tolist() == expected_masks
def test_long_sequence_splitting(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased") indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", max_length=4) string_specials = "[CLS] AllenNLP is great [SEP]" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) assert len( expected_ids) == 7 # just to make sure it's what we're expecting cls_id, sep_id = expected_ids[0], expected_ids[-1] expected_ids = (expected_ids[:3] + [sep_id, cls_id] + expected_ids[3:5] + [sep_id, cls_id] + expected_ids[5:]) allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids assert indexed["segment_concat_mask"] == [1] * len(expected_ids) assert indexed["mask"] == [1] * 7 # original length
class PretrainedTransformerMismatchedIndexer(TokenIndexer): """ Use this indexer when (for whatever reason) you are not using a corresponding `PretrainedTransformerTokenizer` on your input. We assume that you used a tokenizer that splits strings into words, while the transformer expects wordpieces as input. This indexer splits the words into wordpieces and flattens them out. You should use the corresponding `PretrainedTransformerMismatchedEmbedder` to embed these wordpieces and then pull out a single vector for each original word. # Parameters model_name : `str` The name of the `transformers` model to use. namespace : `str`, optional (default=`tags`) We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace. We use a somewhat confusing default value of `tags` so that we do not add padding or UNK tokens to this namespace, which would break on loading because we wouldn't find our default OOV token. """ def __init__(self, model_name: str, namespace: str = "tags", **kwargs) -> None: super().__init__(**kwargs) # The matched version v.s. mismatched self._matched_indexer = PretrainedTransformerIndexer( model_name, namespace, **kwargs) # add_special_tokens=False since we don't want wordpieces to be surrounded by special tokens self._allennlp_tokenizer = PretrainedTransformerTokenizer( model_name, add_special_tokens=False) self._tokenizer = self._allennlp_tokenizer.tokenizer ( self._num_added_start_tokens, self._num_added_end_tokens, ) = self._determine_num_special_tokens_added() @overrides def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): return self._matched_indexer.count_vocab_items(token, counter) @overrides def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList: orig_token_mask = [1] * len(tokens) tokens, offsets = self._intra_word_tokenize(tokens) # {"token_ids": ..., "mask": ...} output = self._matched_indexer.tokens_to_indices(tokens, vocabulary) # Insert type ids for the special tokens. output[ "type_ids"] = self._tokenizer.create_token_type_ids_from_sequences( output["token_ids"]) # Insert the special tokens themselves. output["token_ids"] = self._tokenizer.build_inputs_with_special_tokens( output["token_ids"]) output["mask"] = orig_token_mask output["offsets"] = [(start + self._num_added_start_tokens, end + self._num_added_start_tokens) for start, end in offsets] output["wordpiece_mask"] = [1] * len(output["token_ids"]) return output @overrides def get_empty_token_list(self) -> IndexedTokenList: output = self._matched_indexer.get_empty_token_list() output["offsets"] = [] output["wordpiece_mask"] = [] return output @overrides def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: tokens = tokens.copy() padding_lengths = padding_lengths.copy() offsets_tokens = tokens.pop("offsets") offsets_padding_lengths = padding_lengths.pop("offsets") tensor_dict = self._matched_indexer.as_padded_tensor_dict( tokens, padding_lengths) tensor_dict["offsets"] = torch.LongTensor( pad_sequence_to_length(offsets_tokens, offsets_padding_lengths, default_value=lambda: (0, 0))) return tensor_dict def __eq__(self, other): if isinstance(other, PretrainedTransformerMismatchedIndexer): for key in self.__dict__: if key == "tokenizer": # This is a reference to a function in the huggingface code, which we can't # really modify to make this clean. So we special-case it. continue if self.__dict__[key] != other.__dict__[key]: return False return True return NotImplemented def _intra_word_tokenize( self, tokens: List[Token]) -> Tuple[List[Token], List[Tuple[int, int]]]: """ Tokenizes each word into wordpieces separately. Also calculates offsets such that wordpices[offsets[i][0]:offsets[i][1] + 1] corresponds to the original i-th token. Does not insert special tokens. """ wordpieces: List[Token] = [] offsets = [] cumulative = 0 for token in tokens: subword_wordpieces = self._allennlp_tokenizer.tokenize(token.text) wordpieces.extend(subword_wordpieces) start_offset = cumulative cumulative += len(subword_wordpieces) end_offset = cumulative - 1 # inclusive offsets.append((start_offset, end_offset)) return wordpieces, offsets def _determine_num_special_tokens_added(self) -> Tuple[int, int]: """ Determines the number of tokens self._tokenizer adds to a sequence (currently doesn't consider sequence pairs) in the start & end. # Returns The number of tokens (`int`) that are inserted in the start & end of a sequence. """ # Uses a slightly higher index to avoid tokenizer doing special things to lower-indexed # tokens which might be special. dummy = [1000] inserted = self._tokenizer.build_inputs_with_special_tokens(dummy) num_start = num_end = 0 seen_dummy = False for idx in inserted: if idx == dummy[0]: if seen_dummy: # seeing it twice raise ValueError( "Cannot auto-determine the number of special tokens added." ) seen_dummy = True continue if not seen_dummy: num_start += 1 else: num_end += 1 assert num_start + num_end == self._tokenizer.num_added_tokens() return num_start, num_end