def test_lookup_tokens(self): text = "let's tokenize this" tokenizer = Tokenizer() vocab = Vocabulary(text.split() + [BOS, EOS]) tokens, start_idx, end_idx = lookup_tokens( text, tokenizer=tokenizer, vocab=vocab, add_bos_token=False, add_eos_token=False, ) self.assertEqual(tokens, [0, 1, 2]) self.assertEqual(start_idx, (0, 6, 15)) self.assertEqual(end_idx, (5, 14, 19)) tokens, start_idx, end_idx = lookup_tokens( text, tokenizer=tokenizer, vocab=vocab, add_bos_token=True, add_eos_token=True, ) self.assertEqual(tokens, [3, 0, 1, 2, 4]) self.assertEqual(start_idx, (-1, 0, 6, 15, -1)) self.assertEqual(end_idx, (-1, 5, 14, 19, -1))
def _lookup_tokens(self, text: str, seq_len: int = None): """ This function knows how to call lookup_tokens with the correct settings for this model. The default behavior is to wrap the numberized text with distinct BOS and EOS tokens. The resulting vector would look something like this: [BOS, token1_id, . . . tokenN_id, EOS] The function also takes an optional seq_len parameter which is used to customize truncation in case we have multiple text fields. By default max_seq_len is used. It's upto the numberize function of the class to decide how to use the seq_len param. For example: - In the case of sentence pair classification, we might want both pieces of text have the same length which is half of the max_seq_len supported by the model. - In the case of QA, we might want to truncate the context by a seq_len which is longer than what we use for the question. """ return lookup_tokens( text, tokenizer=self.tokenizer, vocab=self.vocab, bos_token=self.vocab.bos_token, eos_token=self.vocab.eos_token, max_seq_len=seq_len if seq_len else self.max_seq_len, )
def _numberize_and_wrap(self, text: str, seq_len: int) -> List[List[int]]: sentence = ([self.special_token] + lookup_tokens(text, tokenizer=self.tokenizer, vocab=self.vocab, max_seq_len=seq_len)[0] + [self.special_token]) return [sentence]
def _lookup_tokens(self, text): return lookup_tokens( text, tokenizer=self.tokenizer, vocab=self.vocab, add_bos_token=False, add_eos_token=self.add_eos_token, max_seq_len=self.max_seq_len, )
def _lookup_tokens(self, text: str, seq_len: int = None): return lookup_tokens( text, tokenizer=self.tokenizer, vocab=self.vocab, bos_token=None, eos_token=self.vocab.eos_token, max_seq_len=seq_len if seq_len else self.max_seq_len, )
def _lookup_tokens(self, text): return lookup_tokens( text, tokenizer=self.tokenizer, vocab=self.vocab, bos_token=None, eos_token=EOS, max_seq_len=self.max_seq_len, )
def _lookup_tokens(self, text: str, seq_len: int) -> List[str]: return lookup_tokens( text, tokenizer=self.tokenizer, vocab=self.vocab, bos_token=self.vocab.eos_token, eos_token=self.vocab.eos_token, use_eos_token_for_bos=True, max_seq_len=seq_len, )
def _lookup_tokens(self, text: str, seq_len: int = None): # BoS token is added explicitly in numberize() return lookup_tokens( text, tokenizer=self.tokenizer, vocab=self.vocab, bos_token=None, eos_token=self.vocab.eos_token, max_seq_len=seq_len if seq_len else self.max_seq_len, )
def _lookup_tokens(self, text: str, seq_len: int = None): # BoS token is added explicitly in numberize(), -1 from max_seq_len max_seq_len = (seq_len or self.max_seq_len) - 1 return lookup_tokens( text, tokenizer=self.tokenizer, vocab=self.vocab, bos_token=None, eos_token=self.vocab.eos_token, max_seq_len=max_seq_len, )
def test_lookup_tokens(self): text = "let's tokenize this" tokenizer = Tokenizer() vocab = Vocabulary(text.split() + [SpecialTokens.BOS, SpecialTokens.EOS]) tokens, start_idx, end_idx = lookup_tokens(text, tokenizer=tokenizer, vocab=vocab, bos_token=None, eos_token=None) self.assertEqual(tokens, [0, 1, 2]) self.assertEqual(start_idx, (0, 6, 15)) self.assertEqual(end_idx, (5, 14, 19)) tokens, start_idx, end_idx = lookup_tokens( text, tokenizer=tokenizer, vocab=vocab, bos_token=SpecialTokens.BOS, eos_token=SpecialTokens.EOS, ) self.assertEqual(tokens, [3, 0, 1, 2, 4]) self.assertEqual(start_idx, (-1, 0, 6, 15, -1)) self.assertEqual(end_idx, (-1, 5, 14, 19, -1))