示例#1
0
    def test_lookup_tokens(self):
        text = "let's tokenize this"
        tokenizer = Tokenizer()
        vocab = Vocabulary(text.split() + [BOS, EOS])
        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            add_bos_token=False,
            add_eos_token=False,
        )
        self.assertEqual(tokens, [0, 1, 2])
        self.assertEqual(start_idx, (0, 6, 15))
        self.assertEqual(end_idx, (5, 14, 19))

        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            add_bos_token=True,
            add_eos_token=True,
        )
        self.assertEqual(tokens, [3, 0, 1, 2, 4])
        self.assertEqual(start_idx, (-1, 0, 6, 15, -1))
        self.assertEqual(end_idx, (-1, 5, 14, 19, -1))
示例#2
0
    def _lookup_tokens(self, text: str, seq_len: int = None):
        """
        This function knows how to call lookup_tokens with the correct
        settings for this model. The default behavior is to wrap the
        numberized text with distinct BOS and EOS tokens. The resulting
        vector would look something like this:
            [BOS, token1_id, . . . tokenN_id, EOS]

        The function also takes an optional seq_len parameter which is
        used to customize truncation in case we have multiple text fields.
        By default max_seq_len is used. It's upto the numberize function of
        the class to decide how to use the seq_len param.

        For example:
        - In the case of sentence pair classification, we might want both
        pieces of text have the same length which is half of the
        max_seq_len supported by the model.
        - In the case of QA, we might want to truncate the context by a
        seq_len which is longer than what we use for the question.
        """
        return lookup_tokens(
            text,
            tokenizer=self.tokenizer,
            vocab=self.vocab,
            bos_token=self.vocab.bos_token,
            eos_token=self.vocab.eos_token,
            max_seq_len=seq_len if seq_len else self.max_seq_len,
        )
示例#3
0
 def _numberize_and_wrap(self, text: str, seq_len: int) -> List[List[int]]:
     sentence = ([self.special_token] +
                 lookup_tokens(text,
                               tokenizer=self.tokenizer,
                               vocab=self.vocab,
                               max_seq_len=seq_len)[0] +
                 [self.special_token])
     return [sentence]
示例#4
0
 def _lookup_tokens(self, text):
     return lookup_tokens(
         text,
         tokenizer=self.tokenizer,
         vocab=self.vocab,
         add_bos_token=False,
         add_eos_token=self.add_eos_token,
         max_seq_len=self.max_seq_len,
     )
示例#5
0
 def _lookup_tokens(self, text: str, seq_len: int = None):
     return lookup_tokens(
         text,
         tokenizer=self.tokenizer,
         vocab=self.vocab,
         bos_token=None,
         eos_token=self.vocab.eos_token,
         max_seq_len=seq_len if seq_len else self.max_seq_len,
     )
示例#6
0
 def _lookup_tokens(self, text):
     return lookup_tokens(
         text,
         tokenizer=self.tokenizer,
         vocab=self.vocab,
         bos_token=None,
         eos_token=EOS,
         max_seq_len=self.max_seq_len,
     )
示例#7
0
 def _lookup_tokens(self, text: str, seq_len: int) -> List[str]:
     return lookup_tokens(
         text,
         tokenizer=self.tokenizer,
         vocab=self.vocab,
         bos_token=self.vocab.eos_token,
         eos_token=self.vocab.eos_token,
         use_eos_token_for_bos=True,
         max_seq_len=seq_len,
     )
示例#8
0
 def _lookup_tokens(self, text: str, seq_len: int = None):
     # BoS token is added explicitly in numberize()
     return lookup_tokens(
         text,
         tokenizer=self.tokenizer,
         vocab=self.vocab,
         bos_token=None,
         eos_token=self.vocab.eos_token,
         max_seq_len=seq_len if seq_len else self.max_seq_len,
     )
示例#9
0
 def _lookup_tokens(self, text: str, seq_len: int = None):
     # BoS token is added explicitly in numberize(), -1 from max_seq_len
     max_seq_len = (seq_len or self.max_seq_len) - 1
     return lookup_tokens(
         text,
         tokenizer=self.tokenizer,
         vocab=self.vocab,
         bos_token=None,
         eos_token=self.vocab.eos_token,
         max_seq_len=max_seq_len,
     )
示例#10
0
    def test_lookup_tokens(self):
        text = "let's tokenize this"
        tokenizer = Tokenizer()
        vocab = Vocabulary(text.split() +
                           [SpecialTokens.BOS, SpecialTokens.EOS])
        tokens, start_idx, end_idx = lookup_tokens(text,
                                                   tokenizer=tokenizer,
                                                   vocab=vocab,
                                                   bos_token=None,
                                                   eos_token=None)
        self.assertEqual(tokens, [0, 1, 2])
        self.assertEqual(start_idx, (0, 6, 15))
        self.assertEqual(end_idx, (5, 14, 19))

        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            bos_token=SpecialTokens.BOS,
            eos_token=SpecialTokens.EOS,
        )
        self.assertEqual(tokens, [3, 0, 1, 2, 4])
        self.assertEqual(start_idx, (-1, 0, 6, 15, -1))
        self.assertEqual(end_idx, (-1, 5, 14, 19, -1))