示例#1
0
 def test_tokenize_no_byte_offsets(self):
     tokenizer = Tokenizer()
     sentence = "Ordér mê å ćoƒfee"
     expected = [
         Token("ordér", 0, 5),
         Token("mê", 6, 8),
         Token("å", 9, 10),
         Token("ćoƒfee", 11, 17),
     ]
     tokens = tokenizer.tokenize(sentence)
     self.assertListEqual(expected, tokens)
示例#2
0
 def test_tokenize_use_byte_offsets(self):
     tokenizer = Tokenizer(use_byte_offsets=True)
     sentence = "Ordér mê å ćoƒfee"
     expected = [
         Token("ordér", 0, 6),
         Token("mê", 7, 10),
         Token("å", 11, 13),
         Token("ćoƒfee", 14, 22),
     ]
     tokens = tokenizer.tokenize(sentence)
     self.assertListEqual(expected, tokens)
示例#3
0
 def test_gpt2_bpe_tokenizer(self):
     text = "Prototype"
     expected = [Token("19703", 0, 4), Token("8690", 4, 9)]
     tokenizer = GPT2BPETokenizer.from_config(
         GPT2BPETokenizer.Config(
             bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
             bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
         ))
     tokens = tokenizer.tokenize(text)
     print(tokens)
     self.assertEqual(tokens, expected)
示例#4
0
 def test_wordpiece_tokenizer(self):
     text = "Marcó Lopᚠz"
     expected = [
         Token("m", 0, 1),
         Token("##ar", 1, 3),
         Token("##c", 3, 4),
         Token(value="##o", start=4, end=5),
         Token(value="[UNK]", start=6, end=11),
     ]
     tokenizer = WordPieceTokenizer.from_config(
         WordPieceTokenizer.Config(
             wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"))
     tokens = tokenizer.tokenize(text)
     print(tokens)
     self.assertEqual(tokens, expected)
示例#5
0
 def test_input_text_truncation(self):
     sentence = "Testing out sentencepiece"
     expected = [
         Token(value="▁T", start=0, end=1),
         Token(value="est", start=1, end=4),
         Token(value="ing", start=4, end=7),
         Token(value="▁out", start=8, end=11),
     ]
     sp_tokenizer = SentencePieceTokenizer.from_config(
         SentencePieceTokenizer.Config(
             sp_model_path="pytext/data/test/data/sentencepiece.model",
             max_input_text_length=11,
         ))
     tokens = sp_tokenizer.tokenize(sentence)
     self.assertEqual(tokens, expected)
示例#6
0
    def test_gpt2_bpe_tokenizer(self):
        tokenizer = GPT2BPETokenizer.from_config(
            GPT2BPETokenizer.Config(
                bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
                bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
            ))
        text_list = ["Prototype", " Prototype"]
        expected_list = [
            [Token("19703", 0, 4), Token("8690", 4, 9)],
            [Token("220", 0, 0),
             Token("19703", 1, 5),
             Token("8690", 5, 10)],
        ]

        for (text, expected) in zip(text_list, expected_list):
            tokens = tokenizer.tokenize(text)
            self.assertEqual(tokens, expected)
示例#7
0
 def test_tokenize(self):
     sentence = "Testing out sentencepiece"
     expected = [
         Token(value="▁T", start=0, end=1),
         Token(value="est", start=1, end=4),
         Token(value="ing", start=4, end=7),
         Token(value="▁out", start=8, end=11),
         Token(value="▁sen", start=12, end=15),
         Token(value="t", start=15, end=16),
         Token(value="ence", start=16, end=20),
         Token(value="p", start=20, end=21),
         Token(value="i", start=21, end=22),
         Token(value="e", start=22, end=23),
         Token(value="ce", start=23, end=25),
     ]
     sp_tokenizer = SentencePieceTokenizer.from_config(
         SentencePieceTokenizer.Config(
             sp_model_path="pytext/data/test/data/sentencepiece.model"))
     tokens = sp_tokenizer.tokenize(sentence)
     self.assertEqual(tokens, expected)