def test_tokenize_no_byte_offsets(self): tokenizer = Tokenizer() sentence = "Ordér mê å ćoƒfee" expected = [ Token("ordér", 0, 5), Token("mê", 6, 8), Token("å", 9, 10), Token("ćoƒfee", 11, 17), ] tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, tokens)
def test_tokenize_use_byte_offsets(self): tokenizer = Tokenizer(use_byte_offsets=True) sentence = "Ordér mê å ćoƒfee" expected = [ Token("ordér", 0, 6), Token("mê", 7, 10), Token("å", 11, 13), Token("ćoƒfee", 14, 22), ] tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, tokens)
def test_gpt2_bpe_tokenizer(self): text = "Prototype" expected = [Token("19703", 0, 4), Token("8690", 4, 9)] tokenizer = GPT2BPETokenizer.from_config( GPT2BPETokenizer.Config( bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe", bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json", )) tokens = tokenizer.tokenize(text) print(tokens) self.assertEqual(tokens, expected)
def test_wordpiece_tokenizer(self): text = "Marcó Lopᚠz" expected = [ Token("m", 0, 1), Token("##ar", 1, 3), Token("##c", 3, 4), Token(value="##o", start=4, end=5), Token(value="[UNK]", start=6, end=11), ] tokenizer = WordPieceTokenizer.from_config( WordPieceTokenizer.Config( wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt")) tokens = tokenizer.tokenize(text) print(tokens) self.assertEqual(tokens, expected)
def test_input_text_truncation(self): sentence = "Testing out sentencepiece" expected = [ Token(value="▁T", start=0, end=1), Token(value="est", start=1, end=4), Token(value="ing", start=4, end=7), Token(value="▁out", start=8, end=11), ] sp_tokenizer = SentencePieceTokenizer.from_config( SentencePieceTokenizer.Config( sp_model_path="pytext/data/test/data/sentencepiece.model", max_input_text_length=11, )) tokens = sp_tokenizer.tokenize(sentence) self.assertEqual(tokens, expected)
def test_gpt2_bpe_tokenizer(self): tokenizer = GPT2BPETokenizer.from_config( GPT2BPETokenizer.Config( bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe", bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json", )) text_list = ["Prototype", " Prototype"] expected_list = [ [Token("19703", 0, 4), Token("8690", 4, 9)], [Token("220", 0, 0), Token("19703", 1, 5), Token("8690", 5, 10)], ] for (text, expected) in zip(text_list, expected_list): tokens = tokenizer.tokenize(text) self.assertEqual(tokens, expected)
def test_tokenize(self): sentence = "Testing out sentencepiece" expected = [ Token(value="▁T", start=0, end=1), Token(value="est", start=1, end=4), Token(value="ing", start=4, end=7), Token(value="▁out", start=8, end=11), Token(value="▁sen", start=12, end=15), Token(value="t", start=15, end=16), Token(value="ence", start=16, end=20), Token(value="p", start=20, end=21), Token(value="i", start=21, end=22), Token(value="e", start=22, end=23), Token(value="ce", start=23, end=25), ] sp_tokenizer = SentencePieceTokenizer.from_config( SentencePieceTokenizer.Config( sp_model_path="pytext/data/test/data/sentencepiece.model")) tokens = sp_tokenizer.tokenize(sentence) self.assertEqual(tokens, expected)