def test_wordpiece_tokenizer(self): vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]") self.assertListEqual(tokenizer.tokenize(""), []) self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
def from_config(cls, config: Config): basic_tokenizer = create_component(ComponentType.TOKENIZER, config.basic_tokenizer) vocab = WordPieceTokenizer.load_vocab(config.wordpiece_vocab_path) wordpiece_tokenizer = WordpieceTokenizer( vocab=vocab, unk_token="[UNK]") # UNK is for compatibility with HF v0.5 return cls(vocab, basic_tokenizer, wordpiece_tokenizer)
def finish_deserializing(self): self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) if self.do_basic_tokenize: self.basic_tokenizer = BasicTokenizer( do_lower_case=self.do_lower_case, never_split=self.never_split, tokenize_chinese_chars=self.tokenize_chinese_chars, ) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) super().finish_deserializing()
def test_wordpiece_tokenizer(self): vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは" "ばんは", "##こん", "##にちは", "##ばんは"] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]") self.assertListEqual(tokenizer.tokenize(""), []) self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こんにちは"]) self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"]) self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"])