def __init__(self, vocab_file, do_lower_case=True, vocab_override=None): super().__init__() self.vocab_file = vocab_file self.do_lower_case = do_lower_case if vocab_override is None: self.vocab = tokenization.load_vocab(vocab_file) else: self.vocab = vocab_override self.inv_vocab = {v: k for k, v in self.vocab.items()} self.basic_tokenizer = tokenization.BasicTokenizer( do_lower_case=do_lower_case) self.wordpiece_tokenizer = tokenization.WordpieceTokenizer( vocab=self.vocab)
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])