def __init__(self, vocab, max_seq_len, max_char_len, do_lower_case=False, cut_fine_grained=True): self.vocab = vocab self.max_seq_len = max_seq_len self.max_char_len = max_char_len self.__norm_tokenizer = NormTokenizer( do_lower_case=do_lower_case, cut_fine_grained=cut_fine_grained) self.__wordpiece_tokenizer = tokenization.WordpieceTokenizer( vocab=self.vocab)
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
def __init__(self, vocab_file, do_lower_case=False): self.vocab = tokenization.load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.wordpiece_tokenizer = tokenization.WordpieceTokenizer( vocab=self.vocab) self.do_lower_case = do_lower_case