Пример #1
0
 def __init__(self,
              vocab,
              max_seq_len,
              max_char_len,
              do_lower_case=False,
              cut_fine_grained=True):
     self.vocab = vocab
     self.max_seq_len = max_seq_len
     self.max_char_len = max_char_len
     self.__norm_tokenizer = NormTokenizer(
         do_lower_case=do_lower_case, cut_fine_grained=cut_fine_grained)
     self.__wordpiece_tokenizer = tokenization.WordpieceTokenizer(
         vocab=self.vocab)
Пример #2
0
    def test_wordpiece_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing"
        ]

        vocab = {}
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i
        tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)

        self.assertAllEqual(tokenizer.tokenize(""), [])

        self.assertAllEqual(tokenizer.tokenize("unwanted running"),
                            ["un", "##want", "##ed", "runn", "##ing"])

        self.assertAllEqual(tokenizer.tokenize("unwantedX running"),
                            ["[UNK]", "runn", "##ing"])
Пример #3
0
 def __init__(self, vocab_file, do_lower_case=False):
     self.vocab = tokenization.load_vocab(vocab_file)
     self.inv_vocab = {v: k for k, v in self.vocab.items()}
     self.wordpiece_tokenizer = tokenization.WordpieceTokenizer(
         vocab=self.vocab)
     self.do_lower_case = do_lower_case