예제 #1
0
    def test_wordpiece_tokenizer(self):
        vocab_tokens = [
            "[UNK]",
            "[CLS]",
            "[SEP]",
            "want",
            "##want",
            "##ed",
            "wa",
            "un",
            "runn",
            "##ing",
        ]

        vocab = {}
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i
        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")

        self.assertListEqual(tokenizer.tokenize(""), [])

        self.assertListEqual(tokenizer.tokenize("unwanted running"),
                             ["un", "##want", "##ed", "runn", "##ing"])

        self.assertListEqual(tokenizer.tokenize("unwantedX running"),
                             ["[UNK]", "runn", "##ing"])
예제 #2
0
 def add_custom_vocab(self, custom_vocab_file):
     self.vocab = self._load_custom_vocab(custom_vocab_file)
     self.ids_to_tokens = collections.OrderedDict([
         (ids, tok) for tok, ids in self.vocab.items()
     ])
     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
                                                   unk_token=self.unk_token)
예제 #3
0
 def finish_deserializing(self):
     self.ids_to_tokens = OrderedDict([(ids, tok)
                                       for tok, ids in self.vocab.items()])
     if self.do_basic_tokenize:
         self.basic_tokenizer = BasicTokenizer(
             do_lower_case=self.do_lower_case,
             never_split=self.never_split,
             tokenize_chinese_chars=self.tokenize_chinese_chars,
         )
     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
                                                   unk_token=self.unk_token)
예제 #4
0
    def __init__(self,
                 vocab_file,
                 do_lower_case=True,
                 do_basic_tokenize=True,
                 never_split=None,
                 never_split_chars=None,
                 unk_token="[UNK]",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]",
                 tokenize_chinese_chars=True,
                 **kwargs):
        """Constructs a BertTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input
                Only has an effect when do_basic_tokenize=True
            **do_basic_tokenize**: (`optional`) boolean (default True)
                Whether to do basic tokenization before wordpiece.
            **never_split**: (`optional`) list of string
                List of tokens which will never be split during tokenization.
                Only has an effect when do_basic_tokenize=True
            **tokenize_chinese_chars**: (`optional`) boolean (default True)
                Whether to tokenize Chinese characters.
                This should likely be desactivated for Japanese:
                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
        """
        super(BertTokenizer, self).__init__(vocab_file,
                                            do_lower_case=True,
                                            do_basic_tokenize=True,
                                            never_split=None,
                                            never_split_chars=None,
                                            unk_token="[UNK]",
                                            sep_token="[SEP]",
                                            pad_token="[PAD]",
                                            cls_token="[CLS]",
                                            mask_token="[MASK]",
                                            tokenize_chinese_chars=True,
                                            **kwargs)

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                never_split_chars=never_split_chars,
                tokenize_chinese_chars=tokenize_chinese_chars)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
                                                      unk_token=self.unk_token)