示例#1
0
def train_bert_tokenizer(sentences: List[str],
                         serialize_path: str,
                         vocab_size: int = 6000) -> BertWordPieceTokenizer:
    tokenizer = BertWordPieceTokenizer(
        clean_text=True,
        handle_chinese_chars=False,
        strip_accents=False,
        lowercase=False,
    )
    tokenizer.train_from_iterator(
        sentences,
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        limit_alphabet=500,
        wordpieces_prefix="##",
    )

    # Save the files--first write out the vocab, then use BertTokenizer's save_pretrained
    tokenizer.save_model(serialize_path)
    bert_tokenizer = BertTokenizer.from_pretrained(serialize_path + os.sep +
                                                   "vocab.txt")
    bert_tokenizer.save_pretrained(serialize_path)
    os.rename(serialize_path + os.sep + "tokenizer_config.json",
              serialize_path + os.sep + "config.json")
    return bert_tokenizer
示例#2
0
 def training_WordPiece(self):
     tokenizer = BertWordPieceTokenizer(vocab=None,
                                        clean_text=True,
                                        handle_chinese_chars=True,
                                        strip_accents=True,
                                        lowercase=True,
                                        wordpieces_prefix='##')
     tokenizer.train([
         os.path.join(self.corpus_dir_path, file_path)
         for file_path in os.listdir(self.corpus_dir_path)
         if 'mecab' in file_path
     ],
                     limit_alphabet=self.config['limit_alphabet'],
                     vocab_size=self.config['vocab_size'],
                     special_tokens=self.get_special_tokens())
     print('training WordPiece is finished!')
     tokenizer.save_model(self.config['tokenizer_path'], prefix='tokenizer')
     print('tokenizer is saved in {}'.format(
         os.path.join(self.config['tokenizer_path'],
                      'tokenizer-vocab.txt')))
示例#3
0
import tokenizers
from transformers import BertTokenizer
import glob
from tokenizers.implementations import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer()
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True,
)
files = glob.glob("./corpus_for_tokenization/*.txt")

tokenizer.train(files,
                vocab_size=50000,
                min_frequency=3,
                show_progress=True,
                special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
                limit_alphabet=15000,
                wordpieces_prefix="##")
tokenizer.save_model("./")