def train_bert_tokenizer(sentences: List[str], serialize_path: str, vocab_size: int = 6000) -> BertWordPieceTokenizer: tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=False, strip_accents=False, lowercase=False, ) tokenizer.train_from_iterator( sentences, vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=500, wordpieces_prefix="##", ) # Save the files--first write out the vocab, then use BertTokenizer's save_pretrained tokenizer.save_model(serialize_path) bert_tokenizer = BertTokenizer.from_pretrained(serialize_path + os.sep + "vocab.txt") bert_tokenizer.save_pretrained(serialize_path) os.rename(serialize_path + os.sep + "tokenizer_config.json", serialize_path + os.sep + "config.json") return bert_tokenizer
def training_WordPiece(self): tokenizer = BertWordPieceTokenizer(vocab=None, clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, wordpieces_prefix='##') tokenizer.train([ os.path.join(self.corpus_dir_path, file_path) for file_path in os.listdir(self.corpus_dir_path) if 'mecab' in file_path ], limit_alphabet=self.config['limit_alphabet'], vocab_size=self.config['vocab_size'], special_tokens=self.get_special_tokens()) print('training WordPiece is finished!') tokenizer.save_model(self.config['tokenizer_path'], prefix='tokenizer') print('tokenizer is saved in {}'.format( os.path.join(self.config['tokenizer_path'], 'tokenizer-vocab.txt')))
import tokenizers from transformers import BertTokenizer import glob from tokenizers.implementations import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer() tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) files = glob.glob("./corpus_for_tokenization/*.txt") tokenizer.train(files, vocab_size=50000, min_frequency=3, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=15000, wordpieces_prefix="##") tokenizer.save_model("./")