def main(args): # from tokenizers import BertWordPieceTokenizer from tokenizers import Tokenizer from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece()) # bert_tokenizer = Tokenizer(MBartTokenizer()) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() # from tokenizers.processors import TemplateProcessing # # bert_tokenizer.post_processor = TemplateProcessing( # single="[CLS] $A [SEP]", # pair="[CLS] $A [SEP] $B:1 [SEP]:1", # special_tokens=[ # ("[CLS]", 1), # ("[SEP]", 2), # ], # ) from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[PAD]", "[MASK]"] # "[SEP]", "[PAD]", "[MASK]"] ) files = glob.glob(args.text_raw_files_pattern) bert_tokenizer.train(trainer, files) os.makedirs(args.output_dir, exist_ok=True) model_files = bert_tokenizer.model.save(args.output_dir, "bert-tokenizer-kr") bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]") bert_tokenizer.save(os.path.join(args.output_dir, "bert-tokenizer-kr.json"))
def test_instantiate(self, bert_files): assert isinstance(WordPiece(), Model) assert isinstance(WordPiece(), WordPiece) vocab = {"a": 0, "b": 1, "ab": 2} assert isinstance(WordPiece(vocab), Model) assert isinstance(WordPiece(vocab), WordPiece) assert isinstance(WordPiece.from_file(bert_files["vocab"]), WordPiece) assert isinstance(pickle.loads(pickle.dumps(WordPiece(vocab))), WordPiece) # Deprecated calls in 0.9 with pytest.deprecated_call(): assert isinstance(WordPiece(bert_files["vocab"]), Model) with pytest.deprecated_call(): assert isinstance( pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece)
from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers.trainers import WordPieceTrainer from tokenizers.pre_tokenizers import Whitespace from tokenizers.normalizers import Sequence, NFD, Lowercase, Strip def train(dataset_path, output_dir='data/tokenizer/', vocab_size=30_000, min_frequency=3): trainer = WordPieceTrainer(vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']) tokenizer = Tokenizer(WordPiece()) tokenizer.pre_tokenizer = Whitespace() tokenizer.normalizer = Sequence([NFD(), Lowercase(), Strip()]) files = [dataset_path] tokenizer.train(trainer, files) files = tokenizer.model.save(output_dir) tokenizer.model = WordPiece.from_file(*files, unk_token='[UNK]') tokenizer.save(f'{output_dir}tokenizer.json') if __name__ == '__main__': fire.Fire(train)