def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 6000) -> Tokenizer: bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save_model(serialize_path) return bert_tokenizer
print("Decoded string: {}".format(decoded)) # %% from tokenizers import ByteLevelBPETokenizer # tokenizer提供了一些经典tokenization算法的高级封装 # 譬如可以用`ByteLevelBPETokenizer`简单地重写上面的内容 # tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=['data/big.txt'], vocab_size=25000, show_progress=True) SAVE_PATH = Path('tokenizers') PATH = SAVE_PATH / 'bytelevel-bpe-tokenizer-model' if not PATH.exists(): PATH.mkdir(parents=True, exist_ok=True) tokenizer.save_model(str(PATH)) tokenizer = ByteLevelBPETokenizer(vocab_file=str(PATH / 'vocab.json'), merges_file=str(PATH / 'merges.txt')) encoded = \ tokenizer.encode("This is a simple input to be tokenized.") print("Encoded string: {}".format(encoded.tokens)) decoded = \ tokenizer.decode(encoded.ids) print("Decoded string: {}".format(decoded)) # %% # 与transformers搭配使用时,Encoding structure中有用的properties # - normalized_str