Пример #1
0
def train_tokenizer(sentences: List[str],
                    serialize_path: str = "",
                    vocab_size: int = 6000) -> Tokenizer:
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save_model(serialize_path)
    return bert_tokenizer
Пример #2
0
print("Decoded string: {}".format(decoded))

# %%
from tokenizers import ByteLevelBPETokenizer
# tokenizer提供了一些经典tokenization算法的高级封装
# 譬如可以用`ByteLevelBPETokenizer`简单地重写上面的内容
#
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=['data/big.txt'], vocab_size=25000, show_progress=True)

SAVE_PATH = Path('tokenizers')
PATH = SAVE_PATH / 'bytelevel-bpe-tokenizer-model'
if not PATH.exists():
    PATH.mkdir(parents=True, exist_ok=True)

tokenizer.save_model(str(PATH))

tokenizer = ByteLevelBPETokenizer(vocab_file=str(PATH / 'vocab.json'),
                                  merges_file=str(PATH / 'merges.txt'))

encoded = \
    tokenizer.encode("This is a simple input to be tokenized.")
print("Encoded string: {}".format(encoded.tokens))

decoded = \
    tokenizer.decode(encoded.ids)
print("Decoded string: {}".format(decoded))

# %%
# 与transformers搭配使用时,Encoding structure中有用的properties
#   - normalized_str