def test_train_from_iterator(self): text = ["A first sentence", "Another sentence", "And a last one"] tokenizer = BertWordPieceTokenizer() tokenizer.train_from_iterator(text, show_progress=False) output = tokenizer.encode("A sentence") assert output.tokens == ["a", "sentence"]
def load_from_dataset_bert_tokenizer( dataset_name="wikitext", dataset_config_name="wikitext-2-raw-v1", vocab_size=30000 ): """ Adapted from: https://github.com/huggingface/tokenizers/tree/master/bindings/python/examples If used frequently, save the model to avoid reloading tokenizer 0.10.0 required to train from dataset, but not supported by stable version of hugging face or datasets yet """ from datasets import load_dataset tokenizer = BertWordPieceTokenizer( strip_accents=True, # following arguments are all same as default, listed for clarity clean_text=True, handle_chinese_chars=True, lowercase=True, ) dataset = load_dataset(dataset_name, dataset_config_name) # Build an iterator over this dataset def batch_iterator(): batch_length = 1000 for i in range(0, len(dataset["train"]), batch_length): yield dataset["train"][i : i + batch_length]["text"] # Train tokenizer.train_from_iterator( batch_iterator(), length=len(dataset["train"]), # following arguments are all same as default, listed for clarity vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) return tokenizer
def train_tokenizer(file_iterator): # Initialize an empty tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # And then train #tokenizer.train_from_iterator( tokenizer.train_from_iterator( file_iterator, vocab_size=1000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) # Save the files tokenizer.save_model(args.out, args.name)