def test_cannot_train_different_model(self): tokenizer = Tokenizer(models.BPE()) trainer = trainers.UnigramTrainer(show_progress=False) with pytest.raises(Exception, match="UnigramTrainer can only train a Unigram"): tokenizer.train([], trainer)
def test_can_modify(self): trainer = trainers.UnigramTrainer( vocab_size=12345, show_progress=False, special_tokens=["1", AddedToken("2", lstrip=True)], initial_alphabet=["a", "b", "c"], ) assert trainer.vocab_size == 12345 assert trainer.show_progress == False assert trainer.special_tokens == [ AddedToken("1", normalized=False), AddedToken("2", lstrip=True, normalized=False), ] assert sorted(trainer.initial_alphabet) == ["a", "b", "c"] # Modify these trainer.vocab_size = 20000 assert trainer.vocab_size == 20000 trainer.show_progress = True assert trainer.show_progress == True trainer.special_tokens = [] assert trainer.special_tokens == [] trainer.initial_alphabet = ["d", "z"] assert sorted(trainer.initial_alphabet) == ["d", "z"]
def train_from_iterator( self, iterator: Union[Iterator[str], Iterator[Iterator[str]]], vocab_size: int = 8000, show_progress: bool = True, special_tokens: List[Union[str, AddedToken]] = [], unk_token: Optional[str] = None, ): """ Train the model using the given iterator Args: iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`): Any iterator over strings or list of strings vocab_size (:obj:`int`): The size of the final vocabulary, including all tokens and alphabet. show_progress (:obj:`bool`): Whether to show progress bars while training. special_tokens (:obj:`List[Union[str, AddedToken]]`): A list of special tokens the model should know of. unk_token (:obj:`str`, `optional`): The unknown token to be used by the model. """ trainer = trainers.UnigramTrainer( vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress, unk_token=unk_token, ) self._tokenizer.train_from_iterator(iterator, trainer=trainer)
def train( self, files: Union[str, List[str]], vocab_size: int = 8000, show_progress: bool = True, special_tokens: List[Union[str, AddedToken]] = [], unk_token: Optional[str] = None, ): """ Train the model using the given files Args: files (:obj:`List[str]`): A list of path to the files that we should use for training vocab_size (:obj:`int`): The size of the final vocabulary, including all tokens and alphabet. show_progress (:obj:`bool`): Whether to show progress bars while training. special_tokens (:obj:`List[Union[str, AddedToken]]`): A list of special tokens the model should know of. unk_token (:obj:`str`, `optional`): The unknown token to be used by the model. """ trainer = trainers.UnigramTrainer( vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress, unk_token=unk_token, ) if isinstance(files, str): files = [files] self._tokenizer.train(files, trainer=trainer)
def test_train_with_special_tokens(self): filename = "tests/data/dummy-unigram-special_tokens-train.txt" with open(filename, "w") as f: f.write( """ [CLS] The Zen of Python, by Tim Peters [SEP] [CLS] Beautiful is better than ugly. [SEP] [CLS] Explicit is better than implicit. [SEP] [CLS] Simple is better than complex. [SEP] [CLS] Complex is better than complicated. [SEP] [CLS] Flat is better than nested. [SEP] [CLS] Sparse is better than dense. [SEP] [CLS] Readability counts. [SEP] [CLS] Special cases aren't special enough to break the rules. [SEP] [CLS] Although practicality beats purity. [SEP] [CLS] Errors should never pass silently. [SEP] [CLS] Unless explicitly silenced. [SEP] [CLS] In the face of ambiguity, refuse the temptation to guess. [SEP] [CLS] There should be one-- and preferably only one --obvious way to do it. [SEP] [CLS] Although that way may not be obvious at first unless you're Dutch. [SEP] [CLS] Now is better than never. [SEP] [CLS] Although never is often better than *right* now. [SEP] [CLS] If the implementation is hard to explain, it's a bad idea. [SEP] [CLS] If the implementation is easy to explain, it may be a good idea. [SEP] [CLS] Namespaces are one honking great idea -- let's do more of those! [SEP] """ ) tokenizer = Tokenizer(models.Unigram()) trainer = trainers.UnigramTrainer( show_progress=False, special_tokens=["[PAD]", "[SEP]", "[CLS]"], unk_token="[UNK]" ) tokenizer.train([filename], trainer=trainer) assert tokenizer.encode("[CLS] This is a test [SEP]").tokens == [ "[CLS]", " T", "h", "i", "s", " is ", "a", " ", "te", "s", "t ", "[SEP]", ]
def train_from_iterator( self, iterator: Union[Iterator[str], Iterator[Iterator[str]]], vocab_size: int = 8000, show_progress: bool = True, special_tokens: List[Union[str, AddedToken]] = [], ): """ Train the model using the given iterator """ trainer = trainers.UnigramTrainer( vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress, ) self._tokenizer.train_from_iterator(iterator, trainer=trainer)
def train( self, files: Union[str, List[str]], vocab_size: int = 8000, show_progress: bool = True, special_tokens: List[Union[str, AddedToken]] = [], ): """ Train the model using the given files """ trainer = trainers.UnigramTrainer( vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress, ) if isinstance(files, str): files = [files] self._tokenizer.train(trainer, files)
def train_from_iterator( self, iterator: Union[Iterator[str], Iterator[Iterator[str]]], vocab_size: int = 8000, show_progress: bool = True, ): """Train the model using the given iterator""" trainer = trainers.UnigramTrainer( vocab_size=vocab_size, special_tokens=self.special_tokens_list, show_progress=show_progress, ) self._tokenizer.train_from_iterator(iterator, trainer=trainer) self.add_unk_id()
def get_tokenizer_trainer(): # START init_tokenizer_trainer from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers tokenizer = Tokenizer(models.Unigram()) tokenizer.normalizer = normalizers.NFKC() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoders = decoders.ByteLevel() trainer = trainers.UnigramTrainer( vocab_size=20000, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), special_tokens=["<PAD>", "<BOS>", "<EOS>"], ) # END init_tokenizer_trainer trainer.show_progress = False return tokenizer, trainer
def train( self, files: Union[str, List[str]], vocab_size: int = 8000, show_progress: bool = True, ): """Train the model using the given files""" trainer = trainers.UnigramTrainer( vocab_size=vocab_size, special_tokens=self.special_tokens_list, show_progress=show_progress, ) if isinstance(files, str): files = [files] self._tokenizer.train(files, trainer=trainer) self.add_unk_id()
bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer ids = bert_tokenizer.encode(sentences[10]).ids bert_tokenizer.decode(ids) from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers tokenizer = Tokenizer(models.Unigram()) tokenizer.normalizer = normalizers.NFKC() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoders = decoders.ByteLevel() trainer = trainers.UnigramTrainer( vocab_size=20000, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), special_tokens=["<PAD>", "<BOS>", "<EOS>"], ) tokenizer.train_from_iterator(sentences, trainer=trainer) tokenizer.encode(sentences[4]).ids tokenizer.decode(tokenizer.encode(sentences[4]).ids) tokenizer.save('bert_out/test2') tokenizer.save_pretrained('bert_out/test')