def test_get_set_components(self): toki = Tokenizer(models.BPE()) toki.normalizer = normalizers.NFC() toki.pre_tokenizer = pre_tokenizers.ByteLevel() toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1)) toki.decoder = decoders.ByteLevel() tokenizer = BaseTokenizer(toki) assert isinstance(tokenizer.model, models.BPE) assert isinstance(tokenizer.normalizer, normalizers.NFC) assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel) assert isinstance(tokenizer.post_processor, processors.BertProcessing) assert isinstance(tokenizer.decoder, decoders.ByteLevel) tokenizer.model = models.Unigram() assert isinstance(tokenizer.model, models.Unigram) tokenizer.normalizer = normalizers.NFD() assert isinstance(tokenizer.normalizer, normalizers.NFD) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace) tokenizer.post_processor = processors.ByteLevel() assert isinstance(tokenizer.post_processor, processors.ByteLevel) tokenizer.decoder = decoders.WordPiece() assert isinstance(tokenizer.decoder, decoders.WordPiece)
def converted(self) -> Tokenizer: tokenizer_info_str = "#version:" token_suffix = "</w>" vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) if tokenizer_info_str in merges[0][0]: merges = merges[1:] tokenizer = Tokenizer( BPE( vocab, merges, dropout=None, unk_token=self.original_tokenizer.unk_token, end_of_word_suffix=token_suffix, )) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False, strip_accents=False) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.decoder = decoders.BPEDecoder(suffix=token_suffix) tokenizer.post_processor = processors.BertProcessing( sep=(self.original_tokenizer.sep_token, self.original_tokenizer.sep_token_id), cls=(self.original_tokenizer.cls_token, self.original_tokenizer.cls_token_id), ) return tokenizer
def __init__( self, target_vocab, ): special_tokens = { "pad_token": "[PAD]", "unk_token": "[UNK]", "sep_token": "[SEP]", "cls_token": "[CLS]", "mask_token": "[MASK]", } vocab = {} vocab[special_tokens["pad_token"]] = 0 tkn_idx = 1 unused_ctr = 0 # not sure whether that's relevant, but fill 1..99 and 105...999 # with unused tokens to keep BERT's tokenizer style # as a result, one can easily identify special tokens: # 0 is padding # 1xx are other special tokens # any four-digit tokens are actual payload fill_tokens = False if(fill_tokens): while(tkn_idx < 100): vocab[f"[unused{unused_ctr}]"] = tkn_idx tkn_idx += 1 unused_ctr += 1 for token in ["unk_token", "cls_token", "sep_token", "mask_token"]: vocab[special_tokens[token]] = tkn_idx tkn_idx += 1 if(fill_tokens): while(tkn_idx < 1000): vocab[f"[unused{unused_ctr}]"] = tkn_idx tkn_idx += 1 unused_ctr += 1 for word in target_vocab: vocab[word] = tkn_idx tkn_idx += 1 tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token=special_tokens["unk_token"])) tokenizer.add_special_tokens(list(special_tokens.values())) tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() sep_token_id = tokenizer.token_to_id(special_tokens["sep_token"]) cls_token_id = tokenizer.token_to_id(special_tokens["cls_token"]) tokenizer.post_processor = processors.BertProcessing( (special_tokens["sep_token"], sep_token_id), (special_tokens["cls_token"], cls_token_id) ) parameters = special_tokens parameters["model"] = "WordLevel" super().__init__(tokenizer, parameters) tokenizer.save(PRETRAINED_TOKENIZER_FILE)