def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) # # Let the tokenizer know about special tokens if they are part of the vocab # if tokenizer.token_to_id(str(self.original_tokenizer.unk_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.unk_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.sep_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.sep_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.cls_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.cls_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.pad_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.pad_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.mask_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.mask_token)]) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single= f"{cls}:2 $A:0 {sep}:0", # token_type_id is 2 for Funnel transformer pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def test_processing(self, roberta_files): tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"])) tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True) # Keeps original offsets output = tokenizer.encode("My name is John") assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)] # Trims offsets when activated tokenizer.post_processor = ByteLevel(trim_offsets=True) output = tokenizer.encode("My name is John") assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] if len(normalizer) > 0: tokenizer.normalizer = Sequence( normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit( delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def __init__( self, replacement: str = "▁", add_prefix_space: bool = True, unk_token: Union[str, AddedToken] = "<unk>", eos_token: Union[str, AddedToken] = "</s>", pad_token: Union[str, AddedToken] = "<pad>", ): self.special_tokens = { "pad": {"id": 0, "token": pad_token}, "eos": {"id": 1, "token": eos_token}, "unk": {"id": 2, "token": unk_token}, } self.special_tokens_list = [None] * len(self.special_tokens) for token_dict in self.special_tokens.values(): self.special_tokens_list[token_dict["id"]] = token_dict["token"] tokenizer = Tokenizer(Unigram()) tokenizer.normalizer = normalizers.Sequence( [ normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " "), normalizers.Lowercase(), ] ) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), pre_tokenizers.Digits(individual_digits=True), pre_tokenizers.Punctuation(), ] ) tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.post_processor = TemplateProcessing( single=f"$A {self.special_tokens['eos']['token']}", special_tokens=[(self.special_tokens["eos"]["token"], self.special_tokens["eos"]["id"])], ) parameters = { "model": "SentencePieceUnigram", "replacement": replacement, "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def from_spm(filename: str): try: import sys sys.path.append(".") import sentencepiece_model_pb2 as model except Exception: raise Exception( "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required." ) m = model.ModelProto() m.ParseFromString(open(filename, "rb").read()) precompiled_charsmap = m.normalizer_spec.precompiled_charsmap vocab = [(piece.piece, piece.score) for piece in m.pieces] unk_id = m.trainer_spec.unk_id model_type = m.trainer_spec.model_type if model_type != 1: raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) replacement = "▁" add_prefix_space = True tokenizer = Tokenizer(Unigram(vocab, unk_id)) tokenizer.normalizer = normalizers.Sequence( [ normalizers.Precompiled(precompiled_charsmap), normalizers.Replace(Regex(" {2,}"), " "), ] ) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space ) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space ) parameters = { "model": "SentencePieceUnigram", } obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) BaseTokenizer.__init__(obj, tokenizer, parameters) return obj
def wordpiece_tokenize(line): tokenizer = Tokenizer(WordPiece(wordpiece_dict3)) tokenizer.enable_padding(length=200) tokenizer.enable_truncation(max_length=200) tokenizer.pre_tokenizer = WhitespaceSplit() tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) output = tokenizer.encode(line) return (output.ids)
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) question = str(self.original_tokenizer.question_token) dot = "." cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id question_token_id = self.original_tokenizer.question_token_id dot_token_id = self.original_tokenizer.convert_tokens_to_ids(".") if self.original_tokenizer.padding_side == "right": pair = f"{cls}:0 $A:0 {question} {dot} {sep}:0 $B:1 {sep}:1" else: pair = f"{cls}:0 $A:0 {sep}:0 $B:1 {question} {dot} {sep}:1" tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=pair, special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), (question, question_token_id), (dot, dot_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def test_encode_add_special_tokens(self, roberta_files): tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"])) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) tokenizer.post_processor = RobertaProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) # Can encode with special tokens output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True) assert output_with_specials.tokens == ["<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"] # Can encode without special tokens output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False) assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
def main(args): # from tokenizers import BertWordPieceTokenizer from tokenizers import Tokenizer from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece()) # bert_tokenizer = Tokenizer(MBartTokenizer()) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() # from tokenizers.processors import TemplateProcessing # # bert_tokenizer.post_processor = TemplateProcessing( # single="[CLS] $A [SEP]", # pair="[CLS] $A [SEP] $B:1 [SEP]:1", # special_tokens=[ # ("[CLS]", 1), # ("[SEP]", 2), # ], # ) from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[PAD]", "[MASK]"] # "[SEP]", "[PAD]", "[MASK]"] ) files = glob.glob(args.text_raw_files_pattern) bert_tokenizer.train(trainer, files) os.makedirs(args.output_dir, exist_ok=True) model_files = bert_tokenizer.model.save(args.output_dir, "bert-tokenizer-kr") bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]") bert_tokenizer.save(os.path.join(args.output_dir, "bert-tokenizer-kr.json"))
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] # OpenAI normalization is the same as Bert normalizers += [BertNormalizer()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = BPEDecoder(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def create_tokenizer(sentence_list): filename = f'temp_{time.strftime("%Y%m%d-%H%M%S")}.txt' with open(filename, 'w') as f: for s in sentence_list: f.write(f'{s}\n') tokenizer = Tokenizer(WordPiece()) tokenizer.pre_tokenizer = Whitespace() tokenizer.decoder = decoders.WordPiece() tokenizer.enable_padding(pad_token='[PAD]', pad_id=0) trainer = WordPieceTrainer( vocab_size=3000, special_tokens=['[PAD]', '[S]', '[/S]', '[UNK]']) tokenizer.train(trainer, [filename]) os.remove(filename) return tokenizer
def get_tokenizer_trainer(): # START init_tokenizer_trainer from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers tokenizer = Tokenizer(models.Unigram()) tokenizer.normalizer = normalizers.NFKC() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoders = decoders.ByteLevel() trainer = trainers.UnigramTrainer( vocab_size=20000, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), special_tokens=["<PAD>", "<BOS>", "<EOS>"], ) # END init_tokenizer_trainer trainer.show_progress = False return tokenizer, trainer
def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, add_prefix_space: bool = False, do_lowercase: bool = False, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files( vocab_file, merges_file, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "")) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if do_lowercase: normalizers += [Lowercase.new()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence.new(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel.new() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def __init__( self, vocab: Union[str, List], merges: Union[str, None], unk_token: str = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None, normalize: bool = True, ): if merges: n_model = "BPE" tokenizer = Tokenizer( BPE( vocab, # type: ignore merges, unk_token=unk_token, fuse_unk=True, )) else: n_model = "Unigram" tokenizer = Tokenizer(Unigram(vocab, 1)) # type: ignore if normalize: tokenizer.normalizer = NFKC() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, ) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, ) parameters = { "model": f"SentencePiece{n_model}", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def main(args): # copy from https://github.com/xinjli/allosaurus ipa0 = [ 'I', 'a', 'aː', 'ã', 'ă', 'b', 'bʲ', 'bʲj', 'bʷ', 'bʼ', 'bː', 'b̞', 'b̤', 'b̥', 'c', 'd', 'dʒ', 'dʲ', 'dː', 'd̚', 'd̥', 'd̪', 'd̯', 'd͡z', 'd͡ʑ', 'd͡ʒ', 'd͡ʒː', 'd͡ʒ̤', 'e', 'eː', 'e̞', 'f', 'fʲ', 'fʷ', 'fː', 'g', 'gʲ', 'gʲj', 'gʷ', 'gː', 'h', 'hʷ', 'i', 'ij', 'iː', 'i̞', 'i̥', 'i̯', 'j', 'k', 'kx', 'kʰ', 'kʲ', 'kʲj', 'kʷ', 'kʷʼ', 'kʼ', 'kː', 'k̟ʲ', 'k̟̚', 'k͡p̚', 'l', 'lʲ', 'lː', 'l̪', 'm', 'mʲ', 'mʲj', 'mʷ', 'mː', 'n', 'nj', 'nʲ', 'nː', 'n̪', 'n̺', 'o', 'oː', 'o̞', 'o̥', 'p', 'pf', 'pʰ', 'pʲ', 'pʲj', 'pʷ', 'pʷʼ', 'pʼ', 'pː', 'p̚', 'q', 'r', 'rː', 's', 'sʲ', 'sʼ', 'sː', 's̪', 't', 'ts', 'tsʰ', 'tɕ', 'tɕʰ', 'tʂ', 'tʂʰ', 'tʃ', 'tʰ', 'tʲ', 'tʷʼ', 'tʼ', 'tː', 't̚', 't̪', 't̪ʰ', 't̪̚', 't͡s', 't͡sʼ', 't͡ɕ', 't͡ɬ', 't͡ʃ', 't͡ʃʲ', 't͡ʃʼ', 't͡ʃː', 'u', 'uə', 'uː', 'u͡w', 'v', 'vʲ', 'vʷ', 'vː', 'v̞', 'v̞ʲ', 'w', 'x', 'x̟ʲ', 'y', 'z', 'zj', 'zʲ', 'z̪', 'ä', 'æ', 'ç', 'çj', 'ð', 'ø', 'ŋ', 'ŋ̟', 'ŋ͡m', 'œ', 'œ̃', 'ɐ', 'ɐ̞', 'ɑ', 'ɑ̱', 'ɒ', 'ɓ', 'ɔ', 'ɔ̃', 'ɕ', 'ɕː', 'ɖ̤', 'ɗ', 'ə', 'ɛ', 'ɛ̃', 'ɟ', 'ɡ', 'ɡʲ', 'ɡ̤', 'ɡ̥', 'ɣ', 'ɣj', 'ɤ', 'ɤɐ̞', 'ɤ̆', 'ɥ', 'ɦ', 'ɨ', 'ɪ', 'ɫ', 'ɯ', 'ɯ̟', 'ɯ̥', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ', 'ɵ', 'ɸ', 'ɹ', 'ɹ̩', 'ɻ', 'ɻ̩', 'ɽ', 'ɾ', 'ɾj', 'ɾʲ', 'ɾ̠', 'ʀ', 'ʁ', 'ʁ̝', 'ʂ', 'ʃ', 'ʃʲː', 'ʃ͡ɣ', 'ʈ', 'ʉ̞', 'ʊ', 'ʋ', 'ʋʲ', 'ʌ', 'ʎ', 'ʏ', 'ʐ', 'ʑ', 'ʒ', 'ʒ͡ɣ', 'ʔ', 'ʝ', 'ː', 'β', 'β̞', 'θ', 'χ', 'ә', 'ḁ' ] ipa1, ipa2, ipa3 = ipa0.copy(), ipa0.copy(), ipa0.copy() random.shuffle(ipa1) random.shuffle(ipa2) random.shuffle(ipa3) # randomly joined to form training data passage0 = ' '.join(ipa0) passage1 = ' '.join(ipa1) passage2 = ' '.join(ipa2) passage3 = ' '.join(ipa3) data = [passage0, passage1, passage2, passage3] # setup tokenizer = Tokenizer(WordLevel(unk_token="<unk>")) # trainer = WordLevelTrainer(vocab_size=300, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) trainer = WordLevelTrainer( vocab_size=300, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) tokenizer.pre_tokenizer = Whitespace() # train the tokenizer tokenizer.train_from_iterator(data, trainer=trainer) tokenizer.save(args.outdir + '/ipa_tokenizer.json')
def __init__(self, vocab_file: Optional[str]=None, merges_file: Optional[str]=None, add_prefix_space: bool=False): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = NFKC.new() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel.new() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def create_train_bpe_tokenizer( bpe_vocab_size, asr_text_filepath='asr.txt', ttx_text_filepath='ttx.txt', save_tokenizer=True, tokenizer_filename=".\\data\\tokenizer-test.json"): tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=bpe_vocab_size) tokenizer.pre_tokenizer = Whitespace() files = [asr_text_filepath, ttx_text_filepath] files = [file for file in files if file] # Get rid of None's tokenizer.train(files, trainer) if save_tokenizer: tokenizer.save(tokenizer_filename) return tokenizer
def __init__( self, vocab_file, sep_token="<sep>", cls_token="<cls>", pad_token="<pad>", mask_token="<mask>", lowercase: bool = True, ): tokenizer = Tokenizer(WordLevel(vocab_file, unk_token=unk_token)) tokenizer.normalizer = Strip() tokenizer.pre_tokenizer = CharDelimiterSplit(" ") tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) parameters = { "model": "WordLevel", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="</w>", fuse_unk=False, unk_token=str(unk_token), )) tokenizer.normalizer = normalizers.Sequence([ normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase() ]) tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.Split( Regex( r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""" ), behavior="removed", invert=True, ), pre_tokenizers.ByteLevel(add_prefix_space=False), ]) tokenizer.decoder = decoders.ByteLevel() # Hack to have a ByteLevel and TemplaceProcessor tokenizer.post_processor = processors.RobertaProcessing( sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id), cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id), add_prefix_space=False, trim_offsets=False, ) return tokenizer
def get_tokenizer_trainer(): # START init_tokenizer from tokenizers import Tokenizer from tokenizers.models import BPE tokenizer = Tokenizer(BPE()) # END init_tokenizer # START init_trainer from tokenizers.trainers import BpeTrainer trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) # END init_trainer # START init_pretok from tokenizers.pre_tokenizers import Whitespace tokenizer.pre_tokenizer = Whitespace() # END init_pretok return tokenizer, trainer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=self.original_tokenizer.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) return tokenizer
def test_train_parallelism_with_custom_pretokenizer(self, train_files): class GoodCustomPretok: def split(self, n, normalized): # Here we just test that we can return a List[NormalizedString], it # does not really make sense to return twice the same otherwise return [normalized, normalized] def pre_tokenize(self, pretok): pretok.split(self.split) custom = pre_tokenizers.PreTokenizer.custom(GoodCustomPretok()) bpe_tokenizer = Tokenizer(models.BPE()) bpe_tokenizer.normalizer = normalizers.Lowercase() bpe_tokenizer.pre_tokenizer = custom if "TOKENIZERS_PARALLELISM" in os.environ: del os.environ["TOKENIZERS_PARALLELISM"] trainer = trainers.BpeTrainer(special_tokens=["<unk>"], show_progress=False) bpe_tokenizer.train([train_files["small"]], trainer=trainer)
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer: bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] ) bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer
def build_hf_tokenizer(kmer_length: int, kmer_stride: int, alphabet: str, unk_token: str = "?") -> Tokenizer: """Build a full huggingface tokenizer from the inputs. Note: Same arguments taken as KmerPreTokenizer. """ kmer_pre = KmerPreTokenizer(kmer_length=kmer_length, kmer_stride=kmer_stride, alphabet=alphabet, unk_token=unk_token) tokenizer = Tokenizer( models.WordLevel(vocab=kmer_pre.kmer_tokenzier.vocab.stoi, unk_token=unk_token)) tokenizer.pre_tokenizer = PreTokenizer.custom(kmer_pre) tokenizer.decoder = ByteLevel() return tokenizer
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile, vocab: AuxiliaryFile) -> AuxiliaryFile: total_lines = self._total_lines_in_file(corpus) # Create WordPiece model and add special tokens. Note that `unk_token` # is also a special token. tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token)) tokenizer.add_special_tokens(self.special_tokens + [self.unk_token]) # Use BERT-specific normalizer, pre-tokenizer and decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = WordPieceDecoder(prefix='##') tokenized = afm.create() with corpus.open('r') as src, tokenized.open('w') as dst: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm(src, desc=colorful.render( '<r>[*]</r> tokenize sentences with ' '<g>WordPiece</g> model'), total=total_lines) batch_lines = [] for line in tqdm_iter: batch_lines.append(line) # Encode the grouped batch sentences and write the tokenized # sentences to the auxiliary output file. if len(batch_lines) > self.batch_size: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') batch_lines.clear() # Encode the remainders and write to the output file. if batch_lines: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') return tokenized
def train_wordpiece_bert(): """ Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html """ from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() from tokenizers.processors import TemplateProcessing bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) bert_tokenizer.decoder = decoders.WordPiece() from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) files = [ DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a for a in ["test", "train", "valid"] ] bert_tokenizer.train(files, trainer) bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json') return bert_tokenizer
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, unk_token: Union[str, AddedToken] = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None, fuse_unk: Optional[bool] = False, ): if vocab is not None and merges is not None: tokenizer = Tokenizer( BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk)) else: tokenizer = Tokenizer(BPE()) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = NFKC() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair= f"{cls}:0 $A:0 {sep}:0 {sep}:0 $B:1 {sep}:1", # MPNet uses two [SEP] tokens special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def converted(self) -> Tokenizer: from .models.roformer.tokenization_utils import JiebaPreTokenizer vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=False, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom( JiebaPreTokenizer(vocab)) cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, unk_token=str(unk_token), end_of_word_suffix="</w>", fuse_unk=False, )) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.decoder = decoders.BPEDecoder(suffix="</w>") return tokenizer