def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: str = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.add_special_tokens([unk_token]) tokenizer.normalizer = NFKC() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__( self, vocab: Optional[str] = None, replacement: str = "▁", add_prefix_space: bool = True, ): if vocab is not None: # Let Unigram(..) fail if only one of them is None tokenizer = Tokenizer(Unigram(vocab)) else: tokenizer = Tokenizer(Unigram()) tokenizer.normalizer = normalizers.Sequence([ normalizers.Nmt(), normalizers.NFKC(), ]) tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), ]) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceUnigram", "replacement": replacement, "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, unk_token: Union[str, AddedToken] = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None, ): if vocab is not None and merges is not None: tokenizer = Tokenizer( BPE(vocab, merges, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE()) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = NFKC() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, no_consecutive_space: bool = True, dropout: Optional[float] = None, clean_text: bool = True, handle_chinese_chars: bool = True, separate_numbers: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", special_chars: str = SPECIAL_CHARS, zh_norm: bool = True, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE()) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = Sequence([ NFKC(), BertNormalizer(clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, separate_numbers=separate_numbers, strip_accents=strip_accents, lowercase=lowercase, special_chars=special_chars, zh_norm=zh_norm) ]) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, no_consecutive_space=no_consecutive_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, no_consecutive_space=no_consecutive_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "no_consecutive_space": no_consecutive_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def from_spm(filename: str): try: import sys sys.path.append(".") import sentencepiece_model_pb2 as model except Exception: raise Exception( "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required." ) m = model.ModelProto() m.ParseFromString(open(filename, "rb").read()) precompiled_charsmap = m.normalizer_spec.precompiled_charsmap vocab = [(piece.piece, piece.score) for piece in m.pieces] unk_id = m.trainer_spec.unk_id model_type = m.trainer_spec.model_type if model_type != 1: raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) data = {"unk_id": unk_id, "vocab": vocab} replacement = "▁" add_prefix_space = True out_vocab_filename = f"{filename}.json" try: with open(out_vocab_filename, "w") as f: json.dump(data, f, indent=4) tokenizer = Tokenizer(Unigram(out_vocab_filename)) finally: os.remove(out_vocab_filename) tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space ), ] ) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space ) parameters = { "model": "SentencePieceUnigram", } obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) BaseTokenizer.__init__(obj, tokenizer, parameters) return obj
def __init__( self, replacement: str = "▁", add_prefix_space: bool = True, unk_token: Union[str, AddedToken] = "<unk>", eos_token: Union[str, AddedToken] = "</s>", pad_token: Union[str, AddedToken] = "<pad>", ): self.special_tokens = { "pad": {"id": 0, "token": pad_token}, "eos": {"id": 1, "token": eos_token}, "unk": {"id": 2, "token": unk_token}, } self.special_tokens_list = [None] * len(self.special_tokens) for token_dict in self.special_tokens.values(): self.special_tokens_list[token_dict["id"]] = token_dict["token"] tokenizer = Tokenizer(Unigram()) tokenizer.normalizer = normalizers.Sequence( [ normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " "), normalizers.Lowercase(), ] ) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), pre_tokenizers.Digits(individual_digits=True), pre_tokenizers.Punctuation(), ] ) tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.post_processor = TemplateProcessing( single=f"$A {self.special_tokens['eos']['token']}", special_tokens=[(self.special_tokens["eos"]["token"], self.special_tokens["eos"]["id"])], ) parameters = { "model": "SentencePieceUnigram", "replacement": replacement, "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def converted(self) -> Tokenizer: tokenizer = self.tokenizer(self.proto) # Tokenizer assemble tokenizer.normalizer = self.normalizer(self.proto) replacement = "▁" add_prefix_space = True tokenizer.pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space) tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) post_processor = self.post_processor() if post_processor: tokenizer.post_processor = post_processor return tokenizer
def __init__( self, vocab: Union[str, List], merges: Union[str, None], unk_token: str = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None, normalize: bool = True, ): if merges: n_model = "BPE" tokenizer = Tokenizer( BPE( vocab, # type: ignore merges, unk_token=unk_token, fuse_unk=True, )) else: n_model = "Unigram" tokenizer = Tokenizer(Unigram(vocab, 1)) # type: ignore if normalize: tokenizer.normalizer = NFKC() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, ) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, ) parameters = { "model": f"SentencePiece{n_model}", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def converted(self): tokenizer = self.tokenizer(self.proto) # Tokenizer assemble tokenizer.normalizer = self.normalizer(self.proto) replacement = "▁" add_prefix_space = True tokenizer.pre_tokenizer = Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) post_processor = self.post_processor(tokenizer) if post_processor: tokenizer.post_processor = post_processor # TODO what parameters should we give ? parameters = {} return BaseTokenizer(tokenizer, parameters)
def __init__(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None, dropout: float = None, max_length: Optional[int] = 64) -> None: """Constructor Args: vocab (Dict[str, int]): A dictionary of string keys and their ids. merges (List[Tuple[str, str]]): A list of pairs of tokens. dropout (float): BPE dropout max_length (int, optional): The max length at which to truncate. Defaults to `64`. """ self.tokenizer = Tokenizer( BPE(vocab, merges, dropout=dropout, unk_token=self.unk_token)) self.tokenizer.normalizer = BertNormalizer() # noqa self.tokenizer.pre_tokenizer = pre_tokenizers.Metaspace() # noqa self.tokenizer.decoder = decoders.Metaspace() # noqa self.tokenizer.add_special_tokens([self.pad_token, self.unk_token]) self.tokenizer.enable_padding(pad_token=self.pad_token) self.tokenizer.enable_truncation(max_length)
def train_custom_tokenizer(dataset, token_model, tknzr_file, vocab_size, vocab=None, pretrain_fast=False, max_input_chars_per_word=None, eos_token=None, bos_token=None, pad_token=None, mask_token=None, unk_token=None): """ Building a Tokenizer using HuggingFace library. The pipeline seems to be: - Model : algorithm that tokenizes, it is a mandatory component. There are only 4 models implemented (BPE, Unigram, WordLevel, WordPiece) - Normalizer : some preprocessing that could happen before, but doesn't necessarily have to - Pre-Tokenizer : splitting the input according to some rules - Post-Processing : needing to add some tokens/input after (mostly seems to be eos, bos tokens) - Decoder : certain previous pipeline steps need to be reversed for proper decoding - Trainer : The corresponding training algorithm for the model Note : Some pre-processing might need to happen beforehand in previous functions (might be easier using pandas before) Input token_model (str) : algorithm to use for tokenization dataset (class) : a python iterator that goes through the data to be used for training token_dir (str) : directory with tokenizers vocab_size (int) : size of the vocabulary to use tokenFilename (str) : filename of particular token we want to train. Will overwrite previously save files. vocab (list of str) : models other than BPE can use non-mandatory vocab as input max_input_chars_per_word : used for WordPiece Output tokenizer : huggingFace Tokenizer object, our fully trainer tokenizer """ special_token_lst = [ pad_token, bos_token, eos_token, mask_token, unk_token ] # NFKC normalizer_lst = [] pre_tokenizer_lst = [Whitespace, ByteLevel] decoder_lst = [] bos_idx = special_token_lst.index(bos_token) eos_idx = special_token_lst.index(eos_token) if token_model == 'BPE': model = BPE(unk_token=unk_token) Trainer = BpeTrainer elif token_model == 'Unigram': model = Unigram(vocab=vocab) Trainer = UnigramTrainer elif token_model == 'WordLevel': model = WordLevel(unk_token=unk_token, vocab=vocab) Trainer = WordLevelTrainer elif token_model == 'WordPiece': model = WordPiece(unk_token=unk_token, vocab=vocab, max_input_chars_per_word=max_input_chars_per_word) Trainer = WordPieceTrainer else: error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \ % VALID_TOKENIZATIONS raise SystemExit(error_msg) # instantiation tokenizer = Tokenizer(model) # Select a tokenization trainer if vocab_size is None: trainer = Trainer(show_progress=True, special_tokens=special_token_lst) else: trainer = Trainer(vocab_size=vocab_size, show_progress=True, special_tokens=special_token_lst) # Set the normalizer tokenizer.normalizer = normalizers.Sequence( [fcn() for fcn in normalizer_lst]) # Set the pre-tokenizer tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [fcn() for fcn in pre_tokenizer_lst]) # Set the post-processing tokenizer.post_processor = processors.TemplateProcessing( single=bos_token + " $A " + eos_token, special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)], # pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1", ) # Set the decoder if ByteLevel in pre_tokenizer_lst: tokenizer.decoder = decoders.ByteLevel() if Metaspace in pre_tokenizer_lst: tokenizer.decoder = decoders.Metaspace() if token_model == 'WordPiece': tokenizer.decoder = decoders.WordPiece() # creating iterator def batch_iterator(): for i in np.arange(0, len(dataset)): yield dataset[i] # train call tokenizer.train_from_iterator(trainer=trainer, iterator=batch_iterator(), length=len(dataset)) if Path(tknzr_file).exists(): print(f"Warning : overwriting previously save tokenizer with\ same filename ( {tknzr_file} ).") tokenizer.save(tknzr_file) if pretrain_fast: tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file) else: tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file) tokenizer.pad_token = pad_token tokenizer.mask_token = mask_token return tokenizer
def __init__( self, vocab: Union[str, List], merges: List[Tuple[str, str]], bos_token: str = "<s>", eos_token: str = "</s>", sep_token: str = "</s>", cls_token: str = "<s>", pad_token: str = "<pad>", unk_token: str = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None, normalize: bool = True, ): bpe = BPE( vocab=vocab, merges=merges, unk_token=unk_token, fuse_unk=True, ) tokenizer = Tokenizer(bpe) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, ) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, ) if normalize: tokenizer.normalizer = NFKC() parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters) bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) self.add_special_tokens([ bos_token, eos_token, sep_token, cls_token, unk_token, pad_token, ])