def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Sequence.new([NFKC.new(), Lowercase.new()]) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new() tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: str = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.add_special_tokens([unk_token]) tokenizer.normalizer = NFKC.new() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new( replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace.new( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__(self, vocab_file: Optional[str]=None, merges_file: Optional[str]=None, add_prefix_space: bool=False): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = NFKC.new() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel.new() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)