def test_instantiate(self): bad = PreTokenizer.custom(TestCustomPreTokenizer.BadCustomPretok()) good = PreTokenizer.custom(TestCustomPreTokenizer.GoodCustomPretok()) assert isinstance(bad, PreTokenizer) assert isinstance(good, PreTokenizer) with pytest.raises(Exception, match="TypeError: pre_tokenize()"): bad.pre_tokenize_str("Hey there!") assert good.pre_tokenize_str("Hey there!") == [ ("Hey there!", (0, 10)), ("Hey there!", (0, 10)), ]
def load_custom_tokenizer(tokenizer_file: str) -> Tokenizer: """ Tokenizerのロード処理:tokenizer.json からTokenizerをロードし、custome PreTokenizerをセットする。 """ tokenizer = Tokenizer.from_file(tokenizer_file) # ダミー注入したRustベースのPreTokenizerを、custom PreTokenizerで上書き。 tokenizer.pre_tokenizer = PreTokenizer.custom(MecabPreTokenizer()) return tokenizer
def train_custom_tokenizer(files: List[str], tokenizer_file: str, **kwargs) -> BertWordPieceTokenizer: """ Tokenizerの学習・保存処理:custom PreTokenizer付きのTokenizerを学習・保存する。 """ tokenizer = BertWordPieceTokenizer( handle_chinese_chars=False, # for japanese strip_accents=False, # for japanese ) tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom( MecabPreTokenizer()) # 与えられたコーパスファイル集合からサブワード分割を学習 tokenizer.train(files, **kwargs) # vocab情報に加えて、前処理等パラメータ情報を含んだトークナイザ設定のJSONを保存 # NOTE: Pythonで書かれたcustom PreTokenizerはシリアライズできないので、RustベースのPreTokenizerをダミー注入してシリアライズ # JSONにはダミーのPreTokenizerが記録されるので、ロード時にcustom PreTokenizerを再設定する必要がある。 tokenizer._tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.save(tokenizer_file) # (Optional) .txt形式のvocabファイルは f"vocab-{filename}.txt" で保存される(外部の処理で欲しい場合) filename = "wordpiece" model_files = tokenizer._tokenizer.model.save( str(pathlib.Path(tokenizer_file).parent), filename) return tokenizer
def custom_tokenizer_from_pretrained( tokenizer_file_or_name: str, cache_dir: Optional[str] = None) -> PreTrainedTokenizerFast: """Load BertWordPieceTokenizer from tokenizer.json. This is necessary due to the following reasons: - BertWordPieceTokenizer cannot load from tokenizer.json via .from_file() method - Tokenizer.from_file(tokenizer_file) cannot be used because MecabPretokenizer is not a valid native PreTokenizer. """ if os.path.exists(tokenizer_file_or_name): if not os.path.isdir(tokenizer_file_or_name): tokenizer_dir = os.path.dirname(tokenizer_file_or_name) pt_tokenizer = AutoTokenizer.from_pretrained( tokenizer_dir, cache_dir=cache_dir, ) # This is necessary for pt_tokenizer.save_pretrained(save_path) _tokenizer = Tokenizer.from_file(tokenizer_file_or_name) _tokenizer.pre_tokenizer = PreTokenizer.custom(MecabPreTokenizer()) pt_tokenizer._tokenizer = _tokenizer else: pt_tokenizer = AutoTokenizer.from_pretrained( tokenizer_file_or_name, cache_dir=cache_dir, ) else: # trf>=4.0.0: PreTrainedTokenizerFast by default # NOTE: AutoTokenizer doesn't load PreTrainedTokenizerFast... pt_tokenizer = BertTokenizerFast.from_pretrained( tokenizer_file_or_name, cache_dir=cache_dir, ) return pt_tokenizer
def load_janome_tokenizer(tokenizer_path) -> Tokenizer: tokenizer = Tokenizer.from_file(str(tokenizer_path)) tokenizer.pre_tokenizer = Sequence([ Whitespace(), PreTokenizer.custom(JanomePreTokenizer()), ]) tokenizer.decoder = Decoder.custom(JanomeDecoder()) return tokenizer
def test_camel_case(self): class CamelCasePretok: def get_state(self, c): if c.islower(): return "lower" elif c.isupper(): return "upper" elif c.isdigit(): return "digit" else: return "rest" def split(self, n, normalized): i = 0 # states = {"any", "lower", "upper", "digit", "rest"} state = "any" pieces = [] for j, c in enumerate(normalized.normalized): c_state = self.get_state(c) if state == "any": state = c_state if state != "rest" and state == c_state: pass elif state == "upper" and c_state == "lower": pass else: pieces.append(normalized[i:j]) i = j state = c_state pieces.append(normalized[i:]) return pieces def pre_tokenize(self, pretok): pretok.split(self.split) camel = PreTokenizer.custom(CamelCasePretok()) assert camel.pre_tokenize_str("HeyThere!?-ThisIsLife") == [ ("Hey", (0, 3)), ("There", (3, 8)), ("!", (8, 9)), ("?", (9, 10)), ("-", (10, 11)), ("This", (11, 15)), ("Is", (15, 17)), ("Life", (17, 21)), ]
def build_hf_tokenizer(kmer_length: int, kmer_stride: int, alphabet: str, unk_token: str = "?") -> Tokenizer: """Build a full huggingface tokenizer from the inputs. Note: Same arguments taken as KmerPreTokenizer. """ kmer_pre = KmerPreTokenizer(kmer_length=kmer_length, kmer_stride=kmer_stride, alphabet=alphabet, unk_token=unk_token) tokenizer = Tokenizer( models.WordLevel(vocab=kmer_pre.kmer_tokenzier.vocab.stoi, unk_token=unk_token)) tokenizer.pre_tokenizer = PreTokenizer.custom(kmer_pre) tokenizer.decoder = ByteLevel() return tokenizer
def load_pretrained_tokenizer( tokenizer_file: str, cache_dir: Optional[str] = None) -> PreTrainedTokenizerFast: """Load BertWordPieceTokenizer from tokenizer.json. This is necessary due to the following reasons: - BertWordPieceTokenizer cannot load from tokenizer.json via .from_file() method - Tokenizer.from_file(tokenizer_file) cannot be used because MecabPretokenizer is not a valid native PreTokenizer. """ tokenizer = Tokenizer.from_file(tokenizer_file) tokenizer.pre_tokenizer = PreTokenizer.custom(MecabPreTokenizer()) tokenizer_dir = os.path.dirname(tokenizer_file) pt_tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained( tokenizer_dir, cache_dir=cache_dir, ) # This is necessary for pt_tokenizer.save_pretrained(save_path) pt_tokenizer._tokenizer = tokenizer # ._tokenizer return pt_tokenizer
class CustomNormalizer: def normalize(self, normalized: NormalizedString): # Most of these can be replaced by a `Sequence` combining some provided Normalizer, # (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ]) # and it should be the prefered way. That being said, here is an example of the kind # of things that can be done here: normalized.nfkc() normalized.filter(lambda char: not char.isnumeric()) normalized.replace(Regex("\s+"), " ") normalized.lowercase() # This section shows how to attach these custom components to the Tokenizer tok = Tokenizer(BPE()) tok.normalizer = Normalizer.custom(CustomNormalizer()) tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer()) tok.decoder = Decoder.custom(CustomDecoder()) input = "永和服装饰品有限公司" print("PreTokenize:", input) print(tok.pre_tokenizer.pre_tokenize_str(input)) # [('永和', (0, 2)), ('服装', (2, 4)), ('饰品', (4, 6)), ('有限公司', (6, 10))] input = "112233" print("PreTokenize:", input) print(tok.pre_tokenizer.pre_tokenize_str(input)) # [('1', (0, 1)), ('122', (1, 4)), ('3', (4, 5)), ('3', (5, 6))] input = "1234 ℌ𝔢𝔩𝔩𝔬 𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹ℯ𝒶𝓇 𝕕𝕖𝕒𝕣 𝕗𝕣𝕚𝕖𝕟𝕕!" print("Normalize:", input) print(tok.normalizer.normalize_str(input))
def load_jieba_tokenizer(tokenizer_path) -> Tokenizer: tokenizer = Tokenizer.from_file(str(tokenizer_path)) tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer()) tokenizer.decoder = Decoder.custom(JiebaDecoder()) return tokenizer
def __setstate__(self, d): self.__dict__ = d vocab = self.__dict__["_tokenizer"].get_vocab() self.__dict__["_tokenizer"].pre_tokenizer = PreTokenizer.custom( JiebaPreTokenizer(vocab))