def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): """Test that custom tokenizer with not all functions defined or empty properties can be serialized and deserialized correctly (see #2494, #4991).""" tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search) tokenizer_bytes = tokenizer.to_bytes() Tokenizer(en_vocab).from_bytes(tokenizer_bytes) # test that empty/unset values are set correctly on deserialization tokenizer = get_lang_class("en")().tokenizer tokenizer.token_match = re.compile("test").match assert tokenizer.rules != {} assert tokenizer.token_match is not None assert tokenizer.url_match is not None tokenizer.from_bytes(tokenizer_bytes) assert tokenizer.rules == {} assert tokenizer.token_match is None assert tokenizer.url_match is None tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{ "ORTH": "ABC" }, { "ORTH": "." }]}) tokenizer.rules = {} tokenizer_bytes = tokenizer.to_bytes() tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) assert tokenizer_reloaded.rules == {}
def custom_tokenizer(nlp): infix_re = re.compile(r'''[?;‘’`“”"'~]''') prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) tokenizer = Tokenizer(nlp.vocab) tokenizer.prefix_search = prefix_re.search tokenizer.suffix_search = suffix_re.search tokenizer.infix_finditer = infix_re.finditer tokenizer.token_match = None return tokenizer