def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): """Test that custom tokenizer with not all functions defined or empty properties can be serialized and deserialized correctly (see #2494, #4991).""" tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search) tokenizer_bytes = tokenizer.to_bytes() Tokenizer(en_vocab).from_bytes(tokenizer_bytes) # test that empty/unset values are set correctly on deserialization tokenizer = get_lang_class("en")().tokenizer tokenizer.token_match = re.compile("test").match assert tokenizer.rules != {} assert tokenizer.token_match is not None assert tokenizer.url_match is not None tokenizer.from_bytes(tokenizer_bytes) assert tokenizer.rules == {} assert tokenizer.token_match is None assert tokenizer.url_match is None tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{ "ORTH": "ABC" }, { "ORTH": "." }]}) tokenizer.rules = {} tokenizer_bytes = tokenizer.to_bytes() tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) assert tokenizer_reloaded.rules == {}
def test_tokenizer_flush_specials(en_vocab): suffix_re = re.compile(r"[\.]$") rules = {"a a": [{"ORTH": "a a"}]} tokenizer1 = Tokenizer( en_vocab, suffix_search=suffix_re.search, rules=rules, ) assert [t.text for t in tokenizer1("a a.")] == ["a a", "."] tokenizer1.rules = {} assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): """Test that custom tokenizer with not all functions defined or empty properties can be serialized and deserialized correctly (see #2494, #4991).""" tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search) tokenizer_bytes = tokenizer.to_bytes() Tokenizer(en_vocab).from_bytes(tokenizer_bytes) tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{ "ORTH": "ABC", "ORTH": "." }]}) tokenizer.rules = {} tokenizer_bytes = tokenizer.to_bytes() tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) assert tokenizer_reloaded.rules == {}