Пример #1
0
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
    """Test that custom tokenizer with not all functions defined or empty
    properties can be serialized and deserialized correctly (see #2494,
    #4991)."""
    tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
    tokenizer_bytes = tokenizer.to_bytes()
    Tokenizer(en_vocab).from_bytes(tokenizer_bytes)

    # test that empty/unset values are set correctly on deserialization
    tokenizer = get_lang_class("en")().tokenizer
    tokenizer.token_match = re.compile("test").match
    assert tokenizer.rules != {}
    assert tokenizer.token_match is not None
    assert tokenizer.url_match is not None
    tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer.rules == {}
    assert tokenizer.token_match is None
    assert tokenizer.url_match is None

    tokenizer = Tokenizer(en_vocab,
                          rules={"ABC.": [{
                              "ORTH": "ABC"
                          }, {
                              "ORTH": "."
                          }]})
    tokenizer.rules = {}
    tokenizer_bytes = tokenizer.to_bytes()
    tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
    assert tokenizer_reloaded.rules == {}
Пример #2
0
 def custom_tokenizer(nlp):
     infix_re = re.compile(r'''[?;‘’`“”"'~]''')
     prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
     suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
     tokenizer = Tokenizer(nlp.vocab)
     tokenizer.prefix_search = prefix_re.search
     tokenizer.suffix_search = suffix_re.search
     tokenizer.infix_finditer = infix_re.finditer
     tokenizer.token_match = None
     return tokenizer