def test_tokenizer_initial_special_case_explain(en_vocab): tokenizer = Tokenizer( en_vocab, token_match=re.compile("^id$").match, rules={ "id": [{"ORTH": "i"}, {"ORTH": "d"}], }, ) tokens = [t.text for t in tokenizer("id")] explain_tokens = [t[1] for t in tokenizer.explain("id")] assert tokens == explain_tokens
def test_tokenizer_explain_special_matcher(en_vocab): suffix_re = re.compile(r"[\.]$") infix_re = re.compile(r"[/]") rules = {"a.": [{"ORTH": "a."}]} tokenizer = Tokenizer( en_vocab, rules=rules, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, ) tokens = [t.text for t in tokenizer("a/a.")] explain_tokens = [t[1] for t in tokenizer.explain("a/a.")] assert tokens == explain_tokens
def test_tokenizer_infix_prefix(en_vocab): # the prefix and suffix matches overlap in the suffix lookbehind infixes = ["±"] suffixes = ["%"] infix_re = compile_infix_regex(infixes) suffix_re = compile_suffix_regex(suffixes) tokenizer = Tokenizer( en_vocab, infix_finditer=infix_re.finditer, suffix_search=suffix_re.search, ) tokens = [t.text for t in tokenizer("±10%")] assert tokens == ["±10", "%"] explain_tokens = [t[1] for t in tokenizer.explain("±10%")] assert tokens == explain_tokens
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): # the prefix and suffix matches overlap in the suffix lookbehind prefixes = ["a(?=.)"] suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."] prefix_re = compile_prefix_regex(prefixes) suffix_re = compile_suffix_regex(suffixes) tokenizer = Tokenizer( en_vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, ) tokens = [t.text for t in tokenizer("a10.")] assert tokens == ["a", "10", "."] explain_tokens = [t[1] for t in tokenizer.explain("a10.")] assert tokens == explain_tokens