def init_tokenizer(): # import spacy # import os from spacy.lang.en import English from spacy.attrs import ORTH # nlp = spacy.load(os.environ.get('SPACY_MODEL', 'en'), disable=['tagger', 'ner']) # TODO: this may have compatibility issue tokenizer = English().Defaults.create_tokenizer() # add special segmenting case for spacy tokenizer tokenizer.add_special_case('I.', [{ORTH: "I"}, {ORTH: "."}]) for token in RESERVED_TOKENS: tokenizer.add_special_case(token, [{ORTH: token}]) return tokenizer
def test_tokenizer_special_cases_with_affixes_preserve_spacy(): tokenizer = English().tokenizer # reset all special cases tokenizer.rules = {} # in-place modification (only merges) text = "''a'' " tokenizer.add_special_case("''", [{"ORTH": "''"}]) assert tokenizer(text).text == text # not in-place (splits and merges) tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}]) text = "ab ab ab ''ab ab'' ab'' ''ab" assert tokenizer(text).text == text
def test_issue1061(): """Test special-case works after tokenizing. Was caching problem.""" text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_." tokenizer = English().tokenizer doc = tokenizer(text) assert "MATH" in [w.text for w in doc] assert "_MATH_" not in [w.text for w in doc] tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}]) doc = tokenizer(text) assert "_MATH_" in [w.text for w in doc] assert "MATH" not in [w.text for w in doc] # For sanity, check it works when pipeline is clean. tokenizer = English().tokenizer tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}]) doc = tokenizer(text) assert "_MATH_" in [w.text for w in doc] assert "MATH" not in [w.text for w in doc]