def test_tokenizer_add_special_case_tag(text, tokens): vocab = Vocab() tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer.add_special_case(text, tokens) doc = tokenizer(text) assert doc[0].text == tokens[0]["orth"] assert doc[0].norm_ == tokens[0]["norm"] assert doc[1].text == tokens[1]["orth"]
def test_tokenizer_add_special_case_tag(text, tokens): vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer.add_special_case(text, tokens) doc = tokenizer(text) assert doc[0].text == tokens[0]["orth"] assert doc[0].tag_ == tokens[0]["tag"] assert doc[0].pos_ == "NOUN" assert doc[1].text == tokens[1]["orth"]
""" References: 1. https://spacy.io/api/tokenizer 1. https://github.com/explosion/spaCy/issues/396 """ import spacy nlp = spacy.load('en_core_web_lg') from spacy.attrs import ORTH, LEMMA from spacy.tokenizer import Tokenizer exceptions = [{"us": [{ORTH: "us"}, {ORTH: "-east", LEMMA: "east"}]}] tokenizer = Tokenizer(nlp.vocab) tokenizer.add_special_case("""us-east-1""", exceptions)