Exemplo n.º 1
0
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab()
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[0].norm_ == tokens[0]["norm"]
    assert doc[1].text == tokens[1]["orth"]
Exemplo n.º 2
0
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[0].tag_ == tokens[0]["tag"]
    assert doc[0].pos_ == "NOUN"
    assert doc[1].text == tokens[1]["orth"]
Exemplo n.º 3
0
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[0].tag_ == tokens[0]["tag"]
    assert doc[0].pos_ == "NOUN"
    assert doc[1].text == tokens[1]["orth"]
Exemplo n.º 4
0
"""

References:
1. https://spacy.io/api/tokenizer
1. https://github.com/explosion/spaCy/issues/396
"""
import spacy
nlp = spacy.load('en_core_web_lg')
from spacy.attrs import ORTH, LEMMA
from spacy.tokenizer import Tokenizer
exceptions = [{"us": [{ORTH: "us"}, {ORTH: "-east", LEMMA: "east"}]}]
tokenizer = Tokenizer(nlp.vocab)
tokenizer.add_special_case("""us-east-1""", exceptions)