def test_match_preserved(matcher, EN): doc = EN.tokenizer('I like java') EN.tagger(doc) assert len(doc.ents) == 0 doc = EN.tokenizer('I like java') matcher(doc) assert len(doc.ents) == 1 EN.tagger(doc) EN.entity(doc) assert len(doc.ents) == 1
def test_match_zero_plus(matcher): matcher.add('Quote', '', {}, [ [ {'ORTH': '"'}, {'OP': '*', 'IS_PUNCT': False}, {'ORTH': '"'} ]]) doc = Doc(matcher.vocab, words='He said , " some words " ...'.split()) assert len(matcher(doc)) == 1
def test_match_zero(matcher): matcher.add('Quote', '', {}, [ [ {'ORTH': '"'}, {'OP': '!', 'IS_PUNCT': True}, {'OP': '!', 'IS_PUNCT': True}, {'ORTH': '"'} ]]) doc = Doc(matcher.vocab, words='He said , " some words " ...'.split()) assert len(matcher(doc)) == 1 doc = Doc(matcher.vocab, words='He said , " some three words " ...'.split()) assert len(matcher(doc)) == 0 matcher.add('Quote', '', {}, [ [ {'ORTH': '"'}, {'IS_PUNCT': True}, {'IS_PUNCT': True}, {'IS_PUNCT': True}, {'ORTH': '"'} ]]) assert len(matcher(doc)) == 0
def test_matcher_segfault(): nlp = spacy.load('en', parser=False, entity=False) matcher = spacy.matcher.Matcher(nlp.vocab) content = u'''a b; c''' matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]]) matcher(nlp(content)) matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]]) matcher(nlp(content)) matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]]) matcher(nlp(content))
def test_match_preserved(EN): patterns = { 'JS': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]], } matcher = Matcher(EN.vocab, patterns) doc = EN.tokenizer('I like java.') EN.tagger(doc) assert len(doc.ents) == 0 doc = EN.tokenizer('I like java.') doc.ents += tuple(matcher(doc)) assert len(doc.ents) == 1 EN.tagger(doc) EN.entity(doc) assert len(doc.ents) == 1
def extract_phone_number(nlp_doc): pattern = [{ 'ORTH': '(' }, { 'SHAPE': 'ddd' }, { 'ORTH': ')' }, { 'SHAPE': 'ddd' }, { 'ORTH': '-', 'OP': '?' }, { 'SHAPE': 'ddd' }] matcher.add('PHONE_NUMBER', None, pattern) matches = matcher(nlp_doc) for match_id, start, end in matches: span = nlp_doc[start:end] return span.text
def test_match_multi(matcher, EN): tokens = EN('I like Google Now and java best') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4), (EN.vocab.strings['PRODUCT'], 5, 6)]
def test_match_end(matcher, EN): tokens = EN('I like java') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)]
def test_match_middle(matcher, EN): tokens = EN('I like Google Now best') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4)]
def test_no_match(matcher, EN): tokens = EN('I like cheese') assert matcher(tokens) == []
def test_match_start(matcher, EN): tokens = EN('JavaScript is good') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 0, 1)]
def test_match_multi(matcher): doc = Doc(matcher.vocab, words='I like Google Now and java best'.split()) assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], doc.vocab.strings['PRODUCT'], 2, 4), (doc.vocab.strings['Java'], doc.vocab.strings['PRODUCT'], 5, 6)]
from spacy.matcher import * from spacy.attrs import * from spacy.tokens import Doc matcher = Matcher(nlp.vocab) matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}]) doc = nlp(u'Hello, world!') matches = matcher(doc) matcher.add_entity( "GoogleNow", # Entity ID -- Helps you act on the match. {"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional) ) matcher.add_pattern( "GoogleNow", # Entity ID -- Created if doesn't exist. [ # The pattern is a list of *Token Specifiers*. { # This Token Specifier matches tokens whose orth field is "Google" ORTH: "Google" }, { # This Token Specifier matches tokens whose orth field is "Now" ORTH: "Now" } ], label=None # Can associate a label to the pattern-match, to handle it better. )
def test_match_end(matcher): doc = Doc(matcher.vocab, ['I', 'like', 'java']) assert matcher(doc) == [(doc.vocab.strings['Java'], doc.vocab.strings['PRODUCT'], 2, 3)]
def test_no_match(matcher): doc = Doc(matcher.vocab, words=['I', 'like', 'cheese', '.']) assert matcher(doc) == []
def test_no_match(matcher): doc = Doc(matcher.vocab, ['I', 'like', 'cheese', '.']) assert matcher(doc) == []
def test_phrase_matcher(): vocab = Vocab(lex_attr_getters=English.Defaults.lex_attr_getters) matcher = PhraseMatcher(vocab, [Doc(vocab, words='Google Now'.split())]) doc = Doc(vocab, words=['I', 'like', 'Google', 'Now', 'best']) assert len(matcher(doc)) == 1
def test_match_start(matcher): doc = Doc(matcher.vocab, words=['JavaScript', 'is', 'good']) assert matcher(doc) == [(matcher.vocab.strings['JS'], matcher.vocab.strings['PRODUCT'], 0, 1)]
def test_match_end(matcher): doc = Doc(matcher.vocab, words=['I', 'like', 'java']) assert matcher(doc) == [(doc.vocab.strings['Java'], doc.vocab.strings['PRODUCT'], 2, 3)]
from spacy.attrs import * from spacy.tokens import Doc matcher = Matcher(nlp.vocab) matcher.add_pattern("HelloWorld", [{ LOWER: "hello" }, { IS_PUNCT: True }, { LOWER: "world" }]) doc = nlp(u'Hello, world!') matches = matcher(doc) matcher.add_entity( "GoogleNow", # Entity ID -- Helps you act on the match. { "ent_type": "PRODUCT", "wiki_en": "Google_Now" }, # Arbitrary attributes (optional) ) matcher.add_pattern( "GoogleNow", # Entity ID -- Created if doesn't exist. [ # The pattern is a list of *Token Specifiers*. { # This Token Specifier matches tokens whose orth field is "Google" ORTH: "Google" },
def test_match_middle(matcher): doc = Doc(matcher.vocab, ['I', 'like', 'Google', 'Now', 'best']) assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], doc.vocab.strings['PRODUCT'], 2, 4)]
def test_match_multi(matcher): doc = Doc(matcher.vocab, 'I like Google Now and java best'.split()) assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], doc.vocab.strings['PRODUCT'], 2, 4), (doc.vocab.strings['Java'], doc.vocab.strings['PRODUCT'], 5, 6)]
def test_match_start(matcher): doc = Doc(matcher.vocab, ['JavaScript', 'is', 'good']) assert matcher(doc) == [(matcher.vocab.strings['JS'], matcher.vocab.strings['PRODUCT'], 0, 1)]
def test_match_middle(matcher): doc = Doc(matcher.vocab, words=['I', 'like', 'Google', 'Now', 'best']) assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], doc.vocab.strings['PRODUCT'], 2, 4)]