Пример #1
0
def test_match_preserved(matcher, EN):
    doc = EN.tokenizer('I like java')
    EN.tagger(doc)
    assert len(doc.ents) == 0
    doc = EN.tokenizer('I like java')
    matcher(doc)
    assert len(doc.ents) == 1
    EN.tagger(doc)
    EN.entity(doc)
    assert len(doc.ents) == 1
Пример #2
0
def test_match_zero_plus(matcher):
    matcher.add('Quote', '', {}, [
        [
            {'ORTH': '"'},
            {'OP': '*', 'IS_PUNCT': False},
            {'ORTH': '"'}
        ]])
    doc = Doc(matcher.vocab, words='He said , " some words " ...'.split())
    assert len(matcher(doc)) == 1
Пример #3
0
def test_match_zero(matcher):
    matcher.add('Quote', '', {}, [
        [
            {'ORTH': '"'},
            {'OP': '!', 'IS_PUNCT': True},
            {'OP': '!', 'IS_PUNCT': True},
            {'ORTH': '"'}
        ]])
    doc = Doc(matcher.vocab, words='He said , " some words " ...'.split())
    assert len(matcher(doc)) == 1
    doc = Doc(matcher.vocab, words='He said , " some three words " ...'.split())
    assert len(matcher(doc)) == 0
    matcher.add('Quote', '', {}, [
        [
            {'ORTH': '"'},
            {'IS_PUNCT': True},
            {'IS_PUNCT': True},
            {'IS_PUNCT': True},
            {'ORTH': '"'}
        ]])
    assert len(matcher(doc)) == 0
Пример #4
0
def test_matcher_segfault():
    nlp = spacy.load('en', parser=False, entity=False)
    matcher = spacy.matcher.Matcher(nlp.vocab)
    content = u'''a b; c'''
    matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]])
    matcher(nlp(content))
    matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]])
    matcher(nlp(content))
    matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]])
    matcher(nlp(content))
Пример #5
0
def test_match_preserved(EN):
    patterns = {
        'JS': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]],
        'GoogleNow':  ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]],
        'Java':       ['PRODUCT', {}, [[{'LOWER': 'java'}]]],
    }
    matcher = Matcher(EN.vocab, patterns)
    doc = EN.tokenizer('I like java.')
    EN.tagger(doc)
    assert len(doc.ents) == 0
    doc = EN.tokenizer('I like java.')
    doc.ents += tuple(matcher(doc))
    assert len(doc.ents) == 1
    EN.tagger(doc)
    EN.entity(doc)
    assert len(doc.ents) == 1
Пример #6
0
 def extract_phone_number(nlp_doc):
     pattern = [{
         'ORTH': '('
     }, {
         'SHAPE': 'ddd'
     }, {
         'ORTH': ')'
     }, {
         'SHAPE': 'ddd'
     }, {
         'ORTH': '-',
         'OP': '?'
     }, {
         'SHAPE': 'ddd'
     }]
     matcher.add('PHONE_NUMBER', None, pattern)
     matches = matcher(nlp_doc)
     for match_id, start, end in matches:
         span = nlp_doc[start:end]
         return span.text
Пример #7
0
def test_match_multi(matcher, EN):
    tokens = EN('I like Google Now and java best')
    assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4),
                               (EN.vocab.strings['PRODUCT'], 5, 6)]
Пример #8
0
def test_match_end(matcher, EN):
    tokens = EN('I like java')
    assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)]
Пример #9
0
def test_match_middle(matcher, EN):
    tokens = EN('I like Google Now best')
    assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4)]
Пример #10
0
def test_no_match(matcher, EN):
    tokens = EN('I like cheese')
    assert matcher(tokens) == []
Пример #11
0
def test_match_start(matcher, EN):
    tokens = EN('JavaScript is good')
    assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 0, 1)]
Пример #12
0
def test_match_start(matcher, EN):
    tokens = EN('JavaScript is good')
    assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 0, 1)]
Пример #13
0
def test_match_multi(matcher):
    doc = Doc(matcher.vocab, words='I like Google Now and java best'.split())
    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'],
                             doc.vocab.strings['PRODUCT'], 2, 4),
                            (doc.vocab.strings['Java'],
                             doc.vocab.strings['PRODUCT'], 5, 6)]
from spacy.matcher import *
from spacy.attrs import *
from spacy.tokens import Doc

matcher = Matcher(nlp.vocab)

matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])

doc = nlp(u'Hello, world!')

matches = matcher(doc)



matcher.add_entity(
    "GoogleNow", # Entity ID -- Helps you act on the match.
    {"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional)
)

matcher.add_pattern(
    "GoogleNow", # Entity ID -- Created if doesn't exist.
    [ # The pattern is a list of *Token Specifiers*.
        { # This Token Specifier matches tokens whose orth field is "Google"
          ORTH: "Google"
        },
        { # This Token Specifier matches tokens whose orth field is "Now"
          ORTH: "Now"
        }
    ],
    label=None # Can associate a label to the pattern-match, to handle it better.
)
Пример #15
0
def test_match_end(matcher):
    doc = Doc(matcher.vocab, ['I', 'like', 'java'])
    assert matcher(doc) == [(doc.vocab.strings['Java'],
                             doc.vocab.strings['PRODUCT'], 2, 3)]
Пример #16
0
def test_no_match(matcher):
    doc = Doc(matcher.vocab, words=['I', 'like', 'cheese', '.'])
    assert matcher(doc) == []
Пример #17
0
def test_no_match(matcher):
    doc = Doc(matcher.vocab, ['I', 'like', 'cheese', '.'])
    assert matcher(doc) == []
Пример #18
0
def test_phrase_matcher():
    vocab = Vocab(lex_attr_getters=English.Defaults.lex_attr_getters)
    matcher = PhraseMatcher(vocab, [Doc(vocab, words='Google Now'.split())])
    doc = Doc(vocab, words=['I', 'like', 'Google', 'Now', 'best'])
    assert len(matcher(doc)) == 1
Пример #19
0
def test_match_start(matcher):
    doc = Doc(matcher.vocab, words=['JavaScript', 'is', 'good'])
    assert matcher(doc) == [(matcher.vocab.strings['JS'],
                             matcher.vocab.strings['PRODUCT'], 0, 1)]
Пример #20
0
def test_match_end(matcher):
    doc = Doc(matcher.vocab, words=['I', 'like', 'java'])
    assert matcher(doc) == [(doc.vocab.strings['Java'],
                             doc.vocab.strings['PRODUCT'], 2, 3)]
Пример #21
0
from spacy.attrs import *
from spacy.tokens import Doc

matcher = Matcher(nlp.vocab)

matcher.add_pattern("HelloWorld", [{
    LOWER: "hello"
}, {
    IS_PUNCT: True
}, {
    LOWER: "world"
}])

doc = nlp(u'Hello, world!')

matches = matcher(doc)

matcher.add_entity(
    "GoogleNow",  # Entity ID -- Helps you act on the match.
    {
        "ent_type": "PRODUCT",
        "wiki_en": "Google_Now"
    },  # Arbitrary attributes (optional)
)

matcher.add_pattern(
    "GoogleNow",  # Entity ID -- Created if doesn't exist.
    [  # The pattern is a list of *Token Specifiers*.
        {  # This Token Specifier matches tokens whose orth field is "Google"
            ORTH: "Google"
        },
Пример #22
0
def test_match_middle(matcher):
    doc = Doc(matcher.vocab, ['I', 'like', 'Google', 'Now', 'best'])
    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'],
                             doc.vocab.strings['PRODUCT'], 2, 4)]
Пример #23
0
def test_no_match(matcher, EN):
    tokens = EN('I like cheese')
    assert matcher(tokens) == []
Пример #24
0
def test_match_multi(matcher):
    doc = Doc(matcher.vocab, 'I like Google Now and java best'.split())
    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'],
                             doc.vocab.strings['PRODUCT'], 2, 4),
                            (doc.vocab.strings['Java'],
                             doc.vocab.strings['PRODUCT'], 5, 6)]
Пример #25
0
def test_match_start(matcher):
    doc = Doc(matcher.vocab, ['JavaScript', 'is', 'good'])
    assert matcher(doc) == [(matcher.vocab.strings['JS'],
                             matcher.vocab.strings['PRODUCT'], 0, 1)]
Пример #26
0
def test_match_middle(matcher):
    doc = Doc(matcher.vocab, words=['I', 'like', 'Google', 'Now', 'best'])
    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'],
                             doc.vocab.strings['PRODUCT'], 2, 4)]