示例#1
0
def test_matcher_match_zero_plus(en_vocab):
    words = 'He said , " some words " ...'.split()
    pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
    matcher = Matcher(en_vocab)
    matcher.add("Quote", [pattern])
    doc = Doc(en_vocab, words=words)
    assert len(matcher(doc)) == 1
示例#2
0
def test_operator_combos(en_vocab):
    cases = [
        ("aaab", "a a a b", True),
        ("aaab", "a+ b", True),
        ("aaab", "a+ a+ b", True),
        ("aaab", "a+ a+ a b", True),
        ("aaab", "a+ a+ a+ b", True),
        ("aaab", "a+ a a b", True),
        ("aaab", "a+ a a", True),
        ("aaab", "a+", True),
        ("aaa", "a+ b", False),
        ("aaa", "a+ a+ b", False),
        ("aaa", "a+ a+ a+ b", False),
        ("aaa", "a+ a b", False),
        ("aaa", "a+ a a b", False),
        ("aaab", "a+ a a", True),
        ("aaab", "a+", True),
        ("aaab", "a+ a b", True),
    ]
    for string, pattern_str, result in cases:
        matcher = Matcher(en_vocab)
        doc = Doc(en_vocab, words=list(string))
        pattern = []
        for part in pattern_str.split():
            if part.endswith("+"):
                pattern.append({"ORTH": part[0], "OP": "+"})
            else:
                pattern.append({"ORTH": part})
        matcher.add("PATTERN", [pattern])
        matches = matcher(doc)
        if result:
            assert matches, (string, pattern_str)
        else:
            assert not matches, (string, pattern_str)
示例#3
0
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
    matcher = Matcher(en_vocab)
    doc = Doc(en_vocab, words=text.split(" "))
    matcher.add("Rule", [pattern])
    assert len(matcher) == 1
    matches = matcher(doc)
    assert len(matches) == 1
示例#4
0
def test_matcher_any_token_operator(en_vocab):
    """Test that patterns with "any token" {} work with operators."""
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "test"}, {"OP": "*"}]])
    doc = Doc(en_vocab, words=["test", "hello", "world"])
    matches = [doc[start:end].text for _, start, end in matcher(doc)]
    assert len(matches) == 1
    assert matches[0] == "test hello world"
示例#5
0
def test_matcher_operator_shadow(en_vocab):
    matcher = Matcher(en_vocab)
    doc = Doc(en_vocab, words=["a", "b", "c"])
    pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
    matcher.add("A.C", [pattern])
    matches = matcher(doc)
    assert len(matches) == 1
    assert matches[0][1:] == (0, 3)
示例#6
0
def test_matcher_regex(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"REGEX": r"\bUS\d+\b"}]
    matcher.add("REGEX", [pattern])
    text = "This is a test for a regex, US12345."
    doc = Doc(en_vocab, words=text.split())
    matches = matcher(doc)
    assert matches == [(14188318820720882904, 7, 8)]
示例#7
0
def test_match_consuming(doc, text, pattern, re_pattern):
    """Test that matcher.__call__ consumes tokens on a match similar to
    re.findall."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, [pattern])
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    assert len(matches) == len(re_matches)
示例#8
0
def test_matcher_callback(en_vocab):
    mock = Mock()
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "test"}]
    matcher.add("Rule", [pattern], on_match=mock)
    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
    matches = matcher(doc)
    mock.assert_called_once_with(matcher, doc, 0, matches)
示例#9
0
def test_greedy_matching(doc, text, pattern, re_pattern):
    """Test that the greedy matching behavior of the * op is consistant with
    other re implementations."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, [pattern])
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    for match, re_match in zip(matches, re_matches):
        assert match[1:] == re_match
示例#10
0
def test_matcher_compare_length(en_vocab, cmp, bad):
    matcher = Matcher(en_vocab)
    pattern = [{"LENGTH": {cmp: 2}}]
    matcher.add("LENGTH_COMPARE", [pattern])
    doc = Doc(en_vocab, words=["a", "aa", "aaa"])
    matches = matcher(doc)
    assert len(matches) == len(doc) - len(bad)
    doc = Doc(en_vocab, words=bad)
    matches = matcher(doc)
    assert len(matches) == 0
示例#11
0
def test_matcher_regex_shape(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
    matcher.add("NON_ALPHA", [pattern])
    doc = Doc(en_vocab, words=["99", "problems", "!"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["bye"])
    matches = matcher(doc)
    assert len(matches) == 0
示例#12
0
def test_matcher_orth_regex(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
    matcher.add("A_OR_AN", [pattern])
    doc = Doc(en_vocab, words=["an", "a", "hi"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["bye"])
    matches = matcher(doc)
    assert len(matches) == 0
示例#13
0
def test_matcher_set_value(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"IN": ["an", "a"]}}]
    matcher.add("A_OR_AN", [pattern])
    doc = Doc(en_vocab, words=["an", "a", "apple"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
示例#14
0
def matcher(en_vocab):
    rules = {
        "JS": [[{"ORTH": "JavaScript"}]],
        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
        "Java": [[{"LOWER": "java"}]],
    }
    matcher = Matcher(en_vocab)
    for key, patterns in rules.items():
        matcher.add(key, patterns)
    return matcher
示例#15
0
def test_matcher_set_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
    matcher.add("DET_HOUSE", [pattern])
    doc = Doc(en_vocab, words=["In", "a", "house"])
    matches = matcher(doc)
    assert len(matches) == 1
    doc = Doc(en_vocab, words=["my", "house"])
    matches = matcher(doc)
    assert len(matches) == 1
示例#16
0
def test_matcher_extension_set_membership(en_vocab):
    matcher = Matcher(en_vocab)
    get_reversed = lambda token: "".join(reversed(token.text))
    Token.set_extension("reversed", getter=get_reversed, force=True)
    pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
    matcher.add("REVERSED", [pattern])
    doc = Doc(en_vocab, words=["hi", "bye", "hello"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
示例#17
0
def test_matcher_extension_attribute(en_vocab):
    matcher = Matcher(en_vocab)
    get_is_fruit = lambda token: token.text in ("apple", "banana")
    Token.set_extension("is_fruit", getter=get_is_fruit, force=True)
    pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}]
    matcher.add("HAVING_FRUIT", [pattern])
    doc = Doc(en_vocab, words=["an", "apple"])
    matches = matcher(doc)
    assert len(matches) == 1
    doc = Doc(en_vocab, words=["an", "aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
示例#18
0
def test_matcher_match_one_plus(matcher, en_vocab):
    control = Matcher(en_vocab)
    control.add("BasicPhilippe", [[{"ORTH": "Philippe"}]])
    doc = Doc(en_vocab, words=["Philippe", "Philippe"])
    m = control(doc)
    assert len(m) == 2
    pattern = [
        {"ORTH": "Philippe", "OP": "1"},
        {"ORTH": "Philippe", "OP": "+"},
    ]
    matcher.add("KleenePhilippe", [pattern])
    m = matcher(doc)
    assert len(m) == 1
示例#19
0
def test_matcher_from_api_docs(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "test"}]
    assert len(matcher) == 0
    matcher.add("Rule", [pattern])
    assert len(matcher) == 1
    matcher.remove("Rule")
    assert "Rule" not in matcher
    matcher.add("Rule", [pattern])
    assert "Rule" in matcher
    on_match, patterns = matcher.get("Rule")
    assert len(patterns[0])
示例#20
0
def test_matcher_empty_dict(en_vocab):
    """Test matcher allows empty token specs, meaning match on any token."""
    matcher = Matcher(en_vocab)
    doc = Doc(en_vocab, words=["a", "b", "c"])
    matcher.add("A.C", [[{"ORTH": "a"}, {}, {"ORTH": "c"}]])
    matches = matcher(doc)
    assert len(matches) == 1
    assert matches[0][1:] == (0, 3)
    matcher = Matcher(en_vocab)
    matcher.add("A.", [[{"ORTH": "a"}, {}]])
    matches = matcher(doc)
    assert matches[0][1:] == (0, 2)
示例#21
0
def test_matcher_from_usage_docs(en_vocab):
    text = "Wow 😀 This is really cool! 😂 😂"
    doc = Doc(en_vocab, words=text.split(" "))
    pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]
    pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]

    def label_sentiment(matcher, doc, i, matches):
        match_id, start, end = matches[i]
        if match_id == 2686646543460454932:
            doc.sentiment += 0.1
        span = doc[start:end]
        with doc.retokenize() as retokenizer:
            retokenizer.merge(span)
        token = doc[start]
        token.vocab[token.text].norm_ = "happy emoji"

    matcher = Matcher(en_vocab)
    matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
    matcher(doc)
    assert doc.sentiment != 0
    assert doc[1].norm_ == "happy emoji"
示例#22
0
文件: abbrs.py 项目: gpucce/spikex
    def __init__(self, vocab) -> None:
        Doc.set_extension("abbrs", default=[], force=True)
        Span.set_extension("long_form", default=None, force=True)

        self._matcher = Matcher(vocab)
        self._matcher.add(
            "abbrs",
            [
                # Pattern for abbreviations not enclosed in brackets
                # here we limit to alpha chars only as it could
                # get many exceptions
                [{
                    "IS_ALPHA": True,
                    "IS_UPPER": True,
                    "LENGTH": {
                        ">": 1
                    }
                }],
                # Pattern for abbreviations enclosed in brackets
                # here we try to allow non alpha chars too as it is
                # the more likely standard way to introduce an abbreviation
                [
                    {
                        "TEXT": {
                            "IN": ["(", "["]
                        },
                        "OP": "+"
                    },
                    {
                        "OP": "+"
                    },
                    {
                        "TEXT": {
                            "IN": [")", "]"]
                        },
                        "OP": "+"
                    },
                ],
            ],
        )
示例#23
0
def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
    matcher = Matcher(en_vocab)
    if n_min_errors > 0:
        with pytest.raises(ValueError):
            matcher.add("TEST", [pattern])
    elif n_errors == 0:
        matcher.add("TEST", [pattern])
示例#24
0
def test_pattern_errors(en_vocab):
    matcher = Matcher(en_vocab)
    # normalize "regex" to upper like "text"
    matcher.add("TEST1", [[{"text": {"regex": "regex"}}]])
    # error if subpattern attribute isn't recognized and processed
    with pytest.raises(MatchPatternError):
        matcher.add("TEST2", [[{"TEXT": {"XX": "xx"}}]])
示例#25
0
文件: abbrs.py 项目: gpucce/spikex
def _find_matches_for(filtered: Iterable[Tuple[Span, Span]],
                      doc: Doc) -> Iterable[Tuple[Span, Set[Span]]]:
    form2other = {}
    matches = []
    global_matcher = Matcher(doc.vocab)
    for (long_candidate, short_candidate) in filtered:
        abbr = find_abbreviation(long_candidate, short_candidate)
        # We look for abbreviations, so...
        if abbr is None:
            continue
        long_form, short_form = abbr
        # Look for each new abbreviation globally to find lone ones
        for form, other in ((long_form, short_form), (short_form, long_form)):
            form2other.setdefault(form, other)
            pattern = [{"TEXT": t.text} for t in form]
            global_matcher.add(form.text, [pattern])
    seen = set()
    # Search for lone abbreviations globally
    for key, start, end in global_matcher(doc):
        other = None
        text = doc.vocab.strings[key]
        for f, o in form2other.items():
            if f.text != text or f.start > start:
                continue
            other = o
            if f.start == start:
                break
        if other is None:
            continue
        form = doc[start:end]
        # Short form should be the shortest
        match = (other, form) if len(form) < len(other) else (form, other)
        # Don't add duplicates
        key = "/".join([str(el.start) for el in match])
        if key in seen:
            continue
        seen.add(key)
        matches.append(match)
    yield from sorted(matches, key=lambda x: x[0].start)
示例#26
0
def test_matcher_pattern_validation(en_vocab, pattern):
    matcher = Matcher(en_vocab, validate=True)
    with pytest.raises(MatchPatternError):
        matcher.add("TEST", [pattern])
示例#27
0
def test_matcher_valid_callback(en_vocab):
    """Test that on_match can only be None or callable."""
    matcher = Matcher(en_vocab)
    with pytest.raises(ValueError):
        matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[])
    matcher(Doc(en_vocab, words=["test"]))
示例#28
0
def matcher(en_vocab):
    return Matcher(en_vocab)
示例#29
0
文件: abbrs.py 项目: gpucce/spikex
class AbbrX:
    """
    *Strongly based on scispacy's AbbreviationDetector*.
    Detect abbreviations which are acronyms or by using the algorithm in
    "A simple algorithm for identifying abbreviation definitions in biomedical
    text.", (Schwartz & Hearst, 2003).

    This class sets the `._.abbrs` attribute on spaCy Doc.

    The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form`
    attribute set to the long form definition of the abbreviation.

    Note that this class does not replace the spans, or merge them.
    """
    def __init__(self, vocab) -> None:
        Doc.set_extension("abbrs", default=[], force=True)
        Span.set_extension("long_form", default=None, force=True)

        self._matcher = Matcher(vocab)
        self._matcher.add(
            "abbrs",
            [
                # Pattern for abbreviations not enclosed in brackets
                # here we limit to alpha chars only as it could
                # get many exceptions
                [{
                    "IS_ALPHA": True,
                    "IS_UPPER": True,
                    "LENGTH": {
                        ">": 1
                    }
                }],
                # Pattern for abbreviations enclosed in brackets
                # here we try to allow non alpha chars too as it is
                # the more likely standard way to introduce an abbreviation
                [
                    {
                        "TEXT": {
                            "IN": ["(", "["]
                        },
                        "OP": "+"
                    },
                    {
                        "OP": "+"
                    },
                    {
                        "TEXT": {
                            "IN": [")", "]"]
                        },
                        "OP": "+"
                    },
                ],
            ],
        )

    def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]:
        """
        Functional version of calling the matcher for a single span.
        This method is helpful if you already have an abbreviation which
        you want to find a definition for.
        """
        dummy_matches = [(-1, int(span.start), int(span.end))]
        filtered = _filter_matches(dummy_matches, doc)
        abbrs = list(self.find_matches_for(filtered, doc))

        if not abbrs:
            return span, set()
        return abbrs[0]

    def __call__(self, doc: Doc) -> Doc:
        matches = self._matcher(doc)
        matches_no_punct = set([(
            x[0],
            x[1] + (1 if doc[x[1]].is_punct else 0),
            x[2] - (1 if doc[x[2] - 1].is_punct else 0),
        ) for x in matches])
        filtered = _filter_matches(matches_no_punct, doc)
        occurences = _find_matches_for(filtered, doc)

        for long_form, short_form in occurences:
            short_form._.long_form = long_form
            doc._.abbrs.append(short_form)
        return doc
示例#30
0
def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1[0].dep_ = "ROOT"
    doc2 = Doc(en_vocab, words=["Test"])
    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
    if spacy_version >= 3:
        doc2[0].set_morph("Feat=Val")
    else:
        doc1.is_parsed = True
    doc2[0].lemma_ = "LEMMA"
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires DEP
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"DEP": "a"}]])
    matcher(doc1)
    with pytest.raises(ValueError):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
    # errors can be suppressed if desired
    matcher(doc2, allow_missing=True)
    matcher(doc3, allow_missing=True)
    # TAG, POS, LEMMA require those values
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)
        matcher.add("TEST", [[{attr: "a"}]])
        if spacy_version < 3:
            doc2.is_tagged = True
        matcher(doc2)
        with pytest.raises(ValueError):
            matcher(doc1)
        with pytest.raises(ValueError):
            matcher(doc3)
    # TEXT/ORTH only require tokens
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"ORTH": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TEXT": "a"}]])
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)