示例#1
0
def test_regex_match(doc_setup):
    """Test RegexMatch matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)

    # a wrong option name should raise an excetiopn
    with pytest.raises(Exception):
        RegexMatchSpan(regex=r"apple")

    # Test if matcher raises an error when _f is given non-TemporarySpanMention
    matcher = RegexMatchSpan(rgx=r"apple")
    with pytest.raises(ValueError):
        list(matcher.apply(doc.sentences[0].words))

    matcher = RegexMatchEach(rgx=r"apple")
    with pytest.raises(ValueError):
        list(matcher.apply(doc.sentences[0].words))

    # Test if RegexMatchEach works as expected.
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"apple"}

    # Test ignore_case option
    matcher = RegexMatchEach(rgx=r"Apple", ignore_case=False)
    assert list(matcher.apply(space.apply(doc))) == []

    # Test sep option
    matcher = RegexMatchSpan(rgx=r"isapple", sep=" ")
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"is apple"}
示例#2
0
def test_inverse(doc_setup):
    """Test inverse matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)
    tc: TemporarySpanMention
    assert set(tc.get_span() for tc in space.apply(doc)) == {
        "This is",
        "is apple",
        "This",
        "is",
        "apple",
    }

    # Match any span that contains "apple" with longest_match_only=False
    matcher0 = RegexMatchSpan(
        rgx=r"apple", search=True, full_match=True, longest_match_only=False
    )
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "is apple",
        "apple",
    }

    # Take an inverse
    matcher = Inverse(matcher0, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "This",
        "is",
    }

    # longest_match_only=True
    matcher = Inverse(matcher0, longest_match_only=True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"}

    # Match any span that contains "apple" with longest_match_only=True
    matcher0 = RegexMatchSpan(
        rgx=r"apple", search=True, full_match=True, longest_match_only=True
    )
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {"is apple"}

    # longest_match_only=False on Inverse is in effect.
    matcher = Inverse(matcher0, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "This",
        "is",
    }

    # longest_match_only=True on Inverse is in effect.
    matcher = Inverse(matcher0, longest_match_only=True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"}

    # Check if Inverse raises an error when no child matcher is provided.
    with pytest.raises(ValueError):
        Inverse()

    # Check if Inverse raises an error when two child matchers are provided.
    with pytest.raises(ValueError):
        Inverse(matcher0, matcher0)
示例#3
0
def test_union(doc_setup):
    """Test union matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)
    tc: TemporarySpanMention
    assert set(tc.get_span() for tc in space.apply(doc)) == {
        "This is",
        "is apple",
        "This",
        "is",
        "apple",
    }

    # Match any span that contains "apple"
    matcher0 = RegexMatchSpan(rgx=r"apple",
                              search=True,
                              full_match=True,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "is apple",
        "apple",
    }

    # Match any span that contains "this" (case insensitive)
    matcher1 = RegexMatchSpan(rgx=r"this",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == {
        "This is",
        "This",
    }

    matcher = Union(matcher0, matcher1, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "is apple",
        "apple",
        "This is",
        "This",
    }

    # longest_match_only of each matcher is ignored.
    matcher = Union(matcher0, matcher1, longest_match_only=True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "is apple",
    }

    # Unsupported option should raise an exception
    with pytest.raises(Exception):
        Union(matcher0, matcher1, long_match_only=False)
示例#4
0
def test_union(caplog, doc_setup):
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)
    tc: TemporarySpanMention
    assert set(tc.get_span() for tc in space.apply(doc)) == {
        "This is",
        "is apple",
        "This",
        "is",
        "apple",
    }

    # Match any span that contains "apple"
    matcher0 = RegexMatchSpan(rgx=r"apple",
                              search=True,
                              full_match=True,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "is apple",
        "apple",
    }

    # Match any span that contains "this" (case insensitive)
    matcher1 = RegexMatchSpan(rgx=r"this",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == {
        "This is",
        "This",
    }

    matcher = Union(matcher0, matcher1, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "is apple",
        "apple",
        "This is",
        "This",
    }

    # longest_match_only of each matcher is ignored.
    matcher = Union(matcher0, matcher1, longest_match_only=True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "is apple",
    }
示例#5
0
def test_intersect(doc_setup):
    """Test intersect matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=3)
    tc: TemporarySpanMention

    # Match any span that contains "apple"
    matcher0 = RegexMatchSpan(rgx=r"apple",
                              search=True,
                              full_match=True,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "This is apple",
        "is apple",
        "apple",
    }

    # Match any span that contains "this" (case insensitive)
    matcher1 = RegexMatchSpan(rgx=r"this",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == {
        "This is apple",
        "This is",
        "This",
    }

    # Intersection of matcher0 and matcher1
    matcher = Intersect(matcher0, matcher1, longest_match_only=False)
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"This is apple"}

    # Intersection of matcher0 and matcher0
    matcher = Intersect(matcher0, matcher0, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is apple",
        "is apple",
        "apple",
    }

    # longest_match_only=True overrides that of child matchers.
    matcher = Intersect(matcher0, matcher0, longest_match_only=True)
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"This is apple"}