def test_regex_match(doc_setup): """Test RegexMatch matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) # a wrong option name should raise an excetiopn with pytest.raises(Exception): RegexMatchSpan(regex=r"apple") # Test if matcher raises an error when _f is given non-TemporarySpanMention matcher = RegexMatchSpan(rgx=r"apple") with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words)) matcher = RegexMatchEach(rgx=r"apple") with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words)) # Test if RegexMatchEach works as expected. assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"apple"} # Test ignore_case option matcher = RegexMatchEach(rgx=r"Apple", ignore_case=False) assert list(matcher.apply(space.apply(doc))) == [] # Test sep option matcher = RegexMatchSpan(rgx=r"isapple", sep=" ") assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"is apple"}
def test_inverse(doc_setup): """Test inverse matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" with longest_match_only=False matcher0 = RegexMatchSpan( rgx=r"apple", search=True, full_match=True, longest_match_only=False ) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Take an inverse matcher = Inverse(matcher0, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "This", "is", } # longest_match_only=True matcher = Inverse(matcher0, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"} # Match any span that contains "apple" with longest_match_only=True matcher0 = RegexMatchSpan( rgx=r"apple", search=True, full_match=True, longest_match_only=True ) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {"is apple"} # longest_match_only=False on Inverse is in effect. matcher = Inverse(matcher0, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "This", "is", } # longest_match_only=True on Inverse is in effect. matcher = Inverse(matcher0, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"} # Check if Inverse raises an error when no child matcher is provided. with pytest.raises(ValueError): Inverse() # Check if Inverse raises an error when two child matchers are provided. with pytest.raises(ValueError): Inverse(matcher0, matcher0)
def test_union(doc_setup): """Test union matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is", "This", } matcher = Union(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "is apple", "apple", "This is", "This", } # longest_match_only of each matcher is ignored. matcher = Union(matcher0, matcher1, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "is apple", } # Unsupported option should raise an exception with pytest.raises(Exception): Union(matcher0, matcher1, long_match_only=False)
def test_union(caplog, doc_setup): doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is", "This", } matcher = Union(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "is apple", "apple", "This is", "This", } # longest_match_only of each matcher is ignored. matcher = Union(matcher0, matcher1, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "is apple", }
def test_intersect(doc_setup): """Test intersect matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=3) tc: TemporarySpanMention # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "This is apple", "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is apple", "This is", "This", } # Intersection of matcher0 and matcher1 matcher = Intersect(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is apple"} # Intersection of matcher0 and matcher0 matcher = Intersect(matcher0, matcher0, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is apple", "is apple", "apple", } # longest_match_only=True overrides that of child matchers. matcher = Intersect(matcher0, matcher0, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is apple"}