def test_ancestors(doc_setup, mention_ids, output_common_ancestor, output_lcad): """Test if get_vert_ngrams works.""" doc = doc_setup # Create 1-gram span mentions space = MentionNgrams(n_min=1, n_max=1) mentions = [tc for tc in space.apply(doc)] assert len(mentions) == len( [word for sent in doc.sentences for word in sent.words]) # Test mentions extraction assert mentions[0].sentence.text == "test1" assert mentions[1].sentence.text == "test2" assert mentions[2].sentence.text == "test3" assert mentions[3].sentence.text == "test4" assert mentions[4].sentence.text == "test5" assert mentions[5].sentence.text == "test6" assert mentions[6].sentence.text == "test7" assert mentions[7].sentence.text == "test8 test9" assert mentions[7].get_span() == "test8" assert mentions[8].get_span() == "test9" test_mentions = ([mentions[i] for i in mention_ids] if len(mention_ids) > 0 else mentions) # Test commont ancestor calculation overall_common_ancestor = common_ancestor(test_mentions) assert overall_common_ancestor == output_common_ancestor # Test lowest commont ancestor depth calculation overall_lowest_common_ancestor_depth = lowest_common_ancestor_depth( test_mentions) assert overall_lowest_common_ancestor_depth == output_lcad
def test_regex_match(doc_setup): """Test RegexMatch matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) # a wrong option name should raise an excetiopn with pytest.raises(Exception): RegexMatchSpan(regex=r"apple") # Test if matcher raises an error when _f is given non-TemporarySpanMention matcher = RegexMatchSpan(rgx=r"apple") with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words)) matcher = RegexMatchEach(rgx=r"apple") with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words)) # Test if RegexMatchEach works as expected. assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"apple"} # Test ignore_case option matcher = RegexMatchEach(rgx=r"Apple", ignore_case=False) assert list(matcher.apply(space.apply(doc))) == [] # Test sep option matcher = RegexMatchSpan(rgx=r"isapple", sep=" ") assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"is apple"}
def test_inverse(doc_setup): """Test inverse matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" with longest_match_only=False matcher0 = RegexMatchSpan( rgx=r"apple", search=True, full_match=True, longest_match_only=False ) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Take an inverse matcher = Inverse(matcher0, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "This", "is", } # longest_match_only=True matcher = Inverse(matcher0, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"} # Match any span that contains "apple" with longest_match_only=True matcher0 = RegexMatchSpan( rgx=r"apple", search=True, full_match=True, longest_match_only=True ) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {"is apple"} # longest_match_only=False on Inverse is in effect. matcher = Inverse(matcher0, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "This", "is", } # longest_match_only=True on Inverse is in effect. matcher = Inverse(matcher0, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"} # Check if Inverse raises an error when no child matcher is provided. with pytest.raises(ValueError): Inverse() # Check if Inverse raises an error when two child matchers are provided. with pytest.raises(ValueError): Inverse(matcher0, matcher0)
def test_figure_matcher(doc_setup): """Test matchers for figures.""" doc = doc_setup # Create two dummy figures Figure(id=2, document=doc) Figure(id=3, document=doc) assert len(doc.figures) == 2 space = MentionFigures() assert len(list(space.apply(doc))) == 2 # Set up a matcher that matches figures with id==2. matcher = LambdaFunctionFigureMatcher( func=lambda tf: True if tf.figure.id == 2 else False) # Test if matcher only matches the first figure. assert len(list(matcher.apply(space.apply(doc)))) == 1 assert set(tf.figure.id for tf in matcher.apply(space.apply(doc))) == {2} # The keyword arg should be "func" with pytest.raises(Exception): LambdaFunctionFigureMatcher( function=lambda tf: True if tf.figure.id == 2 else False) # LambdaFunctionFigureMatcher only supports TemporaryFigureMention. space = MentionNgrams(n_min=1, n_max=2) with pytest.raises(ValueError): list(matcher.apply(space.apply(doc)))
def test_lambda_function_matcher(doc_setup): """Test DictionaryMatch matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=1) # Test with a lambda function matcher = LambdaFunctionMatcher(func=lambda x: True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This", "is", "apple", } # Test if matcher raises an error when _f is given non-TemporarySpanMention with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words)) # Test if an error raised when a func is not provided. with pytest.raises(Exception): LambdaFunctionMatcher()
def test_get_ngrams_that_match_in_string(doc_setup): """Test if ngrams can be obtained even if they match mention's span in string.""" doc = doc_setup sentence: Sentence = doc.sentences[0] # Assert this sentence is visual. assert sentence.is_visual() # Assert this sentence is not tabular. assert not sentence.is_tabular() # Create 1-gram span mentions space = MentionNgrams(n_min=1, n_max=1) mentions = [tc for tc in space.apply(doc)] assert len(mentions) == len( [word for sent in doc.sentences for word in sent.words]) # Pick "is" from the apple sentence that matches "is" in the orange sentence. mention = mentions[1] assert mention.get_span() == "is" # Check if the "is" in the orange sentence can be obtained. ngrams = list(get_horz_ngrams(mention, from_sentence=False)) assert "is" in ngrams
def test_dictionary_match(doc_setup): """Test DictionaryMatch matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=1) # Test with a list of str matcher = DictionaryMatch(d=["this"]) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This"} # Test without a dictionary with pytest.raises(Exception): DictionaryMatch() # TODO: test with plural words matcher = DictionaryMatch(d=["is"], stemmer=PorterStemmer()) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"is"} # Test if matcher raises an error when _f is given non-TemporarySpanMention matcher = DictionaryMatch(d=["this"]) with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words))
def test_get_vert_ngrams(doc_setup): """Test if get_vert_ngrams works.""" doc = doc_setup sentence: Sentence = doc.sentences[0] # Assert this sentence is visual. assert sentence.is_visual() # Assert this sentence is not tabular. assert not sentence.is_tabular() # Create 1-gram span mentions space = MentionNgrams(n_min=1, n_max=1) mentions = [tc for tc in space.apply(doc)] assert len(mentions) == len( [word for sent in doc.sentences for word in sent.words]) # Pick "apple" span mention. mention = mentions[2] assert mention.get_span() == "apple" # from_sentence=True (ie ngrams from all aligned Sentences but its Sentence) ngrams = list(get_vert_ngrams(mention)) assert ngrams == ["where", "is", "banaba", "?"]
def test_cancat(doc_setup): """Test Concat matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) # Match any span that contains "this" matcher0 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) # Match any span that contains "is" matcher1 = RegexMatchSpan(rgx=r"is", search=False, full_match=False, longest_match_only=False) matcher = Concat(matcher0, matcher1) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"} # Test if matcher raises an error when _f is given non-TemporarySpanMention with pytest.raises(ValueError): list(matcher.apply(doc.sentences[0].words)) # Test if an error is raised when the number of child matchers is not 2. matcher = Concat(matcher0) with pytest.raises(ValueError): list(matcher.apply(space.apply(doc))) # Test with left_required=False matcher = Concat(matcher0, matcher1, left_required=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "is apple", } # Test with right_required=False matcher = Concat(matcher0, matcher1, right_required=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"} # Test with permutations=False matcher = Concat(matcher1, matcher0, permutations=False) assert set(matcher.apply(space.apply(doc))) == set() # Test with permutations=True matcher = Concat(matcher1, matcher0, permutations=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"}
def test_union(doc_setup): """Test union matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is", "This", } matcher = Union(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "is apple", "apple", "This is", "This", } # longest_match_only of each matcher is ignored. matcher = Union(matcher0, matcher1, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "is apple", } # Unsupported option should raise an exception with pytest.raises(Exception): Union(matcher0, matcher1, long_match_only=False)
def test_union(caplog, doc_setup): doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is", "This", } matcher = Union(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "is apple", "apple", "This is", "This", } # longest_match_only of each matcher is ignored. matcher = Union(matcher0, matcher1, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "is apple", }
def test_intersect(doc_setup): """Test intersect matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=3) tc: TemporarySpanMention # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "This is apple", "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is apple", "This is", "This", } # Intersection of matcher0 and matcher1 matcher = Intersect(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is apple"} # Intersection of matcher0 and matcher0 matcher = Intersect(matcher0, matcher0, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is apple", "is apple", "apple", } # longest_match_only=True overrides that of child matchers. matcher = Intersect(matcher0, matcher0, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is apple"}
def test_get_horz_ngrams(doc_setup): """Test if get_horz_ngrams works.""" doc = doc_setup sentence: Sentence = doc.sentences[0] # Assert this sentence is visual. assert sentence.is_visual() # Assert this sentence is not tabular. assert not sentence.is_tabular() # Create 1-gram span mentions space = MentionNgrams(n_min=1, n_max=1) mentions = [tc for tc in space.apply(doc)] assert len(mentions) == len( [word for sent in doc.sentences for word in sent.words]) # Pick "apple" span mention. mention = mentions[2] assert mention.get_span() == "apple" # from_sentence=True (ie ngrams from all aligned Sentences but its Sentence) ngrams = list(get_horz_ngrams(mention)) assert ngrams == ["that", "is", "orange", "."] # Check the from_sentence=False (ie all aligned ngrams but itself) assert mention.get_span() == "apple" ngrams = list(get_horz_ngrams(mention, from_sentence=False)) assert ngrams == ["this", "is", ".", "that", "is", "orange", "."] # Check attrib="lemmas" ngrams = list(get_horz_ngrams(mention, attrib="lemmas")) assert ngrams == ["that", "be", "orange", "."] # Check attrib="pos_tags" ngrams = list(get_horz_ngrams(mention, attrib="pos_tags")) assert ngrams == ["dt", "vbz", "jj", "."] # Check lower option ngrams = list(get_horz_ngrams(mention, lower=False, from_sentence=False)) assert ngrams == ["This", "is", ".", "That", "is", "orange", "."] # Pick "This" span mention. mention = mentions[0] assert mention.get_span() == "This" ngrams = list(get_horz_ngrams(mention, from_sentence=False)) assert ngrams == ["is", "apple", ".", "that", "is", "orange", "."] # Check n_max=2 ngrams = list(get_horz_ngrams(mention, n_max=2, from_sentence=False)) assert ngrams == [ "is apple", "apple.", "is", "apple", ".", "that is", "is orange", "orange.", "that", "is", "orange", ".", ]
def test_ner_matchers(): """Test different ner type matchers.""" # Set up a document doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = " ".join([ "Tim Cook was born in USA in 1960.", "He is the CEO of Apple.", "He sold 100 million of iPhone.", ]) lingual_parser = SpacyParser("en") for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) # Manually attach ner_tags as the result from spacy may fluctuate. doc.sentences[0].ner_tags = [ "PERSON", "PERSON", "O", "O", "O", "GPE", "O", "DATE", "O", ] doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"] doc.sentences[2].ner_tags = [ "O", "O", "CARDINAL", "CARDINAL", "O", "MISC", "O" ] # the length of words and that of ner_tags should match. assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags) assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags) space = MentionNgrams(n_min=1, n_max=2) # Test if PersonMatcher works as expected matcher = PersonMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"} # Test if LocationMatcher works as expected matcher = LocationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"USA"} # Test if DateMatcher works as expected matcher = DateMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"1960"} # Test if OrganizationMatcher works as expected matcher = OrganizationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Apple"} # Test if NumberMatcher works as expected matcher = NumberMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"100 million"} # Test if MiscMatcher works as expected matcher = MiscMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"iPhone"}