def doc_setup(): """Set up a document.""" parser_udf = get_parser_udf() doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = """<html> <body> <h1>test1</h1> <h2>test2</h2> <div> <h3>test3</h3> <table> <tr> <td>test4</td> <td>test5</td> </tr> </table> <table> <tr> <td>test6</td> <td>test7</td> </tr> </table> </div> <p>test8 test9</p> </body> </html>""" doc = parser_udf.apply(doc) return doc
def doc_setup(): doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = "This is apple" lingual_parser = SpacyParser("en") for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) return doc
def test_parser_skips_and_flattens(caplog): """Test if ``Parser`` skips/flattens elements.""" caplog.set_level(logging.INFO) parser_udf = get_parser_udf() # Test if a parser skips comments doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = "<html><body>Hello!<!-- comment --></body></html>" for _ in parser_udf.apply(doc): pass assert doc.sentences[0].text == "Hello!" # Test if a parser skips blacklisted elements doc = Document(id=2, name="test2", stable_id="2::document:0:0") doc.text = "<html><body><script>alert('Hello');</script><p>Hello!</p></body></html>" for _ in parser_udf.apply(doc): pass assert doc.sentences[0].text == "Hello!" # Test if a parser flattens elements doc = Document(id=3, name="test3", stable_id="3::document:0:0") doc.text = "<html><body><span>Hello, <br>world!</span></body></html>" for _ in parser_udf.apply(doc): pass assert doc.sentences[0].text == "Hello, world!" # Now with different blacklist and flatten parser_udf = get_parser_udf(blacklist=["meta"], flatten=["word"]) # Test if a parser does not skip non-blacklisted element doc = Document(id=4, name="test4", stable_id="4::document:0:0") doc.text = "<html><body><script>alert('Hello');</script><p>Hello!</p></body></html>" for _ in parser_udf.apply(doc): pass assert doc.sentences[0].text == "alert('Hello');" assert doc.sentences[1].text == "Hello!" # Test if a parser skips blacklisted elements doc = Document(id=5, name="test5", stable_id="5::document:0:0") doc.text = "<html><head><meta name='keywords'></head><body>Hello!</body></html>" for _ in parser_udf.apply(doc): pass assert doc.sentences[0].text == "Hello!" # Test if a parser does not flatten elements doc = Document(id=6, name="test6", stable_id="6::document:0:0") doc.text = "<html><body><span>Hello, <br>world!</span></body></html>" for _ in parser_udf.apply(doc): pass assert doc.sentences[0].text == "Hello," assert doc.sentences[1].text == "world!" # Test if a parser flattens elements doc = Document(id=7, name="test7", stable_id="7::document:0:0") doc.text = "<html><body><word>Hello, </word><word>world!</word></body></html>" for _ in parser_udf.apply(doc): pass assert doc.sentences[0].text == "Hello, world!"
def doc_setup(): """Set up document.""" doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = "This is apple. That is orange. Where is banaba? I like Apple." lingual_parser = SpacyParser("en") # Split sentences for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) # Enrich sentences for _ in lingual_parser.enrich_sentences_with_NLP(doc.sentences): pass # Pick one sentence and add visual information # so that all the words get aligned horizontally. sentence: Sentence = doc.sentences[0] sentence.page = [1, 1, 1, 1] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] # Assume the 2nd sentence is horizontally aligned with 1st. sentence: Sentence = doc.sentences[1] sentence.page = [1, 1, 1, 1] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [40, 50, 60, 70] sentence.right = [50, 60, 70, 80] # Assume the 3rd sentence is vertically aligned with 1st. sentence: Sentence = doc.sentences[2] sentence.page = [1, 1, 1, 1] sentence.top = [10, 10, 10, 10] sentence.bottom = [20, 20, 20, 20] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] # Assume the 4th sentence is in 2nd page. sentence: Sentence = doc.sentences[3] sentence.page = [2, 2, 2, 2] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] return doc
def parse(self, document: Document, text: str) -> Iterator[Sentence]: """Depth-first search over the provided tree. Implemented as an iterative procedure. The structure of the state needed to parse each node is also defined in this function. :param document: the Document context :param text: the structured text of the document (e.g. HTML) :return: a *generator* of Sentences. """ stack = [] root = lxml.html.fromstring(text) # flattens children of node that are in the 'flatten' list if self.flatten: lxml.etree.strip_tags(root, self.flatten) # Strip comments lxml.etree.strip_tags(root, lxml.etree.Comment) # Assign the text, which was stripped of the 'flatten'-tags, to the document document.text = lxml.etree.tostring(root, encoding="unicode") # This dictionary contain the global state necessary to parse a # document and each context element. This reflects the relationships # defined in parser/models. This contains the state necessary to create # the respective Contexts within the document. state = { "visited": set(), "parent": {}, # map of parent[child] = node used to discover child "context": {}, # track the Context of each node (context['td'] = Cell) "root": root, "document": document, "section": { "idx": 0 }, "paragraph": { "idx": 0 }, "figure": { "idx": 0 }, "caption": { "idx": 0 }, "table": { "idx": 0 }, "sentence": { "idx": 0, "abs_offset": 0 }, } # NOTE: Currently the helper functions directly manipulate the state # rather than returning a modified copy. # Iterative Depth-First Search stack.append(root) state["parent"][root] = document state["context"][root] = document tokenized_sentences: List[Sentence] = [] while stack: node = stack.pop() if node not in state["visited"]: state["visited"].add(node) # mark as visited # Process if self.lingual: tokenized_sentences += [ y for y in self._parse_node(node, state) ] else: yield from self._parse_node(node, state) # NOTE: This reversed() order is to ensure that the iterative # DFS matches the order that would be produced by a recursive # DFS implementation. for child in reversed(node): # Skip nodes that are blacklisted if self.blacklist and child.tag in self.blacklist: continue stack.append(child) # store the parent of the node, which is either the parent # Context, or if the parent did not create a Context, then # use the node's parent Context. state["parent"][child] = (state["context"][node] if node in state["context"] else state["parent"][node]) if self.lingual: yield from self.lingual_parser.enrich_sentences_with_NLP( tokenized_sentences)
def test_ner_matchers(): """Test different ner type matchers.""" # Set up a document doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = " ".join([ "Tim Cook was born in USA in 1960.", "He is the CEO of Apple.", "He sold 100 million of iPhone.", ]) lingual_parser = SpacyParser("en") for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) # Manually attach ner_tags as the result from spacy may fluctuate. doc.sentences[0].ner_tags = [ "PERSON", "PERSON", "O", "O", "O", "GPE", "O", "DATE", "O", ] doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"] doc.sentences[2].ner_tags = [ "O", "O", "CARDINAL", "CARDINAL", "O", "MISC", "O" ] # the length of words and that of ner_tags should match. assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags) assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags) space = MentionNgrams(n_min=1, n_max=2) # Test if PersonMatcher works as expected matcher = PersonMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"} # Test if LocationMatcher works as expected matcher = LocationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"USA"} # Test if DateMatcher works as expected matcher = DateMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"1960"} # Test if OrganizationMatcher works as expected matcher = OrganizationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Apple"} # Test if NumberMatcher works as expected matcher = NumberMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"100 million"} # Test if MiscMatcher works as expected matcher = MiscMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"iPhone"}