def test_pdf_word_list_is_sorted(): """Test if pdf_word_list is sorted as expected. no_image_unsorted.html is originally created from pdf_simple/no_image.pdf, but the order of html elements like block and word has been changed to see if pdf_word_list is sorted as expected. """ docs_path = "tests/data/html_simple/no_image_unsorted.html" pdf_path = "dummy_path" visual_linker = VisualLinker(pdf_path=pdf_path) with open(docs_path) as f: soup = BeautifulSoup(f, "html.parser") page = soup.find_all("page")[0] pdf_word_list, coordinate_map = visual_linker._coordinates_from_HTML( page, 1) # Check if words are sorted by block top assert set([content for (_, content) in pdf_word_list[:2]]) == {"Sample", "HTML"} # Check if words are sorted by top assert [content for (_, content) in pdf_word_list[2:7]] == [ "This", "is", "an", "html", "that", ] # Check if words are sorted by left (#449) assert [content for (_, content) in pdf_word_list[:2]] == ["Sample", "HTML"]
def __init__(self, structural, blacklist, flatten, lingual, strip, replacements, tabular, visual, pdf_path, language, **kwargs): """ :param visual: boolean, if True visual features are used in the model :param pdf_path: directory where pdf are saved, if a pdf file is not found, it will be created from the html document and saved in that directory :param replacements: a list of (_pattern_, _replace_) tuples where _pattern_ isinstance a regex and _replace_ is a character string. All occurents of _pattern_ in the text will be replaced by _replace_. """ super(ParserUDF, self).__init__(**kwargs) # structural (html) setup self.structural = structural self.blacklist = blacklist if isinstance(blacklist, list) else [blacklist] self.flatten = flatten if isinstance(flatten, list) else [flatten] # lingual setup self.language = language self.strip = strip self.replacements = [] for (pattern, replace) in replacements: self.replacements.append((re.compile(pattern, flags=re.UNICODE), replace)) self.lingual = lingual self.lingual_parser = Spacy(self.language) if self.lingual_parser.has_tokenizer_support(): self.tokenize_and_split_sentences = self.lingual_parser.split_sentences self.lingual_parser.load_lang_model() else: self.tokenize_and_split_sentences = SimpleTokenizer().parse if self.lingual: if self.lingual_parser.has_NLP_support(): self.enrich_tokenized_sentences_with_nlp = ( self.lingual_parser.enrich_sentences_with_NLP) else: logger.warning("Lingual mode will be turned off, " "as spacy doesn't provide support for this " "language ({})".format(self.language)) self.lingual = False # tabular setup self.tabular = tabular # visual setup self.visual = visual if self.visual: self.pdf_path = pdf_path self.vizlink = VisualLinker()
def __init__( self, structural, blacklist, flatten, flatten_delim, lingual, strip, replacements, tabular, visual, pdf_path, lingual_parser, **kwargs ): """ :param visual: boolean, if True visual features are used in the model :param pdf_path: directory where pdf are saved, if a pdf file is not found, it will be created from the html document and saved in that directory :param replacements: a list of (_pattern_, _replace_) tuples where _pattern_ isinstance a regex and _replace_ is a character string. All occurents of _pattern_ in the text will be replaced by _replace_. """ super(ParserUDF, self).__init__(**kwargs) # structural (html) setup self.structural = structural self.blacklist = blacklist if isinstance(blacklist, list) else [blacklist] self.flatten = flatten if isinstance(flatten, list) else [flatten] self.flatten_delim = flatten_delim # lingual setup self.lingual = lingual self.strip = strip self.replacements = [] for (pattern, replace) in replacements: self.replacements.append((re.compile(pattern, flags=re.UNICODE), replace)) if self.lingual: self.lingual_parser = lingual_parser self.lingual_parse = self.lingual_parser.parse else: self.lingual_parse = SimpleTokenizer().parse # tabular setup self.tabular = tabular # visual setup self.visual = visual if self.visual: self.pdf_path = pdf_path self.vizlink = VisualLinker()
def test_visual_linker_not_affected_by_order_of_sentences(): """Test if visual_linker result is not affected by the order of sentences.""" docs_path = "tests/data/html/2N6427.html" pdf_path = "tests/data/pdf/2N6427.pdf" # Initialize preprocessor, parser, visual_linker. # Note that parser is initialized with `visual=False` and that visual_linker # will be used to attach "visual" information to sentences after parsing. preprocessor = HTMLDocPreprocessor(docs_path) parser_udf = get_parser_udf(structural=True, lingual=False, tabular=True, visual=False) visual_linker = VisualLinker(pdf_path=pdf_path) doc = parser_udf.apply(next(preprocessor.__iter__())) # Sort sentences by sentence.position doc.sentences = sorted(doc.sentences, key=attrgetter("position")) sentences0 = [ sent for sent in visual_linker.link(doc.name, doc.sentences, pdf_path) ] # Sort again in case visual_linker.link changes the order sentences0 = sorted(sentences0, key=attrgetter("position")) doc = parser_udf.apply(next(preprocessor.__iter__())) # Shuffle random.shuffle(doc.sentences) sentences1 = [ sent for sent in visual_linker.link(doc.name, doc.sentences, pdf_path) ] # Sort sentences by sentence.position sentences1 = sorted(sentences1, key=attrgetter("position")) # This should hold as both sentences are sorted by their position assert all([ sent0.position == sent1.position for (sent0, sent1) in zip(sentences0, sentences1) ]) # The following assertion should hold if the visual_linker result is not affected # by the order of sentences. assert all([ sent0.left == sent1.left for (sent0, sent1) in zip(sentences0, sentences1) ])
class ParserUDF(UDF): def __init__( self, structural, blacklist, flatten, flatten_delim, lingual, strip, replacements, tabular, visual, pdf_path, lingual_parser, **kwargs ): """ :param visual: boolean, if True visual features are used in the model :param pdf_path: directory where pdf are saved, if a pdf file is not found, it will be created from the html document and saved in that directory :param replacements: a list of (_pattern_, _replace_) tuples where _pattern_ isinstance a regex and _replace_ is a character string. All occurents of _pattern_ in the text will be replaced by _replace_. """ super(ParserUDF, self).__init__(**kwargs) # structural (html) setup self.structural = structural self.blacklist = blacklist if isinstance(blacklist, list) else [blacklist] self.flatten = flatten if isinstance(flatten, list) else [flatten] self.flatten_delim = flatten_delim # lingual setup self.lingual = lingual self.strip = strip self.replacements = [] for (pattern, replace) in replacements: self.replacements.append((re.compile(pattern, flags=re.UNICODE), replace)) if self.lingual: self.lingual_parser = lingual_parser self.lingual_parse = self.lingual_parser.parse else: self.lingual_parse = SimpleTokenizer().parse # tabular setup self.tabular = tabular # visual setup self.visual = visual if self.visual: self.pdf_path = pdf_path self.vizlink = VisualLinker() def apply(self, x, **kwargs): # The document is the Document model. Text is string representation. document, text = x if self.visual: if not self.pdf_path: logger.error("Visual parsing failed: pdf_path is required") for _ in self.parse(document, text): pass # Add visual attributes filename = self.pdf_path + document.name missing_pdf = ( not os.path.isfile(self.pdf_path) and not os.path.isfile(filename + ".pdf") and not os.path.isfile(filename + ".PDF") and not os.path.isfile(filename) ) if missing_pdf: logger.error("Visual parsing failed: pdf files are required") yield from self.vizlink.parse_visual( document.name, document.sentences, self.pdf_path ) else: yield from self.parse(document, text) def _parse_table(self, node, state): """Parse a table node. :param node: The lxml table node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if not self.tabular: logger.error("Called _parse_table without tabular activated.") return state if node.tag == "table": table_idx = state["table"]["idx"] stable_id = "{}::{}:{}".format( state["document"].name, "table", state["table"]["idx"] ) # Create the Table in the DB parts = {} parts["document"] = state["document"] parts["stable_id"] = stable_id parts["position"] = table_idx parent = state["parent"][node] if isinstance(parent, Cell): parts["section"] = parent.table.section elif isinstance(parent, Section): parts["section"] = parent else: raise NotImplementedError("Table is not within a Section or Cell") state["context"][node] = Table(**parts) # Local state for each table. This is required to support nested # tables state["table"][table_idx] = { "grid": defaultdict(int), "cell_pos": 0, "row_idx": -1, "col_idx": 0, } # Increment table counter state["table"]["idx"] += 1 elif node.tag == "tr": if not isinstance(state["parent"][node], Table): raise NotImplementedError("Table row parent must be a Table.") state["table"][state["parent"][node].position]["col_idx"] = 0 state["table"][state["parent"][node].position]["row_idx"] += 1 elif node.tag in ["td", "th"]: if not isinstance(state["parent"][node], Table): raise NotImplementedError("Cell parent must be a Table.") if not state["table"][state["parent"][node].position]["row_idx"] >= 0: raise NotImplementedError("Table cell encountered before a table row.") # calculate row_start/col_start while state["table"][state["parent"][node].position]["grid"][ ( state["table"][state["parent"][node].position]["row_idx"], state["table"][state["parent"][node].position]["col_idx"], ) ]: # while a cell on the grid is occupied, keep moving state["table"][state["parent"][node].position]["col_idx"] += 1 col_start = state["table"][state["parent"][node].position]["col_idx"] row_start = state["table"][state["parent"][node].position]["row_idx"] # calculate row_end/col_end row_end = row_start if "rowspan" in node.attrib: row_end += int(node.get("rowspan")) - 1 col_end = col_start if "colspan" in node.attrib: col_end += int(node.get("colspan")) - 1 # update grid with occupied cells for r, c in itertools.product( list(range(row_start, row_end + 1)), list(range(col_start, col_end + 1)) ): state["table"][state["parent"][node].position]["grid"][(r, c)] = 1 # construct cell parts = defaultdict(list) parts["document"] = state["document"] parts["table"] = state["parent"][node] parts["row_start"] = row_start parts["row_end"] = row_end parts["col_start"] = col_start parts["col_end"] = col_end parts["position"] = state["table"][state["parent"][node].position][ "cell_pos" ] stable_id = "{}::{}:{}:{}:{}".format( parts["document"].name, "cell", parts["table"].position, row_start, col_start, ) parts["stable_id"] = stable_id # Create the Cell in the DB state["context"][node] = Cell(**parts) # Update position state["table"][state["parent"][node].position]["col_idx"] += 1 state["table"][state["parent"][node].position]["cell_pos"] += 1 return state def _parse_figure(self, node, state): """Parse the figure node. :param node: The lxml img node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["img", "figure"]: return state # Process the figure stable_id = "{}::{}:{}".format( state["document"].name, "figure", state["figure"]["idx"] ) # img within a Figure get's processed in the parent Figure if node.tag == "img" and isinstance(state["parent"][node], Figure): return state # NOTE: We currently do NOT support nested figures. if not isinstance(state["parent"][node], Section): logger.warning("Figure is nested within {}".format(state["parent"][node])) return state # If processing a raw img if node.tag == "img": # Create the Figure entry in the DB state["context"][node] = Figure( document=state["document"], section=state["parent"][node], stable_id=stable_id, position=state["figure"]["idx"], url=node.get("src"), ) elif node.tag == "figure": # Pull the image from a child img node, if one exists imgs = [child for child in node if child.tag == "img"] if len(imgs) > 1: logger.warning("Figure contains multiple images.") # Right now we don't support multiple URLs in the Figure context # As a workaround, just ignore the outer Figure and allow processing # of the individual images. We ignore the accompanying figcaption # by marking it as visited. captions = [child for child in node if child.tag == "figcaption"] state["visited"].update(captions) return state img = imgs[0] state["visited"].add(img) # Create the Figure entry in the DB state["context"][node] = Figure( document=state["document"], section=state["parent"][node], stable_id=stable_id, position=state["figure"]["idx"], url=img.get("src"), ) state["figure"]["idx"] += 1 return state def _parse_sentence(self, paragraph, node, state): """Parse the Sentences of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ text = state["paragraph"]["text"] field = state["paragraph"]["field"] # Lingual Parse document = state["document"] for parts in self.lingual_parse(document, text): (_, _, _, char_end) = split_stable_id(parts["stable_id"]) parts["document"] = document parts["position"] = state["sentence"]["idx"] abs_sentence_offset_end = ( state["sentence"]["abs_offset"] + parts["char_offsets"][-1] + len(parts["words"][-1]) ) parts["stable_id"] = construct_stable_id( document, "sentence", state["sentence"]["abs_offset"], abs_sentence_offset_end, ) state["sentence"]["abs_offset"] = abs_sentence_offset_end if self.structural: context_node = node.getparent() if field == "tail" else node tree = lxml.etree.ElementTree(state["root"]) parts["xpath"] = tree.getpath(context_node) parts["html_tag"] = context_node.tag parts["html_attrs"] = [ "=".join(x) for x in list(context_node.attrib.items()) ] # Extending html style attribute with the styles # from inline style class for the element. cur_style_index = None for index, attr in enumerate(parts["html_attrs"]): if attr.find("style") >= 0: cur_style_index = index break styles = state["root"].find("head").find("style") if styles is not None: for x in list(context_node.attrib.items()): if x[0] == "class": exp = r"(." + x[1] + ")([\n\s\r]*)\{(.*?)\}" r = re.compile(exp, re.DOTALL) if r.search(styles.text) is not None: if cur_style_index is not None: parts["html_attrs"][cur_style_index] += ( r.search(styles.text) .group(3) .replace("\r", "") .replace("\n", "") .replace("\t", "") ) else: parts["html_attrs"].extend( [ "style=" + re.sub( r"\s{1,}", " ", r.search(styles.text) .group(3) .replace("\r", "") .replace("\n", "") .replace("\t", "") .strip(), ) ] ) break if self.tabular: parts["position"] = state["sentence"]["idx"] # If tabular, consider own Context first in case a Cell # was just created. Otherwise, defer to the parent. parent = paragraph if isinstance(parent, Paragraph): parts["section"] = parent.section parts["paragraph"] = parent if parent.cell: parts["table"] = parent.cell.table parts["cell"] = parent.cell parts["row_start"] = parent.cell.row_start parts["row_end"] = parent.cell.row_end parts["col_start"] = parent.cell.col_start parts["col_end"] = parent.cell.col_end else: raise NotImplementedError("Sentence parent must be Paragraph.") yield Sentence(**parts) state["sentence"]["idx"] += 1 def _parse_paragraph(self, node, state): """Parse a Paragraph of the node. A Paragraph is defined as :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Both Paragraphs will share the same parent parent = ( state["context"][node] if node in state["context"] else state["parent"][node] ) for field in ["text", "tail"]: text = getattr(node, field) text = text.strip() if text and self.strip else text # Skip if "" or None if not text: continue # Run RegEx replacements for (rgx, replace) in self.replacements: text = rgx.sub(replace, text) # Process the Paragraph stable_id = "{}::{}:{}".format( state["document"].name, "paragraph", state["paragraph"]["idx"] ) parts = {} parts["stable_id"] = stable_id parts["document"] = state["document"] parts["position"] = state["paragraph"]["idx"] if isinstance(parent, Caption): if parent.table: parts["section"] = parent.table.section elif parent.figure: parts["section"] = parent.figure.section parts["caption"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent elif isinstance(parent, Section): parts["section"] = parent else: raise NotImplementedError( "Paragraph parent must be Section, Caption, or Cell" ) # Create the Figure entry in the DB paragraph = Paragraph(**parts) state["paragraph"]["idx"] += 1 state["paragraph"]["text"] = text state["paragraph"]["field"] = field # Parse the Sentences in the Paragraph yield from self._parse_sentence(paragraph, node, state) return state def _parse_section(self, node, state): """Parse a Section of the node. Note that this implementation currently just creates a single Section for a document. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag != "html": return state # Add a Section stable_id = "{}::{}:{}".format( state["document"].name, "section", state["section"]["idx"] ) state["context"][node] = Section( document=state["document"], stable_id=stable_id, position=state["section"]["idx"], ) state["section"]["idx"] += 1 return state def _parse_caption(self, node, state): """Parse a Caption of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["caption", "figcaption"]: # captions used in Tables return state # Add a Caption parent = state["parent"][node] stable_id = "{}::{}:{}".format( state["document"].name, "caption", state["caption"]["idx"] ) if isinstance(parent, Table): state["context"][node] = Caption( document=state["document"], table=parent, figure=None, stable_id=stable_id, position=state["caption"]["idx"], ) elif isinstance(parent, Figure): state["context"][node] = Caption( document=state["document"], table=None, figure=parent, stable_id=stable_id, position=state["caption"]["idx"], ) else: raise NotImplementedError("Caption must be a child of Table or Figure.") state["caption"]["idx"] += 1 return state def _parse_node(self, node, state): """Entry point for parsing all node types. :param node: The lxml HTML node to parse :param state: The global state necessary to place the node in context of the document as a whole. :rtype: a *generator* of Sentences """ # Processing on entry of node state = self._parse_section(node, state) state = self._parse_figure(node, state) if self.tabular: state = self._parse_table(node, state) state = self._parse_caption(node, state) yield from self._parse_paragraph(node, state) def parse(self, document, text): """Depth-first search over the provided tree. Implemented as an iterative procedure. The structure of the state needed to parse each node is also defined in this function. :param document: the Document context :param text: the structured text of the document (e.g. HTML) :rtype: a *generator* of Sentences. """ stack = [] root = lxml.html.fromstring(text) document.text = text # flattens children of node that are in the 'flatten' list if self.flatten: lxml.etree.strip_tags(root, self.flatten) # This dictionary contain the global state necessary to parse a # document and each context element. This reflects the relationships # defined in parser/models. This contains the state necessary to create # the respective Contexts within the document. state = { "visited": set(), "parent": {}, # map of parent[child] = node used to discover child "context": {}, # track the Context created by each node (context['td'] = Cell) "root": root, "document": document, "section": {"idx": 0}, "paragraph": {"idx": 0}, "figure": {"idx": 0}, "caption": {"idx": 0}, "table": {"idx": 0}, "sentence": {"idx": 0, "abs_offset": 0}, } # NOTE: Currently the helper functions directly manipulate the state # rather than returning a modified copy. # Iterative Depth-First Search stack.append(root) state["parent"][root] = document state["context"][root] = document while stack: node = stack.pop() if node not in state["visited"]: state["visited"].add(node) # mark as visited # Process yield from self._parse_node(node, state) # NOTE: This reversed() order is to ensure that the iterative # DFS matches the order that would be produced by a recursive # DFS implementation. for child in reversed(node): # Skip nodes that are comments or blacklisted if child.tag is lxml.etree.Comment or ( self.blacklist and child.tag in self.blacklist ): continue stack.append(child) # store the parent of the node, which is either the parent # Context, or if the parent did not create a Context, then # use the node's parent Context. state["parent"][child] = ( state["context"][node] if node in state["context"] else state["parent"][node] )
def __init__( self, structural: bool, blacklist: Union[str, List[str]], flatten: Union[str, List[str]], lingual: bool, lingual_parser: Optional[LingualParser], strip: bool, replacements: List[Tuple[str, str]], tabular: bool, visual: bool, vizlink: Optional[VisualLinker], pdf_path: Optional[str], language: Optional[str], **kwargs: Any, ) -> None: """Initialize Parser UDF. :param visual: boolean, if True visual features are used in the model :param pdf_path: directory where pdf are saved, if a pdf file is not found, it will be created from the html document and saved in that directory :param replacements: a list of (_pattern_, _replace_) tuples where _pattern_ isinstance a regex and _replace_ is a character string. All occurents of _pattern_ in the text will be replaced by _replace_. """ super().__init__(**kwargs) # structural (html) setup self.structural = structural self.blacklist = blacklist if isinstance(blacklist, list) else [blacklist] self.flatten = flatten if isinstance(flatten, list) else [flatten] # lingual setup self.language = language self.strip = strip self.replacements: List[Tuple[Pattern, str]] = [] for (pattern, replace) in replacements: self.replacements.append((re.compile(pattern, flags=re.UNICODE), replace)) self.lingual = lingual if lingual_parser: self.lingual_parser = lingual_parser else: self.lingual_parser = SpacyParser(self.language) # Fallback to SimpleParser if a tokenizer is not supported. if not self.lingual_parser.has_tokenizer_support(): self.lingual_parser = SimpleParser() if self.lingual and not self.lingual_parser.has_NLP_support(): logger.warning(f"Lingual mode will be turned off, " f"as spacy doesn't provide support for this " f"language ({self.language})") self.lingual = False # tabular setup self.tabular = tabular # visual setup self.visual = visual self.vizlink = vizlink if self.visual: self.pdf_path = pdf_path if not self.vizlink: # Use the provided pdf_path if present if not self.pdf_path: warnings.warn( "Visual parsing failed: pdf_path is required. " + "Proceeding without visual parsing.", RuntimeWarning, ) self.visual = False else: self.vizlink = VisualLinker(pdf_path)
class ParserUDF(UDF): """Parser UDF class.""" def __init__( self, structural: bool, blacklist: Union[str, List[str]], flatten: Union[str, List[str]], lingual: bool, lingual_parser: Optional[LingualParser], strip: bool, replacements: List[Tuple[str, str]], tabular: bool, visual: bool, vizlink: Optional[VisualLinker], pdf_path: Optional[str], language: Optional[str], **kwargs: Any, ) -> None: """Initialize Parser UDF. :param visual: boolean, if True visual features are used in the model :param pdf_path: directory where pdf are saved, if a pdf file is not found, it will be created from the html document and saved in that directory :param replacements: a list of (_pattern_, _replace_) tuples where _pattern_ isinstance a regex and _replace_ is a character string. All occurents of _pattern_ in the text will be replaced by _replace_. """ super().__init__(**kwargs) # structural (html) setup self.structural = structural self.blacklist = blacklist if isinstance(blacklist, list) else [blacklist] self.flatten = flatten if isinstance(flatten, list) else [flatten] # lingual setup self.language = language self.strip = strip self.replacements: List[Tuple[Pattern, str]] = [] for (pattern, replace) in replacements: self.replacements.append((re.compile(pattern, flags=re.UNICODE), replace)) self.lingual = lingual if lingual_parser: self.lingual_parser = lingual_parser else: self.lingual_parser = SpacyParser(self.language) # Fallback to SimpleParser if a tokenizer is not supported. if not self.lingual_parser.has_tokenizer_support(): self.lingual_parser = SimpleParser() if self.lingual and not self.lingual_parser.has_NLP_support(): logger.warning(f"Lingual mode will be turned off, " f"as spacy doesn't provide support for this " f"language ({self.language})") self.lingual = False # tabular setup self.tabular = tabular # visual setup self.visual = visual self.vizlink = vizlink if self.visual: self.pdf_path = pdf_path if not self.vizlink: # Use the provided pdf_path if present if not self.pdf_path: warnings.warn( "Visual parsing failed: pdf_path is required. " + "Proceeding without visual parsing.", RuntimeWarning, ) self.visual = False else: self.vizlink = VisualLinker(pdf_path) def apply( # type: ignore self, document: Document, pdf_path: Optional[str] = None, **kwargs: Any) -> Document: """Parse a text in an instance of Document. :param document: document to parse. :param pdf_path: path of a pdf file that the document is visually linked with. """ try: [y for y in self.parse(document, document.text)] if self.visual: # Use the provided pdf_path if present self.pdf_path = pdf_path if pdf_path else self.pdf_path if not self.vizlink.is_linkable(document.name, self.pdf_path): warnings.warn( (f"Visual parse failed. " f"{self.pdf_path + document.name} not a PDF. " f"Proceeding without visual parsing."), RuntimeWarning, ) else: # Add visual attributes [ y for y in self.vizlink.link( document.name, document.sentences, self.pdf_path) ] return document except Exception as e: warnings.warn((f"Document {document.name} not added to database, " f"because of parse error: \n{e}")) def _parse_table(self, node: HtmlElement, state: Dict[str, Any]) -> Dict[str, Any]: """Parse a table node. :param node: The lxml table node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if not self.tabular: logger.error("Called _parse_table without tabular activated.") return state if node.tag == "table": table_idx = state["table"]["idx"] stable_id = f"{state['document'].name}::{'table'}:{state['table']['idx']}" # Set name for Table name = node.attrib["name"] if "name" in node.attrib else None # Create the Table in the DB parts = {} parts["document"] = state["document"] parts["stable_id"] = stable_id parts["name"] = name parts["position"] = table_idx parent = state["parent"][node] if isinstance(parent, Cell): parts["section"] = parent.table.section elif isinstance(parent, Section): parts["section"] = parent else: raise NotImplementedError( "Table is not within a Section or Cell") state["context"][node] = Table(**parts) # Local state for each table. This is required to support nested # tables state["table"][table_idx] = { "grid": defaultdict(int), "cell_pos": 0, "row_idx": -1, "col_idx": 0, } # Increment table counter state["table"]["idx"] += 1 elif node.tag == "tr": if not isinstance(state["parent"][node], Table): raise NotImplementedError("Table row parent must be a Table.") state["table"][state["parent"][node].position]["col_idx"] = 0 state["table"][state["parent"][node].position]["row_idx"] += 1 elif node.tag in ["td", "th"]: if not isinstance(state["parent"][node], Table): raise NotImplementedError("Cell parent must be a Table.") if not state["table"][state["parent"] [node].position]["row_idx"] >= 0: raise NotImplementedError( "Table cell encountered before a table row.") # calculate row_start/col_start while state["table"][state["parent"][node].position]["grid"][( state["table"][state["parent"][node].position]["row_idx"], state["table"][state["parent"][node].position]["col_idx"], )]: # while a cell on the grid is occupied, keep moving state["table"][state["parent"][node].position]["col_idx"] += 1 col_start = state["table"][state["parent"] [node].position]["col_idx"] row_start = state["table"][state["parent"] [node].position]["row_idx"] # calculate row_end/col_end row_end = row_start if "rowspan" in node.attrib: try: row_end += int(node.get("rowspan")) - 1 except ValueError: logger.error( f"Rowspan has invalid value: '{node.get('rowspan')}'") col_end = col_start if "colspan" in node.attrib: try: col_end += int(node.get("colspan")) - 1 except ValueError: logger.error( f"Colspan has invalid value: '{node.get('colspan')}'") # update grid with occupied cells for r, c in itertools.product(list(range(row_start, row_end + 1)), list(range(col_start, col_end + 1))): state["table"][state["parent"][node].position]["grid"][(r, c)] = 1 # Set name for Cell name = node.attrib["name"] if "name" in node.attrib else None # construct cell parts = defaultdict(list) parts["document"] = state["document"] parts["name"] = name parts["table"] = state["parent"][node] parts["row_start"] = row_start parts["row_end"] = row_end parts["col_start"] = col_start parts["col_end"] = col_end parts["position"] = state["table"][state["parent"] [node].position]["cell_pos"] stable_id = (f"{parts['document'].name}" f"::" f"{'cell'}" f":" f"{parts['table'].position}" f":" f"{row_start}" f":" f"{col_start}") parts["stable_id"] = stable_id # Create the Cell in the DB state["context"][node] = Cell(**parts) # Update position state["table"][state["parent"][node].position]["col_idx"] += 1 state["table"][state["parent"][node].position]["cell_pos"] += 1 return state def _parse_figure(self, node: HtmlElement, state: Dict[str, Any]) -> Dict[str, Any]: """Parse the figure node. :param node: The lxml img node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["img", "figure"]: return state # Process the Figure stable_id = (f"{state['document'].name}" f"::" f"{'figure'}" f":" f"{state['figure']['idx']}") # Set name for Figure name = node.attrib["name"] if "name" in node.attrib else None # img within a Figure get's processed in the parent Figure if node.tag == "img" and isinstance(state["parent"][node], Figure): return state # NOTE: We currently do NOT support nested figures. parts: Dict[str, Any] = {} parent = state["parent"][node] if isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent else: logger.warning(f"Figure is nested within {state['parent'][node]}") return state parts["document"] = state["document"] parts["stable_id"] = stable_id parts["name"] = name parts["position"] = state["figure"]["idx"] # If processing a raw img if node.tag == "img": # Create the Figure entry in the DB parts["url"] = node.get("src") state["context"][node] = Figure(**parts) elif node.tag == "figure": # Pull the image from a child img node, if one exists imgs = [child for child in node if child.tag == "img"] # In case the image from the child img node doesn't exist if len(imgs) == 0: logger.warning("No image found in Figure.") return state if len(imgs) > 1: logger.warning("Figure contains multiple images.") # Right now we don't support multiple URLs in the Figure context # As a workaround, just ignore the outer Figure and allow processing # of the individual images. We ignore the accompanying figcaption # by marking it as visited. captions = [ child for child in node if child.tag == "figcaption" ] state["visited"].update(captions) return state img = imgs[0] state["visited"].add(img) # Create the Figure entry in the DB parts["url"] = img.get("src") state["context"][node] = Figure(**parts) state["figure"]["idx"] += 1 return state def _parse_sentence(self, paragraph: Paragraph, node: HtmlElement, state: Dict[str, Any]) -> Iterator[Sentence]: """Parse the Sentences of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ text = state["paragraph"]["text"] field = state["paragraph"]["field"] # Set name for Sentence name = node.attrib["name"] if "name" in node.attrib else None # Lingual Parse document = state["document"] for parts in self.lingual_parser.split_sentences(text): abs_offset = state["sentence"]["abs_offset"] parts["abs_char_offsets"] = [ char_offset + abs_offset for char_offset in parts["char_offsets"] ] parts["document"] = document # NOTE: Why do we overwrite this from the spacy parse? parts["position"] = state["sentence"]["idx"] abs_sentence_offset_end = (state["sentence"]["abs_offset"] + parts["char_offsets"][-1] + len(parts["words"][-1])) parts["stable_id"] = construct_stable_id( document, "sentence", state["sentence"]["abs_offset"], abs_sentence_offset_end, ) parts["name"] = name state["sentence"]["abs_offset"] = abs_sentence_offset_end if self.structural: context_node = node.getparent() if field == "tail" else node tree = lxml.etree.ElementTree(state["root"]) parts["xpath"] = tree.getpath(context_node) parts["html_tag"] = context_node.tag parts["html_attrs"] = [ "=".join(x) for x in list(context_node.attrib.items()) ] # Extending html style attribute with the styles # from inline style class for the element. cur_style_index = None for index, attr in enumerate(parts["html_attrs"]): if attr.find("style") >= 0: cur_style_index = index break head = state["root"].find("head") styles = None if head is not None: styles = head.find("style") if styles is not None: for x in list(context_node.attrib.items()): if x[0] == "class": exp = r"(." + x[1] + r")([\n\s\r]*)\{(.*?)\}" r = re.compile(exp, re.DOTALL) if r.search(styles.text) is not None: if cur_style_index is not None: parts["html_attrs"][cur_style_index] += ( r.search(styles.text).group(3).replace( "\r", "").replace("\n", "").replace("\t", "")) else: parts["html_attrs"].extend([ "style=" + re.sub( r"\s{1,}", " ", r.search( styles.text).group(3).replace( "\r", "").replace( "\n", "").replace( "\t", "").strip(), ) ]) break parts["position"] = state["sentence"]["idx"] # If tabular, consider own Context first in case a Cell # was just created. Otherwise, defer to the parent. parent = paragraph if isinstance(parent, Paragraph): parts["section"] = parent.section parts["paragraph"] = parent if parent.cell: # if True self.tabular is also always True parts["table"] = parent.cell.table parts["cell"] = parent.cell parts["row_start"] = parent.cell.row_start parts["row_end"] = parent.cell.row_end parts["col_start"] = parent.cell.col_start parts["col_end"] = parent.cell.col_end else: raise NotImplementedError("Sentence parent must be Paragraph.") yield Sentence(**parts) state["sentence"]["idx"] += 1 def _parse_paragraph(self, node: HtmlElement, state: Dict[str, Any]) -> Iterator[Sentence]: """Parse a Paragraph of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Both Paragraphs will share the same parent parent = (state["context"][node] if node in state["context"] else state["parent"][node]) # Set name for Paragraph name = node.attrib["name"] if "name" in node.attrib else None for field in ["text", "tail"]: text = getattr(node, field) text = text.strip() if text and self.strip else text # Skip if "" or None if not text: continue # Run RegEx replacements for (rgx, replace) in self.replacements: text = rgx.sub(replace, text) # Process the Paragraph stable_id = (f"{state['document'].name}" f"::" f"{'paragraph'}" f":" f"{state['paragraph']['idx']}") parts = {} parts["stable_id"] = stable_id parts["name"] = name parts["document"] = state["document"] parts["position"] = state["paragraph"]["idx"] if isinstance(parent, Caption): if parent.table: parts["section"] = parent.table.section elif parent.figure: parts["section"] = parent.figure.section parts["caption"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent elif isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Figure): # occurs with text in the tail of an img parts["section"] = parent.section elif isinstance(parent, Table): # occurs with text in the tail of a table parts["section"] = parent.section else: raise NotImplementedError( f"Para '{text}' parent must be Section, Caption, or Cell, " f"not {parent}") # Create the entry in the DB paragraph = Paragraph(**parts) state["paragraph"]["idx"] += 1 state["paragraph"]["text"] = text state["paragraph"]["field"] = field yield from self._parse_sentence(paragraph, node, state) def _parse_section(self, node: HtmlElement, state: Dict[str, Any]) -> Dict[str, Any]: """Parse a Section of the node. Note that this implementation currently creates a Section at the beginning of the document and creates Section based on tag of node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["html", "section"]: return state # Add a Section stable_id = (f"{state['document'].name}" f"::" f"{'section'}" f":" f"{state['section']['idx']}") # Set name for Section name = node.attrib["name"] if "name" in node.attrib else None state["context"][node] = Section( document=state["document"], name=name, stable_id=stable_id, position=state["section"]["idx"], ) state["section"]["idx"] += 1 return state def _parse_caption(self, node: HtmlElement, state: Dict[str, Any]) -> Dict[str, Any]: """Parse a Caption of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["caption", "figcaption"]: # captions used in Tables return state # Add a Caption parent = state["parent"][node] stable_id = (f"{state['document'].name}" f"::" f"{'caption'}" f":" f"{state['caption']['idx']}") # Set name for Section name = node.attrib["name"] if "name" in node.attrib else None if isinstance(parent, Table): state["context"][node] = Caption( document=state["document"], table=parent, figure=None, stable_id=stable_id, name=name, position=state["caption"]["idx"], ) elif isinstance(parent, Figure): state["context"][node] = Caption( document=state["document"], table=None, figure=parent, stable_id=stable_id, name=name, position=state["caption"]["idx"], ) else: raise NotImplementedError( "Caption must be a child of Table or Figure.") state["caption"]["idx"] += 1 return state def _parse_node(self, node: HtmlElement, state: Dict[str, Any]) -> Iterator[Sentence]: """Entry point for parsing all node types. :param node: The lxml HTML node to parse :param state: The global state necessary to place the node in context of the document as a whole. :return: a *generator* of Sentences """ # Processing on entry of node state = self._parse_section(node, state) state = self._parse_figure(node, state) if self.tabular: state = self._parse_table(node, state) state = self._parse_caption(node, state) yield from self._parse_paragraph(node, state) def parse(self, document: Document, text: str) -> Iterator[Sentence]: """Depth-first search over the provided tree. Implemented as an iterative procedure. The structure of the state needed to parse each node is also defined in this function. :param document: the Document context :param text: the structured text of the document (e.g. HTML) :return: a *generator* of Sentences. """ stack = [] root = lxml.html.fromstring(text) # flattens children of node that are in the 'flatten' list if self.flatten: lxml.etree.strip_tags(root, self.flatten) # Strip comments lxml.etree.strip_tags(root, lxml.etree.Comment) # Assign the text, which was stripped of the 'flatten'-tags, to the document document.text = lxml.etree.tostring(root, encoding="unicode") # This dictionary contain the global state necessary to parse a # document and each context element. This reflects the relationships # defined in parser/models. This contains the state necessary to create # the respective Contexts within the document. state = { "visited": set(), "parent": {}, # map of parent[child] = node used to discover child "context": {}, # track the Context of each node (context['td'] = Cell) "root": root, "document": document, "section": { "idx": 0 }, "paragraph": { "idx": 0 }, "figure": { "idx": 0 }, "caption": { "idx": 0 }, "table": { "idx": 0 }, "sentence": { "idx": 0, "abs_offset": 0 }, } # NOTE: Currently the helper functions directly manipulate the state # rather than returning a modified copy. # Iterative Depth-First Search stack.append(root) state["parent"][root] = document state["context"][root] = document tokenized_sentences: List[Sentence] = [] while stack: node = stack.pop() if node not in state["visited"]: state["visited"].add(node) # mark as visited # Process if self.lingual: tokenized_sentences += [ y for y in self._parse_node(node, state) ] else: yield from self._parse_node(node, state) # NOTE: This reversed() order is to ensure that the iterative # DFS matches the order that would be produced by a recursive # DFS implementation. for child in reversed(node): # Skip nodes that are blacklisted if self.blacklist and child.tag in self.blacklist: continue stack.append(child) # store the parent of the node, which is either the parent # Context, or if the parent did not create a Context, then # use the node's parent Context. state["parent"][child] = (state["context"][node] if node in state["context"] else state["parent"][node]) if self.lingual: yield from self.lingual_parser.enrich_sentences_with_NLP( tokenized_sentences)