def get_html_others(self, tag: str, box: List[float], page_num: int) -> Element: element = self.doc.createElement("div") element.setAttribute("class", "ocrx_block") element.setAttribute("pdftotree", tag) # for backward-compatibility top, left, bottom, right = [int(x) for x in box] element.setAttribute("title", f"bbox {left} {top} {right} {bottom}") elems: List[LTTextLine] = get_mentions_within_bbox( box, self.elems[page_num].mentions) elems.sort(key=cmp_to_key(reading_order)) for elem in elems: line_element = self.doc.createElement("span") element.appendChild(line_element) line_element.setAttribute("class", "ocrx_line") line_element.setAttribute("title", bbox2str(elem.bbox)) words = self.get_word_boundaries(elem) for word in words: top, left, bottom, right = [int(x) for x in word[1:]] word_element = self.doc.createElement("span") line_element.appendChild(word_element) word_element.setAttribute("class", "ocrx_word") word_element.setAttribute( "title", f"bbox {left} {top} {right} {bottom}") # No need to escape text here as minidom will do. word_element.appendChild(self.doc.createTextNode(word[0])) return element
def get_html_table(self, table: List[float], page_num) -> Optional[Element]: """Recognize a table using tabula and return a DOM element. :param table: bbox for a table (top,left,bottom,right) :param page_num: 1-based page number :return: DOM element for a table """ logger.debug(f"Calling tabula at page: {page_num} and area: {table}.") loglevel = logging.getLogger("pdftotree").getEffectiveLevel() table_json = tabula.read_pdf( self.pdf_file, pages=page_num, area=table, output_format="json", silent=False if loglevel <= logging.DEBUG else True, ) logger.debug(f"Tabula recognized {len(table_json)} table(s).") if len(table_json) == 0: return None table_element = self.doc.createElement("table") table_element.setAttribute("class", "ocr_table") top = int(table_json[0]["top"]) left = int(table_json[0]["left"]) bottom = int(table_json[0]["bottom"]) right = int(table_json[0]["right"]) table_element.setAttribute("title", f"bbox {left} {top} {right} {bottom}") for i, row in enumerate(table_json[0]["data"]): row_element = self.doc.createElement("tr") table_element.appendChild(row_element) for j, cell in enumerate(row): # It is not explicitly stated anywhere but tabula seems to use the cell # bbox to represent that of cell itself rather than that of text inside. # Note: bbox could be [0, 0, 0, 0] if tabula recognizes no text inside. box: List[float] = [ cell["top"], cell["left"], cell["top"] + cell["height"], cell["left"] + cell["width"], ] cell_element = self.doc.createElement("td") row_element.appendChild(cell_element) elems = get_mentions_within_bbox(box, self.elems[page_num].mentions) if len(elems) == 0: continue cell_element.setAttribute( "title", f"bbox {int(box[1])} {int(box[0])} {int(box[3])} {int(box[2])}", ) elems.sort(key=cmp_to_key(reading_order)) for elem in elems: line_element = self.doc.createElement("span") cell_element.appendChild(line_element) line_element.setAttribute("class", "ocrx_line") line_element.setAttribute("title", bbox2str(elem.bbox)) words = self.get_word_boundaries(elem) for word in words: top = int(word[1]) left = int(word[2]) bottom = int(word[3]) right = int(word[4]) word_element = self.doc.createElement("span") line_element.appendChild(word_element) word_element.setAttribute("class", "ocrx_word") word_element.setAttribute( "title", f"bbox {left} {top} {right} {bottom}") # No need to escape text here as minidom will do. word_element.appendChild( self.doc.createTextNode(word[0])) return table_element
def get_html_tree(self) -> str: # Create a temp folder where images are temporarily saved. dirname = tempfile.mkdtemp() imagewriter = ImageWriter(dirname) doc = Document() self.doc = doc html = doc.createElement("html") doc.appendChild(html) head = doc.createElement("head") html.appendChild(head) # meta meta = doc.createElement("meta") head.appendChild(meta) meta.setAttribute("name", "ocr-system") meta.setAttribute("content", f"Converted from PDF by pdftotree {__version__}") meta = doc.createElement("meta") head.appendChild(meta) meta.setAttribute("name", "ocr-capabilities") meta.setAttribute("content", "ocr_page ocr_table ocrx_block ocrx_line ocrx_word") meta = doc.createElement("meta") head.appendChild(meta) meta.setAttribute("name", "ocr-number-of-pages") meta.setAttribute("content", f"{len(self.elems.keys())}") # body body = doc.createElement("body") html.appendChild(body) for page_num in self.elems.keys(): # 1-based boxes: List[Tuple[str, float, float, float, float]] = [] for clust in self.tree[page_num]: for (pnum, pwidth, pheight, top, left, bottom, right) in self.tree[page_num][clust]: boxes += [(clust.lower().replace(" ", "_"), top, left, bottom, right)] page = doc.createElement("div") page.setAttribute("class", "ocr_page") page.setAttribute("id", f"page_{page_num}") width = int(self.elems[page_num].layout.width) height = int(self.elems[page_num].layout.height) page.setAttribute( "title", f"bbox 0 0 {width} {height}; ppageno {page_num-1}", ) body.appendChild(page) # TODO: We need to detect columns and sort acccordingly. boxes.sort(key=cmp_to_key(column_order)) for box in boxes: if box[0] == "table": table = box[1:] # bbox table_element = self.get_html_table(table, page_num) page.appendChild(table_element) elif box[0] == "figure": elems: List[LTTextLine] = get_mentions_within_bbox( box, self.elems[page_num].figures) fig_element = doc.createElement("figure") page.appendChild(fig_element) top, left, bottom, right = [int(i) for i in box[1:]] fig_element.setAttribute( "title", f"bbox {left} {top} {right} {bottom}") for img in [img for elem in elems for img in elem]: if not isinstance(img, LTImage): continue filename = imagewriter.export_image(img) with open(os.path.join(dirname, filename), "rb") as f: base64 = b64encode(f.read()).decode("ascii") if filename.endswith("jpg"): mediatype = "jpeg" elif filename.endswith("bmp"): mediatype = "bmp" else: logger.info( f"Skipping an unknown type image: {filename}.") continue logger.info( f"Embedding a known type image: {filename}.") img_element = doc.createElement("img") fig_element.appendChild(img_element) img_element.setAttribute("title", bbox2str(img.bbox)) img_element.setAttribute( "src", f"data:image/{mediatype};base64,{base64}") else: element = self.get_html_others(box[0], box[1:], page_num) page.appendChild(element) return doc.toprettyxml()