def get_html_others(self, box, page_num): node_html = "" top_html = "" left_html = "" bottom_html = "" right_html = "" char_html = "" sep = " " elems = get_mentions_within_bbox(box, self.elems[page_num].mentions) elems.sort(key=cmp_to_key(reading_order)) for elem in elems: chars = self.get_char_boundaries(elem) for char in chars: if six.PY2: temp = char[0].encode("utf-8") elif six.PY3: temp = char[0] if not re.match(r"[\x00-\x1F]", temp): char_html += char[0] + sep top_html += str(char[1]) + sep left_html += str(char[2]) + sep bottom_html += str(char[3]) + sep right_html += str(char[4]) + sep words = self.get_word_boundaries(elem) for word in words: # node_html += ( # "<word top=" + str(word[1]) + " left=" + str(word[2]) + # " bottom=" + str(word[3]) + " right=" + str(word[4]) + # ">" + str(word[0].encode('utf-8')) + "</word> ") node_html += word[0] + " " # escape special HTML chars node_html = html.escape(node_html) char_html = html.escape(char_html) return node_html, char_html, top_html, left_html, bottom_html, right_html
def get_html_others(self, tag: str, box: List[float], page_num: int) -> Element: element = self.doc.createElement("div") element.setAttribute("class", "ocrx_block") element.setAttribute("pdftotree", tag) # for backward-compatibility top, left, bottom, right = [int(x) for x in box] element.setAttribute("title", f"bbox {left} {top} {right} {bottom}") elems: List[LTTextLine] = get_mentions_within_bbox( box, self.elems[page_num].mentions ) elems.sort(key=cmp_to_key(reading_order)) for elem in elems: line_element = self.doc.createElement("span") element.appendChild(line_element) line_element.setAttribute("class", "ocrx_line") line_element.setAttribute( "title", f"bbox {int(elem.x0)} {int(elem.y0)} {int(elem.x1)} {int(elem.y1)}", ) words = self.get_word_boundaries(elem) for word in words: top, left, bottom, right = [int(x) for x in word[1:]] # escape special HTML chars text = html.escape(word[0]) word_element = self.doc.createElement("span") line_element.appendChild(word_element) word_element.setAttribute("class", "ocrx_word") word_element.setAttribute( "title", f"bbox {left} {top} {right} {bottom}" ) word_element.appendChild(self.doc.createTextNode(text)) return element
def get_html_table(self, table, page_num) -> Element: table_str = [str(i) for i in table] table_json = tabula.read_pdf( self.pdf_file, pages=page_num, area=table_str, output_format="json" ) if len(table_json) > 0: table_element = self.doc.createElement("table") for i, row in enumerate(table_json[0]["data"]): row_element = self.doc.createElement("tr") table_element.appendChild(row_element) for j, column in enumerate(row): col_element = self.doc.createElement("td") row_element.appendChild(col_element) box = [ column["top"], column["left"], column["top"] + column["height"], column["left"] + column["width"], ] elems = get_mentions_within_bbox(box, self.elems[page_num].mentions) elems.sort(key=cmp_to_key(reading_order)) for elem in elems: words = self.get_word_boundaries(elem) for word in words: top = int(word[1]) left = int(word[2]) bottom = int(word[3]) right = int(word[4]) # escape special HTML chars text = html.escape(word[0]) word_element = self.doc.createElement("span") col_element.appendChild(word_element) word_element.setAttribute("class", "ocrx_word") word_element.setAttribute( "title", f"bbox {left} {top} {right} {bottom}" ) word_element.appendChild(self.doc.createTextNode(text)) return table_element
def get_html_table(self, table, page_num): table_str = [str(i) for i in table] table_json = tabula.read_pdf(self.pdf_file, pages=page_num, area=table_str, output_format="json") table_html = "" if len(table_json) > 0: table_html = "<table>" for i, row in enumerate(table_json[0]["data"]): row_str = "<tr>" for j, column in enumerate(row): box = [ column["top"], column["left"], column["top"] + column["height"], column["left"] + column["width"], ] top_html = "" left_html = "" bottom_html = "" right_html = "" char_html = "" sep = " " elems = get_mentions_within_bbox( box, self.elems[page_num].mentions) elems.sort(key=cmp_to_key(reading_order)) word_td = "" for elem in elems: chars = self.get_char_boundaries(elem) for char in chars: if six.PY2: temp = char[0].encode("utf-8") else: temp = char[0] if not re.match(r"[\x00-\x1F]", temp): char_html += char[0].replace("'", '"') + sep top_html += str(char[1]) + sep left_html += str(char[2]) + sep bottom_html += str(char[3]) + sep right_html += str(char[4]) + sep words = self.get_word_boundaries(elem) for word in words: if six.PY2: temp = word[0].encode("utf-8") elif six.PY3: temp = word[0] if not re.match(r"[\x00-\x1F]", temp): word_td += word[0] + sep # escape special HTML chars word_td = html.escape(word_td) char_html = html.escape(char_html) row_str += ("<td char='" + char_html + "', top='" + top_html + "', left='" + left_html + "', bottom='" + bottom_html + "', right='" + right_html + "'>" + word_td.strip() + "</td>") # row_str += ( # "<td word='" + word_html + "', top='" + top_html + # "', left='" + left_html + "', bottom='" + bottom_html + # "', right='" + right_html + "'>") + str( # column["text"].encode('utf-8')) + "</td>" # row_str += ("<td char='" + char_html + "', top=") + str( # column["top"] # ) + (", left=" + str(column["left"]) + ", bottom=") + str( # column["top"] + column["height"]) + ", right=" + str( # column["left"] + column["width"]) + ">" # row_str += str(column["text"].encode('utf-8')) # row_str += "</td>" row_str += "</tr>" table_html += row_str table_html += "</table>" return table_html
def get_html_table(self, table: List[float], page_num) -> Optional[Element]: """Recognize a table using tabula and return a DOM element. :param table: bbox for a table (top,left,bottom,right) :param page_num: 1-based page number :return: DOM element for a table """ logger.debug(f"Calling tabula at page: {page_num} and area: {table}.") table_json = tabula.read_pdf(self.pdf_file, pages=page_num, area=table, output_format="json") logger.debug(f"Tabula recognized {len(table_json)} table(s).") if len(table_json) == 0: return None table_element = self.doc.createElement("table") table_element.setAttribute("class", "ocr_table") top = int(table_json[0]["top"]) left = int(table_json[0]["left"]) bottom = int(table_json[0]["bottom"]) right = int(table_json[0]["right"]) table_element.setAttribute("title", f"bbox {left} {top} {right} {bottom}") for i, row in enumerate(table_json[0]["data"]): row_element = self.doc.createElement("tr") table_element.appendChild(row_element) for j, cell in enumerate(row): # It is not explicitly stated anywhere but tabula seems to use the cell # bbox to represent that of cell itself rather than that of text inside. # Note: bbox could be [0, 0, 0, 0] if tabula recognizes no text inside. box: List[float] = [ cell["top"], cell["left"], cell["top"] + cell["height"], cell["left"] + cell["width"], ] cell_element = self.doc.createElement("td") row_element.appendChild(cell_element) elems = get_mentions_within_bbox(box, self.elems[page_num].mentions) if len(elems) == 0: continue cell_element.setAttribute( "title", f"bbox {int(box[1])} {int(box[0])} {int(box[3])} {int(box[2])}", ) elems.sort(key=cmp_to_key(reading_order)) for elem in elems: line_element = self.doc.createElement("span") cell_element.appendChild(line_element) line_element.setAttribute("class", "ocrx_line") line_element.setAttribute( "title", " ".join(["bbox"] + [str(int(_)) for _ in elem.bbox]), ) words = self.get_word_boundaries(elem) for word in words: top = int(word[1]) left = int(word[2]) bottom = int(word[3]) right = int(word[4]) word_element = self.doc.createElement("span") line_element.appendChild(word_element) word_element.setAttribute("class", "ocrx_word") word_element.setAttribute( "title", f"bbox {left} {top} {right} {bottom}") # No need to escape text here as minidom will do. word_element.appendChild( self.doc.createTextNode(word[0])) return table_element
def get_html_tree(self) -> str: # Create a temp folder where images are temporarily saved. dirname = tempfile.mkdtemp() imagewriter = ImageWriter(dirname) doc = Document() self.doc = doc html = doc.createElement("html") doc.appendChild(html) head = doc.createElement("head") html.appendChild(head) # meta meta = doc.createElement("meta") head.appendChild(meta) meta.setAttribute("name", "ocr-system") meta.setAttribute("content", f"Converted from PDF by pdftotree {__version__}") meta = doc.createElement("meta") head.appendChild(meta) meta.setAttribute("name", "ocr-capabilities") meta.setAttribute("content", "ocr_page ocr_table ocrx_block ocrx_line ocrx_word") meta = doc.createElement("meta") head.appendChild(meta) meta.setAttribute("name", "ocr-number-of-pages") meta.setAttribute("content", f"{len(self.elems.keys())}") # body body = doc.createElement("body") html.appendChild(body) for page_num in self.elems.keys(): # 1-based boxes: List[Tuple[str, float, float, float, float]] = [] for clust in self.tree[page_num]: for (pnum, pwidth, pheight, top, left, bottom, right) in self.tree[page_num][clust]: boxes += [(clust.lower().replace(" ", "_"), top, left, bottom, right)] page = doc.createElement("div") page.setAttribute("class", "ocr_page") page.setAttribute("id", f"page_{page_num}") width = int(self.elems[page_num].layout.width) height = int(self.elems[page_num].layout.height) page.setAttribute( "title", f"bbox 0 0 {width} {height}; ppageno {page_num-1}", ) body.appendChild(page) # TODO: We need to detect columns and sort acccordingly. boxes.sort(key=cmp_to_key(column_order)) for box in boxes: if box[0] == "table": table = box[1:] # bbox table_element = self.get_html_table(table, page_num) page.appendChild(table_element) elif box[0] == "figure": elems: List[LTTextLine] = get_mentions_within_bbox( box, self.elems[page_num].figures) fig_element = doc.createElement("figure") page.appendChild(fig_element) top, left, bottom, right = [int(i) for i in box[1:]] fig_element.setAttribute( "title", f"bbox {left} {top} {right} {bottom}") for img in [img for elem in elems for img in elem]: if not isinstance(img, LTImage): continue filename = imagewriter.export_image(img) with open(os.path.join(dirname, filename), "rb") as f: base64 = b64encode(f.read()).decode("ascii") if filename.endswith("jpg"): mediatype = "jpeg" elif filename.endswith("bmp"): mediatype = "bmp" else: logger.info( f"Skipping an unknown type image: {filename}.") continue logger.info( f"Embedding a known type image: {filename}.") img_element = doc.createElement("img") fig_element.appendChild(img_element) img_element.setAttribute("title", bbox2str(img.bbox)) img_element.setAttribute( "src", f"data:image/{mediatype};base64,{base64}") else: element = self.get_html_others(box[0], box[1:], page_num) page.appendChild(element) return doc.toprettyxml()