예제 #1
0
    def get_html_others(self, box, page_num):
        node_html = ""
        top_html = ""
        left_html = ""
        bottom_html = ""
        right_html = ""
        char_html = ""
        sep = " "
        elems = get_mentions_within_bbox(box, self.elems[page_num].mentions)
        elems.sort(key=cmp_to_key(reading_order))
        for elem in elems:
            chars = self.get_char_boundaries(elem)
            for char in chars:
                if six.PY2:
                    temp = char[0].encode("utf-8")
                elif six.PY3:
                    temp = char[0]
                if not re.match(r"[\x00-\x1F]", temp):
                    char_html += char[0] + sep
                    top_html += str(char[1]) + sep
                    left_html += str(char[2]) + sep
                    bottom_html += str(char[3]) + sep
                    right_html += str(char[4]) + sep
            words = self.get_word_boundaries(elem)
            for word in words:
                #  node_html += (
                #      "<word top=" + str(word[1]) + " left=" + str(word[2]) +
                #      " bottom=" + str(word[3]) + " right=" + str(word[4]) +
                #      ">" + str(word[0].encode('utf-8')) + "</word> ")
                node_html += word[0] + " "

        # escape special HTML chars
        node_html = html.escape(node_html)
        char_html = html.escape(char_html)
        return node_html, char_html, top_html, left_html, bottom_html, right_html
예제 #2
0
    def get_html_others(self, tag: str, box: List[float], page_num: int) -> Element:
        element = self.doc.createElement("div")
        element.setAttribute("class", "ocrx_block")
        element.setAttribute("pdftotree", tag)  # for backward-compatibility
        top, left, bottom, right = [int(x) for x in box]
        element.setAttribute("title", f"bbox {left} {top} {right} {bottom}")
        elems: List[LTTextLine] = get_mentions_within_bbox(
            box, self.elems[page_num].mentions
        )
        elems.sort(key=cmp_to_key(reading_order))
        for elem in elems:
            line_element = self.doc.createElement("span")
            element.appendChild(line_element)
            line_element.setAttribute("class", "ocrx_line")
            line_element.setAttribute(
                "title",
                f"bbox {int(elem.x0)} {int(elem.y0)} {int(elem.x1)} {int(elem.y1)}",
            )
            words = self.get_word_boundaries(elem)
            for word in words:
                top, left, bottom, right = [int(x) for x in word[1:]]
                # escape special HTML chars
                text = html.escape(word[0])

                word_element = self.doc.createElement("span")
                line_element.appendChild(word_element)
                word_element.setAttribute("class", "ocrx_word")
                word_element.setAttribute(
                    "title", f"bbox {left} {top} {right} {bottom}"
                )
                word_element.appendChild(self.doc.createTextNode(text))
        return element
예제 #3
0
    def get_html_table(self, table, page_num) -> Element:
        table_str = [str(i) for i in table]
        table_json = tabula.read_pdf(
            self.pdf_file, pages=page_num, area=table_str, output_format="json"
        )
        if len(table_json) > 0:
            table_element = self.doc.createElement("table")
            for i, row in enumerate(table_json[0]["data"]):
                row_element = self.doc.createElement("tr")
                table_element.appendChild(row_element)
                for j, column in enumerate(row):
                    col_element = self.doc.createElement("td")
                    row_element.appendChild(col_element)
                    box = [
                        column["top"],
                        column["left"],
                        column["top"] + column["height"],
                        column["left"] + column["width"],
                    ]
                    elems = get_mentions_within_bbox(box, self.elems[page_num].mentions)
                    elems.sort(key=cmp_to_key(reading_order))
                    for elem in elems:
                        words = self.get_word_boundaries(elem)
                        for word in words:
                            top = int(word[1])
                            left = int(word[2])
                            bottom = int(word[3])
                            right = int(word[4])
                            # escape special HTML chars
                            text = html.escape(word[0])

                            word_element = self.doc.createElement("span")
                            col_element.appendChild(word_element)
                            word_element.setAttribute("class", "ocrx_word")
                            word_element.setAttribute(
                                "title", f"bbox {left} {top} {right} {bottom}"
                            )
                            word_element.appendChild(self.doc.createTextNode(text))
        return table_element
예제 #4
0
 def get_html_table(self, table, page_num):
     table_str = [str(i) for i in table]
     table_json = tabula.read_pdf(self.pdf_file,
                                  pages=page_num,
                                  area=table_str,
                                  output_format="json")
     table_html = ""
     if len(table_json) > 0:
         table_html = "<table>"
         for i, row in enumerate(table_json[0]["data"]):
             row_str = "<tr>"
             for j, column in enumerate(row):
                 box = [
                     column["top"],
                     column["left"],
                     column["top"] + column["height"],
                     column["left"] + column["width"],
                 ]
                 top_html = ""
                 left_html = ""
                 bottom_html = ""
                 right_html = ""
                 char_html = ""
                 sep = " "
                 elems = get_mentions_within_bbox(
                     box, self.elems[page_num].mentions)
                 elems.sort(key=cmp_to_key(reading_order))
                 word_td = ""
                 for elem in elems:
                     chars = self.get_char_boundaries(elem)
                     for char in chars:
                         if six.PY2:
                             temp = char[0].encode("utf-8")
                         else:
                             temp = char[0]
                         if not re.match(r"[\x00-\x1F]", temp):
                             char_html += char[0].replace("'", '"') + sep
                             top_html += str(char[1]) + sep
                             left_html += str(char[2]) + sep
                             bottom_html += str(char[3]) + sep
                             right_html += str(char[4]) + sep
                     words = self.get_word_boundaries(elem)
                     for word in words:
                         if six.PY2:
                             temp = word[0].encode("utf-8")
                         elif six.PY3:
                             temp = word[0]
                         if not re.match(r"[\x00-\x1F]", temp):
                             word_td += word[0] + sep
                 # escape special HTML chars
                 word_td = html.escape(word_td)
                 char_html = html.escape(char_html)
                 row_str += ("<td char='" + char_html + "', top='" +
                             top_html + "', left='" + left_html +
                             "', bottom='" + bottom_html + "', right='" +
                             right_html + "'>" + word_td.strip() + "</td>")
                 #  row_str += (
                 #      "<td word='" + word_html + "', top='" + top_html +
                 #      "', left='" + left_html + "', bottom='" + bottom_html +
                 #      "', right='" + right_html + "'>") + str(
                 #          column["text"].encode('utf-8')) + "</td>"
                 #  row_str += ("<td char='" + char_html + "', top=") + str(
                 #      column["top"]
                 #  ) + (", left=" + str(column["left"]) + ", bottom=") + str(
                 #      column["top"] + column["height"]) + ", right=" + str(
                 #          column["left"] + column["width"]) + ">"
                 # row_str += str(column["text"].encode('utf-8'))
                 # row_str += "</td>"
             row_str += "</tr>"
             table_html += row_str
         table_html += "</table>"
     return table_html
예제 #5
0
    def get_html_table(self, table: List[float],
                       page_num) -> Optional[Element]:
        """Recognize a table using tabula and return a DOM element.

        :param table: bbox for a table (top,left,bottom,right)
        :param page_num: 1-based page number
        :return: DOM element for a table
        """
        logger.debug(f"Calling tabula at page: {page_num} and area: {table}.")
        table_json = tabula.read_pdf(self.pdf_file,
                                     pages=page_num,
                                     area=table,
                                     output_format="json")
        logger.debug(f"Tabula recognized {len(table_json)} table(s).")
        if len(table_json) == 0:
            return None
        table_element = self.doc.createElement("table")
        table_element.setAttribute("class", "ocr_table")
        top = int(table_json[0]["top"])
        left = int(table_json[0]["left"])
        bottom = int(table_json[0]["bottom"])
        right = int(table_json[0]["right"])
        table_element.setAttribute("title",
                                   f"bbox {left} {top} {right} {bottom}")
        for i, row in enumerate(table_json[0]["data"]):
            row_element = self.doc.createElement("tr")
            table_element.appendChild(row_element)
            for j, cell in enumerate(row):
                # It is not explicitly stated anywhere but tabula seems to use the cell
                # bbox to represent that of cell itself rather than that of text inside.
                # Note: bbox could be [0, 0, 0, 0] if tabula recognizes no text inside.
                box: List[float] = [
                    cell["top"],
                    cell["left"],
                    cell["top"] + cell["height"],
                    cell["left"] + cell["width"],
                ]
                cell_element = self.doc.createElement("td")
                row_element.appendChild(cell_element)
                elems = get_mentions_within_bbox(box,
                                                 self.elems[page_num].mentions)
                if len(elems) == 0:
                    continue
                cell_element.setAttribute(
                    "title",
                    f"bbox {int(box[1])} {int(box[0])} {int(box[3])} {int(box[2])}",
                )
                elems.sort(key=cmp_to_key(reading_order))
                for elem in elems:
                    line_element = self.doc.createElement("span")
                    cell_element.appendChild(line_element)
                    line_element.setAttribute("class", "ocrx_line")
                    line_element.setAttribute(
                        "title",
                        " ".join(["bbox"] + [str(int(_)) for _ in elem.bbox]),
                    )
                    words = self.get_word_boundaries(elem)
                    for word in words:
                        top = int(word[1])
                        left = int(word[2])
                        bottom = int(word[3])
                        right = int(word[4])

                        word_element = self.doc.createElement("span")
                        line_element.appendChild(word_element)
                        word_element.setAttribute("class", "ocrx_word")
                        word_element.setAttribute(
                            "title", f"bbox {left} {top} {right} {bottom}")
                        # No need to escape text here as minidom will do.
                        word_element.appendChild(
                            self.doc.createTextNode(word[0]))
        return table_element
예제 #6
0
    def get_html_tree(self) -> str:
        # Create a temp folder where images are temporarily saved.
        dirname = tempfile.mkdtemp()
        imagewriter = ImageWriter(dirname)

        doc = Document()
        self.doc = doc
        html = doc.createElement("html")
        doc.appendChild(html)
        head = doc.createElement("head")
        html.appendChild(head)
        # meta
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-system")
        meta.setAttribute("content",
                          f"Converted from PDF by pdftotree {__version__}")
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-capabilities")
        meta.setAttribute("content",
                          "ocr_page ocr_table ocrx_block ocrx_line ocrx_word")
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-number-of-pages")
        meta.setAttribute("content", f"{len(self.elems.keys())}")
        # body
        body = doc.createElement("body")
        html.appendChild(body)
        for page_num in self.elems.keys():  # 1-based
            boxes: List[Tuple[str, float, float, float, float]] = []
            for clust in self.tree[page_num]:
                for (pnum, pwidth, pheight, top, left, bottom,
                     right) in self.tree[page_num][clust]:
                    boxes += [(clust.lower().replace(" ", "_"), top, left,
                               bottom, right)]
            page = doc.createElement("div")
            page.setAttribute("class", "ocr_page")
            page.setAttribute("id", f"page_{page_num}")
            width = int(self.elems[page_num].layout.width)
            height = int(self.elems[page_num].layout.height)
            page.setAttribute(
                "title",
                f"bbox 0 0 {width} {height}; ppageno {page_num-1}",
            )
            body.appendChild(page)
            # TODO: We need to detect columns and sort acccordingly.
            boxes.sort(key=cmp_to_key(column_order))

            for box in boxes:
                if box[0] == "table":
                    table = box[1:]  # bbox
                    table_element = self.get_html_table(table, page_num)
                    page.appendChild(table_element)
                elif box[0] == "figure":
                    elems: List[LTTextLine] = get_mentions_within_bbox(
                        box, self.elems[page_num].figures)
                    fig_element = doc.createElement("figure")
                    page.appendChild(fig_element)
                    top, left, bottom, right = [int(i) for i in box[1:]]
                    fig_element.setAttribute(
                        "title", f"bbox {left} {top} {right} {bottom}")
                    for img in [img for elem in elems for img in elem]:
                        if not isinstance(img, LTImage):
                            continue
                        filename = imagewriter.export_image(img)
                        with open(os.path.join(dirname, filename), "rb") as f:
                            base64 = b64encode(f.read()).decode("ascii")
                        if filename.endswith("jpg"):
                            mediatype = "jpeg"
                        elif filename.endswith("bmp"):
                            mediatype = "bmp"
                        else:
                            logger.info(
                                f"Skipping an unknown type image: {filename}.")
                            continue
                        logger.info(
                            f"Embedding a known type image: {filename}.")
                        img_element = doc.createElement("img")
                        fig_element.appendChild(img_element)
                        img_element.setAttribute("title", bbox2str(img.bbox))
                        img_element.setAttribute(
                            "src", f"data:image/{mediatype};base64,{base64}")
                else:
                    element = self.get_html_others(box[0], box[1:], page_num)
                    page.appendChild(element)
        return doc.toprettyxml()