예제 #1
0
    def get_html_others(self, tag: str, box: List[float],
                        page_num: int) -> Element:
        element = self.doc.createElement("div")
        element.setAttribute("class", "ocrx_block")
        element.setAttribute("pdftotree", tag)  # for backward-compatibility
        top, left, bottom, right = [int(x) for x in box]
        element.setAttribute("title", f"bbox {left} {top} {right} {bottom}")
        elems: List[LTTextLine] = get_mentions_within_bbox(
            box, self.elems[page_num].mentions)
        elems.sort(key=cmp_to_key(reading_order))
        for elem in elems:
            line_element = self.doc.createElement("span")
            element.appendChild(line_element)
            line_element.setAttribute("class", "ocrx_line")
            line_element.setAttribute("title", bbox2str(elem.bbox))
            words = self.get_word_boundaries(elem)
            for word in words:
                top, left, bottom, right = [int(x) for x in word[1:]]

                word_element = self.doc.createElement("span")
                line_element.appendChild(word_element)
                word_element.setAttribute("class", "ocrx_word")
                word_element.setAttribute(
                    "title", f"bbox {left} {top} {right} {bottom}")
                # No need to escape text here as minidom will do.
                word_element.appendChild(self.doc.createTextNode(word[0]))
        return element
예제 #2
0
    def get_html_table(self, table: List[float],
                       page_num) -> Optional[Element]:
        """Recognize a table using tabula and return a DOM element.

        :param table: bbox for a table (top,left,bottom,right)
        :param page_num: 1-based page number
        :return: DOM element for a table
        """
        logger.debug(f"Calling tabula at page: {page_num} and area: {table}.")
        loglevel = logging.getLogger("pdftotree").getEffectiveLevel()
        table_json = tabula.read_pdf(
            self.pdf_file,
            pages=page_num,
            area=table,
            output_format="json",
            silent=False if loglevel <= logging.DEBUG else True,
        )
        logger.debug(f"Tabula recognized {len(table_json)} table(s).")
        if len(table_json) == 0:
            return None
        table_element = self.doc.createElement("table")
        table_element.setAttribute("class", "ocr_table")
        top = int(table_json[0]["top"])
        left = int(table_json[0]["left"])
        bottom = int(table_json[0]["bottom"])
        right = int(table_json[0]["right"])
        table_element.setAttribute("title",
                                   f"bbox {left} {top} {right} {bottom}")
        for i, row in enumerate(table_json[0]["data"]):
            row_element = self.doc.createElement("tr")
            table_element.appendChild(row_element)
            for j, cell in enumerate(row):
                # It is not explicitly stated anywhere but tabula seems to use the cell
                # bbox to represent that of cell itself rather than that of text inside.
                # Note: bbox could be [0, 0, 0, 0] if tabula recognizes no text inside.
                box: List[float] = [
                    cell["top"],
                    cell["left"],
                    cell["top"] + cell["height"],
                    cell["left"] + cell["width"],
                ]
                cell_element = self.doc.createElement("td")
                row_element.appendChild(cell_element)
                elems = get_mentions_within_bbox(box,
                                                 self.elems[page_num].mentions)
                if len(elems) == 0:
                    continue
                cell_element.setAttribute(
                    "title",
                    f"bbox {int(box[1])} {int(box[0])} {int(box[3])} {int(box[2])}",
                )
                elems.sort(key=cmp_to_key(reading_order))
                for elem in elems:
                    line_element = self.doc.createElement("span")
                    cell_element.appendChild(line_element)
                    line_element.setAttribute("class", "ocrx_line")
                    line_element.setAttribute("title", bbox2str(elem.bbox))
                    words = self.get_word_boundaries(elem)
                    for word in words:
                        top = int(word[1])
                        left = int(word[2])
                        bottom = int(word[3])
                        right = int(word[4])

                        word_element = self.doc.createElement("span")
                        line_element.appendChild(word_element)
                        word_element.setAttribute("class", "ocrx_word")
                        word_element.setAttribute(
                            "title", f"bbox {left} {top} {right} {bottom}")
                        # No need to escape text here as minidom will do.
                        word_element.appendChild(
                            self.doc.createTextNode(word[0]))
        return table_element
예제 #3
0
    def get_html_tree(self) -> str:
        # Create a temp folder where images are temporarily saved.
        dirname = tempfile.mkdtemp()
        imagewriter = ImageWriter(dirname)

        doc = Document()
        self.doc = doc
        html = doc.createElement("html")
        doc.appendChild(html)
        head = doc.createElement("head")
        html.appendChild(head)
        # meta
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-system")
        meta.setAttribute("content",
                          f"Converted from PDF by pdftotree {__version__}")
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-capabilities")
        meta.setAttribute("content",
                          "ocr_page ocr_table ocrx_block ocrx_line ocrx_word")
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-number-of-pages")
        meta.setAttribute("content", f"{len(self.elems.keys())}")
        # body
        body = doc.createElement("body")
        html.appendChild(body)
        for page_num in self.elems.keys():  # 1-based
            boxes: List[Tuple[str, float, float, float, float]] = []
            for clust in self.tree[page_num]:
                for (pnum, pwidth, pheight, top, left, bottom,
                     right) in self.tree[page_num][clust]:
                    boxes += [(clust.lower().replace(" ", "_"), top, left,
                               bottom, right)]
            page = doc.createElement("div")
            page.setAttribute("class", "ocr_page")
            page.setAttribute("id", f"page_{page_num}")
            width = int(self.elems[page_num].layout.width)
            height = int(self.elems[page_num].layout.height)
            page.setAttribute(
                "title",
                f"bbox 0 0 {width} {height}; ppageno {page_num-1}",
            )
            body.appendChild(page)
            # TODO: We need to detect columns and sort acccordingly.
            boxes.sort(key=cmp_to_key(column_order))

            for box in boxes:
                if box[0] == "table":
                    table = box[1:]  # bbox
                    table_element = self.get_html_table(table, page_num)
                    page.appendChild(table_element)
                elif box[0] == "figure":
                    elems: List[LTTextLine] = get_mentions_within_bbox(
                        box, self.elems[page_num].figures)
                    fig_element = doc.createElement("figure")
                    page.appendChild(fig_element)
                    top, left, bottom, right = [int(i) for i in box[1:]]
                    fig_element.setAttribute(
                        "title", f"bbox {left} {top} {right} {bottom}")
                    for img in [img for elem in elems for img in elem]:
                        if not isinstance(img, LTImage):
                            continue
                        filename = imagewriter.export_image(img)
                        with open(os.path.join(dirname, filename), "rb") as f:
                            base64 = b64encode(f.read()).decode("ascii")
                        if filename.endswith("jpg"):
                            mediatype = "jpeg"
                        elif filename.endswith("bmp"):
                            mediatype = "bmp"
                        else:
                            logger.info(
                                f"Skipping an unknown type image: {filename}.")
                            continue
                        logger.info(
                            f"Embedding a known type image: {filename}.")
                        img_element = doc.createElement("img")
                        fig_element.appendChild(img_element)
                        img_element.setAttribute("title", bbox2str(img.bbox))
                        img_element.setAttribute(
                            "src", f"data:image/{mediatype};base64,{base64}")
                else:
                    element = self.get_html_others(box[0], box[1:], page_num)
                    page.appendChild(element)
        return doc.toprettyxml()