예제 #1
0
    def parse_pdf_complex(self, pdffile, intermediatedir):
        pdf = PDFReader()
        pdf.read(pdffile, intermediatedir)
        res = CompoundElement
        cnt = 0
        for srcpage in pdf:
            cnt += 1
            # Page is a wonderful and magical class. Read the comments
            # to find out exactly how awesome it is.
            tgtpage = Page(ordinal=cnt)
            # TODO: use magic to find the bounding box of actual page
            # content. 510 is a rough cutoff that might not be
            # appropriate for all page layouts.
            boxes = srcpage.boundingbox(right=510)
            for box in boxes:
                print((box.getfont()))
                print(("    [%dx%d][%dx%d][%s@%s] %s" %
                      (box.top, box.left, box.bottom, box.right, box.getfont()['family'], box.getfont()['size'], str(box))))
                # Heuristic: If something is in large type, it's a heading.
                if int(box.getfont()['size']) > 12:
                    if isinstance(ctx, Heading):
                        if vertical_space(box, boxes.previous()) > 10:
                            # Page.new closes the current context and
                            # creates a new context of the given class
                            tgtpage.new(Heading)

                    # Heading is a DimensionedElement with top,
                    # left, width, height props. Page.set creates a new
                    # context, but only if needed.
                    txtpage.set(Heading)

                    # calls the current context's append() method. If
                    # it's a DimensionedElement (it should be), it's
                    # implementation of append() expands the bounding
                    # box as new stuff is added (provided they have
                    # top/left+width/height attribs
                    txtpage.write(box)

                    continue

                # add more heuristicts here...

                # Last resort: Everything that is not something else is a Paragraph
                page.set(Paragraph)
                if horizontal_diff(box, boxes.previous()) > 0:  # maybe something like 4-5
                    page.new(Paragraph)
                if vertical_space(box.boxes.previous()) > 5:
                    page.new(Paragraph)

        print((pdf.median_box_width(threshold=0)))
예제 #2
0
 def parse_pdf(self, pdffile, intermediatedir):
     pdf = PDFReader()
     pdf.read(pdffile, intermediatedir)
     return pdf