예제 #1
0
    def parse_pdf(self, filename, intermediatedir, basefile):
        # By default, don't create and manage PDF backgrounds files
        # (takes forever, we don't use them yet)
        if self.config.compress == "bz2":
            keep_xml = "bz2"
        else:
            keep_xml = True
        tup = (self.document_type, basefile)
        default_decoder = (DetectingDecoder, None)
        # This just just a list of known different encoding
        # schemes. FIXME: try to find out whether all Ds documents should
        # use the (non-decoding) BaseTextDecoder
        alternate_decoders = {(self.PROPOSITION, "1997/98:44"): (OffsetDecoder20, "Datalagskommittén"),
                              (self.DS, "2004:46"): (BaseTextDecoder, None)}

        decoding_class, decoder_arg = alternate_decoders.get(tup, default_decoder)
        convert_to_pdf = not filename.lower().endswith(".pdf")
        pdf = PDFReader(filename=filename,
                        workdir=intermediatedir,
                        images=self.config.pdfimages,
                        convert_to_pdf=convert_to_pdf,
                        keep_xml=keep_xml,
                        textdecoder=decoding_class(decoder_arg))
        if pdf.is_empty():
            self.log.warning("PDF file %s had no textcontent, trying OCR" % filename)
            pdf = PDFReader(filename=filename,
                            workdir=intermediatedir,
                            images=self.config.pdfimages,
                            keep_xml=keep_xml,
                            ocr_lang="swe")
        identifier = self.canonical_uri(basefile)
        for page in pdf:
            page.src = filename
        return pdf
예제 #2
0
    def parse_pdf(self, filename, intermediatedir, basefile):
        # By default, don't create and manage PDF backgrounds files
        # (takes forever, we don't use them yet)
        if self.config.compress == "bz2":
            keep_xml = "bz2"
        else:
            keep_xml = True
        tup = (self.document_type, basefile)
        default_decoder = (DetectingDecoder, None)
        # This just just a list of known different encoding
        # schemes. FIXME: try to find out whether all Ds documents should
        # use the (non-decoding) BaseTextDecoder
        alternate_decoders = {(self.PROPOSITION, "1997/98:44"): (OffsetDecoder20, "Datalagskommittén"),
                              (self.DS, "2004:46"): (BaseTextDecoder, None)}

        decoding_class, decoder_arg = alternate_decoders.get(tup, default_decoder)
        convert_to_pdf = not filename.lower().endswith(".pdf")
        pdf = PDFReader(filename=filename,
                        workdir=intermediatedir,
                        images=self.config.pdfimages,
                        convert_to_pdf=convert_to_pdf,
                        keep_xml=keep_xml,
                        textdecoder=decoding_class(decoder_arg))
        if pdf.is_empty():
            self.log.warning("PDF file %s had no textcontent, trying OCR" % filename)
            pdf = PDFReader(filename=filename,
                            workdir=intermediatedir,
                            images=self.config.pdfimages,
                            keep_xml=keep_xml,
                            ocr_lang="swe")
        identifier = self.canonical_uri(basefile)
        for page in pdf:
            page.src = filename
        return pdf
예제 #3
0
    def parse_pdf_complex(self, pdffile, intermediatedir):
        pdf = PDFReader()
        pdf.read(pdffile, intermediatedir)
        res = CompoundElement
        cnt = 0
        for srcpage in pdf:
            cnt += 1
            # Page is a wonderful and magical class. Read the comments
            # to find out exactly how awesome it is.
            tgtpage = Page(ordinal=cnt)
            # TODO: use magic to find the bounding box of actual page
            # content. 510 is a rough cutoff that might not be
            # appropriate for all page layouts.
            boxes = srcpage.boundingbox(right=510)
            for box in boxes:
                print((box.getfont()))
                print(("    [%dx%d][%dx%d][%s@%s] %s" %
                      (box.top, box.left, box.bottom, box.right, box.getfont()['family'], box.getfont()['size'], str(box))))
                # Heuristic: If something is in large type, it's a heading.
                if int(box.getfont()['size']) > 12:
                    if isinstance(ctx, Heading):
                        if vertical_space(box, boxes.previous()) > 10:
                            # Page.new closes the current context and
                            # creates a new context of the given class
                            tgtpage.new(Heading)

                    # Heading is a DimensionedElement with top,
                    # left, width, height props. Page.set creates a new
                    # context, but only if needed.
                    txtpage.set(Heading)

                    # calls the current context's append() method. If
                    # it's a DimensionedElement (it should be), it's
                    # implementation of append() expands the bounding
                    # box as new stuff is added (provided they have
                    # top/left+width/height attribs
                    txtpage.write(box)

                    continue

                # add more heuristicts here...

                # Last resort: Everything that is not something else is a Paragraph
                page.set(Paragraph)
                if horizontal_diff(box, boxes.previous()) > 0:  # maybe something like 4-5
                    page.new(Paragraph)
                if vertical_space(box.boxes.previous()) > 5:
                    page.new(Paragraph)

        print((pdf.median_box_width(threshold=0)))
예제 #4
0
 def parse_pdf(self, pdffile, intermediatedir):
     pdf = PDFReader()
     pdf.read(pdffile, intermediatedir)
     return pdf