def parse_pdf(self, filename, intermediatedir, basefile): # By default, don't create and manage PDF backgrounds files # (takes forever, we don't use them yet) if self.config.compress == "bz2": keep_xml = "bz2" else: keep_xml = True tup = (self.document_type, basefile) default_decoder = (DetectingDecoder, None) # This just just a list of known different encoding # schemes. FIXME: try to find out whether all Ds documents should # use the (non-decoding) BaseTextDecoder alternate_decoders = {(self.PROPOSITION, "1997/98:44"): (OffsetDecoder20, "Datalagskommittén"), (self.DS, "2004:46"): (BaseTextDecoder, None)} decoding_class, decoder_arg = alternate_decoders.get(tup, default_decoder) convert_to_pdf = not filename.lower().endswith(".pdf") pdf = PDFReader(filename=filename, workdir=intermediatedir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, textdecoder=decoding_class(decoder_arg)) if pdf.is_empty(): self.log.warning("PDF file %s had no textcontent, trying OCR" % filename) pdf = PDFReader(filename=filename, workdir=intermediatedir, images=self.config.pdfimages, keep_xml=keep_xml, ocr_lang="swe") identifier = self.canonical_uri(basefile) for page in pdf: page.src = filename return pdf
def parse_pdf_complex(self, pdffile, intermediatedir): pdf = PDFReader() pdf.read(pdffile, intermediatedir) res = CompoundElement cnt = 0 for srcpage in pdf: cnt += 1 # Page is a wonderful and magical class. Read the comments # to find out exactly how awesome it is. tgtpage = Page(ordinal=cnt) # TODO: use magic to find the bounding box of actual page # content. 510 is a rough cutoff that might not be # appropriate for all page layouts. boxes = srcpage.boundingbox(right=510) for box in boxes: print((box.getfont())) print((" [%dx%d][%dx%d][%s@%s] %s" % (box.top, box.left, box.bottom, box.right, box.getfont()['family'], box.getfont()['size'], str(box)))) # Heuristic: If something is in large type, it's a heading. if int(box.getfont()['size']) > 12: if isinstance(ctx, Heading): if vertical_space(box, boxes.previous()) > 10: # Page.new closes the current context and # creates a new context of the given class tgtpage.new(Heading) # Heading is a DimensionedElement with top, # left, width, height props. Page.set creates a new # context, but only if needed. txtpage.set(Heading) # calls the current context's append() method. If # it's a DimensionedElement (it should be), it's # implementation of append() expands the bounding # box as new stuff is added (provided they have # top/left+width/height attribs txtpage.write(box) continue # add more heuristicts here... # Last resort: Everything that is not something else is a Paragraph page.set(Paragraph) if horizontal_diff(box, boxes.previous()) > 0: # maybe something like 4-5 page.new(Paragraph) if vertical_space(box.boxes.previous()) > 5: page.new(Paragraph) print((pdf.median_box_width(threshold=0)))
def parse_pdf(self, pdffile, intermediatedir): pdf = PDFReader() pdf.read(pdffile, intermediatedir) return pdf