def parse_pdf_complex(self, pdffile, intermediatedir): pdf = PDFReader() pdf.read(pdffile, intermediatedir) res = CompoundElement cnt = 0 for srcpage in pdf: cnt += 1 # Page is a wonderful and magical class. Read the comments # to find out exactly how awesome it is. tgtpage = Page(ordinal=cnt) # TODO: use magic to find the bounding box of actual page # content. 510 is a rough cutoff that might not be # appropriate for all page layouts. boxes = srcpage.boundingbox(right=510) for box in boxes: print((box.getfont())) print((" [%dx%d][%dx%d][%s@%s] %s" % (box.top, box.left, box.bottom, box.right, box.getfont()['family'], box.getfont()['size'], str(box)))) # Heuristic: If something is in large type, it's a heading. if int(box.getfont()['size']) > 12: if isinstance(ctx, Heading): if vertical_space(box, boxes.previous()) > 10: # Page.new closes the current context and # creates a new context of the given class tgtpage.new(Heading) # Heading is a DimensionedElement with top, # left, width, height props. Page.set creates a new # context, but only if needed. txtpage.set(Heading) # calls the current context's append() method. If # it's a DimensionedElement (it should be), it's # implementation of append() expands the bounding # box as new stuff is added (provided they have # top/left+width/height attribs txtpage.write(box) continue # add more heuristicts here... # Last resort: Everything that is not something else is a Paragraph page.set(Paragraph) if horizontal_diff(box, boxes.previous()) > 0: # maybe something like 4-5 page.new(Paragraph) if vertical_space(box.boxes.previous()) > 5: page.new(Paragraph) print((pdf.median_box_width(threshold=0)))
def parse_document_from_soup(self, soup, doc): from ferenda.elements import Page from ferenda import Describer part = Page(["This is a part of a document"], ordinal=42, uri="http://example.org/doc#42", meta=self.make_graph()) d = Describer(part.meta, part.uri) d.rdftype(self.ns['bibo'].DocumentPart) # the dcterms:identifier for a document part is often whatever # would be the preferred way to cite that part in another # document d.value(self.ns['dcterms'].identifier, "Doc:4711, p 42") # end part from lxml import etree return etree.tostring(part.as_xhtml("http://example.org/doc"))