def parse(self, doc): doc.uri = self.canonical_uri(doc.basefile) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) self.infer_triples(d, doc.basefile) # prefer PDF or Word files over the plaintext-containing HTML files # FIXME: PDF or Word files are now stored as attachments pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf') wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'), self.generic_path(doc.basefile, 'downloaded', '.docx'), self.generic_path(doc.basefile, 'downloaded', '.wpd'), self.generic_path(doc.basefile, 'downloaded', '.rtf')) wordfile = None for f in wordfiles: if os.path.exists(f): wordfile = f # if we lack a .pdf file, use Open/LibreOffice to convert any # .wpd or .doc file to .pdf first if (wordfile and not os.path.exists(pdffile)): intermediate_pdf = self.generic_path( doc.basefile, "intermediate", ".pdf") if not os.path.exists(intermediate_pdf): cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'), os.path.dirname( intermediate_pdf), wordfile) self.log.debug( "%s: Converting to PDF: %s" % (doc.basefile, cmdline)) (ret, stdout, stderr) = util.runcmd( cmdline, require_success=True) pdffile = intermediate_pdf if os.path.exists(pdffile): self.log.debug("%s: Using %s" % (doc.basefile, pdffile)) intermediate_dir = os.path.dirname( self.generic_path(doc.basefile, 'intermediate', '.foo')) self.setup_logger('pdfreader', self.config.get('log', 'INFO')) pdfreader = PDFReader() pdfreader.read(pdffile, intermediate_dir) self.parse_from_pdfreader(pdfreader, doc) else: downloaded_path = self.downloaded_path(doc.basefile) intermediate_path = self.generic_path( doc.basefile, 'intermediate', '.txt') self.log.debug("%s: Using %s (%s)" % (doc.basefile, downloaded_path, intermediate_path)) if not os.path.exists(intermediate_path): html = codecs.open( downloaded_path, encoding="iso-8859-1").read() util.writefile(intermediate_path, util.extract_text( html, '<pre>', '</pre>'), encoding="utf-8") textreader = TextReader(intermediate_path, encoding="utf-8") self.parse_from_textreader(textreader, doc)
class Read(unittest.TestCase): def setUp(self): self.maxDiff = None self.datadir = tempfile.mkdtemp() self.reader = PDFReader() def tearDown(self): shutil.rmtree(self.datadir) def test_basic(self): try: self.reader.read("test/files/pdfreader/sample.pdf", self.datadir) except errors.ExternalCommandError: for fname in os.listdir("test/files/pdfreader/intermediate"): to = fname.replace("index", "sample") shutil.copy("test/files/pdfreader/intermediate/%s" % fname, self.datadir + os.sep + to) self.reader.read("test/files/pdfreader/sample.pdf", self.datadir) self.assertEqual(len(self.reader), 1) # first page, first box title = str(self.reader[0][0]) self.assertEqual("Document title ", title) self.assertEqual(318, self.reader.median_box_width()) page = self.reader[0] self.assertEqual("Page 1 (892 x 1263): 'Document title This is a simple documen...'", str(page)) # an uncropped doc should have two textboxes self.assertEqual(2, len(list(page.boundingbox()))) # a smaller bounding box yields just one self.assertEqual(1, len(list(page.boundingbox(190, 130, 230, 460)))) # cropping it with the same dimensions page.crop(190, 130, 230, 460) # should also result in just one box -- the bottom one boxes = list(page.boundingbox()) self.assertEqual(1, len(boxes)) box = boxes[0] self.assertEqual("This is a simple document in PDF format. ", str(box)) self.assertEqual({'color': '#000000', 'size': '16', 'id': '1', 'family': 'Times'}, box.getfont()) # this box should have four text elements self.assertEqual(4, len(box)) self.assertEqual(None, box[0].tag) self.assertEqual("i", box[1].tag) self.assertEqual("ib", box[2].tag) self.assertEqual(None, box[3].tag)
def pdfreader_from_basefile(self, basefile): pdffile = self.store.downloaded_path(basefile) # Convoluted way of getting the directory of the intermediate # xml + png files that PDFReader will create intermediate_dir = os.path.dirname(self.store.intermediate_path(basefile)) pdf = PDFReader() pdf.read(pdffile, intermediate_dir) return pdf