Пример #1
0
    def parse(self, doc):
        doc.uri = self.canonical_uri(doc.basefile)
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        self.infer_triples(d, doc.basefile)

        # prefer PDF or Word files over the plaintext-containing HTML files
        # FIXME: PDF or Word files are now stored as attachments

        pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf')

        wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'),
                     self.generic_path(doc.basefile, 'downloaded', '.docx'),
                     self.generic_path(doc.basefile, 'downloaded', '.wpd'),
                     self.generic_path(doc.basefile, 'downloaded', '.rtf'))
        wordfile = None
        for f in wordfiles:
            if os.path.exists(f):
                wordfile = f

        # if we lack a .pdf file, use Open/LibreOffice to convert any
        # .wpd or .doc file to .pdf first
        if (wordfile
                and not os.path.exists(pdffile)):
            intermediate_pdf = self.generic_path(
                doc.basefile, "intermediate", ".pdf")
            if not os.path.exists(intermediate_pdf):
                cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'),
                                                                             os.path.dirname(
                                                                                 intermediate_pdf),
                                                                             wordfile)
                self.log.debug(
                    "%s: Converting to PDF: %s" % (doc.basefile, cmdline))
                (ret, stdout, stderr) = util.runcmd(
                    cmdline, require_success=True)
            pdffile = intermediate_pdf

        if os.path.exists(pdffile):
            self.log.debug("%s: Using %s" % (doc.basefile, pdffile))
            intermediate_dir = os.path.dirname(
                self.generic_path(doc.basefile, 'intermediate', '.foo'))
            self.setup_logger('pdfreader', self.config.get('log', 'INFO'))
            pdfreader = PDFReader()
            pdfreader.read(pdffile, intermediate_dir)
            self.parse_from_pdfreader(pdfreader, doc)
        else:
            downloaded_path = self.downloaded_path(doc.basefile)
            intermediate_path = self.generic_path(
                doc.basefile, 'intermediate', '.txt')
            self.log.debug("%s: Using %s (%s)" % (doc.basefile,
                           downloaded_path, intermediate_path))
            if not os.path.exists(intermediate_path):
                html = codecs.open(
                    downloaded_path, encoding="iso-8859-1").read()
                util.writefile(intermediate_path, util.extract_text(
                    html, '<pre>', '</pre>'), encoding="utf-8")
            textreader = TextReader(intermediate_path, encoding="utf-8")
            self.parse_from_textreader(textreader, doc)
Пример #2
0
class Read(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.datadir = tempfile.mkdtemp()
        self.reader = PDFReader()
        
    def tearDown(self):
        shutil.rmtree(self.datadir)

    def test_basic(self):
        try:
            self.reader.read("test/files/pdfreader/sample.pdf",
                             self.datadir)
        except errors.ExternalCommandError:
            for fname in os.listdir("test/files/pdfreader/intermediate"):
                to = fname.replace("index", "sample")
                shutil.copy("test/files/pdfreader/intermediate/%s" % fname,
                             self.datadir + os.sep + to)
            self.reader.read("test/files/pdfreader/sample.pdf",
                             self.datadir)
        self.assertEqual(len(self.reader), 1)
        # first page, first box
        title = str(self.reader[0][0])
        self.assertEqual("Document title ", title)

        self.assertEqual(318, self.reader.median_box_width())

        page = self.reader[0]
        self.assertEqual("Page 1 (892 x 1263): 'Document title  This is a simple documen...'", str(page))

        # an uncropped doc should have two textboxes
        self.assertEqual(2, len(list(page.boundingbox())))

        # a smaller bounding box yields just one
        self.assertEqual(1,
                         len(list(page.boundingbox(190, 130, 230, 460))))

        # cropping it with the same dimensions
        page.crop(190, 130, 230, 460)

        # should also result in just one box -- the bottom one
        boxes = list(page.boundingbox())
        self.assertEqual(1, len(boxes))

        box = boxes[0]

        self.assertEqual("This is a simple document in PDF format. ", str(box))
        self.assertEqual({'color': '#000000',
                          'size': '16',
                          'id': '1',
                          'family': 'Times'}, box.getfont())
                         

        # this box should have four text elements
        self.assertEqual(4, len(box))
        self.assertEqual(None, box[0].tag)
        self.assertEqual("i", box[1].tag)
        self.assertEqual("ib", box[2].tag)
        self.assertEqual(None, box[3].tag)
Пример #3
0
    def pdfreader_from_basefile(self, basefile):
        pdffile = self.store.downloaded_path(basefile)
        # Convoluted way of getting the directory of the intermediate
        # xml + png files that PDFReader will create

        intermediate_dir = os.path.dirname(self.store.intermediate_path(basefile))
        pdf = PDFReader()
        pdf.read(pdffile, intermediate_dir)
        return pdf