Exemplo n.º 1
0
 def extract_body(self, fp, basefile):
     reader = StreamingPDFReader()
     parser = "ocr" if self.config.ocr else "xml"
     reader.read(fp, parser=parser)
     for page in reader:
         page.src = "index.pdf"  # FIXME: don't hardcode the filename
     return reader
Exemplo n.º 2
0
 def extract_body(self, fp, basefile):
     reader = StreamingPDFReader()
     parser = "ocr" if self.config.ocr else "xml"
     reader.read(fp, parser=parser)
     for page in reader:
         page.src = "index.pdf"  # FIXME: don't hardcode the filename
     return reader
Exemplo n.º 3
0
    def extract_body(self, fp, basefile):
        reader = StreamingPDFReader()
        parser = "ocr" if self.config.ocr else "xml"
        intermediate_suffix = ".hocr" if self.config.ocr else ".xml"
        if self.config.compress:
            intermediate_suffix += "." + self.config.compress
        reader.read(fp, parser=parser)
        for attachment in [
                x for x in sorted(
                    self.store.list_attachments(basefile, "downloaded"))
                if x.endswith(".pdf")
        ]:
            downloaded_path = self.store.downloaded_path(basefile,
                                                         attachment=attachment)
            iattachment = attachment.replace(".pdf", intermediate_suffix)
            intermediate_path = self.store.intermediate_path(
                basefile, attachment=iattachment)
            if not os.path.exists(intermediate_path):
                fp = self.convert_pdf(downloaded_path, intermediate_path)
            else:
                fp = self.store.open_intermediate(basefile,
                                                  attachment=iattachment)
            reader += StreamingPDFReader().read(fp)

        for page in reader:
            page.src = "index.pdf"  # FIXME: don't hardcode the filename
        return reader
Exemplo n.º 4
0
    def extract_body(self, fp, basefile):
        reader = StreamingPDFReader()
        parser = "ocr" if self.config.ocr else "xml"
        intermediate_suffix = ".hocr" if self.config.ocr else ".xml"
        if self.config.compress:
            intermediate_suffix += "." + self.config.compress
        reader.read(fp, parser=parser)
        for attachment in [x for x in sorted(self.store.list_attachments(basefile, "downloaded")) if x.endswith(".pdf")]:
            downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
            iattachment = attachment.replace(".pdf", intermediate_suffix)
            intermediate_path = self.store.intermediate_path(basefile, attachment=iattachment)
            if not os.path.exists(intermediate_path):
                fp = self.convert_pdf(downloaded_path, intermediate_path)
            else:
                fp = self.store.open_intermediate(basefile, attachment=iattachment)
            reader += StreamingPDFReader().read(fp)

        for page in reader:
            page.src = "index.pdf"  # FIXME: don't hardcode the filename
        return reader