def downloaded_to_intermediate(self, basefile, attachment=None): # force just the conversion part of the PDF handling downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) ocr_lang = None convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang) except PDFFileIsEmpty as e: self.log.warning("%s: %s was empty, attempting OCR" % (basefile, downloaded_path)) ocr_lang = "swe" # reasonable guess return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang)
def lazy_downloaded_to_intermediate(basefile): downloaded_path = self.store.downloaded_path( basefile, attachment="index.pdf") downloaded_path_html = self.store.downloaded_path( basefile, attachment="index.html") if not os.path.exists(downloaded_path): if os.path.exists(downloaded_path_html): # attempt to parse HTML instead return open(downloaded_path_html) else: # just grab the HTML from the XML file itself... tree = etree.parse(self.store.downloaded_path(basefile)) html = tree.getroot().find("dokument").find("html") if html is not None: return StringIO(html.text) else: return StringIO( "<html><h1>Dokumenttext saknas</h1></html>") intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: res = reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml) except (errors.PDFFileIsEmpty, errors.ExternalCommandError) as e: if isinstance(e, errors.ExternalCommandError): self.log.debug("%s: PDF file conversion failed: %s" % (basefile, str(e).split("\n")[0])) # if PDF file conversion fails, it'll probaby fail # again when we try OCR, but maybe there will # exist a cached intermediate file that allow us # to get data without even looking at the PDF file # again. elif isinstance(e, errors.PDFFileIsEmpty): self.log.debug("%s: PDF had no textcontent, trying OCR" % basefile) res = reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang="swe") if os.path.getsize(intermediate_path) > 20 * 1024 * 1024: raise errors.ParseError( "%s: %s (after conversion) is just too damn big (%s Mbytes)" % (basefile, intermediate_path, os.path.getsize(intermediate_path) / (1024 * 1024))) return res
def lazy_downloaded_to_intermediate(basefile): downloaded_path = self.store.downloaded_path(basefile, attachment="index.pdf") downloaded_path_html = self.store.downloaded_path(basefile, attachment="index.html") if not os.path.exists(downloaded_path): if os.path.exists(downloaded_path_html): # attempt to parse HTML instead return open(downloaded_path_html) else: # just grab the HTML from the XML file itself... tree = etree.parse(self.store.downloaded_path(basefile)) html = tree.getroot().find("dokument").find("html") if html is not None: return StringIO(html.text) else: return StringIO("<html><h1>Dokumenttext saknas</h1></html>") intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: res = reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml) except (errors.PDFFileIsEmpty, errors.ExternalCommandError) as e: if isinstance(e, errors.ExternalCommandError): self.log.debug("%s: PDF file conversion failed: %s" % (basefile, str(e).split("\n")[0])) # if PDF file conversion fails, it'll probaby fail # again when we try OCR, but maybe there will # exist a cached intermediate file that allow us # to get data without even looking at the PDF file # again. elif isinstance(e, errors.PDFFileIsEmpty): self.log.debug("%s: PDF had no textcontent, trying OCR" % basefile) res = reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang="swe") # now the intermediate path endswith .hocr.html.bz2, not .xml.bz2 intermediate_path = self.store.intermediate_path(basefile) if os.path.getsize(intermediate_path) > 20*1024*1024: raise errors.ParseError("%s: %s (after conversion) is just too damn big (%s Mbytes)" % (basefile, intermediate_path, os.path.getsize(intermediate_path) / (1024*1024))) return res
def convert_pdf(self, downloaded_path, intermediate_path): intermediate_dir = os.path.dirname(intermediate_path) keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() kwargs = {'filename': downloaded_path, 'workdir': intermediate_dir, 'images': self.config.pdfimages, 'keep_xml': keep_xml} if self.config.ocr: kwargs['ocr_lang'] = 'swe' return reader.convert(**kwargs)
def downloaded_to_intermediate(self, basefile, attachment=None): intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() kwargs = {'filename': self.store.downloaded_path(basefile, attachment=attachment), 'workdir': intermediate_dir, 'images': self.config.pdfimages, 'keep_xml': keep_xml} if self.config.ocr: kwargs['ocr_lang'] = 'swe' return reader.convert(**kwargs)
def downloaded_to_intermediate(self, basefile, attachment=None): # force just the conversion part of the PDF handling downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) ocr_lang = None convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang, legacy_tesseract=self.config.legacytesseract) except PDFFileIsEmpty as e: if self.config.ocr: self.log.warning("%s: %s was empty, attempting OCR" % (basefile, downloaded_path)) ocr_lang = "swe" # reasonable guess return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang) else: self.log.warning("%s: %s was empty, returning placeholder" % (basefile, downloaded_path)) fp = BytesIO(b"""<pdf2xml> <page number="1" position="absolute" top="0" left="0" height="1029" width="701"> <fontspec id="0" size="12" family="TimesNewRomanPSMT" color="#000000"/> <text top="67" left="77" width="287" height="26" font="0">[Avgörandetext saknas]</text> </page> </pdf2xml>""") fp.name = "dummy.xml" return fp
def downloaded_to_intermediate(self, basefile, attachment=None): intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() kwargs = { 'filename': self.store.downloaded_path(basefile, attachment=attachment), 'workdir': intermediate_dir, 'images': self.config.pdfimages, 'keep_xml': keep_xml } if self.config.ocr: kwargs['ocr_lang'] = 'swe' return reader.convert(**kwargs)
def downloaded_to_intermediate(self, basefile, attachment=None): # force just the conversion part of the PDF handling downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) ocr_lang = None convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang) except PDFFileIsEmpty as e: if self.config.ocr: self.log.warning("%s: %s was empty, attempting OCR" % (basefile, downloaded_path)) ocr_lang = "swe" # reasonable guess return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang) else: self.log.warning("%s: %s was empty, returning placeholder" % (basefile, downloaded_path)) fp = BytesIO(b"""<pdf2xml> <page number="1" position="absolute" top="0" left="0" height="1029" width="701"> <fontspec id="0" size="12" family="TimesNewRomanPSMT" color="#000000"/> <text top="67" left="77" width="287" height="26" font="0">[Avgörandetext saknas]</text> </page> </pdf2xml>""") fp.name = "dummy.xml" return fp