def extract_body(self, fp, basefile): reader = StreamingPDFReader() parser = "ocr" if self.config.ocr else "xml" reader.read(fp, parser=parser) for page in reader: page.src = "index.pdf" # FIXME: don't hardcode the filename return reader
def downloaded_to_intermediate(self, basefile, attachment=None): # force just the conversion part of the PDF handling downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) ocr_lang = None convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang) except PDFFileIsEmpty as e: self.log.warning("%s: %s was empty, attempting OCR" % (basefile, downloaded_path)) ocr_lang = "swe" # reasonable guess return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang)
def extract_body(self, fp, basefile): reader = StreamingPDFReader() parser = "ocr" if self.config.ocr else "xml" reader.read(fp, parser=parser) for page in reader: page.src = "index.pdf" # FIXME: don't hardcode the filename return reader
def extract_body(self, fp, basefile): reader = StreamingPDFReader() parser = "ocr" if self.config.ocr else "xml" intermediate_suffix = ".hocr" if self.config.ocr else ".xml" if self.config.compress: intermediate_suffix += "." + self.config.compress reader.read(fp, parser=parser) for attachment in [ x for x in sorted( self.store.list_attachments(basefile, "downloaded")) if x.endswith(".pdf") ]: downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) iattachment = attachment.replace(".pdf", intermediate_suffix) intermediate_path = self.store.intermediate_path( basefile, attachment=iattachment) if not os.path.exists(intermediate_path): fp = self.convert_pdf(downloaded_path, intermediate_path) else: fp = self.store.open_intermediate(basefile, attachment=iattachment) reader += StreamingPDFReader().read(fp) for page in reader: page.src = "index.pdf" # FIXME: don't hardcode the filename return reader
def convert_pdf(self, downloaded_path, intermediate_path): intermediate_dir = os.path.dirname(intermediate_path) keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() kwargs = {'filename': downloaded_path, 'workdir': intermediate_dir, 'images': self.config.pdfimages, 'keep_xml': keep_xml} if self.config.ocr: kwargs['ocr_lang'] = 'swe' return reader.convert(**kwargs)
def lazy_downloaded_to_intermediate(basefile): downloaded_path = self.store.downloaded_path( basefile, attachment="index.pdf") downloaded_path_html = self.store.downloaded_path( basefile, attachment="index.html") if not os.path.exists(downloaded_path): if os.path.exists(downloaded_path_html): # attempt to parse HTML instead return open(downloaded_path_html) else: # just grab the HTML from the XML file itself... tree = etree.parse(self.store.downloaded_path(basefile)) html = tree.getroot().find("dokument").find("html") if html is not None: return StringIO(html.text) else: return StringIO( "<html><h1>Dokumenttext saknas</h1></html>") intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: res = reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml) except (errors.PDFFileIsEmpty, errors.ExternalCommandError) as e: if isinstance(e, errors.ExternalCommandError): self.log.debug("%s: PDF file conversion failed: %s" % (basefile, str(e).split("\n")[0])) # if PDF file conversion fails, it'll probaby fail # again when we try OCR, but maybe there will # exist a cached intermediate file that allow us # to get data without even looking at the PDF file # again. elif isinstance(e, errors.PDFFileIsEmpty): self.log.debug("%s: PDF had no textcontent, trying OCR" % basefile) res = reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang="swe") if os.path.getsize(intermediate_path) > 20 * 1024 * 1024: raise errors.ParseError( "%s: %s (after conversion) is just too damn big (%s Mbytes)" % (basefile, intermediate_path, os.path.getsize(intermediate_path) / (1024 * 1024))) return res
def downloaded_to_intermediate(self, basefile, attachment=None): intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() kwargs = {'filename': self.store.downloaded_path(basefile, attachment=attachment), 'workdir': intermediate_dir, 'images': self.config.pdfimages, 'keep_xml': keep_xml} if self.config.ocr: kwargs['ocr_lang'] = 'swe' return reader.convert(**kwargs)
def extract_body(self, fp, basefile): # If we can asssume that the fp is a hOCR HTML file and not a # PDF2XML file, use alternate parser. FIXME: There ought to be # a cleaner way than guessing based on filename parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml" reader = StreamingPDFReader().read(fp, parser=parser) baseuri = self.canonical_uri(basefile) for page in reader: page.src = "%s/sid%s.png" % (baseuri, page.number) if reader.is_empty(): raise DocumentRemovedError(dummyfile=self.store.parsed_path(basefile)) else: return reader
def lazy_downloaded_to_intermediate(basefile): downloaded_path = self.store.downloaded_path(basefile, attachment="index.pdf") downloaded_path_html = self.store.downloaded_path(basefile, attachment="index.html") if not os.path.exists(downloaded_path): if os.path.exists(downloaded_path_html): # attempt to parse HTML instead return open(downloaded_path_html) else: # just grab the HTML from the XML file itself... tree = etree.parse(self.store.downloaded_path(basefile)) html = tree.getroot().find("dokument").find("html") if html is not None: return StringIO(html.text) else: return StringIO("<html><h1>Dokumenttext saknas</h1></html>") intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: res = reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml) except (errors.PDFFileIsEmpty, errors.ExternalCommandError) as e: if isinstance(e, errors.ExternalCommandError): self.log.debug("%s: PDF file conversion failed: %s" % (basefile, str(e).split("\n")[0])) # if PDF file conversion fails, it'll probaby fail # again when we try OCR, but maybe there will # exist a cached intermediate file that allow us # to get data without even looking at the PDF file # again. elif isinstance(e, errors.PDFFileIsEmpty): self.log.debug("%s: PDF had no textcontent, trying OCR" % basefile) res = reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang="swe") # now the intermediate path endswith .hocr.html.bz2, not .xml.bz2 intermediate_path = self.store.intermediate_path(basefile) if os.path.getsize(intermediate_path) > 20*1024*1024: raise errors.ParseError("%s: %s (after conversion) is just too damn big (%s Mbytes)" % (basefile, intermediate_path, os.path.getsize(intermediate_path) / (1024*1024))) return res
def extract_body(self, fp, basefile): # If we can asssume that the fp is a hOCR HTML file and not a # PDF2XML file, use alternate parser. FIXME: There ought to be # a cleaner way than guessing based on filename parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml" reader = StreamingPDFReader().read(fp, parser=parser) baseuri = self.canonical_uri(basefile) for page in reader: page.src = "%s/sid%s.png" % (baseuri, page.number) if reader.is_empty(): raise DocumentRemovedError( dummyfile=self.store.parsed_path(basefile)) else: return reader
def downloaded_to_intermediate(self, basefile, attachment=None): intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() kwargs = { 'filename': self.store.downloaded_path(basefile, attachment=attachment), 'workdir': intermediate_dir, 'images': self.config.pdfimages, 'keep_xml': keep_xml } if self.config.ocr: kwargs['ocr_lang'] = 'swe' return reader.convert(**kwargs)
def extract_body(self, fp, basefile): pdffile = self.store.downloaded_path(basefile, attachment="index.pdf") # fp can now be a pointer to a hocr file, a pdf2xml file, # a html file or a StringIO object containing html taken # from index.xml if os.path.exists(pdffile): fp = self.parse_open(basefile) parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml" reader = StreamingPDFReader().read(fp, parser=parser) identifier = self.canonical_uri(basefile) pdffile = self.store.downloaded_path(basefile, attachment="index.pdf") for page in reader: page.src = pdffile return reader else: # fp points to a HTML file, which we can use directly. # fp will be a raw bitstream of a latin-1 file. try: filename = util.name_from_fp(fp) self.log.debug("%s: Loading soup from %s" % (basefile, filename)) except ValueError: self.log.debug("%s: Loading placeholder soup" % (basefile)) text = fp.read() if text == "Propositionen ej utgiven": raise errors.DocumentRemovedError("%s was never published" % basefile) else: return BeautifulSoup(text, "lxml")
def extract_body(self, fp, basefile): reader = StreamingPDFReader() parser = "ocr" if self.config.ocr else "xml" intermediate_suffix = ".hocr" if self.config.ocr else ".xml" if self.config.compress: intermediate_suffix += "." + self.config.compress reader.read(fp, parser=parser) for attachment in [x for x in sorted(self.store.list_attachments(basefile, "downloaded")) if x.endswith(".pdf")]: downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) iattachment = attachment.replace(".pdf", intermediate_suffix) intermediate_path = self.store.intermediate_path(basefile, attachment=iattachment) if not os.path.exists(intermediate_path): fp = self.convert_pdf(downloaded_path, intermediate_path) else: fp = self.store.open_intermediate(basefile, attachment=iattachment) reader += StreamingPDFReader().read(fp) for page in reader: page.src = "index.pdf" # FIXME: don't hardcode the filename return reader
def downloaded_to_intermediate(self, basefile, attachment=None): # force just the conversion part of the PDF handling downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) ocr_lang = None convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang, legacy_tesseract=self.config.legacytesseract) except PDFFileIsEmpty as e: if self.config.ocr: self.log.warning("%s: %s was empty, attempting OCR" % (basefile, downloaded_path)) ocr_lang = "swe" # reasonable guess return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang) else: self.log.warning("%s: %s was empty, returning placeholder" % (basefile, downloaded_path)) fp = BytesIO(b"""<pdf2xml> <page number="1" position="absolute" top="0" left="0" height="1029" width="701"> <fontspec id="0" size="12" family="TimesNewRomanPSMT" color="#000000"/> <text top="67" left="77" width="287" height="26" font="0">[Avgörandetext saknas]</text> </page> </pdf2xml>""") fp.name = "dummy.xml" return fp
def downloaded_to_intermediate(self, basefile, attachment=None): # force just the conversion part of the PDF handling downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) ocr_lang = None convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang) except PDFFileIsEmpty as e: if self.config.ocr: self.log.warning("%s: %s was empty, attempting OCR" % (basefile, downloaded_path)) ocr_lang = "swe" # reasonable guess return reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang=ocr_lang) else: self.log.warning("%s: %s was empty, returning placeholder" % (basefile, downloaded_path)) fp = BytesIO(b"""<pdf2xml> <page number="1" position="absolute" top="0" left="0" height="1029" width="701"> <fontspec id="0" size="12" family="TimesNewRomanPSMT" color="#000000"/> <text top="67" left="77" width="287" height="26" font="0">[Avgörandetext saknas]</text> </page> </pdf2xml>""") fp.name = "dummy.xml" return fp