Exemplo n.º 1
0
 def extract_body(self, fp, basefile):
     reader = StreamingPDFReader()
     parser = "ocr" if self.config.ocr else "xml"
     reader.read(fp, parser=parser)
     for page in reader:
         page.src = "index.pdf"  # FIXME: don't hardcode the filename
     return reader
Exemplo n.º 2
0
 def downloaded_to_intermediate(self, basefile, attachment=None):
     # force just the conversion part of the PDF handling
     downloaded_path = self.store.downloaded_path(basefile,
                                                  attachment=attachment)
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     ocr_lang = None
     convert_to_pdf = not downloaded_path.endswith(".pdf")
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     try:
         return reader.convert(filename=downloaded_path,
                               workdir=intermediate_dir,
                               images=self.config.pdfimages,
                               convert_to_pdf=convert_to_pdf,
                               keep_xml=keep_xml,
                               ocr_lang=ocr_lang)
     except PDFFileIsEmpty as e:
         self.log.warning("%s: %s was empty, attempting OCR" %
                          (basefile, downloaded_path))
         ocr_lang = "swe"  # reasonable guess
         return reader.convert(filename=downloaded_path,
                               workdir=intermediate_dir,
                               images=self.config.pdfimages,
                               convert_to_pdf=convert_to_pdf,
                               keep_xml=keep_xml,
                               ocr_lang=ocr_lang)
Exemplo n.º 3
0
 def extract_body(self, fp, basefile):
     reader = StreamingPDFReader()
     parser = "ocr" if self.config.ocr else "xml"
     reader.read(fp, parser=parser)
     for page in reader:
         page.src = "index.pdf"  # FIXME: don't hardcode the filename
     return reader
Exemplo n.º 4
0
    def extract_body(self, fp, basefile):
        reader = StreamingPDFReader()
        parser = "ocr" if self.config.ocr else "xml"
        intermediate_suffix = ".hocr" if self.config.ocr else ".xml"
        if self.config.compress:
            intermediate_suffix += "." + self.config.compress
        reader.read(fp, parser=parser)
        for attachment in [
                x for x in sorted(
                    self.store.list_attachments(basefile, "downloaded"))
                if x.endswith(".pdf")
        ]:
            downloaded_path = self.store.downloaded_path(basefile,
                                                         attachment=attachment)
            iattachment = attachment.replace(".pdf", intermediate_suffix)
            intermediate_path = self.store.intermediate_path(
                basefile, attachment=iattachment)
            if not os.path.exists(intermediate_path):
                fp = self.convert_pdf(downloaded_path, intermediate_path)
            else:
                fp = self.store.open_intermediate(basefile,
                                                  attachment=iattachment)
            reader += StreamingPDFReader().read(fp)

        for page in reader:
            page.src = "index.pdf"  # FIXME: don't hardcode the filename
        return reader
Exemplo n.º 5
0
 def convert_pdf(self, downloaded_path, intermediate_path):
     intermediate_dir = os.path.dirname(intermediate_path)
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     kwargs = {'filename': downloaded_path,
               'workdir': intermediate_dir,
               'images': self.config.pdfimages,
               'keep_xml': keep_xml}
     if self.config.ocr:
         kwargs['ocr_lang'] = 'swe'
     return reader.convert(**kwargs)
Exemplo n.º 6
0
 def lazy_downloaded_to_intermediate(basefile):
     downloaded_path = self.store.downloaded_path(
         basefile, attachment="index.pdf")
     downloaded_path_html = self.store.downloaded_path(
         basefile, attachment="index.html")
     if not os.path.exists(downloaded_path):
         if os.path.exists(downloaded_path_html):
             # attempt to parse HTML instead
             return open(downloaded_path_html)
         else:
             # just grab the HTML from the XML file itself...
             tree = etree.parse(self.store.downloaded_path(basefile))
             html = tree.getroot().find("dokument").find("html")
         if html is not None:
             return StringIO(html.text)
         else:
             return StringIO(
                 "<html><h1>Dokumenttext saknas</h1></html>")
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     convert_to_pdf = not downloaded_path.endswith(".pdf")
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     try:
         res = reader.convert(filename=downloaded_path,
                              workdir=intermediate_dir,
                              images=self.config.pdfimages,
                              convert_to_pdf=convert_to_pdf,
                              keep_xml=keep_xml)
     except (errors.PDFFileIsEmpty, errors.ExternalCommandError) as e:
         if isinstance(e, errors.ExternalCommandError):
             self.log.debug("%s: PDF file conversion failed: %s" %
                            (basefile, str(e).split("\n")[0]))
             # if PDF file conversion fails, it'll probaby fail
             # again when we try OCR, but maybe there will
             # exist a cached intermediate file that allow us
             # to get data without even looking at the PDF file
             # again.
         elif isinstance(e, errors.PDFFileIsEmpty):
             self.log.debug("%s: PDF had no textcontent, trying OCR" %
                            basefile)
         res = reader.convert(filename=downloaded_path,
                              workdir=intermediate_dir,
                              images=self.config.pdfimages,
                              convert_to_pdf=convert_to_pdf,
                              keep_xml=keep_xml,
                              ocr_lang="swe")
     if os.path.getsize(intermediate_path) > 20 * 1024 * 1024:
         raise errors.ParseError(
             "%s: %s (after conversion) is just too damn big (%s Mbytes)"
             % (basefile, intermediate_path,
                os.path.getsize(intermediate_path) / (1024 * 1024)))
     return res
Exemplo n.º 7
0
 def downloaded_to_intermediate(self, basefile, attachment=None):
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     kwargs = {'filename': self.store.downloaded_path(basefile, attachment=attachment),
               'workdir': intermediate_dir,
               'images': self.config.pdfimages,
               'keep_xml': keep_xml}
     if self.config.ocr:
         kwargs['ocr_lang'] = 'swe'
     return reader.convert(**kwargs)
Exemplo n.º 8
0
 def extract_body(self, fp, basefile):
     # If we can asssume that the fp is a hOCR HTML file and not a
     # PDF2XML file, use alternate parser. FIXME: There ought to be
     # a cleaner way than guessing based on filename
     parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml"
     reader = StreamingPDFReader().read(fp, parser=parser)
     baseuri = self.canonical_uri(basefile)
     for page in reader:
         page.src = "%s/sid%s.png" % (baseuri, page.number)
     if reader.is_empty():
         raise DocumentRemovedError(dummyfile=self.store.parsed_path(basefile))
     else:
         return reader
Exemplo n.º 9
0
 def lazy_downloaded_to_intermediate(basefile):
     downloaded_path = self.store.downloaded_path(basefile,
                                                  attachment="index.pdf")
     downloaded_path_html = self.store.downloaded_path(basefile,
                                                       attachment="index.html")
     if not os.path.exists(downloaded_path):
         if os.path.exists(downloaded_path_html):
             # attempt to parse HTML instead
             return open(downloaded_path_html)
         else:
             # just grab the HTML from the XML file itself...
             tree = etree.parse(self.store.downloaded_path(basefile))
             html = tree.getroot().find("dokument").find("html")
         if html is not None:
             return StringIO(html.text)
         else:
             return StringIO("<html><h1>Dokumenttext saknas</h1></html>")
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     convert_to_pdf = not downloaded_path.endswith(".pdf")
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     try:
         res = reader.convert(filename=downloaded_path,
                              workdir=intermediate_dir,
                              images=self.config.pdfimages,
                              convert_to_pdf=convert_to_pdf,
                              keep_xml=keep_xml)
     except (errors.PDFFileIsEmpty, errors.ExternalCommandError) as e:
         if isinstance(e, errors.ExternalCommandError):
             self.log.debug("%s: PDF file conversion failed: %s" % (basefile, str(e).split("\n")[0]))
             # if PDF file conversion fails, it'll probaby fail
             # again when we try OCR, but maybe there will
             # exist a cached intermediate file that allow us
             # to get data without even looking at the PDF file
             # again.
         elif isinstance(e, errors.PDFFileIsEmpty):
             self.log.debug("%s: PDF had no textcontent, trying OCR" % basefile)
         res = reader.convert(filename=downloaded_path,
                              workdir=intermediate_dir,
                              images=self.config.pdfimages,
                              convert_to_pdf=convert_to_pdf,
                              keep_xml=keep_xml,
                              ocr_lang="swe")
         # now the intermediate path endswith .hocr.html.bz2, not .xml.bz2 
         intermediate_path = self.store.intermediate_path(basefile)
     if os.path.getsize(intermediate_path) > 20*1024*1024:
         raise errors.ParseError("%s: %s (after conversion) is just too damn big (%s Mbytes)" % 
                                 (basefile, intermediate_path, 
                                  os.path.getsize(intermediate_path) / (1024*1024)))
     return res
Exemplo n.º 10
0
 def extract_body(self, fp, basefile):
     # If we can asssume that the fp is a hOCR HTML file and not a
     # PDF2XML file, use alternate parser. FIXME: There ought to be
     # a cleaner way than guessing based on filename
     parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml"
     reader = StreamingPDFReader().read(fp, parser=parser)
     baseuri = self.canonical_uri(basefile)
     for page in reader:
         page.src = "%s/sid%s.png" % (baseuri, page.number)
     if reader.is_empty():
         raise DocumentRemovedError(
             dummyfile=self.store.parsed_path(basefile))
     else:
         return reader
Exemplo n.º 11
0
 def downloaded_to_intermediate(self, basefile, attachment=None):
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     kwargs = {
         'filename': self.store.downloaded_path(basefile,
                                                attachment=attachment),
         'workdir': intermediate_dir,
         'images': self.config.pdfimages,
         'keep_xml': keep_xml
     }
     if self.config.ocr:
         kwargs['ocr_lang'] = 'swe'
     return reader.convert(**kwargs)
Exemplo n.º 12
0
 def extract_body(self, fp, basefile):
     pdffile = self.store.downloaded_path(basefile, attachment="index.pdf")
     # fp can now be a pointer to a hocr file, a pdf2xml file,
     # a html file or a StringIO object containing html taken
     # from index.xml
     if os.path.exists(pdffile):
         fp = self.parse_open(basefile)
         parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml"
         reader = StreamingPDFReader().read(fp, parser=parser)
         identifier = self.canonical_uri(basefile)
         pdffile = self.store.downloaded_path(basefile,
                                              attachment="index.pdf")
         for page in reader:
             page.src = pdffile
         return reader
     else:
         # fp points to a HTML file, which we can use directly.
         # fp will be a raw bitstream of a latin-1 file.
         try:
             filename = util.name_from_fp(fp)
             self.log.debug("%s: Loading soup from %s" %
                            (basefile, filename))
         except ValueError:
             self.log.debug("%s: Loading placeholder soup" % (basefile))
         text = fp.read()
         if text == "Propositionen ej utgiven":
             raise errors.DocumentRemovedError("%s was never published" %
                                               basefile)
         else:
             return BeautifulSoup(text, "lxml")
Exemplo n.º 13
0
    def extract_body(self, fp, basefile):
        reader = StreamingPDFReader()
        parser = "ocr" if self.config.ocr else "xml"
        intermediate_suffix = ".hocr" if self.config.ocr else ".xml"
        if self.config.compress:
            intermediate_suffix += "." + self.config.compress
        reader.read(fp, parser=parser)
        for attachment in [x for x in sorted(self.store.list_attachments(basefile, "downloaded")) if x.endswith(".pdf")]:
            downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
            iattachment = attachment.replace(".pdf", intermediate_suffix)
            intermediate_path = self.store.intermediate_path(basefile, attachment=iattachment)
            if not os.path.exists(intermediate_path):
                fp = self.convert_pdf(downloaded_path, intermediate_path)
            else:
                fp = self.store.open_intermediate(basefile, attachment=iattachment)
            reader += StreamingPDFReader().read(fp)

        for page in reader:
            page.src = "index.pdf"  # FIXME: don't hardcode the filename
        return reader
Exemplo n.º 14
0
    def downloaded_to_intermediate(self, basefile, attachment=None):
        # force just the conversion part of the PDF handling
        downloaded_path = self.store.downloaded_path(basefile,
                                                     attachment=attachment)
        intermediate_path = self.store.intermediate_path(basefile)
        intermediate_dir = os.path.dirname(intermediate_path)
        ocr_lang = None
        convert_to_pdf = not downloaded_path.endswith(".pdf")
        keep_xml = "bz2" if self.config.compress == "bz2" else True
        reader = StreamingPDFReader()
        try:
            return reader.convert(filename=downloaded_path,
                                  workdir=intermediate_dir,
                                  images=self.config.pdfimages,
                                  convert_to_pdf=convert_to_pdf,
                                  keep_xml=keep_xml,
                                  ocr_lang=ocr_lang,
                                  legacy_tesseract=self.config.legacytesseract)
        except PDFFileIsEmpty as e:
            if self.config.ocr:
                self.log.warning("%s: %s was empty, attempting OCR" %
                                 (basefile, downloaded_path))
                ocr_lang = "swe"  # reasonable guess
                return reader.convert(filename=downloaded_path,
                                      workdir=intermediate_dir,
                                      images=self.config.pdfimages,
                                      convert_to_pdf=convert_to_pdf,
                                      keep_xml=keep_xml,
                                      ocr_lang=ocr_lang)
            else:
                self.log.warning("%s: %s was empty, returning placeholder" %
                                 (basefile, downloaded_path))
                fp = BytesIO(b"""<pdf2xml>
                <page number="1" position="absolute" top="0" left="0" height="1029" width="701">
	        <fontspec id="0" size="12" family="TimesNewRomanPSMT" color="#000000"/>
                <text top="67" left="77" width="287" height="26" font="0">[Avg&#246;randetext saknas]</text>
                </page>
                </pdf2xml>""")
                fp.name = "dummy.xml"
                return fp
Exemplo n.º 15
0
    def downloaded_to_intermediate(self, basefile, attachment=None):
        # force just the conversion part of the PDF handling
        downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
        intermediate_path = self.store.intermediate_path(basefile)
        intermediate_dir = os.path.dirname(intermediate_path)
        ocr_lang = None
        convert_to_pdf = not downloaded_path.endswith(".pdf")
        keep_xml = "bz2" if self.config.compress == "bz2" else True
        reader = StreamingPDFReader()
        try:
            return reader.convert(filename=downloaded_path,
                                  workdir=intermediate_dir,
                                  images=self.config.pdfimages,
                                  convert_to_pdf=convert_to_pdf,
                                  keep_xml=keep_xml,
                                  ocr_lang=ocr_lang)
        except PDFFileIsEmpty as e:
            if self.config.ocr:
                self.log.warning("%s: %s was empty, attempting OCR" % (basefile, downloaded_path))
                ocr_lang = "swe" # reasonable guess
                return reader.convert(filename=downloaded_path,
                                      workdir=intermediate_dir,
                                      images=self.config.pdfimages,
                                      convert_to_pdf=convert_to_pdf,
                                      keep_xml=keep_xml,
                                      ocr_lang=ocr_lang)
            else:
                self.log.warning("%s: %s was empty, returning placeholder" % (basefile, downloaded_path))
                fp = BytesIO(b"""<pdf2xml>
                <page number="1" position="absolute" top="0" left="0" height="1029" width="701">
	        <fontspec id="0" size="12" family="TimesNewRomanPSMT" color="#000000"/>
                <text top="67" left="77" width="287" height="26" font="0">[Avg&#246;randetext saknas]</text>
                </page>
                </pdf2xml>""")
                fp.name = "dummy.xml"
                return fp