Пример #1
0
def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    tessdata_prefix = get_config('TESSDATA_PREFIX')
    if tessdata_prefix is None:
        raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.")
    languages = get_languages_iso3(languages)
    text = Cache.get_ocr(data, languages)
    if text is not None:
        return text
    try:
        img = Image.open(StringIO(data))
    except DecompressionBombWarning as dce:
        log.debug("Image too large: %", dce)
        return None
    except IOError as ioe:
        log.info("Unknown image format: %r", ioe)
        return None
    # TODO: play with contrast and sharpening the images.
    extractor = Tesseract(tessdata_prefix, lang=languages)
    extractor.set_image(img)
    extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
    text = extractor.get_text() or ''
    text = text.decode(encoding="UTF-8")
    # extractor.clear()
    log.debug('OCR done: %s, %s characters extracted', languages, len(text))
    Cache.set_ocr(data, languages, text)
    return text
Пример #2
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will convert the whole file to XML using `pdftohtml`, then run OCR
    on individual images within the file.
    """
    temp_dir = make_tempdir()
    try:
        out_file = os.path.join(temp_dir, 'pdf.xml')
        log.info("Converting PDF to XML: %r...", path)
        pdftohtml = get_config('PDFTOHTML_BIN')
        args = [pdftohtml, '-xml', '-hidden', '-q', '-nodrm', path, out_file]
        subprocess.call(args)

        if not os.path.exists(out_file):
            raise IngestorException("Could not convert PDF to XML: %s" % path)

        with open(out_file, 'r') as fh:
            xml = string_value(fh.read())
            xml = xml.replace('encoding="UTF-8"', '')
            parser = etree.XMLParser(recover=True, remove_comments=True)
            doc = etree.fromstring(xml, parser=parser)
            log.debug("Parsed XML: %r", path)

        pages = []
        for page in doc.findall('./page'):
            pages.append(extract_page(path, temp_dir, page, languages))

        return {'pages': pages}
    finally:
        remove_tempdir(temp_dir)
Пример #3
0
    def ingest(self, meta, local_path):
        fh, out_path = mkstemp(suffix='.htm')
        os.close(fh)
        with open(local_path, 'rb') as fh:
            doc = html.fromstring(fh.read())
            if not meta.has('title'):
                title = doc.findtext('.//title')
                if title is not None:
                    meta.title = title.strip()

            if not meta.has('summary'):
                summary = doc.find('.//meta[@name="description"]')
                if summary is not None and summary.get('content'):
                    meta.summary = summary.get('content')

            self.cleaner(doc)
        try:
            with open(out_path, 'w') as fh:
                fh.write(etree.tostring(doc))

            pdf_path = self.generate_pdf_version(out_path)
            if pdf_path is None or not os.path.isfile(pdf_path):
                raise IngestorException("Could not convert document: %r", meta)
            self.extract_pdf_alternative(meta, pdf_path)
        finally:
            if os.path.isfile(out_path):
                os.unlink(out_path)
Пример #4
0
def ingest_file(source_id, meta, file_path, move=False):
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = get_archive().archive_file(file_path, meta, move=move)
        ingest.delay(source_id, meta.data)
    except Exception as ex:
        Ingestor.handle_exception(meta, source_id, ex)
Пример #5
0
def ingest_file(collection_id, meta, file_path, move=False):
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = get_archive().archive_file(file_path, meta, move=move)
        ingest.delay(collection_id, meta.to_attr_dict())
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
Пример #6
0
 def handle_html(self, meta, html_path):
     """OK, this is weirder. Converting HTML to PDF via WebKit."""
     out_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         wkhtmltopdf = get_config('WKHTMLTOPDF_BIN')
         args = [
             wkhtmltopdf, '--disable-javascript', '--no-outline',
             '--no-images', '--quiet', html_path, out_path
         ]
         subprocess.call(args)
         if not os.path.isfile(out_path):
             raise IngestorException("Could not convert document: %r", meta)
         self.extract_pdf_alternative(meta, out_path)
     finally:
         remove_tempfile(out_path)
Пример #7
0
def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    tessdata_prefix = get_config('TESSDATA_PREFIX')
    if tessdata_prefix is None:
        raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.")
    languages = get_languages_iso3(languages)
    text = Cache.get_ocr(data, languages)
    if text is not None:
        return text
    img = Image.open(StringIO(data))
    # TODO: play with contrast and sharpening the images.
    extractor = Tesseract(tessdata_prefix, lang=languages)
    extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
    text = extractor.ocr_image(img)
    log.debug('OCR done: %s, %s characters extracted', languages, len(text))
    Cache.set_ocr(data, languages, text)
    return text
Пример #8
0
 def ingest(self, meta, local_path):
     try:
         fh, pdf_path = mkstemp(suffix='.pdf')
         os.close(fh)
         meta.title = meta.file_name
         convert = get_config('CONVERT_BIN')
         args = [
             convert, local_path, '-density', '300', '-define',
             'pdf:fit-page=A4', pdf_path
         ]
         subprocess.call(args)
         if pdf_path is None or not os.path.isfile(pdf_path):
             raise IngestorException("Could not convert image: %r" % meta)
         self.store_pdf(meta, pdf_path)
         self.extract_pdf(meta, pdf_path)
     finally:
         if os.path.isfile(pdf_path):
             os.unlink(pdf_path)
Пример #9
0
 def generate_pdf_alternative(self, meta, local_path):
     """Convert LibreOffice-supported documents to PDF."""
     work_dir = six.text_type(mkdtemp())
     instance_dir = six.text_type(mkdtemp())
     try:
         soffice = get_config('SOFFICE_BIN')
         instance_path = u'"-env:UserInstallation=file://%s"' % instance_dir
         args = [soffice, '--convert-to', 'pdf', '--nofirststartwizard',
                 instance_path, '--norestore', '--nologo', '--nodefault',
                 '--nolockcheck', '--invisible', '--outdir', work_dir,
                 '--headless', string_value(local_path)]
         # log.debug('Converting document: %r', ' '.join(args))
         subprocess.call(args, timeout=CONVERT_TIMEOUT)
         for out_file in os.listdir(work_dir):
             return os.path.join(work_dir, out_file)
         raise IngestorException("Could not convert document: %r" % meta)
     finally:
         shutil.rmtree(instance_dir)
Пример #10
0
def ingest_file(collection_id,
                meta,
                file_path,
                move=False,
                queue=WORKER_QUEUE,
                routing_key=WORKER_ROUTING_KEY):
    # the queue and routing key arguments are a workaround to
    # expedite user uploads over long-running batch imports.
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = get_archive().archive_file(file_path, meta, move=move)
        ingest.apply_async([collection_id, meta.to_attr_dict()],
                           queue=queue,
                           routing_key=routing_key)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
Пример #11
0
def ingest_url(collection_id, metadata, url):
    meta = Metadata(data=metadata)
    try:
        fh, tmp_path = mkstemp()
        os.close(fh)
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True, timeout=120)
        if res.status_code >= 400:
            msg = "HTTP Error %r: %r" % (url, res.status_code)
            raise IngestorException(msg)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()