Python _extract_image_page примеры использования

Язык программирования: Python

Пространство имен/Пакет: aleph.ingest.tesseract

Метод/Функция: _extract_image_page

Примеров на hotexamples.com: 3

Python _extract_image_page - 3 примера найдено. Это лучшие примеры Python кода для aleph.ingest.tesseract._extract_image_page, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result

Пример #2

Показать файл

Файл: pdf.py Проект: nivertech/aleph

def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result

Пример #3

Показать файл

Файл: pdf.py Проект: CodeForAfrica/aleph

def _convert_page(interpreter, page, device, page_no, path, languages):
    # If this returns None or an empty string, it'll trigger OCR.
    text_content = []
    ocr_required = False
    try:
        interpreter.process_page(page)
        layout = device.get_result()

        for text_obj in _find_objects(layout._objs, (LTTextBox, LTTextLine)):
            text = text_obj.get_text()
            if text is None:
                continue
            text = text.strip()
            if len(text):
                text_content.append(text)

        # Generous try/catch because pdfminers image support is
        # horrible.
        page_area = float(layout.width * layout.height)
        for image_obj in _find_objects(layout._objs, LTImage):
            image_area = float(image_obj.width * image_obj.height)
            page_portion = image_area / page_area
            # Go for OCR if an image makes up more than 70% of the page.
            if page_portion > 0.7:
                ocr_required = True

    except Exception as ex:
        log.exception(ex)
        ocr_required = True

    if ocr_required and get_config("OCR_PDF_PAGES"):
        log.info("Using OCR for %r, p.%s", path, page_no)
        text_content.append(_extract_image_page(path, page_no, languages))

    text = "\n".join(text_content)
    log.debug("Extracted %d characters of text from %r, p.%s", len(text), path, page_no)
    return text.strip()