示例#1
0
def test_ocr_rotated_small_angle():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        with ocr_page_to_pdf(png_fns[0]) as pdf_fn:
            with extract_text_and_structure(pdf_fn) as (txt, txt_struct, _s,
                                                        _d):
                assert 'rotated' in txt
示例#2
0
def p():
    fn = '..'
    with extract_page_images(fn, 54, 54) as image_fns:
        for image_fn in image_fns:
            shutil.copy(image_fn, '..')
            with ocr_page_to_pdf(image_fn) as page_pdf_fn:
                shutil.copy(page_pdf_fn, '..')
                text, struct = extract_text_and_structure(page_pdf_fn)
示例#3
0
def test_ocr_page():
    fn = os.path.join(data_dir, 'ocr1.pdf')
    txt = ''
    with extract_page_images(fn) as image_fns:
        for image in image_fns:
            with ocr_page_to_pdf(image) as pdf_fn:
                txt += '\n' + extract_text_pdfminer(pdf_fn)
    txt = txt.replace('  ', ' ')
    assert 'each Contributor hereby grants to You' in txt
    assert 'You may add Your own' in txt
    assert 'Submission of Contributions' in txt
    assert 'END OF TERMS AND CONDITIONS' in txt
示例#4
0
def p2():
    from text_extraction_system.pdf.pdf import merge_pdf_pages, split_pdf_to_page_blocks
    from text_extraction_system.ocr.ocr import ocr_page_to_pdf
    import shutil
    orig_pdf_fn = '/home/mikhail/lexpredict/misc/angles/A2A3E26061E43CD60156598713530D98C.pdf'
    page = 1

    with split_pdf_to_page_blocks(orig_pdf_fn) as page_fns:
        page_fn = page_fns[49]
        with extract_page_ocr_images(page_fn, 1, 1, dpi=300) as images:
            with ocr_page_to_pdf(images.get(1),
                                 glyphless_text_only=True,
                                 tesseract_page_orientation_detection=True) as ocred_page_pdf:  # type: str
                with merge_pdf_pages(orig_pdf_fn, single_page_merge_num_file_rotate=(1, ocred_page_pdf, None)) as final_pdf:
                    shutil.copy(page_fn, '/home/mikhail/lexpredict/misc/angles/A2A3E26061E43CD60156598713530D98C__00050.ocred.pdf')
def test_table_ocr():
    fn = os.path.join(data_dir, 'table1.png')
    warn_mock = MagicMock('warn')
    warnings.warn = warn_mock

    from text_extraction_system.ocr.ocr import ocr_page_to_pdf

    with ocr_page_to_pdf(fn) as pdf_fn:
        with open(pdf_fn, 'rb') as ocred_in_file:
            ocred_page_layout = data_extract.get_first_page_layout(
                ocred_in_file)
            camelot_tables = extract_tables(1, ocred_page_layout, fn)

    assert len(camelot_tables) == 1
    warn_mock.assert_not_called()
示例#6
0
def process_pdf_page(
    pdf_fn: str,
    page_num: int,
    ocr_enabled: bool = True,
    ocr_language: str = None,
    ocr_timeout_sec: int = 60,
    pdf_password: str = None
) -> Generator[PDFPageProcessingResults, None, None]:
    with open(pdf_fn, 'rb') as in_file:
        if ocr_enabled:
            # Try extracting "no-text" image of the pdf page.
            # It removes all elements from the page except images having no overlapping
            # with any text element.
            # This is used to avoid the text duplication by OCR.
            with extract_page_ocr_images(pdf_fn,
                                         start_page=1,
                                         end_page=1,
                                         pdf_password=pdf_password,
                                         dpi=DPI,
                                         reset_page_rotation=False) \
                    as image_fns:
                page_image_without_text_fn = image_fns.get(
                    1) if image_fns else None
                if page_image_without_text_fn:
                    # this returns a text-based PDF with glyph-less text only
                    # to be used for merging in front of the original PDF page layout
                    with ocr_page_to_pdf(
                            page_image_fn=page_image_without_text_fn,
                            language=ocr_language,
                            timeout=ocr_timeout_sec,
                            glyphless_text_only=True,
                            tesseract_page_orientation_detection=True
                    ) as ocred_text_layer_pdf_fn:
                        # we return only the transparent text layer PDF and not the merged page
                        # because in the final step we will need to merge these transparent layer in front
                        # of the pages in the original PDF file to keep its small size and structure/bookmarks.
                        yield PDFPageProcessingResults(
                            page_requires_ocr=True,
                            ocred_page_fn=ocred_text_layer_pdf_fn)
                        return
        # if we don't need OCR then
        yield PDFPageProcessingResults(page_requires_ocr=False)
from text_extraction_system.ocr.ocr import ocr_page_to_pdf
from text_extraction_system.pdf.pdf import merge_pdf_pages, log
import shutil
from text_extraction_system.commons.tests.commons import default_settings
from logging import DEBUG

with default_settings():
    log.setLevel(DEBUG)
    with ocr_page_to_pdf('/home/mikhail/lexpredict/misc/ocr_complicated1/page_no_text_00034.png',
                         glyphless_text_only=True) as fn:
        with merge_pdf_pages('/home/mikhail/lexpredict/misc/ocr_complicated1/ocr_complicated1_0034.pdf',
                             single_page_merge_num_file_rotate=(1, fn, None)) as fn1:
            shutil.copy(fn1, '/home/mikhail/lexpredict/misc/ocr_complicated1/ocred/')