示例#1
0
def test_compare_image_extraction_performance():
    # This is not a test but a small method for comparing how slower the page-to-image
    # conversion will work if running pdf2image per page instead of running it on all pages at once.

    # disabling to avoid slowing down the tests too much
    return

    pdf_fn = os.path.join(data_dir, 'tables2.pdf')

    start = time.time()
    with extract_page_images(pdf_fn) as image_file_names:
        page_num = len(image_file_names)
        print(f'Extracted {page_num} images')
    all_pages_at_once_seconds = time.time() - start
    page_num = 0
    with split_pdf_to_page_blocks(pdf_fn, page_dir) as page_pdf_fns:
        start = time.time()
        for page_fn in page_pdf_fns:
            with extract_page_images(page_fn) as _image_file_names:
                page_num += 1
        all_pages_separately_seconds = time.time() - start

    print(f'All pages at once time: {all_pages_at_once_seconds:.3f}s\n'
          f'All pages separately time: {all_pages_separately_seconds:.3f}s')
    assert all_pages_separately_seconds > 2 * all_pages_at_once_seconds
示例#2
0
def test_split_pdf_text():
    fn = os.path.join(data_dir, 'pdf_9_pages.pdf')
    with split_pdf_to_page_blocks(fn, 4) as block_files:
        txt1 = str(extract_text_pdfminer(block_files[0]))
        txt2 = str(extract_text_pdfminer(block_files[1]))
        txt3 = str(extract_text_pdfminer(block_files[2]))

        assert 'This is page 1.' in txt1
        assert 'This is page 2.' in txt1
        assert 'This is page 3.' in txt1
        assert 'This is page 4.' in txt1
        assert 'This is page 5.' in txt2
        assert 'This is page 6.' in txt2
        assert 'This is page 7.' in txt2
        assert 'This is page 8.' in txt2
        assert 'This is page 9.' in txt3

        assert 'This is page 1.' not in txt2
        assert 'This is page 2.' not in txt3
        assert 'This is page 3.' not in txt2
        assert 'This is page 4.' not in txt3
        assert 'This is page 5.' not in txt1
        assert 'This is page 6.' not in txt3
        assert 'This is page 7.' not in txt1
        assert 'This is page 8.' not in txt3
        assert 'This is page 9.' not in txt2

        assert len(block_files) == 3
示例#3
0
def test_split_pdf1():
    fn = os.path.join(data_dir, 'pdf_9_pages.pdf')
    with split_pdf_to_page_blocks(fn, 3) as block_files:
        assert len(block_files) == 3
        for fn in block_files:
            with pikepdf.open(fn) as pdf:
                assert len(pdf.pages) == 3
示例#4
0
def test_split_pdf_file_names5():
    fn = os.path.join(data_dir, 'pdf_9_pages.pdf')
    with split_pdf_to_page_blocks(
            fn, 3, page_block_base_name='qwerty.pdf') as block_files:
        assert os.path.basename(block_files[0]) == 'qwerty_0001_0003.pdf'
        assert os.path.basename(block_files[1]) == 'qwerty_0004_0006.pdf'
        assert os.path.basename(block_files[2]) == 'qwerty_0007_0009.pdf'
示例#5
0
def test_split_pdf_file_names6():
    fn = os.path.join(data_dir, 'pdf_9_pages.pdf')
    temp_dir = tempfile.mkdtemp()
    with split_pdf_to_page_blocks(
            fn, 11, page_block_base_name='aaa.pdf') as block_files:
        assert len(block_files) == 1
        assert os.path.basename(block_files[0]) == 'pdf_9_pages.pdf'
示例#6
0
def test_split_pdf2():
    fn = os.path.join(data_dir, 'pdf_9_pages.pdf')
    with split_pdf_to_page_blocks(fn, 4) as block_files:
        assert len(block_files) == 3
        with pikepdf.open(block_files[0]) as pdf:
            assert len(pdf.pages) == 4
        with pikepdf.open(block_files[1]) as pdf:
            assert len(pdf.pages) == 4
        with pikepdf.open(block_files[2]) as pdf:
            assert len(pdf.pages) == 1
示例#7
0
def p2():
    from text_extraction_system.pdf.pdf import merge_pdf_pages, split_pdf_to_page_blocks
    from text_extraction_system.ocr.ocr import ocr_page_to_pdf
    import shutil
    orig_pdf_fn = '/home/mikhail/lexpredict/misc/angles/A2A3E26061E43CD60156598713530D98C.pdf'
    page = 1

    with split_pdf_to_page_blocks(orig_pdf_fn) as page_fns:
        page_fn = page_fns[49]
        with extract_page_ocr_images(page_fn, 1, 1, dpi=300) as images:
            with ocr_page_to_pdf(images.get(1),
                                 glyphless_text_only=True,
                                 tesseract_page_orientation_detection=True) as ocred_page_pdf:  # type: str
                with merge_pdf_pages(orig_pdf_fn, single_page_merge_num_file_rotate=(1, ocred_page_pdf, None)) as final_pdf:
                    shutil.copy(page_fn, '/home/mikhail/lexpredict/misc/angles/A2A3E26061E43CD60156598713530D98C__00050.ocred.pdf')
def process_pdf(pdf_fn: str, req: RequestMetadata,
                webdav_client: WebDavClient):
    log.info(f'{req.original_file_name} | Pre-processing PDF document')
    log.info(
        f'{req.original_file_name} | Splitting to pages to parallelize processing...'
    )
    with split_pdf_to_page_blocks(pdf_fn, pages_per_block=1) as pdf_page_fns:
        webdav_client.mkdir(f'{req.request_id}/{pages_for_processing}')
        webdav_client.mkdir(f'{req.request_id}/{pages_ocred}')
        webdav_client.mkdir(f'{req.request_id}/{pages_tables}')
        task_signatures = list()
        i = 0

        lang_converter = LanguageConverter()
        language, locale_code = lang_converter.get_language_and_locale_code(
            req.doc_language)
        ocr_language = lang_converter.convert_language_to_tesseract_view(
            language)

        for pdf_page_fn in pdf_page_fns:
            i += 1
            pdf_page_base_fn = os.path.basename(pdf_page_fn)
            webdav_client.upload_file(
                f'{req.request_id}/{pages_for_processing}/{pdf_page_base_fn}',
                pdf_page_fn)
            task_signatures.append(
                process_pdf_page_task.s(req.request_id, req.original_file_name,
                                        pdf_page_base_fn, i, ocr_language,
                                        req.request_callback_info.log_extra))

        log.info(
            f'{req.original_file_name} | Scheduling {len(task_signatures)} sub-tasks...'
        )
        request_callback_info_dict = req.request_callback_info.to_dict()
        c = chord(task_signatures)(finish_pdf_processing.s(
            req.request_id, req.original_file_name,
            request_callback_info_dict).set(link_error=[
                ocr_error_callback.s(req.request_id,
                                     request_callback_info_dict)
            ]))
        register_task_id(webdav_client, req.request_id, c.id)
        for ar in c.parent.children:
            register_task_id(webdav_client, req.request_id, ar.id)
示例#9
0
def test_split_pdf_file_names1():
    fn = os.path.join(data_dir, 'pdf_9_pages.pdf')
    with split_pdf_to_page_blocks(fn, 4) as block_files:
        assert os.path.basename(block_files[0]) == 'pdf_9_pages_0001_0004.pdf'
        assert os.path.basename(block_files[1]) == 'pdf_9_pages_0005_0008.pdf'
        assert os.path.basename(block_files[2]) == 'pdf_9_pages_0009.pdf'
示例#10
0
def test_split_pdf_file_names4():
    fn = os.path.join(data_dir, 'pdf_9_pages.pdf')
    with split_pdf_to_page_blocks(fn, 11) as block_files:
        assert len(block_files) == 1
        assert block_files[0].endswith('pdf_9_pages.pdf')
示例#11
0
def test_split_pdf_file_names3():
    fn = os.path.join(data_dir, 'pdf_9_pages.pdf')
    with split_pdf_to_page_blocks(fn, 1) as block_files:
        assert os.path.basename(block_files[0]) == 'pdf_9_pages_0001.pdf'
        assert os.path.basename(block_files[1]) == 'pdf_9_pages_0002.pdf'
        assert os.path.basename(block_files[-1]) == 'pdf_9_pages_0009.pdf'
import shutil

from text_extraction_system.pdf.pdf import split_pdf_to_page_blocks

pdf_fn = '/home/mikhail/lexpredict/misc/angles/wrong_angle6.pdf'
with split_pdf_to_page_blocks(pdf_fn, 1) as pages:
    shutil.copy(pages[96], '/home/mikhail/lexpredict/misc/angles/20210504/')
示例#13
0
from text_extraction_system.pdf.pdf import split_pdf_to_page_blocks
from shutil import copy

with split_pdf_to_page_blocks(
        '/home/mikhail/lexpredict/misc/ocr_complicated1.pdf') as page_fns:
    for p in page_fns:
        copy(p, '/home/mikhail/lexpredict/misc/ocr_complicated1/')