예제 #1
0
def test_compare_image_extraction_performance():
    # This is not a test but a small method for comparing how slower the page-to-image
    # conversion will work if running pdf2image per page instead of running it on all pages at once.

    # disabling to avoid slowing down the tests too much
    return

    pdf_fn = os.path.join(data_dir, 'tables2.pdf')

    start = time.time()
    with extract_page_images(pdf_fn) as image_file_names:
        page_num = len(image_file_names)
        print(f'Extracted {page_num} images')
    all_pages_at_once_seconds = time.time() - start
    page_num = 0
    with split_pdf_to_page_blocks(pdf_fn, page_dir) as page_pdf_fns:
        start = time.time()
        for page_fn in page_pdf_fns:
            with extract_page_images(page_fn) as _image_file_names:
                page_num += 1
        all_pages_separately_seconds = time.time() - start

    print(f'All pages at once time: {all_pages_at_once_seconds:.3f}s\n'
          f'All pages separately time: {all_pages_separately_seconds:.3f}s')
    assert all_pages_separately_seconds > 2 * all_pages_at_once_seconds
예제 #2
0
def test_ocr_rotated_small_angle():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        with ocr_page_to_pdf(png_fns[0]) as pdf_fn:
            with extract_text_and_structure(pdf_fn) as (txt, txt_struct, _s,
                                                        _d):
                assert 'rotated' in txt
예제 #3
0
def p():
    fn = '..'
    with extract_page_images(fn, 54, 54) as image_fns:
        for image_fn in image_fns:
            shutil.copy(image_fn, '..')
            with ocr_page_to_pdf(image_fn) as page_pdf_fn:
                shutil.copy(page_pdf_fn, '..')
                text, struct = extract_text_and_structure(page_pdf_fn)
예제 #4
0
def test_extract_images():
    fn = os.path.join(data_dir, 'ocr1.pdf')
    dirs_to_be_deleted = set()
    with extract_page_images(fn) as images:
        for page, image in enumerate(images):
            assert os.path.getsize(image) > 5
            assert os.path.splitext(image)[1] == '.png'
            dirs_to_be_deleted.add(os.path.dirname(image))
    for d in dirs_to_be_deleted:
        assert not os.path.exists(d)
예제 #5
0
def test_ocr_page():
    fn = os.path.join(data_dir, 'ocr1.pdf')
    txt = ''
    with extract_page_images(fn) as image_fns:
        for image in image_fns:
            with ocr_page_to_pdf(image) as pdf_fn:
                txt += '\n' + extract_text_pdfminer(pdf_fn)
    txt = txt.replace('  ', ' ')
    assert 'each Contributor hereby grants to You' in txt
    assert 'You may add Your own' in txt
    assert 'Submission of Contributions' in txt
    assert 'END OF TERMS AND CONDITIONS' in txt
예제 #6
0
def extract_tables_from_pdf_file(
        pdf_fn: str,
        pdfminer_advanced_detection: bool = False) -> List[CamelotTable]:
    res: List[CamelotTable] = list()
    with extract_page_images(pdf_fn=pdf_fn, dpi=71) as image_fns:
        page_num = 0
        for ltpage in iterate_pages(
                pdf_fn, use_advanced_detection=pdfminer_advanced_detection):
            page_image_fn = image_fns[page_num]
            camelot_tables: List[CamelotTable] = extract_tables(
                page_num, ltpage, page_image_fn)
            if camelot_tables:
                res += camelot_tables
            page_num += 1
    return res or None
예제 #7
0
def test_calc_covers_bitmap():
    file_path = os.path.join(data_dir, 'one_page_big_bitmap.pdf')

    with extract_page_images(file_path,
                             start_page=1,
                             end_page=1,
                             pdf_password='') as image_fns:
        with open(file_path, 'rb') as in_file:
            page_layout = get_first_page_layout(in_file)
            tc, ic = calc_covers(page_layout)
            assert ic > 0
            assert tc > 0

            # I measured visual size of image and text in a graphic editor
            vis_im_size = 772 * 509
            vis_text_size = 772 * 128
            vis_ratio = vis_im_size / vis_text_size

            calc_ratio = ic / tc
            diff = abs(vis_ratio - calc_ratio) / vis_ratio
            assert diff < 0.1
예제 #8
0
def test_angle3_dilated_rows():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        angle = detect_rotation_dilated_rows(png_fns[0],
                                             pre_calculated_orientation=None)
        assert int(angle) == -1
예제 #9
0
def test_angle3():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        angle = determine_skew(png_fns[0])
        assert int(angle) == -2
예제 #10
0
import os
import shutil

from text_extraction_system.pdf.pdf import split_pdf_to_page_blocks, extract_page_images
from text_extraction_system.commons.tests.commons import default_settings

fn = os.path.join(os.path.dirname(__file__), 'data',
                  'table-based-text_noocr.pdf')

with default_settings():
    with extract_page_images(fn) as pages:
        for pfn in pages:
            shutil.copy(
                pfn,
                str(os.path.join(os.path.dirname(__file__), 'data')) + '/')