Python extract_page_images 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: text_extraction_system.pdf.pdf

메소드/함수: extract_page_images

hotexamples.com에서의 예제들: 10

Python extract_page_images - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 text_extraction_system.pdf.pdf.extract_page_images에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def test_compare_image_extraction_performance():
    # This is not a test but a small method for comparing how slower the page-to-image
    # conversion will work if running pdf2image per page instead of running it on all pages at once.

    # disabling to avoid slowing down the tests too much
    return

    pdf_fn = os.path.join(data_dir, 'tables2.pdf')

    start = time.time()
    with extract_page_images(pdf_fn) as image_file_names:
        page_num = len(image_file_names)
        print(f'Extracted {page_num} images')
    all_pages_at_once_seconds = time.time() - start
    page_num = 0
    with split_pdf_to_page_blocks(pdf_fn, page_dir) as page_pdf_fns:
        start = time.time()
        for page_fn in page_pdf_fns:
            with extract_page_images(page_fn) as _image_file_names:
                page_num += 1
        all_pages_separately_seconds = time.time() - start

    print(f'All pages at once time: {all_pages_at_once_seconds:.3f}s\n'
          f'All pages separately time: {all_pages_separately_seconds:.3f}s')
    assert all_pages_separately_seconds > 2 * all_pages_at_once_seconds

예제 #2

파일 보기

def test_ocr_rotated_small_angle():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        with ocr_page_to_pdf(png_fns[0]) as pdf_fn:
            with extract_text_and_structure(pdf_fn) as (txt, txt_struct, _s,
                                                        _d):
                assert 'rotated' in txt

예제 #3

파일 보기

def p():
    fn = '..'
    with extract_page_images(fn, 54, 54) as image_fns:
        for image_fn in image_fns:
            shutil.copy(image_fn, '..')
            with ocr_page_to_pdf(image_fn) as page_pdf_fn:
                shutil.copy(page_pdf_fn, '..')
                text, struct = extract_text_and_structure(page_pdf_fn)

예제 #4

파일 보기

def test_extract_images():
    fn = os.path.join(data_dir, 'ocr1.pdf')
    dirs_to_be_deleted = set()
    with extract_page_images(fn) as images:
        for page, image in enumerate(images):
            assert os.path.getsize(image) > 5
            assert os.path.splitext(image)[1] == '.png'
            dirs_to_be_deleted.add(os.path.dirname(image))
    for d in dirs_to_be_deleted:
        assert not os.path.exists(d)

예제 #5

파일 보기

def test_ocr_page():
    fn = os.path.join(data_dir, 'ocr1.pdf')
    txt = ''
    with extract_page_images(fn) as image_fns:
        for image in image_fns:
            with ocr_page_to_pdf(image) as pdf_fn:
                txt += '\n' + extract_text_pdfminer(pdf_fn)
    txt = txt.replace('  ', ' ')
    assert 'each Contributor hereby grants to You' in txt
    assert 'You may add Your own' in txt
    assert 'Submission of Contributions' in txt
    assert 'END OF TERMS AND CONDITIONS' in txt

예제 #6

파일 보기

def extract_tables_from_pdf_file(
        pdf_fn: str,
        pdfminer_advanced_detection: bool = False) -> List[CamelotTable]:
    res: List[CamelotTable] = list()
    with extract_page_images(pdf_fn=pdf_fn, dpi=71) as image_fns:
        page_num = 0
        for ltpage in iterate_pages(
                pdf_fn, use_advanced_detection=pdfminer_advanced_detection):
            page_image_fn = image_fns[page_num]
            camelot_tables: List[CamelotTable] = extract_tables(
                page_num, ltpage, page_image_fn)
            if camelot_tables:
                res += camelot_tables
            page_num += 1
    return res or None

예제 #7

파일 보기

def test_calc_covers_bitmap():
    file_path = os.path.join(data_dir, 'one_page_big_bitmap.pdf')

    with extract_page_images(file_path,
                             start_page=1,
                             end_page=1,
                             pdf_password='') as image_fns:
        with open(file_path, 'rb') as in_file:
            page_layout = get_first_page_layout(in_file)
            tc, ic = calc_covers(page_layout)
            assert ic > 0
            assert tc > 0

            # I measured visual size of image and text in a graphic editor
            vis_im_size = 772 * 509
            vis_text_size = 772 * 128
            vis_ratio = vis_im_size / vis_text_size

            calc_ratio = ic / tc
            diff = abs(vis_ratio - calc_ratio) / vis_ratio
            assert diff < 0.1

예제 #8

파일 보기

def test_angle3_dilated_rows():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        angle = detect_rotation_dilated_rows(png_fns[0],
                                             pre_calculated_orientation=None)
        assert int(angle) == -1

예제 #9

파일 보기

def test_angle3():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        angle = determine_skew(png_fns[0])
        assert int(angle) == -2

예제 #10

파일 보기

import os
import shutil

from text_extraction_system.pdf.pdf import split_pdf_to_page_blocks, extract_page_images
from text_extraction_system.commons.tests.commons import default_settings

fn = os.path.join(os.path.dirname(__file__), 'data',
                  'table-based-text_noocr.pdf')

with default_settings():
    with extract_page_images(fn) as pages:
        for pfn in pages:
            shutil.copy(
                pfn,
                str(os.path.join(os.path.dirname(__file__), 'data')) + '/')