Python extract_page_images示例

编程语言: Python

命名空间/包名称: text_extraction_system.pdf.pdf

方法/功能: extract_page_images

hotexamples.com的示例: 10

Python extract_page_images - 已找到10个示例。这些是从开源项目中提取的最受好评的text_extraction_system.pdf.pdf.extract_page_images现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def test_compare_image_extraction_performance():
    # This is not a test but a small method for comparing how slower the page-to-image
    # conversion will work if running pdf2image per page instead of running it on all pages at once.

    # disabling to avoid slowing down the tests too much
    return

    pdf_fn = os.path.join(data_dir, 'tables2.pdf')

    start = time.time()
    with extract_page_images(pdf_fn) as image_file_names:
        page_num = len(image_file_names)
        print(f'Extracted {page_num} images')
    all_pages_at_once_seconds = time.time() - start
    page_num = 0
    with split_pdf_to_page_blocks(pdf_fn, page_dir) as page_pdf_fns:
        start = time.time()
        for page_fn in page_pdf_fns:
            with extract_page_images(page_fn) as _image_file_names:
                page_num += 1
        all_pages_separately_seconds = time.time() - start

    print(f'All pages at once time: {all_pages_at_once_seconds:.3f}s\n'
          f'All pages separately time: {all_pages_separately_seconds:.3f}s')
    assert all_pages_separately_seconds > 2 * all_pages_at_once_seconds

示例#2

显示文件

def test_ocr_rotated_small_angle():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        with ocr_page_to_pdf(png_fns[0]) as pdf_fn:
            with extract_text_and_structure(pdf_fn) as (txt, txt_struct, _s,
                                                        _d):
                assert 'rotated' in txt

示例#3

显示文件

def p():
    fn = '..'
    with extract_page_images(fn, 54, 54) as image_fns:
        for image_fn in image_fns:
            shutil.copy(image_fn, '..')
            with ocr_page_to_pdf(image_fn) as page_pdf_fn:
                shutil.copy(page_pdf_fn, '..')
                text, struct = extract_text_and_structure(page_pdf_fn)

示例#4

显示文件

def test_extract_images():
    fn = os.path.join(data_dir, 'ocr1.pdf')
    dirs_to_be_deleted = set()
    with extract_page_images(fn) as images:
        for page, image in enumerate(images):
            assert os.path.getsize(image) > 5
            assert os.path.splitext(image)[1] == '.png'
            dirs_to_be_deleted.add(os.path.dirname(image))
    for d in dirs_to_be_deleted:
        assert not os.path.exists(d)

示例#5

显示文件

def test_ocr_page():
    fn = os.path.join(data_dir, 'ocr1.pdf')
    txt = ''
    with extract_page_images(fn) as image_fns:
        for image in image_fns:
            with ocr_page_to_pdf(image) as pdf_fn:
                txt += '\n' + extract_text_pdfminer(pdf_fn)
    txt = txt.replace('  ', ' ')
    assert 'each Contributor hereby grants to You' in txt
    assert 'You may add Your own' in txt
    assert 'Submission of Contributions' in txt
    assert 'END OF TERMS AND CONDITIONS' in txt

示例#6

显示文件

def extract_tables_from_pdf_file(
        pdf_fn: str,
        pdfminer_advanced_detection: bool = False) -> List[CamelotTable]:
    res: List[CamelotTable] = list()
    with extract_page_images(pdf_fn=pdf_fn, dpi=71) as image_fns:
        page_num = 0
        for ltpage in iterate_pages(
                pdf_fn, use_advanced_detection=pdfminer_advanced_detection):
            page_image_fn = image_fns[page_num]
            camelot_tables: List[CamelotTable] = extract_tables(
                page_num, ltpage, page_image_fn)
            if camelot_tables:
                res += camelot_tables
            page_num += 1
    return res or None

示例#7

显示文件

def test_calc_covers_bitmap():
    file_path = os.path.join(data_dir, 'one_page_big_bitmap.pdf')

    with extract_page_images(file_path,
                             start_page=1,
                             end_page=1,
                             pdf_password='') as image_fns:
        with open(file_path, 'rb') as in_file:
            page_layout = get_first_page_layout(in_file)
            tc, ic = calc_covers(page_layout)
            assert ic > 0
            assert tc > 0

            # I measured visual size of image and text in a graphic editor
            vis_im_size = 772 * 509
            vis_text_size = 772 * 128
            vis_ratio = vis_im_size / vis_text_size

            calc_ratio = ic / tc
            diff = abs(vis_ratio - calc_ratio) / vis_ratio
            assert diff < 0.1

示例#8

显示文件

def test_angle3_dilated_rows():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        angle = detect_rotation_dilated_rows(png_fns[0],
                                             pre_calculated_orientation=None)
        assert int(angle) == -1

示例#9

显示文件

def test_angle3():
    fn = os.path.join(data_dir, 'rotated_small_angle.pdf')
    with extract_page_images(fn, 1, 1) as png_fns:
        angle = determine_skew(png_fns[0])
        assert int(angle) == -2

示例#10

显示文件

import os
import shutil

from text_extraction_system.pdf.pdf import split_pdf_to_page_blocks, extract_page_images
from text_extraction_system.commons.tests.commons import default_settings

fn = os.path.join(os.path.dirname(__file__), 'data',
                  'table-based-text_noocr.pdf')

with default_settings():
    with extract_page_images(fn) as pages:
        for pfn in pages:
            shutil.copy(
                pfn,
                str(os.path.join(os.path.dirname(__file__), 'data')) + '/')