def test_compare_image_extraction_performance(): # This is not a test but a small method for comparing how slower the page-to-image # conversion will work if running pdf2image per page instead of running it on all pages at once. # disabling to avoid slowing down the tests too much return pdf_fn = os.path.join(data_dir, 'tables2.pdf') start = time.time() with extract_page_images(pdf_fn) as image_file_names: page_num = len(image_file_names) print(f'Extracted {page_num} images') all_pages_at_once_seconds = time.time() - start page_num = 0 with split_pdf_to_page_blocks(pdf_fn, page_dir) as page_pdf_fns: start = time.time() for page_fn in page_pdf_fns: with extract_page_images(page_fn) as _image_file_names: page_num += 1 all_pages_separately_seconds = time.time() - start print(f'All pages at once time: {all_pages_at_once_seconds:.3f}s\n' f'All pages separately time: {all_pages_separately_seconds:.3f}s') assert all_pages_separately_seconds > 2 * all_pages_at_once_seconds
def test_ocr_rotated_small_angle(): fn = os.path.join(data_dir, 'rotated_small_angle.pdf') with extract_page_images(fn, 1, 1) as png_fns: with ocr_page_to_pdf(png_fns[0]) as pdf_fn: with extract_text_and_structure(pdf_fn) as (txt, txt_struct, _s, _d): assert 'rotated' in txt
def p(): fn = '..' with extract_page_images(fn, 54, 54) as image_fns: for image_fn in image_fns: shutil.copy(image_fn, '..') with ocr_page_to_pdf(image_fn) as page_pdf_fn: shutil.copy(page_pdf_fn, '..') text, struct = extract_text_and_structure(page_pdf_fn)
def test_extract_images(): fn = os.path.join(data_dir, 'ocr1.pdf') dirs_to_be_deleted = set() with extract_page_images(fn) as images: for page, image in enumerate(images): assert os.path.getsize(image) > 5 assert os.path.splitext(image)[1] == '.png' dirs_to_be_deleted.add(os.path.dirname(image)) for d in dirs_to_be_deleted: assert not os.path.exists(d)
def test_ocr_page(): fn = os.path.join(data_dir, 'ocr1.pdf') txt = '' with extract_page_images(fn) as image_fns: for image in image_fns: with ocr_page_to_pdf(image) as pdf_fn: txt += '\n' + extract_text_pdfminer(pdf_fn) txt = txt.replace(' ', ' ') assert 'each Contributor hereby grants to You' in txt assert 'You may add Your own' in txt assert 'Submission of Contributions' in txt assert 'END OF TERMS AND CONDITIONS' in txt
def extract_tables_from_pdf_file( pdf_fn: str, pdfminer_advanced_detection: bool = False) -> List[CamelotTable]: res: List[CamelotTable] = list() with extract_page_images(pdf_fn=pdf_fn, dpi=71) as image_fns: page_num = 0 for ltpage in iterate_pages( pdf_fn, use_advanced_detection=pdfminer_advanced_detection): page_image_fn = image_fns[page_num] camelot_tables: List[CamelotTable] = extract_tables( page_num, ltpage, page_image_fn) if camelot_tables: res += camelot_tables page_num += 1 return res or None
def test_calc_covers_bitmap(): file_path = os.path.join(data_dir, 'one_page_big_bitmap.pdf') with extract_page_images(file_path, start_page=1, end_page=1, pdf_password='') as image_fns: with open(file_path, 'rb') as in_file: page_layout = get_first_page_layout(in_file) tc, ic = calc_covers(page_layout) assert ic > 0 assert tc > 0 # I measured visual size of image and text in a graphic editor vis_im_size = 772 * 509 vis_text_size = 772 * 128 vis_ratio = vis_im_size / vis_text_size calc_ratio = ic / tc diff = abs(vis_ratio - calc_ratio) / vis_ratio assert diff < 0.1
def test_angle3_dilated_rows(): fn = os.path.join(data_dir, 'rotated_small_angle.pdf') with extract_page_images(fn, 1, 1) as png_fns: angle = detect_rotation_dilated_rows(png_fns[0], pre_calculated_orientation=None) assert int(angle) == -1
def test_angle3(): fn = os.path.join(data_dir, 'rotated_small_angle.pdf') with extract_page_images(fn, 1, 1) as png_fns: angle = determine_skew(png_fns[0]) assert int(angle) == -2
import os import shutil from text_extraction_system.pdf.pdf import split_pdf_to_page_blocks, extract_page_images from text_extraction_system.commons.tests.commons import default_settings fn = os.path.join(os.path.dirname(__file__), 'data', 'table-based-text_noocr.pdf') with default_settings(): with extract_page_images(fn) as pages: for pfn in pages: shutil.copy( pfn, str(os.path.join(os.path.dirname(__file__), 'data')) + '/')