def test_compare_image_extraction_performance(): # This is not a test but a small method for comparing how slower the page-to-image # conversion will work if running pdf2image per page instead of running it on all pages at once. # disabling to avoid slowing down the tests too much return pdf_fn = os.path.join(data_dir, 'tables2.pdf') start = time.time() with extract_page_images(pdf_fn) as image_file_names: page_num = len(image_file_names) print(f'Extracted {page_num} images') all_pages_at_once_seconds = time.time() - start page_num = 0 with split_pdf_to_page_blocks(pdf_fn, page_dir) as page_pdf_fns: start = time.time() for page_fn in page_pdf_fns: with extract_page_images(page_fn) as _image_file_names: page_num += 1 all_pages_separately_seconds = time.time() - start print(f'All pages at once time: {all_pages_at_once_seconds:.3f}s\n' f'All pages separately time: {all_pages_separately_seconds:.3f}s') assert all_pages_separately_seconds > 2 * all_pages_at_once_seconds
def test_split_pdf_text(): fn = os.path.join(data_dir, 'pdf_9_pages.pdf') with split_pdf_to_page_blocks(fn, 4) as block_files: txt1 = str(extract_text_pdfminer(block_files[0])) txt2 = str(extract_text_pdfminer(block_files[1])) txt3 = str(extract_text_pdfminer(block_files[2])) assert 'This is page 1.' in txt1 assert 'This is page 2.' in txt1 assert 'This is page 3.' in txt1 assert 'This is page 4.' in txt1 assert 'This is page 5.' in txt2 assert 'This is page 6.' in txt2 assert 'This is page 7.' in txt2 assert 'This is page 8.' in txt2 assert 'This is page 9.' in txt3 assert 'This is page 1.' not in txt2 assert 'This is page 2.' not in txt3 assert 'This is page 3.' not in txt2 assert 'This is page 4.' not in txt3 assert 'This is page 5.' not in txt1 assert 'This is page 6.' not in txt3 assert 'This is page 7.' not in txt1 assert 'This is page 8.' not in txt3 assert 'This is page 9.' not in txt2 assert len(block_files) == 3
def test_split_pdf1(): fn = os.path.join(data_dir, 'pdf_9_pages.pdf') with split_pdf_to_page_blocks(fn, 3) as block_files: assert len(block_files) == 3 for fn in block_files: with pikepdf.open(fn) as pdf: assert len(pdf.pages) == 3
def test_split_pdf_file_names5(): fn = os.path.join(data_dir, 'pdf_9_pages.pdf') with split_pdf_to_page_blocks( fn, 3, page_block_base_name='qwerty.pdf') as block_files: assert os.path.basename(block_files[0]) == 'qwerty_0001_0003.pdf' assert os.path.basename(block_files[1]) == 'qwerty_0004_0006.pdf' assert os.path.basename(block_files[2]) == 'qwerty_0007_0009.pdf'
def test_split_pdf_file_names6(): fn = os.path.join(data_dir, 'pdf_9_pages.pdf') temp_dir = tempfile.mkdtemp() with split_pdf_to_page_blocks( fn, 11, page_block_base_name='aaa.pdf') as block_files: assert len(block_files) == 1 assert os.path.basename(block_files[0]) == 'pdf_9_pages.pdf'
def test_split_pdf2(): fn = os.path.join(data_dir, 'pdf_9_pages.pdf') with split_pdf_to_page_blocks(fn, 4) as block_files: assert len(block_files) == 3 with pikepdf.open(block_files[0]) as pdf: assert len(pdf.pages) == 4 with pikepdf.open(block_files[1]) as pdf: assert len(pdf.pages) == 4 with pikepdf.open(block_files[2]) as pdf: assert len(pdf.pages) == 1
def p2(): from text_extraction_system.pdf.pdf import merge_pdf_pages, split_pdf_to_page_blocks from text_extraction_system.ocr.ocr import ocr_page_to_pdf import shutil orig_pdf_fn = '/home/mikhail/lexpredict/misc/angles/A2A3E26061E43CD60156598713530D98C.pdf' page = 1 with split_pdf_to_page_blocks(orig_pdf_fn) as page_fns: page_fn = page_fns[49] with extract_page_ocr_images(page_fn, 1, 1, dpi=300) as images: with ocr_page_to_pdf(images.get(1), glyphless_text_only=True, tesseract_page_orientation_detection=True) as ocred_page_pdf: # type: str with merge_pdf_pages(orig_pdf_fn, single_page_merge_num_file_rotate=(1, ocred_page_pdf, None)) as final_pdf: shutil.copy(page_fn, '/home/mikhail/lexpredict/misc/angles/A2A3E26061E43CD60156598713530D98C__00050.ocred.pdf')
def process_pdf(pdf_fn: str, req: RequestMetadata, webdav_client: WebDavClient): log.info(f'{req.original_file_name} | Pre-processing PDF document') log.info( f'{req.original_file_name} | Splitting to pages to parallelize processing...' ) with split_pdf_to_page_blocks(pdf_fn, pages_per_block=1) as pdf_page_fns: webdav_client.mkdir(f'{req.request_id}/{pages_for_processing}') webdav_client.mkdir(f'{req.request_id}/{pages_ocred}') webdav_client.mkdir(f'{req.request_id}/{pages_tables}') task_signatures = list() i = 0 lang_converter = LanguageConverter() language, locale_code = lang_converter.get_language_and_locale_code( req.doc_language) ocr_language = lang_converter.convert_language_to_tesseract_view( language) for pdf_page_fn in pdf_page_fns: i += 1 pdf_page_base_fn = os.path.basename(pdf_page_fn) webdav_client.upload_file( f'{req.request_id}/{pages_for_processing}/{pdf_page_base_fn}', pdf_page_fn) task_signatures.append( process_pdf_page_task.s(req.request_id, req.original_file_name, pdf_page_base_fn, i, ocr_language, req.request_callback_info.log_extra)) log.info( f'{req.original_file_name} | Scheduling {len(task_signatures)} sub-tasks...' ) request_callback_info_dict = req.request_callback_info.to_dict() c = chord(task_signatures)(finish_pdf_processing.s( req.request_id, req.original_file_name, request_callback_info_dict).set(link_error=[ ocr_error_callback.s(req.request_id, request_callback_info_dict) ])) register_task_id(webdav_client, req.request_id, c.id) for ar in c.parent.children: register_task_id(webdav_client, req.request_id, ar.id)
def test_split_pdf_file_names1(): fn = os.path.join(data_dir, 'pdf_9_pages.pdf') with split_pdf_to_page_blocks(fn, 4) as block_files: assert os.path.basename(block_files[0]) == 'pdf_9_pages_0001_0004.pdf' assert os.path.basename(block_files[1]) == 'pdf_9_pages_0005_0008.pdf' assert os.path.basename(block_files[2]) == 'pdf_9_pages_0009.pdf'
def test_split_pdf_file_names4(): fn = os.path.join(data_dir, 'pdf_9_pages.pdf') with split_pdf_to_page_blocks(fn, 11) as block_files: assert len(block_files) == 1 assert block_files[0].endswith('pdf_9_pages.pdf')
def test_split_pdf_file_names3(): fn = os.path.join(data_dir, 'pdf_9_pages.pdf') with split_pdf_to_page_blocks(fn, 1) as block_files: assert os.path.basename(block_files[0]) == 'pdf_9_pages_0001.pdf' assert os.path.basename(block_files[1]) == 'pdf_9_pages_0002.pdf' assert os.path.basename(block_files[-1]) == 'pdf_9_pages_0009.pdf'
import shutil from text_extraction_system.pdf.pdf import split_pdf_to_page_blocks pdf_fn = '/home/mikhail/lexpredict/misc/angles/wrong_angle6.pdf' with split_pdf_to_page_blocks(pdf_fn, 1) as pages: shutil.copy(pages[96], '/home/mikhail/lexpredict/misc/angles/20210504/')
from text_extraction_system.pdf.pdf import split_pdf_to_page_blocks from shutil import copy with split_pdf_to_page_blocks( '/home/mikhail/lexpredict/misc/ocr_complicated1.pdf') as page_fns: for p in page_fns: copy(p, '/home/mikhail/lexpredict/misc/ocr_complicated1/')