def test_pdf_to_jpeg(): file = get_file("tests/test_sample/2003.00744v1_image_pdf.pdf") count = 0 for image_file in PDFToJPEG().iterate_transform(file): count += 1 assert type(image_file) == io.BytesIO assert count == 3
def main(input_path: str, output_path: str, verbose: bool) -> None: logger = setup_logger(verbose) logger.info("start ocr process") input_extension = input_path.split(".")[-1] validation_input_path(input_path, SUPPORT_INPUT_EXTENSIONS) validation_output_path(output_path, SUPPORT_OUTPUT_EXTENSIONS) lines = [] input_file = get_file(input_path) if input_extension == "pdf": logger.info("start converting pdf to string") for i, file in enumerate(PDFToJPEG().iterate_transform(input_file), 1): lines.append(ocr_precess(file, PreProcessor(RGB_BORDER), OCRTesseractProcessor(), PostProcessor())) file.close() logger.info(f"finish converting pdf to string, at {i} page") else: logger.info(f"start converting {input_extension} to string") lines.append(ocr_precess(input_file, PreProcessor(RGB_BORDER), OCRTesseractProcessor(), PostProcessor())) logger.info(f"finish converting {input_extension} to string") input_file.close() logger.info(f"start converting string to text file") write_file(lines, output_path) logger.info(f"start converting string to text file") logger.info(f"finish ocr processing")
def test_ocr_processor(): file = get_file("tests/test_sample/82251504.png") fixed_text = ocr_precess(file, PreProcessor(100), OCRTesseractProcessor(), PostProcessor()) assert type(fixed_text) == str
def test_pre_processor(): file = get_file("tests/test_sample/82251504.png") new_file = PreProcessor(100).run(file) assert type(new_file) == io.BytesIO
def test_ocr_tesseract_processor(): file = get_file("tests/test_sample/82251504.png") text = OCRTesseractProcessor().run(file) assert type(text) == str