示例#1
0
def test_pdf_to_jpeg():
    file = get_file("tests/test_sample/2003.00744v1_image_pdf.pdf")
    count = 0
    for image_file in PDFToJPEG().iterate_transform(file):
        count += 1
        assert type(image_file) == io.BytesIO
    assert count == 3
示例#2
0
def main(input_path: str, output_path: str, verbose: bool) -> None:
    logger = setup_logger(verbose)
    logger.info("start ocr process")

    input_extension = input_path.split(".")[-1]
    validation_input_path(input_path, SUPPORT_INPUT_EXTENSIONS)
    validation_output_path(output_path, SUPPORT_OUTPUT_EXTENSIONS)

    lines = []
    input_file = get_file(input_path)
    if input_extension == "pdf":
        logger.info("start converting pdf to string")
        for i, file in enumerate(PDFToJPEG().iterate_transform(input_file), 1):
            lines.append(ocr_precess(file, PreProcessor(RGB_BORDER), OCRTesseractProcessor(), PostProcessor()))
            file.close()
            logger.info(f"finish converting pdf to string, at {i} page")
    else:
        logger.info(f"start converting {input_extension} to string")
        lines.append(ocr_precess(input_file, PreProcessor(RGB_BORDER), OCRTesseractProcessor(), PostProcessor()))
        logger.info(f"finish converting {input_extension} to string")
    input_file.close()

    logger.info(f"start converting string to text file")
    write_file(lines, output_path)
    logger.info(f"start converting string to text file")
    logger.info(f"finish ocr processing")
def test_ocr_processor():
    file = get_file("tests/test_sample/82251504.png")
    fixed_text = ocr_precess(file, PreProcessor(100), OCRTesseractProcessor(), PostProcessor())
    assert type(fixed_text) == str
def test_pre_processor():
    file = get_file("tests/test_sample/82251504.png")
    new_file = PreProcessor(100).run(file)
    assert type(new_file) == io.BytesIO
def test_ocr_tesseract_processor():
    file = get_file("tests/test_sample/82251504.png")
    text = OCRTesseractProcessor().run(file)
    assert type(text) == str