def test_convert_to_square_resolution(renderer, spoof_tesseract_cache, resources, outpdf): from math import isclose # Confirm input image is non-square resolution in_pageinfo = pdf_get_all_pageinfo(str(resources / 'aspect.pdf')) assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres'] # --force-ocr requires means forced conversion to square resolution check_ocrmypdf(resources / 'aspect.pdf', outpdf, '--force-ocr', '--pdf-renderer', renderer, env=spoof_tesseract_cache) out_pageinfo = pdf_get_all_pageinfo(str(outpdf)) in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0] # Resolution show now be equal assert out_p0['xres'] == out_p0['yres'] # Page size should match input page size assert isclose(in_p0['width_inches'], out_p0['width_inches']) assert isclose(in_p0['height_inches'], out_p0['height_inches']) # Because we rasterized the page to produce a new image, it should occupy # the entire page out_im_w = out_p0['images'][0]['width'] / out_p0['images'][0]['dpi_w'] out_im_h = out_p0['images'][0]['height'] / out_p0['images'][0]['dpi_h'] assert isclose(out_p0['width_inches'], out_im_w) assert isclose(out_p0['height_inches'], out_im_h)
def test_rotated_skew_timeout(): """This document contains an image that is rotated 90 into place with a /Rotate tag and intentionally skewed by altering the transformation matrix. This tests for a bug where the combinatino of preprocessing and a tesseract timeout produced a page whose dimensions did not match the original's. """ input_file = _infile('rotated_skew.pdf') in_pageinfo = pdf_get_all_pageinfo(input_file)[0] assert in_pageinfo['height_pixels'] < in_pageinfo['width_pixels'], \ "Expected the input page to be landscape" assert in_pageinfo['rotate'] == 90, "Expected a rotated page" out = check_ocrmypdf( 'rotated_skew.pdf', 'test_rotated_skew.pdf', '--pdf-renderer', 'hocr', '--deskew', '--tesseract-timeout', '0') out_pageinfo = pdf_get_all_pageinfo(out)[0] assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \ "Expected the output page to be portrait" assert out_pageinfo['rotate'] == 0, \ "Expected no page rotation for output" assert in_pageinfo['width_pixels'] == out_pageinfo['height_pixels'] and \ in_pageinfo['height_pixels'] == out_pageinfo['width_pixels'], \ "Expected page rotation to be baked in"
def test_rotated_skew_timeout(resources, outpdf): """This document contains an image that is rotated 90 into place with a /Rotate tag and intentionally skewed by altering the transformation matrix. This tests for a bug where the combinatino of preprocessing and a tesseract timeout produced a page whose dimensions did not match the original's. """ input_file = str(resources / 'rotated_skew.pdf') in_pageinfo = pdf_get_all_pageinfo(input_file)[0] assert in_pageinfo['height_pixels'] < in_pageinfo['width_pixels'], \ "Expected the input page to be landscape" assert in_pageinfo['rotate'] == 90, "Expected a rotated page" out = check_ocrmypdf(input_file, outpdf, '--pdf-renderer', 'hocr', '--deskew', '--tesseract-timeout', '0') out_pageinfo = pdf_get_all_pageinfo(str(out))[0] assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \ "Expected the output page to be portrait" assert out_pageinfo['rotate'] == 0, \ "Expected no page rotation for output" assert in_pageinfo['width_pixels'] == out_pageinfo['height_pixels'] and \ in_pageinfo['height_pixels'] == out_pageinfo['width_pixels'], \ "Expected page rotation to be baked in"
def test_non_square_resolution(renderer, spoof_tesseract_cache): # Confirm input image is non-square resolution in_pageinfo = pdf_get_all_pageinfo(_infile('aspect.pdf')) assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres'] out = 'aspect_%s.pdf' % renderer check_ocrmypdf( 'aspect.pdf', out, '--pdf-renderer', renderer, env=spoof_tesseract_cache) out_pageinfo = pdf_get_all_pageinfo(_outfile(out)) # Confirm resolution was kept the same assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres'] assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
def test_single_page_inline_image(): filename = os.path.join(TEST_OUTPUT, 'image-mono-inline.pdf') pdf = Canvas(filename, pagesize=(8 * 72, 6 * 72)) with NamedTemporaryFile() as im_tmp: im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(im_tmp.name, format='PNG') # Draw image in a 72x72 pt or 1"x1" area pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72) pdf.showPage() pdf.save() with pytest.raises(NotImplementedError): pageinfo.pdf_get_all_pageinfo(filename)
def test_non_square_resolution(renderer, spoof_tesseract_cache, resources, outpdf): # Confirm input image is non-square resolution in_pageinfo = pdf_get_all_pageinfo(str(resources / 'aspect.pdf')) assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres'] check_ocrmypdf( resources / 'aspect.pdf', outpdf, '--pdf-renderer', renderer, env=spoof_tesseract_cache) out_pageinfo = pdf_get_all_pageinfo(str(outpdf)) # Confirm resolution was kept the same assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres'] assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
def test_single_page_inline_image(): filename = os.path.join(TEST_OUTPUT, "image-mono-inline.pdf") pdf = Canvas(filename, pagesize=(8 * 72, 6 * 72)) with NamedTemporaryFile() as im_tmp: im = Image.new("1", (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(im_tmp.name, format="PNG") # Draw image in a 72x72 pt or 1"x1" area pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72) pdf.showPage() pdf.save() with pytest.raises(NotImplementedError): pageinfo.pdf_get_all_pageinfo(filename)
def test_single_page_image(): filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf') pdf = Canvas(filename, pagesize=(72, 72)) with NamedTemporaryFile() as im_tmp: im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(im_tmp.name, format='PNG') # Draw image in a 72x72 pt or 1"x1" area pdf.drawImage(im_tmp.name, 0, 0, width=72, height=72) pdf.showPage() pdf.save() pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) assert len(pdfinfo) == 1 page = pdfinfo[0] assert not page['has_text'] assert len(page['images']) == 1 pdfimage = page['images'][0] assert pdfimage['width'] == 8 # assert pdfimage['color'] == 'gray' # While unexpected, this is correct # PDF spec says /FlateDecode image must have /BitsPerComponent 8 # So mono images get upgraded to 8-bit assert pdfimage['bpc'] == 8 # DPI in a 1"x1" is the image width assert pdfimage['dpi_w'] == 8 assert pdfimage['dpi_h'] == 8
def test_force_ocr(spoof_tesseract_cache): out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f', env=spoof_tesseract_cache) pdfinfo = pdf_get_all_pageinfo(out) assert pdfinfo[0]['has_text']
def test_single_page_image(): filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf') with NamedTemporaryFile() as im_tmp: im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(im_tmp.name, format='PNG') pdf_bytes = img2pdf.convert([im_tmp.name], dpi=8) with open(filename, 'wb') as pdf: pdf.write(pdf_bytes) pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) assert len(pdfinfo) == 1 page = pdfinfo[0] assert not page['has_text'] assert len(page['images']) == 1 pdfimage = page['images'][0] assert pdfimage['width'] == 8 assert pdfimage['color'] == 'gray' # While unexpected, this is correct # PDF spec says /FlateDecode image must have /BitsPerComponent 8 # So mono images get upgraded to 8-bit assert pdfimage['bpc'] == 8 # DPI in a 1"x1" is the image width assert pdfimage['dpi_w'] == 8 assert pdfimage['dpi_h'] == 8
def test_jpeg(): filename = resource_filename(req, 'tests/resources/c02-22.pdf') pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) pdfimage = pdfinfo[0]['images'][0] assert pdfimage['enc'] == 'jpeg'
def test_jpeg(): filename = resource_filename(req, "tests/resources/c02-22.pdf") pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) pdfimage = pdfinfo[0]["images"][0] assert pdfimage["enc"] == "jpeg"
def test_single_page_image(): filename = os.path.join(TEST_OUTPUT, "image-mono.pdf") with NamedTemporaryFile() as im_tmp: im = Image.new("1", (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(im_tmp.name, format="PNG") pdf_bytes = img2pdf.convert([im_tmp.name], dpi=8) with open(filename, "wb") as pdf: pdf.write(pdf_bytes) pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) assert len(pdfinfo) == 1 page = pdfinfo[0] assert not page["has_text"] assert len(page["images"]) == 1 pdfimage = page["images"][0] assert pdfimage["width"] == 8 assert pdfimage["color"] == "gray" # While unexpected, this is correct # PDF spec says /FlateDecode image must have /BitsPerComponent 8 # So mono images get upgraded to 8-bit assert pdfimage["bpc"] == 8 # DPI in a 1"x1" is the image width assert pdfimage["dpi_w"] == 8 assert pdfimage["dpi_h"] == 8
def test_jpeg(): filename = _make_input('c02-22.pdf') pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) pdfimage = pdfinfo[0]['images'][0] assert pdfimage['enc'] == 'jpeg'
def test_force_ocr(spoof_tesseract_cache, resources, outpdf): out = check_ocrmypdf(resources / 'graph_ocred.pdf', outpdf, '-f', env=spoof_tesseract_cache) pdfinfo = pdf_get_all_pageinfo(out) assert pdfinfo[0]['has_text']
def test_single_page_image(outdir): filename = outdir / 'image-mono.pdf' im_tmp = outdir / 'tmp.png' im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(str(im_tmp), format='PNG') imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8)) layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None) im_bytes = im_tmp.read_bytes() pdf_bytes = img2pdf.convert(im_bytes, producer="img2pdf", with_pdfrw=False, layout_fun=layout_fun) filename.write_bytes(pdf_bytes) pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename)) assert len(pdfinfo) == 1 page = pdfinfo[0] assert not page['has_text'] assert len(page['images']) == 1 pdfimage = page['images'][0] assert pdfimage['width'] == 8 assert pdfimage['color'] == 'gray' # DPI in a 1"x1" is the image width assert abs(pdfimage['dpi_w'] - 8) < 1e-5 assert abs(pdfimage['dpi_h'] - 8) < 1e-5
def test_skip_big(spoof_tesseract_cache): out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf', '--skip-big', '10', env=spoof_tesseract_cache) pdfinfo = pdf_get_all_pageinfo(out) assert not pdfinfo[0]['has_text']
def test_jpeg(resources, outdir): filename = resources / 'c02-22.pdf' pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename)) pdfimage = pdfinfo[0]['images'][0] assert pdfimage['enc'] == 'jpeg' assert (pdfimage['dpi_w'] - 150) < 1e-5
def test_skip_big(spoof_tesseract_cache, resources, outpdf): out = check_ocrmypdf(resources / 'enormous.pdf', outpdf, '--skip-big', '10', env=spoof_tesseract_cache) pdfinfo = pdf_get_all_pageinfo(str(out)) assert not pdfinfo[0]['has_text']
def test_jpeg(): filename = _make_input('c02-22.pdf') pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) pdfimage = pdfinfo[0]['images'][0] assert pdfimage['enc'] == 'jpeg' assert (pdfimage['dpi_w'] - 150) < 1e-5
def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf): out = check_ocrmypdf( resources / 'jbig2.pdf', outpdf, '--output-type', 'pdf', '--pdf-renderer', 'hocr', env=spoof_tesseract_cache) out_pageinfo = pdf_get_all_pageinfo(str(out)) assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
def check_oversample(renderer): oversampled_pdf = check_ocrmypdf( 'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '300', '--pdf-renderer', renderer) pdfinfo = pdf_get_all_pageinfo(oversampled_pdf) print(pdfinfo[0]['xres']) assert abs(pdfinfo[0]['xres'] - 300) < 1
def test_jbig2_passthrough(spoof_tesseract_cache): out = check_ocrmypdf( 'jbig2.pdf', 'jbig2_out.pdf', '--output-type', 'pdf', '--pdf-renderer', 'hocr', env=spoof_tesseract_cache) out_pageinfo = pdf_get_all_pageinfo(out) assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
def test_oversample(spoof_tesseract_cache, renderer): oversampled_pdf = check_ocrmypdf( 'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '350', '-f', '--pdf-renderer', renderer, env=spoof_tesseract_cache) pdfinfo = pdf_get_all_pageinfo(oversampled_pdf) print(pdfinfo[0]['xres']) assert abs(pdfinfo[0]['xres'] - 350) < 1
def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf): oversampled_pdf = check_ocrmypdf( resources / 'skew.pdf', outpdf, '--oversample', '350', '-f', '--pdf-renderer', renderer, env=spoof_tesseract_cache) pdfinfo = pdf_get_all_pageinfo(str(oversampled_pdf)) print(pdfinfo[0]['xres']) assert abs(pdfinfo[0]['xres'] - 350) < 1
def check_oversample(renderer): oversampled_pdf = check_ocrmypdf('skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '300', '--pdf-renderer', renderer) pdfinfo = pdf_get_all_pageinfo(oversampled_pdf) print(pdfinfo[0]['xres']) assert abs(pdfinfo[0]['xres'] - 300) < 1
def test_very_high_dpi(spoof_tesseract_cache, resources, outpdf): "Checks for a Decimal quantize error with high DPI, etc" check_ocrmypdf(resources / '2400dpi.pdf', outpdf, env=spoof_tesseract_cache) pdfinfo = pdf_get_all_pageinfo(outpdf) image = pdfinfo[0]['images'][0] assert image['dpi_w'] == image['dpi_h'] assert image['dpi_w'] == 2400
def test_skip_pages_does_not_replicate( ensure_tess4, resources, basename, outdir): infile = resources / basename outpdf = outdir / basename check_ocrmypdf( infile, outpdf, '--pdf-renderer', 'tess4', '--force-ocr', '--tesseract-timeout', '0', env=ensure_tess4 ) info_in = pageinfo.pdf_get_all_pageinfo(str(infile)) info = pageinfo.pdf_get_all_pageinfo(str(outpdf)) for page in info: assert len(page['images']) == 1, "skipped page was replicated" for n in range(len(info_in)): assert info[n]['width_inches'] == info_in[n]['width_inches']
def test_content_preservation(ensure_tess4, resources, outpdf): infile = resources / 'masks.pdf' check_ocrmypdf( infile, outpdf, '--pdf-renderer', 'tess4', '--tesseract-timeout', '0', env=ensure_tess4 ) info = pageinfo.pdf_get_all_pageinfo(str(outpdf)) page = info[0] assert len(page['images']) > 1, "masked were rasterized"
def test_single_page_text(): filename = os.path.join(TEST_OUTPUT, "text.pdf") pdf = Canvas(filename, pagesize=(8 * 72, 6 * 72)) text = pdf.beginText() text.setFont("Helvetica", 12) text.setTextOrigin(1 * 72, 3 * 72) text.textLine("Methink'st thou art a general offence and every" " man should beat thee.") pdf.drawText(text) pdf.showPage() pdf.save() pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) assert len(pdfinfo) == 1 page = pdfinfo[0] assert page["has_text"] assert len(page["images"]) == 0
def test_single_page_inline_image(): filename = os.path.join(TEST_OUTPUT, 'image-mono-inline.pdf') pdf = Canvas(filename, pagesize=(8*72, 6*72)) with NamedTemporaryFile() as im_tmp: im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(im_tmp.name, format='PNG') # Draw image in a 72x72 pt or 1"x1" area pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72) pdf.showPage() pdf.save() pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) print(pdfinfo) pdfimage = pdfinfo[0]['images'][0] assert (pdfimage['dpi_w'] - 8) < 1e-5 assert pdfimage['color'] != '-' assert pdfimage['width'] == 8
def test_sidecar_pagecount(spoof_tesseract_cache, resources, outpdf): sidecar = outpdf + '.txt' check_ocrmypdf(resources / 'multipage.pdf', outpdf, '--skip-text', '--sidecar', sidecar, env=spoof_tesseract_cache) pdfinfo = pdf_get_all_pageinfo(str(resources / 'multipage.pdf')) num_pages = len(pdfinfo) with open(sidecar, 'r') as f: ocr_text = f.read() # There should a formfeed between each pair of pages, so the count of # formfeeds is the page count less one assert ocr_text.count('\f') == num_pages - 1, \ "Sidecar page count does not match PDF page count"
def test_single_page_inline_image(outdir): filename = outdir / 'image-mono-inline.pdf' pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72)) with NamedTemporaryFile() as im_tmp: im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(im_tmp.name, format='PNG') # Draw image in a 72x72 pt or 1"x1" area pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72) pdf.showPage() pdf.save() pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename)) print(pdfinfo) pdfimage = pdfinfo[0]['images'][0] assert (pdfimage['dpi_w'] - 8) < 1e-5 assert pdfimage['color'] != '-' assert pdfimage['width'] == 8
def test_single_page_text(outdir): filename = outdir / 'text.pdf' pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72)) text = pdf.beginText() text.setFont('Helvetica', 12) text.setTextOrigin(1 * 72, 3 * 72) text.textLine("Methink'st thou art a general offence and every" " man should beat thee.") pdf.drawText(text) pdf.showPage() pdf.save() pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename)) assert len(pdfinfo) == 1 page = pdfinfo[0] assert page['has_text'] assert len(page['images']) == 0
def test_single_page_image(): filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf') with NamedTemporaryFile(mode='wb+', suffix='.png') as im_tmp: im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(im_tmp.name, format='PNG') imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8)) layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None) im_tmp.seek(0) im_bytes = im_tmp.read() pdf_bytes = img2pdf.convert(im_bytes, producer="img2pdf", with_pdfrw=False, layout_fun=layout_fun) with open(filename, 'wb') as pdf: pdf.write(pdf_bytes) pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) assert len(pdfinfo) == 1 page = pdfinfo[0] assert not page['has_text'] assert len(page['images']) == 1 pdfimage = page['images'][0] assert pdfimage['width'] == 8 assert pdfimage['color'] == 'gray' # While unexpected, this is correct # PDF spec says /FlateDecode image must have /BitsPerComponent 8 # So mono images get upgraded to 8-bit assert pdfimage['bpc'] == 8 # DPI in a 1"x1" is the image width assert abs(pdfimage['dpi_w'] - 8) < 1e-5 assert abs(pdfimage['dpi_h'] - 8) < 1e-5
def test_single_page_image(): filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf') with NamedTemporaryFile(mode='wb+', suffix='.png') as im_tmp: im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(im_tmp.name, format='PNG') imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8)) layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None) im_tmp.seek(0) im_bytes = im_tmp.read() pdf_bytes = img2pdf.convert( im_bytes, producer="img2pdf", with_pdfrw=False, layout_fun=layout_fun) with open(filename, 'wb') as pdf: pdf.write(pdf_bytes) pdfinfo = pageinfo.pdf_get_all_pageinfo(filename) assert len(pdfinfo) == 1 page = pdfinfo[0] assert not page['has_text'] assert len(page['images']) == 1 pdfimage = page['images'][0] assert pdfimage['width'] == 8 assert pdfimage['color'] == 'gray' # While unexpected, this is correct # PDF spec says /FlateDecode image must have /BitsPerComponent 8 # So mono images get upgraded to 8-bit assert pdfimage['bpc'] == 8 # DPI in a 1"x1" is the image width assert pdfimage['dpi_w'] == 8 assert pdfimage['dpi_h'] == 8
def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec, resources, image, outpdf): from PIL import Image input_file = str(resources / image) output_file = str(outpdf) im = Image.open(input_file) # Runs: ocrmypdf - output.pdf < testfile with open(input_file, 'rb') as input_stream: p_args = ocrmypdf_exec + [ '--image-dpi', '150', '--output-type', 'pdf', '-', output_file ] p = Popen(p_args, close_fds=True, stdout=PIPE, stderr=PIPE, stdin=input_stream, env=spoof_tesseract_noop) out, err = p.communicate() assert p.returncode == ExitCode.ok pdfinfo = pdf_get_all_pageinfo(output_file) pdfimage = pdfinfo[0]['images'][0] if input_file.endswith('.png'): assert pdfimage['enc'] != 'jpeg', \ "Lossless compression changed to lossy!" elif input_file.endswith('.jpg'): assert pdfimage['enc'] == 'jpeg', \ "Lossy compression changed to lossless!" if im.mode.startswith('RGB') or im.mode.startswith('BGR'): assert pdfimage['color'] == 'rgb', \ "Colorspace changed" elif im.mode.startswith('L'): assert pdfimage['color'] == 'gray', \ "Colorspace changed"
def first_page_dimensions(pdf): from ocrmypdf import pageinfo info = pageinfo.pdf_get_all_pageinfo(str(pdf)) page0 = info[0] return (page0['width_inches'], page0['height_inches'])
def test_ocr_timeout(renderer): out = check_ocrmypdf('skew.pdf', 'test_timeout_%s.pdf' % renderer, '--tesseract-timeout', '1.0') pdfinfo = pdf_get_all_pageinfo(out) assert not pdfinfo[0]['has_text']
def test_force_ocr(): out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f') pdfinfo = pdf_get_all_pageinfo(out) assert pdfinfo[0]['has_text']
def test_skip_big(): out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf', '--skip-big', '10') pdfinfo = pdf_get_all_pageinfo(out) assert pdfinfo[0]['has_text'] == False
def test_form_xobject(resources): filename = resources / 'formxobject.pdf' pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename)) pdfimage = pdfinfo[0]['images'][0] assert pdfimage['width'] == 50
def test_no_contents(resources): filename = resources / 'no_contents.pdf' pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename)) assert len(pdfinfo[0]['images']) == 0 assert pdfinfo[0]['has_text'] == False