def test_remove_background(resources, outdir): # Ensure the input image does not contain pure white/black with Image.open(resources / 'congress.jpg') as im: assert im.getextrema() != ((0, 255), (0, 255), (0, 255)) output_pdf = check_ocrmypdf( resources / 'congress.jpg', outdir / 'test_remove_bg.pdf', '--remove-background', '--image-dpi', '150', '--plugin', 'tests/plugins/tesseract_noop.py', ) output_png = outdir / 'remove_bg.png' ghostscript.rasterize_pdf( output_pdf, output_png, raster_device='png16m', raster_dpi=Resolution(100, 100), pageno=1, ) # The output image should contain pure white and black with Image.open(output_png) as im: assert im.getextrema() == ((0, 255), (0, 255), (0, 255))
def test_deskew(resources, outdir): # Run with deskew deskewed_pdf = check_ocrmypdf( resources / 'skew.pdf', outdir / 'skew.pdf', '-d', '--plugin', 'tests/plugins/tesseract_noop.py', ) # Now render as an image again and use Leptonica to find the skew angle # to confirm that it was deskewed deskewed_png = outdir / 'deskewed.png' ghostscript.rasterize_pdf( deskewed_pdf, deskewed_png, raster_device='pngmono', raster_dpi=Resolution(150, 150), pageno=1, ) pix = Pix.open(deskewed_png) skew_angle, _skew_confidence = pix.find_skew() print(skew_angle) assert -0.5 < skew_angle < 0.5, "Deskewing failed"
def rasterize(pdf, pageno, png): if png.exists(): print(png) return ghostscript.rasterize_pdf( pdf, png, raster_device='pngmono', raster_dpi=Resolution(100, 100), pageno=pageno, rotation=0, )
def test_mono_not_inverted(resources, outdir): infile = resources / '2400dpi.pdf' opt.main(infile, outdir / 'out.pdf', level=3) rasterize_pdf( outdir / 'out.pdf', outdir / 'im.png', raster_device='pnggray', raster_dpi=Resolution(10, 10), ) with Image.open(fspath(outdir / 'im.png')) as im: assert im.getpixel((0, 0)) == 255, "Expected white background"
def rasterize_pdf_page( input_file, output_file, raster_device, raster_dpi, pageno, page_dpi, rotation, filter_vector, ): ghostscript.rasterize_pdf( input_file, output_file, raster_device=raster_device, raster_dpi=raster_dpi, pageno=pageno, page_dpi=page_dpi, rotation=rotation, filter_vector=filter_vector, ) return output_file
def test_rasterize_size(francais, outdir): path, pdf = francais page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3]) assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0 page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72)) target_size = Decimal('50.0'), Decimal('30.0') forced_dpi = Resolution(42.0, 4242.0) rasterize_pdf( path, outdir / 'out.png', raster_device='pngmono', raster_dpi=Resolution( target_size[0] / page_size[0], target_size[1] / page_size[1] ), page_dpi=forced_dpi, ) with Image.open(outdir / 'out.png') as im: assert im.size == target_size assert im.info['dpi'] == forced_dpi
def test_rasterize_rotated(francais, outdir, caplog): path, pdf = francais page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3]) assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0 page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72)) target_size = Decimal('50.0'), Decimal('30.0') forced_dpi = Resolution(42.0, 4242.0) caplog.set_level(logging.DEBUG) rasterize_pdf( path, outdir / 'out.png', raster_device='pngmono', raster_dpi=Resolution( target_size[0] / page_size[0], target_size[1] / page_size[1] ), page_dpi=forced_dpi, rotation=90, ) with Image.open(outdir / 'out.png') as im: assert im.size == (target_size[1], target_size[0]) assert im.info['dpi'] == (forced_dpi[1], forced_dpi[0])