def test_skip_pages_does_not_replicate(resources, basename, outdir): infile = resources / basename outpdf = outdir / basename check_ocrmypdf( infile, outpdf, '--pdf-renderer', 'sandwich', '--force-ocr', '--tesseract-timeout', '0', ) info_in = pdfinfo.PdfInfo(infile) info = pdfinfo.PdfInfo(outpdf) for page in info: assert len(page.images) == 1, "skipped page was replicated" for n in range(len(info_in)): assert info[n].width_inches == info_in[n].width_inches
def test_pages_issue700(monkeypatch, resources): def get_no_pages(*args, **kwargs): return iter([]) monkeypatch.setattr(PDFPage, 'get_pages', get_no_pages) with pytest.raises(InputFileError, match="pdfminer"): pdfinfo.PdfInfo( resources / 'cardinal.pdf', detailed_analysis=True, progbar=False, max_workers=1, )
def test_content_preservation(ensure_tess4, resources, outpdf): infile = resources / 'masks.pdf' check_ocrmypdf(infile, outpdf, '--pdf-renderer', 'sandwich', '--tesseract-timeout', '0', env=ensure_tess4) info = pdfinfo.PdfInfo(outpdf) page = info[0] assert len(page.images) > 1, "masks were rasterized"
def test_single_page_inline_image(eight_by_eight, outdir): filename = outdir / 'image-mono-inline.pdf' pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72)) # Draw image in a 72x72 pt or 1"x1" area pdf.drawInlineImage(eight_by_eight, 0, 0, width=72, height=72) pdf.showPage() pdf.save() info = pdfinfo.PdfInfo(filename) print(info) pdfimage = info[0].images[0] assert isclose(pdfimage.dpi.x, 8) assert pdfimage.color == Colorspace.gray assert pdfimage.width == 8
def test_image_scale0(resources, outpdf): with pikepdf.open(resources / 'cmyk.pdf') as cmyk: xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject() p = pikepdf.Pdf.new() p.add_blank_page(page_size=(72, 72)) objname = pikepdf.Page(p.pages[0]).add_resource( p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0) print(objname) p.pages[0].Contents = pikepdf.Stream( p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname)) p.save(outpdf) pi = pdfinfo.PdfInfo(outpdf, detailed_analysis=True, progbar=False, max_workers=1) assert not pi.pages[0]._images[0].dpi.is_finite assert pi.pages[0].dpi == Resolution(0, 0)
def test_single_page_inline_image(outdir): filename = outdir / 'image-mono-inline.pdf' pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72)) im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) # Draw image in a 72x72 pt or 1"x1" area pdf.drawInlineImage(im, 0, 0, width=72, height=72) pdf.showPage() pdf.save() info = pdfinfo.PdfInfo(filename) print(info) pdfimage = info[0].images[0] assert isclose(pdfimage.dpi.x, 8) assert pdfimage.color == Colorspace.gray assert pdfimage.width == 8
def test_dpi_needed(image, text, vector, result, rgb_image, outdir): c = Canvas(str(outdir / 'dpi.pdf'), pagesize=(5 * inch, 5 * inch)) if image: c.drawImage(rgb_image, 1 * inch, 1 * inch, width=1 * inch, height=1 * inch) if text: c.drawString(1 * inch, 4 * inch, "Actual text") if vector: c.ellipse(3 * inch, 3 * inch, 4 * inch, 4 * inch) c.showPage() c.save() mock = Mock() mock.oversample = DUMMY_OVERSAMPLE_RESOLUTION[0] pi = pdfinfo.PdfInfo(outdir / 'dpi.pdf') assert _pipeline.get_canvas_square_dpi(pi[0], mock) == result assert _pipeline.get_page_square_dpi(pi[0], mock) == result
def test_single_page_text(outdir): filename = outdir / 'text.pdf' pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72)) text = pdf.beginText() text.setFont('Helvetica', 12) text.setTextOrigin(1 * 72, 3 * 72) text.textLine("Methink'st thou art a general offence and every" " man should beat thee.") pdf.drawText(text) pdf.showPage() pdf.save() info = pdfinfo.PdfInfo(filename) assert len(info) == 1 page = info[0] assert page.has_text assert len(page.images) == 0
def test_single_page_inline_image(outdir): filename = outdir / 'image-mono-inline.pdf' pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72)) with NamedTemporaryFile() as im_tmp: im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(im_tmp.name, format='PNG') # Draw image in a 72x72 pt or 1"x1" area pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72) pdf.showPage() pdf.save() pdf = pdfinfo.PdfInfo(filename) print(pdf) pdfimage = pdf[0].images[0] assert isclose(pdfimage.xres, 8) assert pdfimage.color == Colorspace.rgb # reportlab produces color image assert pdfimage.width == 8
def first_page_dimensions(pdf): from ocrmypdf import pdfinfo info = pdfinfo.PdfInfo(pdf) page0 = info[0] return (page0.width_inches, page0.height_inches)
def test_vector(resources): filename = resources / 'vector.pdf' pdf = pdfinfo.PdfInfo(filename) assert pdf[0].has_vector assert not pdf[0].has_text
def test_ocr_detection(resources): filename = resources / 'graph_ocred.pdf' pdf = pdfinfo.PdfInfo(filename) assert not pdf[0].has_vector assert pdf[0].has_text
def test_oversized_page(resources): pdf = pdfinfo.PdfInfo(resources / 'poster.pdf') image = pdf[0].images[0] assert image.width * image.xres > 200, "this is supposed to be oversized"
def test_no_contents(resources): filename = resources / 'no_contents.pdf' pdf = pdfinfo.PdfInfo(filename) assert len(pdf[0].images) == 0 assert pdf[0].has_text == False
def test_form_xobject(resources): filename = resources / 'formxobject.pdf' pdf = pdfinfo.PdfInfo(filename) pdfimage = pdf[0].images[0] assert pdfimage.width == 50
def first_page_dimensions(pdf): info = pdfinfo.PdfInfo(pdf) page0 = info[0] return (page0.width_inches, page0.height_inches)
def test_corrupt_font_detection(resources, testfile): filename = resources / testfile pdf = pdfinfo.PdfInfo(filename, detailed_analysis=True) assert pdf[0].has_corrupt_text