def test_text_filter(resources, outdir): input_pdf = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf' # Ensure the test PDF has detect we can find proc = run(['pdftotext', str(input_pdf), '-'], check=True, stdout=PIPE, encoding='utf-8') assert proc.stdout.strip() != '', "Need input test file that contains text" pdf = Pdf.open(input_pdf) page = pdf.pages[0] keep = [] for operands, command in parse_content_stream(page): if command == Operator('Tj'): print("skipping Tj") continue keep.append((operands, command)) new_stream = Stream(pdf, keep) print(new_stream.read_bytes()) # pylint: disable=no-member page['/Contents'] = new_stream page['/Rotate'] = 90 pdf.save(outdir / 'notext.pdf', True) proc = run( ['pdftotext', str(outdir / 'notext.pdf'), '-'], check=True, stdout=PIPE, encoding='utf-8', ) assert proc.stdout.strip() == '', "Expected text to be removed"
def test_oddwidth_grayscale(bits, check_pixels): pdf = pikepdf.new() pdf.add_blank_page(page_size=(108, 72)) imobj = Stream( pdf, bytes([0b00011011, 0b11011000, 0b00000001]), BitsPerComponent=bits, ColorSpace=Name.DeviceGray, Width=3, Height=2, Type=Name.XObject, Subtype=Name.Image, ) pdf.pages[0].Contents = Stream(pdf, b'108 0 0 72 0 0 cm /Im0 Do') pdf.pages[0].Resources = Dictionary(XObject=Dictionary(Im0=imobj)) pim = PdfImage(pdf.pages[0].Resources.XObject.Im0) assert pim.mode == 'L' assert pim.bits_per_component == bits bio = BytesIO() pim.extract_to(stream=bio) bio.seek(0) im = Image.open(bio) assert im.mode == 'L' assert im.size == (3, 2) # pdf.save(f'oddbit_{bits}.pdf') for check_x, check_y, val in check_pixels: assert im.getpixel((check_x, check_y)) == val
def test_page_contents_add(resources, outdir): pdf = Pdf.open(resources / 'graph.pdf') stream1 = Stream(pdf, b"q 0.707 -0.707 0.707 0.707 0 0 cm") stream2 = Stream(pdf, b"Q") pdf.pages[0].page_contents_add(stream1, True) pdf.pages[0].page_contents_add(stream2, False) pdf.save(outdir / 'out.pdf')
def test_create_pdf(outdir): pdf = Pdf.new() font = pdf.make_indirect( Object.parse(b""" << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >>""")) width, height = 100, 100 image_data = b"\xff\x7f\x00" * (width * height) image = Stream(pdf, image_data) image.stream_dict = Object.parse(b""" << /Type /XObject /Subtype /Image /ColorSpace /DeviceRGB /BitsPerComponent 8 /Width 100 /Height 100 >>""") rfont = {'/F1': font} xobj = {'/Im1': image} resources = { '/Font': rfont, '/XObject': xobj } mediabox = [0, 0, 612, 792] stream = b""" BT /F1 24 Tf 72 720 Td (Hi there) Tj ET q 144 0 0 144 234 324 cm /Im1 Do Q """ contents = Stream(pdf, stream) page_dict = { '/Type': Name('/Page'), '/MediaBox': mediabox, '/Contents': contents, '/Resources': resources } qpdf_page_dict = page_dict page = pdf.make_indirect(qpdf_page_dict) pdf.pages.append(page) pdf.save(outdir / 'hi.pdf')
def test_stream_dict_oneshot(): pdf = pikepdf.new() stream1 = Stream(pdf, b'12345', One=1, Two=2) stream2 = Stream(pdf, b'67890', {'/Three': 3, '/Four': 4}) stream3 = pdf.make_stream(b'abcdef', One=1, Two=2) assert stream1.One == 1 assert stream1.read_bytes() == b'12345' assert stream2.Three == 3 assert stream3.One == 1
def _from_pil_image(cls, *, pdf, page, name, image): # pragma: no cover """Insert a PIL image into a PDF (rudimentary) Args: pdf (pikepdf.Pdf): the PDF to attach the image to page (pikepdf.Object): the page to attach the image to name (str or pikepdf.Name): the name to set the image image (PIL.Image.Image): the image to insert """ data = image.tobytes() imstream = Stream(pdf, data) imstream.Type = Name('/XObject') imstream.Subtype = Name('/Image') if image.mode == 'RGB': imstream.ColorSpace = Name('/DeviceRGB') elif image.mode in ('1', 'L'): imstream.ColorSpace = Name('/DeviceGray') imstream.BitsPerComponent = 1 if image.mode == '1' else 8 imstream.Width = image.width imstream.Height = image.height page.Resources.XObject[name] = imstream return cls(imstream)
def test_stream_data_equal(self): pdf1 = pikepdf.new() stream1 = Stream(pdf1, b'abc') pdf2 = pikepdf.new() stream2 = Stream(pdf2, b'abc') stream21 = Stream(pdf2, b'abcdef') assert stream1 == stream2 assert stream21 != stream2 stream2.stream_dict.SomeData = 1 assert stream2 != stream1
def test_page_contents_add(resources, outdir): pdf = Pdf.open(resources / 'graph.pdf') mat = PdfMatrix().rotated(45) stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm') stream2 = Stream(pdf, b'Q') pdf.pages[0].page_contents_add(stream1, True) pdf.pages[0].page_contents_add(stream2, False) pdf.save(outdir / 'out.pdf')
def test_image_roundtrip(outdir, w, h, pixeldata, cs, bpc): pdf = Pdf.new() image_data = pixeldata * (w * h) image = Stream(pdf, image_data) image.Type = Name('/XObject') image.Subtype = Name('/Image') image.ColorSpace = Name(cs) image.BitsPerComponent = bpc image.Width = w image.Height = h xobj = {'/Im1': image} resources = {'/XObject': xobj} mediabox = [0, 0, 100, 100] stream = b'q 100 0 0 100 0 0 cm /Im1 Do Q' contents = Stream(pdf, stream) page_dict = { '/Type': Name('/Page'), '/MediaBox': mediabox, '/Contents': contents, '/Resources': resources, } page = pdf.make_indirect(page_dict) pdf.pages.append(page) outfile = outdir / f'test{w}{h}{cs[1:]}{bpc}.pdf' pdf.save( outfile, compress_streams=False, stream_decode_level=StreamDecodeLevel.none ) with Pdf.open(outfile) as p2: pim = PdfImage(p2.pages[0].Resources.XObject['/Im1']) assert pim.bits_per_component == bpc assert pim.colorspace == cs assert pim.width == w assert pim.height == h if cs == '/DeviceRGB': assert pim.mode == 'RGB' elif cs == '/DeviceGray' and bpc == 8: assert pim.mode == 'L' elif cs == '/DeviceCMYK': assert pim.mode == 'CMYK' elif bpc == 1: assert pim.mode == '1' assert not pim.palette assert pim.filters == [] assert pim.read_bytes() == pixeldata outstream = BytesIO() pim.extract_to(stream=outstream) outstream.seek(0) im = Image.open(outstream) assert pim.mode == im.mode
def test_page_contents_add(graph, outdir): pdf = graph mat = PdfMatrix().rotated(45) stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm') stream2 = Stream(pdf, b'Q') pdf.pages[0].page_contents_add(stream1, True) pdf.pages[0].page_contents_add(stream2, False) pdf.save(outdir / 'out.pdf') with pytest.raises(TypeError, match="Not a Page"): Array([42]).page_contents_add(stream1)
def test_page_labels(): p = Pdf.new() d = Dictionary(Type=Name.Page, MediaBox=[0, 0, 612, 792], Resources=Dictionary()) for n in range(5): p.pages.append(d) p.pages[n].Contents = Stream(p, b"BT (Page %s) Tj ET" % str(n).encode()) p.Root.PageLabels = p.make_indirect( Dictionary( Nums=Array( [ 0, # new label rules begin at index 0 Dictionary(S=Name.r), # use lowercase roman numerals, until... 2, # new label rules begin at index 2 Dictionary( S=Name.D, St=42, P='Prefix-' ), # label pages as 'Prefix-42', 'Prefix-43', ... ] ) ) ) labels = ['i', 'ii', 'Prefix-42', 'Prefix-43', 'Prefix-44'] for n in range(5): rawpage = p.pages[n] page = Page(rawpage) assert page.label == labels[n]
def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] rich_page = Page(page) rich_page.contents_coalesce() for operands, operator in parse_content_stream(page, ''): if not in_text_obj: if operator == Operator('BT'): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == Operator('Tr'): render_mode = operands[0] text_objects.append((operands, operator)) if operator == Operator('ET'): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() content_stream = unparse_content_stream(stream) page.Contents = Stream(pdf, content_stream)
def test_docinfo_problems(sandwich, invalid_creationdate): sandwich.Root.Metadata = Stream( sandwich, b""" <?xpacket begin='\xc3\xaf\xc2\xbb\xc2\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?> <?adobe-xap-filters esc="CRLF"?> <x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'> <rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' xmlns:iX='http://ns.adobe.com/iX/1.0/'> <rdf:Description rdf:about='uuid:873a76ba-4819-11f4-0000-5c5716666531' xmlns:pdf='http://ns.adobe.com/pdf/1.3/' pdf:Producer='GPL Ghostscript 9.26'/> <rdf:Description rdf:about='uuid:873a76ba-4819-11f4-0000-5c5716666531' xmlns:xmp='http://ns.adobe.com/xap/1.0/'><xmp:ModifyDate>2019-01-04T00:44:42-08:00</xmp:ModifyDate> <xmp:CreateDate>2019-01-04T00:44:42-08:00</xmp:CreateDate> <xmp:CreatorTool>Acrobat 4.0 Scan Plug-in for Windows�</xmp:CreatorTool></rdf:Description> <rdf:Description rdf:about='uuid:873a76ba-4819-11f4-0000-5c5716666531' xmlns:xapMM='http://ns.adobe.com/xap/1.0/mm/' xapMM:DocumentID='uuid:873a76ba-4819-11f4-0000-5c5716666531'/> <rdf:Description rdf:about='uuid:873a76ba-4819-11f4-0000-5c5716666531' xmlns:dc='http://purl.org/dc/elements/1.1/' dc:format='application/pdf'><dc:title><rdf:Alt><rdf:li xml:lang='x-default'>Untitled</rdf:li></rdf:Alt></dc:title></rdf:Description> </rdf:RDF> </x:xmpmeta> """, ) meta = sandwich.open_metadata() meta._load() # File has invalid XML sequence � with meta: with pytest.warns(UserWarning) as warned: meta.load_from_docinfo(invalid_creationdate.docinfo) assert 'could not be copied' in warned[0].message.args[0] with pytest.raises(ValueError): meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) with pytest.warns(UserWarning) as warned: with meta as xmp: xmp['xmp:CreateDate'] = 'invalid date' assert 'could not be updated' in warned[0].message.args[0]
def test_no_x_xmpmeta(trivial): trivial.Root.Metadata = Stream( trivial, b""" <?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:xmp="http://ns.adobe.com/xap/1.0/"> <rdf:Description rdf:about="" xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/" xmlns:xmp="http://ns.adobe.com/xap/1.0/"> <pdfaid:part>1</pdfaid:part> <pdfaid:conformance>A</pdfaid:conformance> <xmp:CreatorTool>Simple Scan 3.30.2</xmp:CreatorTool> <xmp:CreateDate>2019-02-05T07:08:46+01:00</xmp:CreateDate> <xmp:ModifyDate>2019-02-05T07:08:46+01:00</xmp:ModifyDate> <xmp:MetadataDate>2019-02-05T07:08:46+01:00</xmp:MetadataDate> </rdf:Description> </rdf:RDF> <?xpacket end="w"?> """.strip(), ) with trivial.open_metadata() as xmp: assert xmp._get_rdf_root() is not None xmp['pdfaid:part'] = '2' assert xmp['pdfaid:part'] == '2'
def test_empty_xmpmeta(trivial): trivial.Root.Metadata = Stream( trivial, b"""<?xpacket begin="" id=""?> <x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk=""> </x:xmpmeta> <?xpacket end=""?> """, ) with trivial.open_metadata() as xmp: pass
def test_remove_unreferenced(resources, outdir): in_ = resources / 'sandwich.pdf' out1 = outdir / 'out1.pdf' out2 = outdir / 'out2.pdf' pdf = Pdf.open(in_) pdf.pages[0].Contents = Stream(pdf, b' ') pdf.save(out1) pdf.remove_unreferenced_resources() pdf.save(out2) assert out2.stat().st_size < out1.stat().st_size
def test_wrong_xml(enron1): enron1.Root.Metadata = Stream( enron1, b""" <test><xml>This is valid xml but not valid XMP</xml></test> """.strip(), ) meta = enron1.open_metadata() with pytest.raises(ValueError, match='not XMP'): with meta: pass with pytest.raises(ValueError, match='not XMP'): meta['pdfaid:part']
def test_wrong_xml(sandwich): sandwich.Root.Metadata = Stream( sandwich, b""" <test><xml>This is valid xml but not valid XMP</xml></test> """.strip(), ) meta = sandwich.open_metadata(strict=True) with pytest.raises(ValueError, match='not XMP'): with meta: pass with pytest.raises(ValueError, match='not XMP'): meta['pdfaid:part']
def test_palette_nonrgb(base, hival, palette, expect_type): pdf = pikepdf.new() imobj = Stream( pdf, b'\x00\x01\x02\x03' * 4, BitsPerComponent=8, ColorSpace=Array([Name.Indexed, base, hival, palette]), Width=16, Height=1, Type=Name.XObject, Subtype=Name.Image, ) pim = pikepdf.PdfImage(imobj) assert pim.palette == (expect_type, palette)
def test_present_bug_empty_tags(trivial): trivial.Root.Metadata = Stream( trivial, b""" <?xpacket begin='\xc3\xaf\xc2\xbb\xc2\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?> <?adobe-xap-filters esc="CRLF"?> <x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'> <rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' xmlns:iX='http://ns.adobe.com/iX/1.0/'> <rdf:Description rdf:about=""><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/"><rdf:Seq><rdf:li/></rdf:Seq></dc:creator></rdf:Description> </rdf:RDF> </x:xmpmeta> """, ) with trivial.open_metadata(update_docinfo=True) as meta: pass assert Name.Author not in trivial.docinfo
def test_issue_162(trivial, author): trivial.Root.Metadata = Stream( trivial, b""" <?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:xmp="http://ns.adobe.com/xap/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/"> <rdf:Description rdf:about="" xmlns:dc="http://purl.org/dc/elements/1.1/" dc:creator="Foo"></rdf:Description> </rdf:RDF> <?xpacket end="w"?>""", ) with trivial.open_metadata() as m: docinfo = pikepdf.Dictionary(Author=author) with pytest.warns(UserWarning, match=r'Merging elements'): m.load_from_docinfo(docinfo, raise_failure=True) assert m['dc:creator'] == [author]
def test_palette_nonrgb(base, hival, bits, palette, expect_type, expect_mode): pdf = pikepdf.new() imobj = Stream( pdf, b'\x00\x01\x02\x03' * 16, BitsPerComponent=bits, ColorSpace=Array([Name.Indexed, base, hival, palette]), Width=16, Height=4, Type=Name.XObject, Subtype=Name.Image, ) pim = pikepdf.PdfImage(imobj) assert pim.palette == (expect_type, palette) pim.extract_to(stream=BytesIO()) # To view images: # pim.extract_to(fileprefix=f'palette_nonrgb_{expect_type}_{bits}') assert pim.mode == expect_mode
def convert_to_jbig2( pike: Pdf, jbig2_groups: Dict[int, List[XrefExt]], root: Path, options, executor: Executor, ) -> None: """Convert images to JBIG2 and insert into PDF. When the JBIG2 page group size is > 1 we do several JBIG2 images at once and build a symbol dictionary that will span several pages. Each JBIG2 image must reference to its symbol dictionary. If too many pages shared the same dictionary JBIG2 encoding becomes more expensive and less efficient. The default value of 10 was determined through testing. Currently this must be lossy encoding since jbig2enc does not support refinement coding. When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own and needs no dictionary. Currently this must be lossless JBIG2. """ jbig2_globals_dict: Optional[Dictionary] _produce_jbig2_images(jbig2_groups, root, options, executor) for group, xref_exts in jbig2_groups.items(): prefix = f'group{group:08d}' jbig2_symfile = root / (prefix + '.sym') if jbig2_symfile.exists(): jbig2_globals_data = jbig2_symfile.read_bytes() jbig2_globals = Stream(pike, jbig2_globals_data) jbig2_globals_dict = Dictionary(JBIG2Globals=jbig2_globals) elif options.jbig2_page_group_size == 1: jbig2_globals_dict = None else: raise FileNotFoundError(jbig2_symfile) for n, xref_ext in enumerate(xref_exts): xref, _ = xref_ext jbig2_im_file = root / (prefix + f'.{n:04d}') jbig2_im_data = jbig2_im_file.read_bytes() im_obj = pike.get_object(xref, 0) im_obj.write(jbig2_im_data, filter=Name.JBIG2Decode, decode_parms=jbig2_globals_dict)
def test_dict_or_array_dict(): pdf = pikepdf.new() imobj = Stream( pdf, b'dummy', BitsPerComponent=1, ColorSpace=Name.DeviceGray, DecodeParms=Array([Dictionary( BlackIs1=False, Columns=16, K=-1, )]), Filter=Array([Name.CCITTFaxDecode]), Height=16, Width=16, Type=Name.XObject, Subtype=Name.Image, ) pim = pikepdf.PdfImage(imobj) assert pim.decode_parms[ 0].K == -1 # Check that array of dict is unpacked properly
def test_xxe(trivial, outdir): secret = outdir / 'secret.txt' secret.write_text("This is a secret") trivial.Root.Metadata = Stream( trivial, b"""\ <?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?> <!DOCTYPE rdf:RDF [<!ENTITY xxe SYSTEM "file://%s">]> <x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='Image'> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <note> <to>&xxe;</to> <from>xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx</from> </note> </rdf:RDF> </x:xmpmeta> <?xpacket end='w'?> """ % os.fsencode(secret), ) with trivial.open_metadata() as m: assert 'This is a secret' not in str(m)
def stream_object(): pdf = pikepdf.new() return Stream(pdf, b'')
def test_stream_isinstance(): pdf = pikepdf.new() stream = Stream(pdf, b'xyz') assert isinstance(stream, Stream) assert isinstance(stream, Object)
def test_stream_no_dangling_stream_on_failure(): p = pikepdf.new() num_objects = len(p.objects) with pytest.raises(AttributeError): Stream(p, b'3.14159', ['Not a mapping object']) assert len(p.objects) == num_objects, "A dangling object was created"
def abcxyz_stream(): pdf = pikepdf.new() data = b'abcxyz' stream = Stream(pdf, data) return stream
def test_stream_bad_params(): p = pikepdf.new() with pytest.raises(TypeError, match='data'): Stream(p)