def test_stream_data_equal(self): pdf1 = pikepdf.new() stream1 = Stream(pdf1, b'abc') pdf2 = pikepdf.new() stream2 = Stream(pdf2, b'abc') stream21 = Stream(pdf2, b'abcdef') assert stream1 == stream2 assert stream21 != stream2 stream2.stream_dict.SomeData = 1 assert stream2 != stream1
def test_random_valid_docinfo(docinfo): p = pikepdf.new() with p.open_metadata() as m: pdf_docinfo = pikepdf.Dictionary(docinfo) m.load_from_docinfo(pdf_docinfo, raise_failure=True) ET.fromstring(str(m)) # ensure we can parse it
def test_report_file_size(tmp_path, caplog): in_ = tmp_path / 'a.pdf' out = tmp_path / 'b.pdf' pdf = pikepdf.new() pdf.save(in_) pdf.save(out) opts = make_opts(output_type='pdf') vd.report_output_file_size(opts, in_, out) assert caplog.text == '' caplog.clear() waste_of_space = b'Dummy' * 5000 pdf.Root.Dummy = waste_of_space pdf.save(in_) pdf.Root.Dummy2 = waste_of_space + waste_of_space pdf.save(out) with patch('ocrmypdf._validation.jbig2enc.available', return_value=True), patch( 'ocrmypdf._validation.pngquant.available', return_value=True ): vd.report_output_file_size(opts, in_, out) assert 'No reason' in caplog.text caplog.clear() with patch('ocrmypdf._validation.jbig2enc.available', return_value=False), patch( 'ocrmypdf._validation.pngquant.available', return_value=True ): vd.report_output_file_size(opts, in_, out) assert 'optional dependency' in caplog.text caplog.clear() opts = make_opts(in_, out, optimize=0, output_type='pdf') vd.report_output_file_size(opts, in_, out) assert 'disabled' in caplog.text caplog.clear()
def test_oddwidth_grayscale(bits, check_pixels): pdf = pikepdf.new() pdf.add_blank_page(page_size=(108, 72)) imobj = Stream( pdf, bytes([0b00011011, 0b11011000, 0b00000001]), BitsPerComponent=bits, ColorSpace=Name.DeviceGray, Width=3, Height=2, Type=Name.XObject, Subtype=Name.Image, ) pdf.pages[0].Contents = Stream(pdf, b'108 0 0 72 0 0 cm /Im0 Do') pdf.pages[0].Resources = Dictionary(XObject=Dictionary(Im0=imobj)) pim = PdfImage(pdf.pages[0].Resources.XObject.Im0) assert pim.mode == 'L' assert pim.bits_per_component == bits bio = BytesIO() pim.extract_to(stream=bio) bio.seek(0) im = Image.open(bio) assert im.mode == 'L' assert im.size == (3, 2) # pdf.save(f'oddbit_{bits}.pdf') for check_x, check_y, val in check_pixels: assert im.getpixel((check_x, check_y)) == val
def fetch_attachment_entel(self, filename: str, date: datetime) -> None: """ Fetch attachment of entel :param filename: :param date: :return: """ os.rename(self.path_pdf + filename, self.path_pdf + date.strftime("%m") + "_" + date.strftime("%Y") + "_" + \ filename) try: init_pdf = pikepdf.open( self.path_pdf + date.strftime("%m") + "_" + date.strftime("%Y") + "_" + \ filename, password='******') new_pdf = pikepdf.new() new_pdf.pages.extend(init_pdf.pages) new_pdf.save( str(self.path_pdf + date.strftime("%Y") + "_" + date.strftime("%m") + ".pdf")) os.remove(self.path_pdf + date.strftime("%m") + "_" + date.strftime("%Y") + "_" + \ filename) except: os.remove(self.path_pdf + date.strftime("%m") + "_" + date.strftime("%Y") + "_" + \ filename) pass
def main(): if sys.argv[1] == '--version': print(VERSION_STRING, file=sys.stderr) sys.exit(0) elif sys.argv[1] == '--list-langs': print('List of available languages (1):\neng', file=sys.stderr) sys.exit(0) elif sys.argv[-2] == '--print-parameters': print("Some parameters", file=sys.stderr) print("textonly_pdf\t1\tSome help text") sys.exit(0) elif sys.argv[-2] == 'hocr': inputf = sys.argv[-4] output = sys.argv[-3] with Image.open(inputf) as im, open( output + '.hocr', 'w', encoding='utf-8' ) as f: w, h = im.size f.write(HOCR_TEMPLATE.format(str(w), str(h))) with open(output + '.txt', 'w') as f: f.write('') elif sys.argv[-2] == 'pdf': if 'textonly_pdf=1' in sys.argv: inputf = sys.argv[-4] output = sys.argv[-3] with Image.open(inputf) as im: dpi = im.info['dpi'] pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1] ptsize = pagesize[0] * 72, pagesize[1] * 72 pdf_out = pikepdf.new() pdf_out.add_blank_page(page_size=ptsize) pdf_out.save(Path(output).with_suffix('.pdf'), static_id=True) Path(output).with_suffix('.txt').write_text('') else: inputf = sys.argv[-4] output = sys.argv[-3] pdf_bytes = img2pdf.convert([inputf], dpi=300) with open(output + '.pdf', 'wb') as f: f.write(pdf_bytes) with open(output + '.txt', 'w') as f: f.write('') elif sys.argv[-1] == 'stdout': inputf = sys.argv[-2] print( """Orientation: 0 Orientation in degrees: 0 Orientation confidence: 100.00 Script: 1 Script confidence: 100.00""", file=sys.stderr, ) else: print("Spoof doesn't understand arguments", file=sys.stderr) print(sys.argv, file=sys.stderr) sys.exit(1) sys.exit(0)
def generate_pdf(input_file, output_pdf, output_text, options): with Image.open(input_file) as im: dpi = im.info['dpi'] pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1] ptsize = pagesize[0] * 72, pagesize[1] * 72 pdf = pikepdf.new() pdf.add_blank_page(page_size=ptsize) pdf.save(output_pdf, static_id=True) output_text.write_text('')
def test_stream_dict_oneshot(): pdf = pikepdf.new() stream1 = Stream(pdf, b'12345', One=1, Two=2) stream2 = Stream(pdf, b'67890', {'/Three': 3, '/Four': 4}) stream3 = pdf.make_stream(b'abcdef', One=1, Two=2) assert stream1.One == 1 assert stream1.read_bytes() == b'12345' assert stream2.Three == 3 assert stream3.One == 1
def test_random_docinfo(docinfo): p = pikepdf.new() with p.open_metadata() as m: pdf_docinfo = pikepdf.Dictionary(docinfo) try: m.load_from_docinfo(pdf_docinfo, raise_failure=True) except ValueError as e: assert 'could not be copied to XMP' in str(e) or '/Dummy' in str(e) else: ET.fromstring(str(m)) # ensure we can parse it
def PDF_Merge(filePath1, filePath2, savePath): """ 合并两个pdf文件 :param filePath1: :param filePath2: :param savePath: 保存路径和文件名 :return: """ pdf = pikepdf.new() with pikepdf.open(filePath1) as pdf1: pdf.pages.extend(pdf1) with pikepdf.open(filePath2) as pdf2: pdf.pages.extend(pdf2)
def test_palette_nonrgb(base, hival, palette, expect_type): pdf = pikepdf.new() imobj = Stream( pdf, b'\x00\x01\x02\x03' * 4, BitsPerComponent=8, ColorSpace=Array([Name.Indexed, base, hival, palette]), Width=16, Height=1, Type=Name.XObject, Subtype=Name.Image, ) pim = pikepdf.PdfImage(imobj) assert pim.palette == (expect_type, palette)
def test_palette_nonrgb(base, hival, bits, palette, expect_type, expect_mode): pdf = pikepdf.new() imobj = Stream( pdf, b'\x00\x01\x02\x03' * 16, BitsPerComponent=bits, ColorSpace=Array([Name.Indexed, base, hival, palette]), Width=16, Height=4, Type=Name.XObject, Subtype=Name.Image, ) pim = pikepdf.PdfImage(imobj) assert pim.palette == (expect_type, palette) pim.extract_to(stream=BytesIO()) # To view images: # pim.extract_to(fileprefix=f'palette_nonrgb_{expect_type}_{bits}') assert pim.mode == expect_mode
def split_pdf_to_page_blocks( src_pdf_fn: str, pages_per_block: int = 1, page_block_base_name: str = None, ) -> Generator[List[str], None, None]: with pikepdf.open(src_pdf_fn) as pdf: if len(pdf.pages) < 1: yield [] return if len(pdf.pages) < pages_per_block: yield [src_pdf_fn] return if not page_block_base_name: page_block_base_name = os.path.basename(src_pdf_fn) temp_dir = mkdtemp() try: res: List[str] = list() page_start: int = 0 out_pdf: Optional[pikepdf.Pdf] = None for n, page in enumerate(pdf.pages): if n % pages_per_block == 0: if out_pdf is not None: out_fn = build_block_fn(str(page_block_base_name), page_start, n - 1) out_pdf.save(os.path.join(temp_dir, out_fn)) out_pdf.close() res.append(os.path.join(temp_dir, out_fn)) page_start = n out_pdf = pikepdf.new() out_pdf.pages.append(page) if out_pdf is not None and len(out_pdf.pages) > 0: out_fn = build_block_fn(str(page_block_base_name), page_start, n) out_pdf.save(os.path.join(temp_dir, out_fn)) out_pdf.close() res.append(os.path.join(temp_dir, out_fn)) yield res finally: shutil.rmtree(temp_dir)
def test_failed_add_page_cleanup(): pdf = pikepdf.new() d = pikepdf.Dictionary(Type=pikepdf.Name.NotAPage) num_objects = len(pdf.objects) with pytest.raises(TypeError, match="only pages can be inserted"): pdf.pages.append(d) assert len(pdf.pages) == 0 # If we fail to add a new page, we expect one new null object handle to be # be added (since QPDF does not remove the object outright) assert len(pdf.objects) == num_objects + 1, "QPDF semantics changed" assert pdf.objects[-1] is None, "Left a stale object behind without deleting" # But we'd better not delete an existing object... d2 = pdf.make_indirect(pikepdf.Dictionary(Type=pikepdf.Name.StillNotAPage)) with pytest.raises(TypeError, match="only pages can be inserted"): pdf.pages.append(d2) assert len(pdf.pages) == 0 assert d2.same_owner_as(pdf.Root)
def test_dict_or_array_dict(): pdf = pikepdf.new() imobj = Stream( pdf, b'dummy', BitsPerComponent=1, ColorSpace=Name.DeviceGray, DecodeParms=Array([Dictionary( BlackIs1=False, Columns=16, K=-1, )]), Filter=Array([Name.CCITTFaxDecode]), Height=16, Width=16, Type=Name.XObject, Subtype=Name.Image, ) pim = pikepdf.PdfImage(imobj) assert pim.decode_parms[ 0].K == -1 # Check that array of dict is unpacked properly
def merge(pdf_streams, names, outpath, first_page): output = pikepdf.new() pgcounts = [] for stream, name in tqdm.tqdm(zip(pdf_streams, names), total=len(names), desc="Merging PDFs"): pgcounts.append(append_pdf(output, stream, name)) # add page numbering amount_of_contents = 0 for amt, name in zip(pgcounts, names): if is_contents(name): amount_of_contents += amt else: break if amount_of_contents != 0: output.Root.PageLabels = { "/Nums": [ 0, { "/S": pikepdf.Name("/r") }, amount_of_contents, { "/S": pikepdf.Name("/D"), "/St": first_page } ] } with tqdm.tqdm(total=100, desc="Writing PDF") as pbar: last = 0 def update(x): nonlocal last g = x - last last = x pbar.update(g) output.save(outpath, progress=update)
def test_repr_circular(self): with pikepdf.new() as pdf: pdf.Root.Circular = pdf.make_indirect(Dictionary()) pdf.Root.Circular.Parent = pdf.make_indirect(Dictionary()) pdf.Root.Circular.Parent = pdf.make_indirect(pdf.Root.Circular) assert '.get_object' in repr(pdf.Root.Circular)
def test_stream_no_dangling_stream_on_failure(): p = pikepdf.new() num_objects = len(p.objects) with pytest.raises(AttributeError): Stream(p, b'3.14159', ['Not a mapping object']) assert len(p.objects) == num_objects, "A dangling object was created"
def test_stream_bad_params(): p = pikepdf.new() with pytest.raises(TypeError, match='data'): Stream(p)
def test_identical_streams_equal(self): pdf = pikepdf.new() stream1 = Stream(pdf, b'12345', One=1, Two=2) stream2 = Stream(pdf, b'67890', {'/Three': 3, '/Four': 4}) assert stream1 == stream1 assert stream1 != stream2
def abcxyz_stream(): pdf = pikepdf.new() data = b'abcxyz' stream = Stream(pdf, data) return stream
def test_stream_isinstance(): pdf = pikepdf.new() stream = Stream(pdf, b'xyz') assert isinstance(stream, Stream) assert isinstance(stream, Object)
def stream_object(): pdf = pikepdf.new() return Stream(pdf, b'')
def test_devicen(): # Manually construct a 2"x1" document with a DeviceN # colorspace that devices a single "spot" color channel named # "Black". Define a conversion to standard CMYK that assigns # C=0 M=0 Y=0 and lets black through. The result should appear as a # gradient from white (top left) to black (bottom right) in the # left cell, and black to white in the right cell. pdf = pikepdf.new() pdf.add_blank_page(page_size=(144, 72)) # Postscript function to map X -> CMYK={0, 0, 0, X} # Explanation: # X is implicitly on the stack # 0 0 0 <- load three zeros on to stack # stack contains: X 0 0 0 # 4 -1 roll <- roll stack 4 elements -1 times, meaning the order is reversed # stack contains: 0 0 0 X # pikepdf currently does not interpret tint transformation functions. This # is done so that the output test file can be checked in a PDF viewer. tint_transform_k_to_cmyk = b'{0 0 0 4 -1 roll}' cs = Array( [ Name.DeviceN, Array([Name.Black]), Name.DeviceCMYK, Stream( pdf, tint_transform_k_to_cmyk, FunctionType=4, Domain=[0.0, 1.0], Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0], ), ] ) def check_pim(imobj, idx): pim = pikepdf.PdfImage(imobj) assert pim.mode == 'DeviceN' assert pim.is_device_n assert not pim.is_separation assert pim.indexed == idx assert repr(pim) with pytest.raises(pikepdf.models.image.HifiPrintImageNotTranscodableError): pim.extract_to(stream=BytesIO()) imobj0 = Stream( pdf, bytes(range(0, 256)), BitsPerComponent=8, ColorSpace=cs, Width=16, Height=16, Type=Name.XObject, Subtype=Name.Image, ) check_pim(imobj0, idx=False) imobj1 = Stream( pdf, bytes(range(0, 256)), BitsPerComponent=8, ColorSpace=Array([Name.Indexed, cs, 255, bytes(range(255, -1, -1))]), Width=16, Height=16, Type=Name.XObject, Subtype=Name.Image, ) check_pim(imobj1, idx=True) pdf.pages[0].Contents = Stream( pdf, b'72 0 0 72 0 0 cm /Im0 Do 1 0 0 1 1 0 cm /Im1 Do' ) pdf.pages[0].Resources = Dictionary(XObject=Dictionary(Im0=imobj0, Im1=imobj1))
def test_random_image(bpc, width, height, colorspace, imbytes, tmp_path_factory): pdf = pikepdf.new() pdfw, pdfh = 18 * width, 18 * height pdf.add_blank_page(page_size=(pdfw, pdfh)) if len(imbytes) < width * height: imbytes = imbytes + bytes(width * height * (2 if bpc == 16 else 1)) imobj = Stream( pdf, imbytes, BitsPerComponent=bpc, ColorSpace=colorspace, Width=width, Height=height, Type=Name.XObject, Subtype=Name.Image, ) pdf.pages[0].Contents = Stream(pdf, b'%f 0 0 %f 0 0 cm /Im0 Do' % (pdfw, pdfh)) pdf.pages[0].Resources = Dictionary(XObject=Dictionary(Im0=imobj)) pim = PdfImage(pdf.pages[0].Resources.XObject.Im0) bio = BytesIO() try: result_extension = pim.extract_to(stream=bio) assert result_extension in ('.png', '.tiff') except ValueError as e: if 'not enough image data' in str(e): return elif 'buffer is not large enough' in str(e): ncomps = ( 4 if colorspace == Name.DeviceCMYK else 3 if colorspace == Name.DeviceRGB else 1 ) assert ceil(bpc / 8) * width * height * ncomps > len(imbytes) return raise except PIL.UnidentifiedImageError as e: if len(imbytes) == 0: return raise except UnsupportedImageTypeError as e: if colorspace in (Name.DeviceRGB, Name.DeviceCMYK) and bpc < 8: return if bpc == 16: return raise bio.seek(0) im = Image.open(bio) assert im.mode == pim.mode assert im.size == pim.size outprefix = f'{width}x{height}x{im.mode}-' tmpdir = tmp_path_factory.mktemp(outprefix) pdf.save(tmpdir / 'pdf.pdf') # We don't have convenient CMYK checking tools if im.mode == 'CMYK': return im.save(tmpdir / 'pikepdf.png') Path(tmpdir / 'imbytes.bin').write_bytes(imbytes) run( [ 'pdfimages', '-png', fspath('pdf.pdf'), fspath('pdfimage'), # omit suffix ], cwd=fspath(tmpdir), check=True, ) outpng = tmpdir / 'pdfimage-000.png' assert outpng.exists() im_roundtrip = Image.open(outpng) assert im.size == im_roundtrip.size diff = ImageChops.difference(im, im_roundtrip) assert not diff.getbbox()
def test_add_unowned_page(): # issue 174 pdf = pikepdf.new() d = pikepdf.Dictionary(Type=pikepdf.Name.Page) pdf.pages.append(d)
def test_closed_anon_pdf(): pdf = pikepdf.new() desc = pdf.filename pdf.close() assert pdf.filename != desc
import pikepdf input_path_source = r'/home/jfgcisneros/Desktop/DC-260-D91FC21183_all.pdf' input_path_destination = r'/home/jfgcisneros/Desktop/G-28.pdf' page_to_replace_source = 4 page_to_be_replaced_destination = 3 output_path = r'/home/jfgcisneros/Desktop/test.pdf' temp_folder = r'/home/jfgcisneros/Desktop/temp/' with pikepdf.open(input_path_source) as pdf_source: dst = pikepdf.new() dst.pages.append(pdf_source.pages[page_to_replace_source - 1]) #dst.save(output_path) with pikepdf.open(input_path_destination) as pdf_destination: pdf_destination.pages[page_to_be_replaced_destination - 1] = dst.pages[0] dst2 = pdf_destination filename = 'G-28_Signed.pdf' path = temp_folder + filename dst2.save(path)
def test_new(outdir): pdf = pikepdf.new() pdf.save(outdir / 'new-empty.pdf')
def test_separation(): # Manually construct a 2"x1" document with a Separation # colorspace that devices a single "spot" color channel named # "LogoGreen". Define a conversion to standard CMYK that assigns # CMYK equivalents. Copied example from PDF RM. # LogoGreen is a teal-ish green. First panel is white to full green, # second is green to full white. RGB ~= (31, 202, 113) pdf = pikepdf.new() pdf.add_blank_page(page_size=(144, 72)) # pikepdf does not interpret this - it is for the PDF viewer # Explanation: # X is implicitly loaded to stack # dup: X X # 0.84 mul: X 0.84X # exch: 0.84X X # 0.00: 0.84X X 0.00 # exch: 0.84X 0.00 X # dup: 0.84X 0.00 X X # 0.44 mul: 0.84X 0.00 X 0.44X # exch: 0.84X 0.00 0.44X X # 0.21mul: 0.84X 0.00 0.44X 0.21X # X -> {0.84X, 0, 0.44X, 0.21X} tint_transform_logogreen_to_cmyk = b''' { dup 0.84 mul exch 0.00 exch dup 0.44 mul exch 0.21 mul } ''' cs = Array( [ Name.Separation, Name.LogoGreen, Name.DeviceCMYK, Stream( pdf, tint_transform_logogreen_to_cmyk, FunctionType=4, Domain=[0.0, 1.0], Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0], ), ] ) def check_pim(imobj, idx): pim = pikepdf.PdfImage(imobj) assert pim.mode == 'Separation' assert pim.is_separation assert not pim.is_device_n assert pim.indexed == idx assert repr(pim) with pytest.raises(pikepdf.models.image.HifiPrintImageNotTranscodableError): pim.extract_to(stream=BytesIO()) imobj0 = Stream( pdf, bytes(range(0, 256)), BitsPerComponent=8, ColorSpace=cs, Width=16, Height=16, Type=Name.XObject, Subtype=Name.Image, ) check_pim(imobj0, idx=False) imobj1 = Stream( pdf, bytes(range(0, 256)), BitsPerComponent=8, ColorSpace=Array([Name.Indexed, cs, 255, bytes(range(255, -1, -1))]), Width=16, Height=16, Type=Name.XObject, Subtype=Name.Image, ) check_pim(imobj1, idx=True) pdf.pages[0].Contents = Stream( pdf, b'72 0 0 72 0 0 cm /Im0 Do 1 0 0 1 1 0 cm /Im1 Do' ) pdf.pages[0].Resources = Dictionary(XObject=Dictionary(Im0=imobj0, Im1=imobj1))