def test_remove_text(input_path, ignore_byte_string_object): pdf_path = os.path.join(RESOURCE_ROOT, input_path) reader = PdfReader(pdf_path) writer = PdfWriter() page = reader.pages[0] writer.insert_page(page, 0) writer.remove_text(ignore_byte_string_object=ignore_byte_string_object) # finally, write "output" to PyPDF2-output.pdf tmp_filename = "dont_commit_writer_removed_text.pdf" with open(tmp_filename, "wb") as output_stream: writer.write(output_stream) # Cleanup os.remove(tmp_filename)
def test_remove_images(input_path, ignore_byte_string_object): pdf_path = os.path.join(RESOURCE_ROOT, input_path) reader = PdfReader(pdf_path) writer = PdfWriter() page = reader.pages[0] writer.insert_page(page, 0) writer.remove_images(ignore_byte_string_object=ignore_byte_string_object) # finally, write "output" to PyPDF2-output.pdf tmp_filename = "dont_commit_writer_removed_image.pdf" with open(tmp_filename, "wb") as output_stream: writer.write(output_stream) with open(tmp_filename, "rb") as input_stream: reader = PdfReader(input_stream) if input_path == "side-by-side-subfig.pdf": extracted_text = reader.pages[0].extract_text() assert "Lorem ipsum dolor sit amet" in extracted_text # Cleanup os.remove(tmp_filename)
def test_writer_operations(): """ This test just checks if the operation throws an exception. This should be done way more thoroughly: It should be checked if the output is as expected. """ pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") pdf_outline_path = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf") reader = PdfReader(pdf_path) reader_outline = PdfReader(pdf_outline_path) writer = PdfWriter() page = reader.pages[0] with pytest.raises(PageSizeNotDefinedError) as exc: writer.add_blank_page() assert exc.value.args == () writer.insert_page(page, 1) writer.insert_page(reader_outline.pages[0], 0) writer.add_bookmark_destination(page) writer.remove_links() writer.add_bookmark_destination(page) bm = writer.add_bookmark("A bookmark", 0, None, (255, 0, 15), True, True, "/FitBV", 10) writer.add_bookmark("The XYZ fit", 0, bm, (255, 0, 15), True, True, "/XYZ", 10, 20, 3) writer.add_bookmark("The FitH fit", 0, bm, (255, 0, 15), True, True, "/FitH", 10) writer.add_bookmark("The FitV fit", 0, bm, (255, 0, 15), True, True, "/FitV", 10) writer.add_bookmark("The FitR fit", 0, bm, (255, 0, 15), True, True, "/FitR", 10, 20, 30, 40) writer.add_bookmark("The FitB fit", 0, bm, (255, 0, 15), True, True, "/FitB") writer.add_bookmark("The FitBH fit", 0, bm, (255, 0, 15), True, True, "/FitBH", 10) writer.add_bookmark("The FitBV fit", 0, bm, (255, 0, 15), True, True, "/FitBV", 10) writer.add_blank_page() writer.add_uri(2, "https://example.com", RectangleObject([0, 0, 100, 100])) writer.add_link(2, 1, RectangleObject([0, 0, 100, 100])) assert writer._get_page_layout() is None writer._set_page_layout("/SinglePage") assert writer._get_page_layout() == "/SinglePage" assert writer._get_page_mode() is None writer.set_page_mode("/UseNone") assert writer._get_page_mode() == "/UseNone" writer.insert_blank_page(width=100, height=100) writer.insert_blank_page() # without parameters # TODO: This gives "KeyError: '/Contents'" - is that a bug? # writer.removeImages() writer.add_metadata({"author": "Martin Thoma"}) writer.add_attachment("foobar.gif", b"foobarcontent") # finally, write "output" to PyPDF2-output.pdf tmp_path = "dont_commit_writer.pdf" with open(tmp_path, "wb") as output_stream: writer.write(output_stream) # cleanup os.remove(tmp_path)
def test_remove_text_all_operators(ignore_byte_string_object): stream = (b"BT " b"/F0 36 Tf " b"50 706 Td " b"36 TL " b"(The Tj operator) Tj " b'1 2 (The double quote operator) " ' b"(The single quote operator) ' " b"ET") pdf_data = ( b"%%PDF-1.7\n" b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n" b"2 0 obj << >> endobj\n" b"3 0 obj << >> endobj\n" b"4 0 obj << /Length %d >>\n" b"stream\n" + (b"%s\n" % stream) + b"endstream\n" b"endobj\n" b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n" b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" b" /Resources << /Font << >> >>" b" /Rotate 0 /Type /Page >> endobj\n" b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" b"xref 1 6\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << /Root 6 0 R /Size 6 >>\n" b"startxref\n%d\n" b"%%%%EOF") startx_correction = -1 pdf_data = pdf_data % ( len(stream), pdf_data.find(b"1 0 obj") + startx_correction, pdf_data.find(b"2 0 obj") + startx_correction, pdf_data.find(b"3 0 obj") + startx_correction, pdf_data.find(b"4 0 obj") + startx_correction, pdf_data.find(b"5 0 obj") + startx_correction, pdf_data.find(b"6 0 obj") + startx_correction, # startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation pdf_data.find(b"xref"), ) print(pdf_data.decode()) pdf_stream = BytesIO(pdf_data) reader = PdfReader(pdf_stream, strict=False) writer = PdfWriter() page = reader.pages[0] writer.insert_page(page, 0) writer.remove_text(ignore_byte_string_object=ignore_byte_string_object) # finally, write "output" to PyPDF2-output.pdf tmp_filename = "dont_commit_writer_removed_text.pdf" with open(tmp_filename, "wb") as output_stream: writer.write(output_stream) # Cleanup os.remove(tmp_filename)