def testFileLoad(self): """ Test loading and parsing of a file. Extract text of the file and compare to expected textual output. Expected outcome: file loads, text matches expected. """ with open(join(TEST_DATA_ROOT, 'crazyones.pdf'), 'rb') as inputfile: # Load PDF file from file r = PdfFileReader(inputfile) page1 = r.getPage(0) # Retrieve the text of the PDF with open(join(self.localDataRoot, 'crazyones.txt'), 'rb') as pdftextFile: pdftext = pdftextFile.read() page1Text = page1.extractText().replace('\n', '').encode('utf-8') # Compare the text of the PDF to a known source self.assertEqual( pdftext, page1Text, msg='PDF extracted text differs from expected value.' '\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' % (pdftext, page1Text)) r.close()
def testJpegImage(self): """ Test loading and parsing of a file. Extract the image of the file and compare to expected textual output. Expected outcome: file loads, image matches expected. """ with open(join(TEST_DATA_ROOT, 'jpeg.pdf'), 'rb') as inputfile: # Load PDF file from file r = PdfFileReader(inputfile) # Retrieve the text of the image with open(join(self.localDataRoot, 'jpeg.txt'), 'r') as pdftextFile: imagetext = pdftextFile.read() page1 = r.getPage(0) xObject = page1['/Resources']['/XObject'].getObject() data = xObject['/Im4'].getData() # Compare the text of the PDF to a known source self.assertEqual( binascii.hexlify(data).decode(), imagetext, msg='PDF extracted image differs from expected value.' '\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' % (imagetext, binascii.hexlify(data).decode())) r.close()
def main(): pagesRequired = 5 output = "PyPDF-Features-Output.pdf" if set(argv) & FLAG_HELP: print(USAGE) exit(0) elif len(argv) < 2: print(USAGE) exit(1) else: inputpath = argv[1].strip() filename = basename(inputpath) if len(argv) > 2: output = argv[2].strip() # We can instantiate a PdfFileReader/Writer by giving in a stream object # or a path string reader = PdfFileReader(open(inputpath, "rb")) writer = PdfFileWriter(output) # Check that the PDF file has the required number of pages if reader.numPages < pagesRequired: print( "We require a document with %d pages at least, %s has %d" % (pagesRequired, filename, reader.numPages), file=stderr, ) exit(1) else: print("'%s' has %d pages... OK" % (filename, reader.numPages)) # Add page 1 from reader to output document, unchanged writer.addPage(reader.getPage(0)) # Add page 2 from reader, but rotated clockwise 90 degrees writer.addPage(reader.getPage(1).rotateClockwise(90)) # Add page 3 from reader, rotated the other way: writer.addPage(reader.getPage(2).rotateCounterClockwise(90)) # Alt.: writer.addPage(reader.getPage(2).rotateClockwise(270)) # Add page 4 from reader, but first add a watermark from another PDF: page4 = reader.getPage(3) watermark = PdfFileReader(open(join(SAMPLE_PDF_ROOT, "AutoCad_Diagram.pdf"), "rb")) page4.mergePage(watermark.getPage(0)) writer.addPage(page4) # Add page 5 from reader, but crop it to half size: page5 = reader.getPage(4) page5.mediaBox.upperRight = ( page5.mediaBox.getUpperRight_x() / 2, page5.mediaBox.getUpperRight_y() / 2, ) writer.addPage(page5) # Add some Javascript to launch the print window on opening this PDF. # The password dialog may prevent the print dialog from being shown. # Comment the encrypted lines, if that's the case, to try this out writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") # Encrypt your new PDF and add a password password = "******" writer.encrypt(password) # Finally, write the resulting PDF document to ``output`` writer.write() print("Output successfully written to", output) reader.close() writer.close()