Пример #1
0
    def testFileLoad(self):
        """
        Test loading and parsing of a file. Extract text of the file and
        compare to expected textual output. Expected outcome: file loads, text
        matches expected.
        """
        with open(join(TEST_DATA_ROOT, 'crazyones.pdf'), 'rb') as inputfile:
            # Load PDF file from file
            r = PdfFileReader(inputfile)
            page1 = r.getPage(0)

            # Retrieve the text of the PDF
            with open(join(self.localDataRoot, 'crazyones.txt'),
                      'rb') as pdftextFile:
                pdftext = pdftextFile.read()

            page1Text = page1.extractText().replace('\n', '').encode('utf-8')

            # Compare the text of the PDF to a known source
            self.assertEqual(
                pdftext,
                page1Text,
                msg='PDF extracted text differs from expected value.'
                '\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' %
                (pdftext, page1Text))

            r.close()
Пример #2
0
    def testJpegImage(self):
        """
        Test loading and parsing of a file. Extract the image of the file and
        compare to expected textual output. Expected outcome: file loads, image
        matches expected.
        """
        with open(join(TEST_DATA_ROOT, 'jpeg.pdf'), 'rb') as inputfile:
            # Load PDF file from file
            r = PdfFileReader(inputfile)

            # Retrieve the text of the image
            with open(join(self.localDataRoot, 'jpeg.txt'),
                      'r') as pdftextFile:
                imagetext = pdftextFile.read()

            page1 = r.getPage(0)
            xObject = page1['/Resources']['/XObject'].getObject()
            data = xObject['/Im4'].getData()

            # Compare the text of the PDF to a known source
            self.assertEqual(
                binascii.hexlify(data).decode(),
                imagetext,
                msg='PDF extracted image differs from expected value.'
                '\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' %
                (imagetext, binascii.hexlify(data).decode()))

            r.close()
Пример #3
0
    def testXRefTableObjects(self):
        """
        Ensures that after ``PdfFileReader._parsePdfFile()`` all the indirect
        references from the XRef-Table *only* have been loaded as expected.
        Objects from the free entries list are included as well in the test.

        This case tests the part of ``PdfFileReader.objects()`` responsible for
        generating the Cross-Reference Table entries too.
        """
        self.maxDiff = None
        inputFiles = ("jpeg.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf",
                      "SF424_page2.pdf")

        for filename in inputFiles:
            filepath = join(TEST_DATA_ROOT, filename)
            xtablepath = join(self.localDataRoot, filename)
            r = PdfFileReader(filepath)
            # The two below are (id, gen, byte offset)-valued lists
            actualItems = list()
            expItems = list()

            for ref in r.objects(PdfFileReader.R_XTABLE, True):
                actualItems.append(
                    (ref.idnum, ref.generation,
                     r._xrefTable[ref.generation][ref.idnum][0]))

            r.close()
            # We artificially read the XRef Table entries that we know belong
            # to filepath, and store them into expItems.
            expItems = sorted(self._parseXRefTable(xtablepath, (0, 1, 2)))
            actualItems = sorted(actualItems)
            expItems = sorted(expItems)

            self.assertListEqual(expItems, actualItems,
                                 "Differences found in " + filename)
Пример #4
0
    def testIsObjectFree(self):
        """
        Tests the ``PdfFileReader.isObjectFree()` method.
        """
        # TO-DO Find PDF files that feature free-entry lists. We are checking
        # isObjectFree() only against used items.
        inputFiles = (
            "jpeg.pdf",
            "Seige_of_Vicksburg_Sample_OCR.pdf",
            "SF424_page2.pdf",
        )

        for filename in inputFiles:
            filepath = join(self.localDataRoot, filename)
            r = PdfFileReader(join(TEST_DATA_ROOT, filename))
            expItems = self._parseXRefTable(filepath, (0, 1, 3))
            actualItems = list()

            for ref in r.objects(PdfFileReader.R_XTABLE, True):
                actualItems.append(
                    # This is where isObjectFree() gets invoked
                    (ref.idnum, ref.generation, r.isObjectFree(ref)))

            r.close()
            expItems = sorted(expItems)
            actualItems = sorted(actualItems)

            self.assertListEqual(expItems, actualItems)
Пример #5
0
    def testObjectIds(self):
        """
        Tests the ``ObjectStream.objectIds()`` method.
        """
        expResults = (
            (8, 3, 10, 2, 1, 11, 13, 15, 4, 19, 5, 20, 6, 21, 17),
            (644, 642, 646, 647, 648, 122, 119, 120, 121, 124, 179, 232, 327,
             467, 478, 519, 568, 573, 580, 586, 592, 598, 603, 611, 616, 623,
             629, 634),
        )
        # Files we know to have Object Streams within
        inputData = (
            # (filename, id, generation number)
            ("crazyones.pdf", 9, 0),
            ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", 645, 0),
        )

        for o, d in zip(expResults, inputData):
            filepath = join(TESTS_DATA_ROOT, d[0])
            r = PdfFileReader(filepath)
            ref = IndirectObject(d[1], d[2], r)
            objStm = r.getObject(ref)

            r.close()

            self.assertIsInstance(objStm, ObjectStream)
            self.assertTupleEqual(tuple(o), tuple(objStm.objectIds))
Пример #6
0
    def testReadXRefStreamCompressedObjects(self):
        """
        Targets the same objects as ``testXRefStreamObjects()``, but instead
        of ensuring an identity between the list of items read and the one
        expected, it verifies that their *contents* are identical.

        This method does **not** test ``PdfFileReader.objects()`` as two of the
        previous test cases did.
        """
        self.maxDiff = None
        inputFiles = ("crazyones.pdf", )
        # expItems and actualItems will contain two-element tuples, where the
        # first element is the object ID, used to sort.
        sortKey = lambda e: e[0]
        compressedObj = lambda e: e[1][0] == 2

        for filename in inputFiles:
            filepath = join(self.localDataRoot, filename)
            r = PdfFileReader(join(TEST_DATA_ROOT, filename))
            expItems = list()
            actualItems = list()

            with open(filepath, "rb") as instream:
                for line in instream:
                    if not line or line.isspace() or line.startswith(b"%"):
                        continue

                    globalId, offset, obj = line.split(b" ", 2)
                    globalId, offset = int(globalId), int(offset)

                    with BytesIO(obj) as objStream:
                        obj = readObject(objStream, r)

                    expItems.append((globalId, obj))

            for itemid, item in filter(compressedObj, r._xrefStm.items()):
                # We deal exclusively with compressed objects (from Table 18 of
                # ISO 32000 reference, 2008) whose generation number is 0
                actualItems.append(
                    # (ID, PdfObject) tuples
                    (itemid, IndirectObject(itemid, 0, r).getObject()))

            r.close()
            expItems = sorted(expItems, key=sortKey)
            actualItems = sorted(actualItems, key=sortKey)

            self.assertListEqual(expItems, actualItems)
Пример #7
0
    def testXTableAgainstXStream(self):
        """
        In section 7.5.8.4 of ISO 32000, "Compatibility with Applications That
        Do Not Support Compressed Reference Streams", the standard describes a
        means of crafting PDF files designed for versions 1.5+ that can be
        opened nevertheless by readers that support older versions.

        This test case verifies that all the items hidden by the XRef Table in
        non-conforming readers are *all and exactly* loaded into the XRef
        Stream by readers that support PDF 1.5+.
        """
        self.maxDiff = None
        # TO-DO Possibly add a few other files to this test case
        inputFiles = ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", )

        for filename in inputFiles:
            filepath = join(self.localDataRoot, filename)
            expItems = {
                e[0]: e[1:]
                for e in self._parseXRefTable(filepath, (0, 2, 3))
            }
            actualItems = list()
            r = PdfFileReader(join(TEST_DATA_ROOT, filename))

            for ref in r.objects(PdfFileReader.R_XSTREAM, True):
                actualItems.append(ref)

            r.close()
            actualItems = sorted(actualItems, key=lambda e: e.idnum)
            expKeys = sorted(expItems.keys())
            actualKeys = list(map(lambda e: e.idnum, actualItems))

            self.assertListEqual(expKeys, actualKeys,
                                 "Lists of item IDs are not identical")

            for e, a in zip(expKeys, actualItems):
                self.assertEqual(e, a.idnum, "Items ID does not correspond")

                # If an item is in use in the XRef Stream, ensure then that it
                # is marked free in the XRef Table.
                if r._xrefStm[a.idnum][0] in (2, ):
                    self.assertTrue(
                        expItems[e][-1],
                        "Item %d should be hid by the XRef Table, but it was "
                        "not." % e,
                    )
Пример #8
0
    def testXRefStreamObjects(self):
        """
        Like ``PdfReaderTestCases.testXRefTableObjects()``, except that it
        tests objects referenced by the Cross-Reference Stream.
        ``PdfFileReader.objects()`` second part (dealing with XStream objects)
        is invoked and implicitly tested.
        """
        inputFiles = ("crazyones.pdf", )

        for filename in inputFiles:
            filepath = join(self.localDataRoot, filename)
            r = PdfFileReader(join(TEST_DATA_ROOT, filename))
            # Two lists of tuples as explained by Table 18
            actualItems = list()
            expItems = list()

            with open(filepath, "r") as instream:
                for line in instream:
                    if not line or line.isspace() or line.startswith("%"):
                        continue

                    this_type, field2, field3 = (int(f) for f in line.split())
                    expItems.append((this_type, field2, field3))

            for item in r.objects(PdfFileReader.R_XSTREAM, True):
                priv8Item = r._xrefStm[item.idnum]

                if priv8Item[0] in {0, 1}:
                    self.assertEqual(priv8Item[2], item.generation)
                elif priv8Item[0] == 2:
                    self.assertEqual(item.generation, 0)

                actualItems.append(priv8Item)

            r.close()
            actualItems = sorted(actualItems)
            expItems = sorted(expItems)

            self.assertListEqual(
                expItems,
                actualItems,
                "Didn't correctly read the Cross-Reference Stream",
            )
Пример #9
0
def main():
    pagesRequired = 5
    output = "PyPDF-Features-Output.pdf"

    if set(argv) & FLAG_HELP:
        print(USAGE)
        exit(0)
    elif len(argv) < 2:
        print(USAGE)
        exit(1)
    else:
        inputpath = argv[1].strip()
        filename = basename(inputpath)

        if len(argv) > 2:
            output = argv[2].strip()

    # We can instantiate a PdfFileReader/Writer by giving in a stream object
    # or a path string
    reader = PdfFileReader(open(inputpath, "rb"))
    writer = PdfFileWriter(output)

    # Check that the PDF file has the required number of pages
    if reader.numPages < pagesRequired:
        print(
            "We require a document with %d pages at least, %s has %d"
            % (pagesRequired, filename, reader.numPages),
            file=stderr,
        )
        exit(1)
    else:
        print("'%s' has %d pages... OK" % (filename, reader.numPages))

    # Add page 1 from reader to output document, unchanged
    writer.addPage(reader.getPage(0))

    # Add page 2 from reader, but rotated clockwise 90 degrees
    writer.addPage(reader.getPage(1).rotateClockwise(90))

    # Add page 3 from reader, rotated the other way:
    writer.addPage(reader.getPage(2).rotateCounterClockwise(90))
    # Alt.: writer.addPage(reader.getPage(2).rotateClockwise(270))

    # Add page 4 from reader, but first add a watermark from another PDF:
    page4 = reader.getPage(3)
    watermark = PdfFileReader(open(join(SAMPLE_PDF_ROOT, "AutoCad_Diagram.pdf"), "rb"))
    page4.mergePage(watermark.getPage(0))
    writer.addPage(page4)

    # Add page 5 from reader, but crop it to half size:
    page5 = reader.getPage(4)
    page5.mediaBox.upperRight = (
        page5.mediaBox.getUpperRight_x() / 2,
        page5.mediaBox.getUpperRight_y() / 2,
    )
    writer.addPage(page5)

    # Add some Javascript to launch the print window on opening this PDF.
    # The password dialog may prevent the print dialog from being shown.
    # Comment the encrypted lines, if that's the case, to try this out
    writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

    # Encrypt your new PDF and add a password
    password = "******"
    writer.encrypt(password)

    # Finally, write the resulting PDF document to ``output``
    writer.write()

    print("Output successfully written to", output)

    reader.close()
    writer.close()