示例#1
0
    def getNamedDestRoot(self):
        root = self.getObject(self._root)

        if '/Names' in root and isinstance(root['/Names'], DictionaryObject):
            names = root['/Names']
            idnum = self._objects.index(names) + 1
            namesRef = IndirectObject(idnum, 0, self)
            assert namesRef.getObject() == names
            if '/Dests' in names and isinstance(names['/Dests'],
                                                DictionaryObject):
                dests = names['/Dests']
                idnum = self._objects.index(dests) + 1
                destsRef = IndirectObject(idnum, 0, self)
                assert destsRef.getObject() == dests
                if '/Names' in dests:
                    nd = dests['/Names']
                else:
                    nd = ArrayObject()
                    dests[NameObject('/Names')] = nd
            else:
                dests = DictionaryObject()
                destsRef = self._addObject(dests)
                names[NameObject('/Dests')] = destsRef
                nd = ArrayObject()
                dests[NameObject('/Names')] = nd
        else:
            names = DictionaryObject()
            namesRef = self._addObject(names)
            root[NameObject('/Names')] = namesRef
            dests = DictionaryObject()
            destsRef = self._addObject(dests)
            names[NameObject('/Dests')] = destsRef
            nd = ArrayObject()
            dests[NameObject('/Names')] = nd
        return nd
示例#2
0
 def __init__(self):
     self._header = b_("%PDF-1.3")
     self._objects = []  # array of indirect objects
     # The root of our page tree node.
     pages = DictionaryObject()
     pages.update({
         NameObject("/Type"): NameObject("/Pages"),
         NameObject("/Count"): NumberObject(0),
         NameObject("/Kids"): ArrayObject()
     })
     self._pages = self._addObject(pages)
     # info object
     info = DictionaryObject()
     info.update({
         NameObject("/Producer"):
         createStringObject(
             u"Python PDF Library - http://pybrary.net/pyPdf/")
     })
     self._info = self._addObject(info)
     # root object
     root = DictionaryObject()
     root.update({
         NameObject("/Type"): NameObject("/Catalog"),
         NameObject("/Pages"): self._pages
     })
     self._root = self._addObject(root)
示例#3
0
 def _mergeResources(res1, res2, resource):
     newRes = DictionaryObject()
     newRes.update(res1.get(resource, DictionaryObject()).getObject())
     page2Res = res2.get(resource, DictionaryObject()).getObject()
     renameRes = {}
     for key in page2Res.keys():
         if key in newRes and newRes[key] != page2Res[key]:
             newname = NameObject(key + "renamed")
             renameRes[key] = newname
             newRes[newname] = page2Res[key]
         elif key not in newRes:
             newRes[key] = page2Res.raw_get(key)
     return newRes, renameRes
示例#4
0
 def _readInlineImage(self, stream):
     # begin reading just after the "BI" - begin image
     # first read the dictionary of settings.
     settings = DictionaryObject()
     while True:
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         if tok == "I":
             # "ID" - begin of image data
             break
         key = readObject(stream, self.pdf)
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         value = readObject(stream, self.pdf)
         settings[key] = value
     # left at beginning of ID
     tmp = stream.read(3)
     assert tmp[:2] == "ID"
     data = ""
     while True:
         tok = stream.read(1)
         if tok == "E":
             next = stream.read(1)
             if next == "I":
                 break
             else:
                 stream.seek(-1, 1)
                 data += tok
         else:
             data += tok
     readNonWhitespace(stream)
     stream.seek(-1, 1)
     return {"settings": settings, "data": data}
示例#5
0
    def addBookmark(self, title, pagenum, parent=None):
        """
        Add a bookmark to the pdf, using the specified title and pointing at
        the specified page number. A parent can be specified to make this a
        nested bookmark below the parent.
        """
        pageRef = self.getObject(self._pages)['/Kids'][pagenum]
        action = DictionaryObject()
        action.update({
            NameObject('/D'):
            ArrayObject([pageRef,
                         NameObject('/FitH'),
                         NumberObject(826)]),
            NameObject('/S'):
            NameObject('/GoTo')
        })
        actionRef = self._addObject(action)

        outlineRef = self.getOutlineRoot()
        if parent is None:
            parent = outlineRef
        bookmark = TreeObject()
        bookmark.update({
            NameObject('/A'): actionRef,
            NameObject('/Title'): createStringObject(title)
        })
        bookmarkRef = self._addObject(bookmark)

        parent = parent.getObject()
        parent.addChild(bookmarkRef, self)
        return bookmarkRef
示例#6
0
 def add(self, title, pagenum):
     pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum]
     action = DictionaryObject()
     action.update({NameObject('/D'): ArrayObject([pageRef,
                                                   NameObject('/FitH'),
                                                   NumberObject(826)]),
                    NameObject('/S'): NameObject('/GoTo')})
     actionRef = self.pdf._addObject(action)
     bookmark = TreeObject()
     bookmark.update({NameObject('/A'): actionRef,
                      NameObject('/Title'): createStringObject(title)})
     self.pdf._addObject(bookmark)
     self.tree.addChild(bookmark)
示例#7
0
    def addNamedDestination(self, title, pagenum):
        pageRef = self.getObject(self._pages)['/Kids'][pagenum]
        dest = DictionaryObject()
        dest.update({
            NameObject('/D'):
            ArrayObject([pageRef,
                         NameObject('/FitH'),
                         NumberObject(826)]),
            NameObject('/S'):
            NameObject('/GoTo')
        })
        destRef = self._addObject(dest)
        nd = self.getNamedDestRoot()

        nd.extend([title, destRef])
        return destRef
示例#8
0
    def createBlankPage(pdf=None, width=None, height=None):
        page = PageObject(pdf)

        # Creates a new page (cf PDF Reference  7.7.3.3)
        page.__setitem__(NameObject('/Type'), NameObject('/Page'))
        page.__setitem__(NameObject('/Parent'), NullObject())
        page.__setitem__(NameObject('/Resources'), DictionaryObject())
        if width is None or height is None:
            if pdf is not None and pdf.getNumPages() > 0:
                lastpage = pdf.getPage(pdf.getNumPages() - 1)
                width = lastpage.mediaBox.getWidth()
                height = lastpage.mediaBox.getHeight()
            else:
                raise utils.PageSizeNotDefinedError()
        page.__setitem__(NameObject('/MediaBox'),
                         RectangleObject([0, 0, width, height]))

        return page
示例#9
0
 def addBookmarkDict(self, bookmark, parent=None):
     bookmarkObj = TreeObject()
     for k, v in bookmark.items():
         bookmarkObj[NameObject(str(k))] = v
     bookmarkObj.update(bookmark)
     if '/A' in bookmark:
         action = DictionaryObject()
         for k, v in bookmark['/A'].items():
             action[NameObject(str(k))] = v
         actionRef = self._addObject(action)
         bookmarkObj['/A'] = actionRef
     bookmarkRef = self._addObject(bookmarkObj)
     outlineRef = self.getOutlineRoot()
     if parent is None:
         parent = outlineRef
     parent = parent.getObject()
     parent.addChild(bookmarkRef, self)
     return bookmarkRef
示例#10
0
 def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True):
     import time
     import random
     if owner_pwd is None:
         owner_pwd = user_pwd
     if use_128bit:
         V = 2
         rev = 3
         keylen = 128 / 8
     else:
         V = 1
         rev = 2
         keylen = 40 / 8
     # permit everything:
     P = -1
     O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
     ID_1 = md5(repr(time.time())).digest()
     ID_2 = md5(repr(random.random())).digest()
     self._ID = ArrayObject(
         (ByteStringObject(ID_1), ByteStringObject(ID_2)))
     if rev == 2:
         U, key = _alg34(user_pwd, O, P, ID_1)
     else:
         assert rev == 3
         U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
     encrypt = DictionaryObject()
     encrypt[NameObject("/Filter")] = NameObject("/Standard")
     encrypt[NameObject("/V")] = NumberObject(V)
     if V == 2:
         encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
     encrypt[NameObject("/R")] = NumberObject(rev)
     encrypt[NameObject("/O")] = ByteStringObject(O)
     encrypt[NameObject("/U")] = ByteStringObject(U)
     encrypt[NameObject("/P")] = NumberObject(P)
     self._encrypt = self._addObject(encrypt)
     self._encrypt_key = key
示例#11
0
    def _mergePage(self,
                   page2,
                   page2transformation=None,
                   ctm=None,
                   expand=False):
        # First we work on merging the resource dictionaries.  This allows us
        # to find out what symbols in the content streams we might need to
        # rename.
        newResources = DictionaryObject()
        rename = {}
        originalResources = self["/Resources"].getObject()
        page2Resources = page2["/Resources"].getObject()

        for res in [
                "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern",
                "/Shading", "/Properties"
        ]:
            new, newrename = PageObject._mergeResources(
                originalResources, page2Resources, res)
            if new:
                newResources[NameObject(res)] = new
                rename.update(newrename)
        # Combine /ProcSet sets.
        newResources[NameObject("/ProcSet")] = ArrayObject(
            frozenset(
                originalResources.get("/ProcSet",
                                      ArrayObject()).getObject()).union(
                                          frozenset(
                                              page2Resources.get(
                                                  "/ProcSet",
                                                  ArrayObject()).getObject())))
        newContentArray = ArrayObject()
        originalContent = self.getContents()
        if originalContent is not None:
            newContentArray.append(
                PageObject._pushPopGS(originalContent, self.pdf))
        page2Content = page2.getContents()
        if page2Content is not None:
            if page2transformation is not None:
                page2Content = page2transformation(page2Content)
            page2Content = PageObject._contentStreamRename(
                page2Content, rename, self.pdf)
            page2Content = PageObject._pushPopGS(page2Content, self.pdf)
            newContentArray.append(page2Content)
        # if expanding the page to fit a new page,
        # calculate the new media box size
        if expand:
            corners1 = [
                self.mediaBox.getLowerLeft_x().as_numeric(),
                self.mediaBox.getLowerLeft_y().as_numeric(),
                self.mediaBox.getUpperRight_x().as_numeric(),
                self.mediaBox.getUpperRight_y().as_numeric()
            ]
            corners2 = [
                page2.mediaBox.getLowerLeft_x().as_numeric(),
                page2.mediaBox.getLowerLeft_y().as_numeric(),
                page2.mediaBox.getUpperLeft_x().as_numeric(),
                page2.mediaBox.getUpperLeft_y().as_numeric(),
                page2.mediaBox.getUpperRight_x().as_numeric(),
                page2.mediaBox.getUpperRight_y().as_numeric(),
                page2.mediaBox.getLowerRight_x().as_numeric(),
                page2.mediaBox.getLowerRight_y().as_numeric()
            ]
            if ctm is not None:
                new_x = map(
                    lambda i: ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] +
                    ctm[4], range(0, 8, 2))
                new_y = map(
                    lambda i: ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] +
                    ctm[5], range(0, 8, 2))
            else:
                new_x = corners2[0:8:2]
                new_y = corners2[1:8:2]
            lowerleft = [min(new_x), min(new_y)]
            upperright = [max(new_x), max(new_y)]
            lowerleft = [
                min(corners1[0], lowerleft[0]),
                min(corners1[1], lowerleft[1])
            ]
            upperright = [
                max(corners1[2], upperright[0]),
                max(corners1[3], upperright[1])
            ]

            self.mediaBox.setLowerLeft(lowerleft)
            self.mediaBox.setUpperRight(upperright)

        self[NameObject('/Contents')] = ContentStream(newContentArray,
                                                      self.pdf)
        self[NameObject('/Resources')] = newResources
示例#12
0
    def write(self, stream):
        externalReferenceMap = {}

        # PDF objects sometimes have circular references to their /Page objects
        # inside their object tree (for example, annotations).  Those will be
        # indirect references to objects that we've recreated in this PDF.  To
        # address this problem, PageObject's store their original object
        # reference number, and we add it to the external reference map before
        # we sweep for indirect references.  This forces self-page-referencing
        # trees to reference the correct new object location, rather than
        # copying in a new copy of the page object.
        for objIndex in xrange(len(self._objects)):
            obj = self._objects[objIndex]
            if isinstance(obj, PageObject) and obj.indirectRef is not None:
                data = obj.indirectRef
                externalReferenceMap.setdefault(data.pdf, {})
                externalReferenceMap[data.pdf].setdefault(data.generation, {})
                externalReferenceMap[data.pdf][data.generation][data.idnum] = \
                    IndirectObject(objIndex + 1, 0, self)

        self.stack = []
        self._sweepIndirectReferences(externalReferenceMap, self._root)
        del self.stack

        # Begin writing:
        object_positions = []
        stream.write(self._header + b_("\n"))
        for i in range(len(self._objects)):
            idnum = (i + 1)
            obj = self._objects[i]
            object_positions.append(stream.tell())
            stream.write(b_(str(idnum) + " 0 obj\n"))
            key = None
            if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum:
                pack1 = struct.pack("<i", i + 1)[:3]
                pack2 = struct.pack("<i", 0)[:2]
                key = self._encrypt_key + pack1 + pack2
                assert len(key) == (len(self._encrypt_key) + 5)
                md5_hash = md5(key).digest()
                key = md5_hash[:min(16, len(self._encrypt_key) + 5)]
            if obj is not None:
                obj.writeToStream(stream, key)
                stream.write(b_("\nendobj\n"))

        # xref table
        xref_location = stream.tell()
        stream.write(b_("xref\n"))
        stream.write(b_("0 %s\n" % (len(self._objects) + 1)))
        stream.write(b_("%010d %05d f \n" % (0, 65535)))
        for offset in object_positions:
            stream.write(b_("%010d %05d n \n" % (offset, 0)))

        # trailer
        stream.write(b_("trailer\n"))
        trailer = DictionaryObject()
        trailer.update({
            NameObject("/Size"):
            NumberObject(len(self._objects) + 1),
            NameObject("/Root"):
            self._root,
            NameObject("/Info"):
            self._info
        })
        if hasattr(self, "_ID"):
            trailer[NameObject("/ID")] = self._ID
        if hasattr(self, "_encrypt"):
            trailer[NameObject("/Encrypt")] = self._encrypt
        trailer.writeToStream(stream, None)

        # eof
        stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))
示例#13
0
    def read(self, stream):
        # start at the end:
        stream.seek(-1, 2)
        line = b_('')
        while not line:
            line = self.readNextEndLine(stream)
        if line[:5] != b_("%%EOF"):
            raise utils.PdfReadError, "EOF marker not found"
        # find startxref entry - the location of the xref table
        line = self.readNextEndLine(stream)
        startxref = int(line)
        line = self.readNextEndLine(stream)
        if line[:9] != b_("startxref"):
            raise utils.PdfReadError, "startxref not found"

        # read all cross reference tables and their trailers
        self.xref = {}
        self.xref_objStm = {}
        self.trailer = DictionaryObject()
        while 1:
            # load the xref table
            stream.seek(startxref, 0)
            x = stream.read(1)
            if x == b_("x"):
                # standard cross-reference table
                ref = stream.read(4)
                if ref[:3] != b_("ref"):
                    raise utils.PdfReadError, "xref table read error"
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                # check if the first time looking at the xref table
                firsttime = True
                while True:
                    num = readObject(stream, self)
                    if firsttime and num != 0:
                        self.xrefIndex = num
                        warnings.warn(
                            "Xref table not zero-indexed. ID "
                            "numbers for objects will %sbe "
                            "corrected." % ("" if not self.strict else "not "),
                            utils.PdfReadWarning)
                        # if table not zero indexed, could be due to
                        # error from when PDF was created
                        # which will lead to mismatched indices later on
                    firsttime = False
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    size = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    cnt = 0
                    while cnt < size:
                        line = stream.read(20)
                        # It's very clear in section 3.4.3 of the PDF spec
                        # that all cross-reference table lines are a fixed
                        # 20 bytes (as of PDF 1.7). However, some files have
                        # 21-byte entries (or more) due to the use of \r\n
                        # (CRLF) EOL's. Detect that case, and adjust the line
                        # until it does not begin with a \r (CR) or \n (LF).
                        while line[0] in b_("\x0D\x0A"):
                            stream.seek(-20 + 1, 1)
                            line = stream.read(20)
                        # On the other hand, some malformed PDF files
                        # use a single character EOL without a preceeding
                        # space.  Detect that case, and seek the stream
                        # back one character.  (0-9 means we've bled into
                        # the next xref entry, t means we've bled into the
                        # text "trailer"):
                        if line[-1] in b_("0123456789t"):
                            stream.seek(-1, 1)
                        offset, generation = line[:16].split(b_(" "))
                        offset, generation = int(offset), int(generation)
                        self.xref.setdefault(generation, {})
                        if num in self.xref[generation]:
                            # It really seems like we should allow the last
                            # xref table in the file to override previous
                            # ones. Since we read the file backwards, assume
                            # any existing key is already set correctly.
                            pass
                        else:
                            self.xref[generation][num] = offset
                        cnt += 1
                        num += 1
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    trailertag = stream.read(7)
                    if trailertag != b_("trailer"):
                        # more xrefs!
                        stream.seek(-7, 1)
                    else:
                        break
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                newTrailer = readObject(stream, self)
                for key, value in newTrailer.items():
                    self.trailer.setdefault(key, value)
                if "/Prev" in newTrailer:
                    startxref = newTrailer["/Prev"]
                else:
                    break
            elif x.isdigit():
                # PDF 1.5+ Cross-Reference Stream
                stream.seek(-1, 1)
                idnum, generation = self.readObjectHeader(stream)
                xrefstream = readObject(stream, self)
                assert xrefstream["/Type"] == "/XRef"
                self.cacheIndirectObject(generation, idnum, xrefstream)
                streamData = StringIO(xrefstream.getData())
                idx_pairs = xrefstream.get("/Index",
                                           [0, xrefstream.get("/Size")])
                entrySizes = xrefstream.get("/W")
                for num, size in self._pairs(idx_pairs):
                    cnt = 0
                    while cnt < size:
                        for i in range(len(entrySizes)):
                            d = streamData.read(entrySizes[i])
                            di = convertToInt(d, entrySizes[i])
                            if i == 0:
                                xref_type = di
                            elif i == 1:
                                if xref_type == 0:
                                    # next_free_object = di
                                    pass
                                elif xref_type == 1:
                                    byte_offset = di
                                elif xref_type == 2:
                                    objstr_num = di
                            elif i == 2:
                                if xref_type == 0:
                                    # next_generation = di
                                    pass
                                elif xref_type == 1:
                                    generation = di
                                elif xref_type == 2:
                                    obstr_idx = di
                        if xref_type == 0:
                            pass
                        elif xref_type == 1:
                            if generation not in self.xref:
                                self.xref[generation] = {}
                            if not num in self.xref[generation]:
                                self.xref[generation][num] = byte_offset
                        elif xref_type == 2:
                            if not num in self.xref_objStm:
                                self.xref_objStm[num] = [objstr_num, obstr_idx]
                        cnt += 1
                        num += 1
                trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
                for key in trailerKeys:
                    if key in xrefstream and key not in self.trailer:
                        self.trailer[NameObject(key)] = xrefstream.raw_get(key)
                if "/Prev" in xrefstream:
                    startxref = xrefstream["/Prev"]
                else:
                    break
            else:
                # bad xref character at startxref.  Let's see if we can find
                # the xref table nearby, as we've observed this error with an
                # off-by-one before.
                stream.seek(-11, 1)
                tmp = stream.read(20)
                xref_loc = tmp.find(b_("xref"))
                if xref_loc != -1:
                    startxref -= (10 - xref_loc)
                    continue
                else:
                    # no xref table found at specified location
                    assert False
                    break
        # if not zero-indexed, verify that the table is correct
        # change it if necessary
        if self.xrefIndex and not self.strict:
            loc = stream.tell()
            for gen in self.xref:
                if gen == 65535:
                    continue
                for id in self.xref[gen]:
                    stream.seek(self.xref[gen][id], 0)
                    pid, pgen = self.readObjectHeader(stream)
                    if pid == id - self.xrefIndex:
                        self._zeroXref(gen)
                        break
                    # if not, then either it's just plain wrong,
                    # or the non-zero-index is actually correct
            stream.seek(loc, 0)  # return to where it was