def getNamedDestRoot(self): root = self.getObject(self._root) if '/Names' in root and isinstance(root['/Names'], DictionaryObject): names = root['/Names'] idnum = self._objects.index(names) + 1 namesRef = IndirectObject(idnum, 0, self) assert namesRef.getObject() == names if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject): dests = names['/Dests'] idnum = self._objects.index(dests) + 1 destsRef = IndirectObject(idnum, 0, self) assert destsRef.getObject() == dests if '/Names' in dests: nd = dests['/Names'] else: nd = ArrayObject() dests[NameObject('/Names')] = nd else: dests = DictionaryObject() destsRef = self._addObject(dests) names[NameObject('/Dests')] = destsRef nd = ArrayObject() dests[NameObject('/Names')] = nd else: names = DictionaryObject() namesRef = self._addObject(names) root[NameObject('/Names')] = namesRef dests = DictionaryObject() destsRef = self._addObject(dests) names[NameObject('/Dests')] = destsRef nd = ArrayObject() dests[NameObject('/Names')] = nd return nd
def __init__(self): self._header = b_("%PDF-1.3") self._objects = [] # array of indirect objects # The root of our page tree node. pages = DictionaryObject() pages.update({ NameObject("/Type"): NameObject("/Pages"), NameObject("/Count"): NumberObject(0), NameObject("/Kids"): ArrayObject() }) self._pages = self._addObject(pages) # info object info = DictionaryObject() info.update({ NameObject("/Producer"): createStringObject( u"Python PDF Library - http://pybrary.net/pyPdf/") }) self._info = self._addObject(info) # root object root = DictionaryObject() root.update({ NameObject("/Type"): NameObject("/Catalog"), NameObject("/Pages"): self._pages }) self._root = self._addObject(root)
def _mergeResources(res1, res2, resource): newRes = DictionaryObject() newRes.update(res1.get(resource, DictionaryObject()).getObject()) page2Res = res2.get(resource, DictionaryObject()).getObject() renameRes = {} for key in page2Res.keys(): if key in newRes and newRes[key] != page2Res[key]: newname = NameObject(key + "renamed") renameRes[key] = newname newRes[newname] = page2Res[key] elif key not in newRes: newRes[key] = page2Res.raw_get(key) return newRes, renameRes
def _readInlineImage(self, stream): # begin reading just after the "BI" - begin image # first read the dictionary of settings. settings = DictionaryObject() while True: tok = readNonWhitespace(stream) stream.seek(-1, 1) if tok == "I": # "ID" - begin of image data break key = readObject(stream, self.pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, self.pdf) settings[key] = value # left at beginning of ID tmp = stream.read(3) assert tmp[:2] == "ID" data = "" while True: tok = stream.read(1) if tok == "E": next = stream.read(1) if next == "I": break else: stream.seek(-1, 1) data += tok else: data += tok readNonWhitespace(stream) stream.seek(-1, 1) return {"settings": settings, "data": data}
def addBookmark(self, title, pagenum, parent=None): """ Add a bookmark to the pdf, using the specified title and pointing at the specified page number. A parent can be specified to make this a nested bookmark below the parent. """ pageRef = self.getObject(self._pages)['/Kids'][pagenum] action = DictionaryObject() action.update({ NameObject('/D'): ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), NameObject('/S'): NameObject('/GoTo') }) actionRef = self._addObject(action) outlineRef = self.getOutlineRoot() if parent is None: parent = outlineRef bookmark = TreeObject() bookmark.update({ NameObject('/A'): actionRef, NameObject('/Title'): createStringObject(title) }) bookmarkRef = self._addObject(bookmark) parent = parent.getObject() parent.addChild(bookmarkRef, self) return bookmarkRef
def add(self, title, pagenum): pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum] action = DictionaryObject() action.update({NameObject('/D'): ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), NameObject('/S'): NameObject('/GoTo')}) actionRef = self.pdf._addObject(action) bookmark = TreeObject() bookmark.update({NameObject('/A'): actionRef, NameObject('/Title'): createStringObject(title)}) self.pdf._addObject(bookmark) self.tree.addChild(bookmark)
def addNamedDestination(self, title, pagenum): pageRef = self.getObject(self._pages)['/Kids'][pagenum] dest = DictionaryObject() dest.update({ NameObject('/D'): ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), NameObject('/S'): NameObject('/GoTo') }) destRef = self._addObject(dest) nd = self.getNamedDestRoot() nd.extend([title, destRef]) return destRef
def createBlankPage(pdf=None, width=None, height=None): page = PageObject(pdf) # Creates a new page (cf PDF Reference 7.7.3.3) page.__setitem__(NameObject('/Type'), NameObject('/Page')) page.__setitem__(NameObject('/Parent'), NullObject()) page.__setitem__(NameObject('/Resources'), DictionaryObject()) if width is None or height is None: if pdf is not None and pdf.getNumPages() > 0: lastpage = pdf.getPage(pdf.getNumPages() - 1) width = lastpage.mediaBox.getWidth() height = lastpage.mediaBox.getHeight() else: raise utils.PageSizeNotDefinedError() page.__setitem__(NameObject('/MediaBox'), RectangleObject([0, 0, width, height])) return page
def addBookmarkDict(self, bookmark, parent=None): bookmarkObj = TreeObject() for k, v in bookmark.items(): bookmarkObj[NameObject(str(k))] = v bookmarkObj.update(bookmark) if '/A' in bookmark: action = DictionaryObject() for k, v in bookmark['/A'].items(): action[NameObject(str(k))] = v actionRef = self._addObject(action) bookmarkObj['/A'] = actionRef bookmarkRef = self._addObject(bookmarkObj) outlineRef = self.getOutlineRoot() if parent is None: parent = outlineRef parent = parent.getObject() parent.addChild(bookmarkRef, self) return bookmarkRef
def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True): import time import random if owner_pwd is None: owner_pwd = user_pwd if use_128bit: V = 2 rev = 3 keylen = 128 / 8 else: V = 1 rev = 2 keylen = 40 / 8 # permit everything: P = -1 O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) ID_1 = md5(repr(time.time())).digest() ID_2 = md5(repr(random.random())).digest() self._ID = ArrayObject( (ByteStringObject(ID_1), ByteStringObject(ID_2))) if rev == 2: U, key = _alg34(user_pwd, O, P, ID_1) else: assert rev == 3 U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) encrypt = DictionaryObject() encrypt[NameObject("/Filter")] = NameObject("/Standard") encrypt[NameObject("/V")] = NumberObject(V) if V == 2: encrypt[NameObject("/Length")] = NumberObject(keylen * 8) encrypt[NameObject("/R")] = NumberObject(rev) encrypt[NameObject("/O")] = ByteStringObject(O) encrypt[NameObject("/U")] = ByteStringObject(U) encrypt[NameObject("/P")] = NumberObject(P) self._encrypt = self._addObject(encrypt) self._encrypt_key = key
def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): # First we work on merging the resource dictionaries. This allows us # to find out what symbols in the content streams we might need to # rename. newResources = DictionaryObject() rename = {} originalResources = self["/Resources"].getObject() page2Resources = page2["/Resources"].getObject() for res in [ "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties" ]: new, newrename = PageObject._mergeResources( originalResources, page2Resources, res) if new: newResources[NameObject(res)] = new rename.update(newrename) # Combine /ProcSet sets. newResources[NameObject("/ProcSet")] = ArrayObject( frozenset( originalResources.get("/ProcSet", ArrayObject()).getObject()).union( frozenset( page2Resources.get( "/ProcSet", ArrayObject()).getObject()))) newContentArray = ArrayObject() originalContent = self.getContents() if originalContent is not None: newContentArray.append( PageObject._pushPopGS(originalContent, self.pdf)) page2Content = page2.getContents() if page2Content is not None: if page2transformation is not None: page2Content = page2transformation(page2Content) page2Content = PageObject._contentStreamRename( page2Content, rename, self.pdf) page2Content = PageObject._pushPopGS(page2Content, self.pdf) newContentArray.append(page2Content) # if expanding the page to fit a new page, # calculate the new media box size if expand: corners1 = [ self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(), self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric() ] corners2 = [ page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(), page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(), page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(), page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric() ] if ctm is not None: new_x = map( lambda i: ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4], range(0, 8, 2)) new_y = map( lambda i: ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5], range(0, 8, 2)) else: new_x = corners2[0:8:2] new_y = corners2[1:8:2] lowerleft = [min(new_x), min(new_y)] upperright = [max(new_x), max(new_y)] lowerleft = [ min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]) ] upperright = [ max(corners1[2], upperright[0]), max(corners1[3], upperright[1]) ] self.mediaBox.setLowerLeft(lowerleft) self.mediaBox.setUpperRight(upperright) self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) self[NameObject('/Resources')] = newResources
def write(self, stream): externalReferenceMap = {} # PDF objects sometimes have circular references to their /Page objects # inside their object tree (for example, annotations). Those will be # indirect references to objects that we've recreated in this PDF. To # address this problem, PageObject's store their original object # reference number, and we add it to the external reference map before # we sweep for indirect references. This forces self-page-referencing # trees to reference the correct new object location, rather than # copying in a new copy of the page object. for objIndex in xrange(len(self._objects)): obj = self._objects[objIndex] if isinstance(obj, PageObject) and obj.indirectRef is not None: data = obj.indirectRef externalReferenceMap.setdefault(data.pdf, {}) externalReferenceMap[data.pdf].setdefault(data.generation, {}) externalReferenceMap[data.pdf][data.generation][data.idnum] = \ IndirectObject(objIndex + 1, 0, self) self.stack = [] self._sweepIndirectReferences(externalReferenceMap, self._root) del self.stack # Begin writing: object_positions = [] stream.write(self._header + b_("\n")) for i in range(len(self._objects)): idnum = (i + 1) obj = self._objects[i] object_positions.append(stream.tell()) stream.write(b_(str(idnum) + " 0 obj\n")) key = None if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: pack1 = struct.pack("<i", i + 1)[:3] pack2 = struct.pack("<i", 0)[:2] key = self._encrypt_key + pack1 + pack2 assert len(key) == (len(self._encrypt_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._encrypt_key) + 5)] if obj is not None: obj.writeToStream(stream, key) stream.write(b_("\nendobj\n")) # xref table xref_location = stream.tell() stream.write(b_("xref\n")) stream.write(b_("0 %s\n" % (len(self._objects) + 1))) stream.write(b_("%010d %05d f \n" % (0, 65535))) for offset in object_positions: stream.write(b_("%010d %05d n \n" % (offset, 0))) # trailer stream.write(b_("trailer\n")) trailer = DictionaryObject() trailer.update({ NameObject("/Size"): NumberObject(len(self._objects) + 1), NameObject("/Root"): self._root, NameObject("/Info"): self._info }) if hasattr(self, "_ID"): trailer[NameObject("/ID")] = self._ID if hasattr(self, "_encrypt"): trailer[NameObject("/Encrypt")] = self._encrypt trailer.writeToStream(stream, None) # eof stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))
def read(self, stream): # start at the end: stream.seek(-1, 2) line = b_('') while not line: line = self.readNextEndLine(stream) if line[:5] != b_("%%EOF"): raise utils.PdfReadError, "EOF marker not found" # find startxref entry - the location of the xref table line = self.readNextEndLine(stream) startxref = int(line) line = self.readNextEndLine(stream) if line[:9] != b_("startxref"): raise utils.PdfReadError, "startxref not found" # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} self.trailer = DictionaryObject() while 1: # load the xref table stream.seek(startxref, 0) x = stream.read(1) if x == b_("x"): # standard cross-reference table ref = stream.read(4) if ref[:3] != b_("ref"): raise utils.PdfReadError, "xref table read error" readNonWhitespace(stream) stream.seek(-1, 1) # check if the first time looking at the xref table firsttime = True while True: num = readObject(stream, self) if firsttime and num != 0: self.xrefIndex = num warnings.warn( "Xref table not zero-indexed. ID " "numbers for objects will %sbe " "corrected." % ("" if not self.strict else "not "), utils.PdfReadWarning) # if table not zero indexed, could be due to # error from when PDF was created # which will lead to mismatched indices later on firsttime = False readNonWhitespace(stream) stream.seek(-1, 1) size = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) cnt = 0 while cnt < size: line = stream.read(20) # It's very clear in section 3.4.3 of the PDF spec # that all cross-reference table lines are a fixed # 20 bytes (as of PDF 1.7). However, some files have # 21-byte entries (or more) due to the use of \r\n # (CRLF) EOL's. Detect that case, and adjust the line # until it does not begin with a \r (CR) or \n (LF). while line[0] in b_("\x0D\x0A"): stream.seek(-20 + 1, 1) line = stream.read(20) # On the other hand, some malformed PDF files # use a single character EOL without a preceeding # space. Detect that case, and seek the stream # back one character. (0-9 means we've bled into # the next xref entry, t means we've bled into the # text "trailer"): if line[-1] in b_("0123456789t"): stream.seek(-1, 1) offset, generation = line[:16].split(b_(" ")) offset, generation = int(offset), int(generation) self.xref.setdefault(generation, {}) if num in self.xref[generation]: # It really seems like we should allow the last # xref table in the file to override previous # ones. Since we read the file backwards, assume # any existing key is already set correctly. pass else: self.xref[generation][num] = offset cnt += 1 num += 1 readNonWhitespace(stream) stream.seek(-1, 1) trailertag = stream.read(7) if trailertag != b_("trailer"): # more xrefs! stream.seek(-7, 1) else: break readNonWhitespace(stream) stream.seek(-1, 1) newTrailer = readObject(stream, self) for key, value in newTrailer.items(): self.trailer.setdefault(key, value) if "/Prev" in newTrailer: startxref = newTrailer["/Prev"] else: break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) idnum, generation = self.readObjectHeader(stream) xrefstream = readObject(stream, self) assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) streamData = StringIO(xrefstream.getData()) idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) entrySizes = xrefstream.get("/W") for num, size in self._pairs(idx_pairs): cnt = 0 while cnt < size: for i in range(len(entrySizes)): d = streamData.read(entrySizes[i]) di = convertToInt(d, entrySizes[i]) if i == 0: xref_type = di elif i == 1: if xref_type == 0: # next_free_object = di pass elif xref_type == 1: byte_offset = di elif xref_type == 2: objstr_num = di elif i == 2: if xref_type == 0: # next_generation = di pass elif xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 0: pass elif xref_type == 1: if generation not in self.xref: self.xref[generation] = {} if not num in self.xref[generation]: self.xref[generation][num] = byte_offset elif xref_type == 2: if not num in self.xref_objStm: self.xref_objStm[num] = [objstr_num, obstr_idx] cnt += 1 num += 1 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" for key in trailerKeys: if key in xrefstream and key not in self.trailer: self.trailer[NameObject(key)] = xrefstream.raw_get(key) if "/Prev" in xrefstream: startxref = xrefstream["/Prev"] else: break else: # bad xref character at startxref. Let's see if we can find # the xref table nearby, as we've observed this error with an # off-by-one before. stream.seek(-11, 1) tmp = stream.read(20) xref_loc = tmp.find(b_("xref")) if xref_loc != -1: startxref -= (10 - xref_loc) continue else: # no xref table found at specified location assert False break # if not zero-indexed, verify that the table is correct # change it if necessary if self.xrefIndex and not self.strict: loc = stream.tell() for gen in self.xref: if gen == 65535: continue for id in self.xref[gen]: stream.seek(self.xref[gen][id], 0) pid, pgen = self.readObjectHeader(stream) if pid == id - self.xrefIndex: self._zeroXref(gen) break # if not, then either it's just plain wrong, # or the non-zero-index is actually correct stream.seek(loc, 0) # return to where it was