def _readInlineImage(self, stream): # begin reading just after the "BI" - begin image # first read the dictionary of settings. settings = DictionaryObject() while True: tok = readNonWhitespace(stream) stream.seek(-1, 1) if tok == "I": # "ID" - begin of image data break key = readObject(stream, self.pdf) tok = readNonWhitespace(stream) stream.seek(-1, 1) value = readObject(stream, self.pdf) settings[key] = value # left at beginning of ID tmp = stream.read(3) assert tmp[:2] == "ID" data = "" while True: tok = stream.read(1) if tok == "E": next = stream.read(1) if next == "I": break else: stream.seek(-1, 1) data += tok else: data += tok readNonWhitespace(stream) stream.seek(-1, 1) return {"settings": settings, "data": data}
def __parseContentStream(self, stream): stream.seek(0, 0) operands = [] while True: peek = readNonWhitespace(stream) if peek == '': break stream.seek(-1, 1) if peek.isalpha() or peek == "'" or peek == '"': operator = "" while True: tok = stream.read(1) if tok.isspace() or tok in NameObject.delimiterCharacters: stream.seek(-1, 1) break elif tok == '': break operator += tok if operator == "BI": # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... assert operands == [] ii = self._readInlineImage(stream) self.operations.append((ii, "INLINE IMAGE")) else: self.operations.append((operands, operator)) operands = [] elif peek == '%': # If we encounter a comment in the content stream, we have to # handle it here. Typically, readObject will handle # encountering a comment -- but readObject assumes that # following the comment must be the object we're trying to # read. In this case, it could be an operator instead. while peek not in ('\r', '\n'): peek = stream.read(1) else: operands.append(readObject(stream, None))
def read(self, stream): # start at the end: stream.seek(-1, 2) line = b_('') while not line: line = self.readNextEndLine(stream) if line[:5] != b_("%%EOF"): raise utils.PdfReadError, "EOF marker not found" # find startxref entry - the location of the xref table line = self.readNextEndLine(stream) startxref = int(line) line = self.readNextEndLine(stream) if line[:9] != b_("startxref"): raise utils.PdfReadError, "startxref not found" # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} self.trailer = DictionaryObject() while 1: # load the xref table stream.seek(startxref, 0) x = stream.read(1) if x == b_("x"): # standard cross-reference table ref = stream.read(4) if ref[:3] != b_("ref"): raise utils.PdfReadError, "xref table read error" readNonWhitespace(stream) stream.seek(-1, 1) # check if the first time looking at the xref table firsttime = True while True: num = readObject(stream, self) if firsttime and num != 0: self.xrefIndex = num warnings.warn("Xref table not zero-indexed. ID " "numbers for objects will %sbe " "corrected." % ("" if not self.strict else "not "), utils.PdfReadWarning) # if table not zero indexed, could be due to # error from when PDF was created # which will lead to mismatched indices later on firsttime = False readNonWhitespace(stream) stream.seek(-1, 1) size = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) cnt = 0 while cnt < size: line = stream.read(20) # It's very clear in section 3.4.3 of the PDF spec # that all cross-reference table lines are a fixed # 20 bytes (as of PDF 1.7). However, some files have # 21-byte entries (or more) due to the use of \r\n # (CRLF) EOL's. Detect that case, and adjust the line # until it does not begin with a \r (CR) or \n (LF). while line[0] in b_("\x0D\x0A"): stream.seek(-20 + 1, 1) line = stream.read(20) # On the other hand, some malformed PDF files # use a single character EOL without a preceeding # space. Detect that case, and seek the stream # back one character. (0-9 means we've bled into # the next xref entry, t means we've bled into the # text "trailer"): if line[-1] in b_("0123456789t"): stream.seek(-1, 1) offset, generation = line[:16].split(b_(" ")) offset, generation = int(offset), int(generation) self.xref.setdefault(generation, {}) if num in self.xref[generation]: # It really seems like we should allow the last # xref table in the file to override previous # ones. Since we read the file backwards, assume # any existing key is already set correctly. pass else: self.xref[generation][num] = offset cnt += 1 num += 1 readNonWhitespace(stream) stream.seek(-1, 1) trailertag = stream.read(7) if trailertag != b_("trailer"): # more xrefs! stream.seek(-7, 1) else: break readNonWhitespace(stream) stream.seek(-1, 1) newTrailer = readObject(stream, self) for key, value in newTrailer.items(): self.trailer.setdefault(key, value) if "/Prev" in newTrailer: startxref = newTrailer["/Prev"] else: break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) idnum, generation = self.readObjectHeader(stream) xrefstream = readObject(stream, self) assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) streamData = StringIO(xrefstream.getData()) idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) entrySizes = xrefstream.get("/W") for num, size in self._pairs(idx_pairs): cnt = 0 while cnt < size: for i in range(len(entrySizes)): d = streamData.read(entrySizes[i]) di = convertToInt(d, entrySizes[i]) if i == 0: xref_type = di elif i == 1: if xref_type == 0: # next_free_object = di pass elif xref_type == 1: byte_offset = di elif xref_type == 2: objstr_num = di elif i == 2: if xref_type == 0: # next_generation = di pass elif xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 0: pass elif xref_type == 1: if generation not in self.xref: self.xref[generation] = {} if not num in self.xref[generation]: self.xref[generation][num] = byte_offset elif xref_type == 2: if not num in self.xref_objStm: self.xref_objStm[num] = [objstr_num, obstr_idx] cnt += 1 num += 1 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" for key in trailerKeys: if key in xrefstream and key not in self.trailer: self.trailer[NameObject(key)] = xrefstream.raw_get(key) if "/Prev" in xrefstream: startxref = xrefstream["/Prev"] else: break else: # bad xref character at startxref. Let's see if we can find # the xref table nearby, as we've observed this error with an # off-by-one before. stream.seek(-11, 1) tmp = stream.read(20) xref_loc = tmp.find(b_("xref")) if xref_loc != -1: startxref -= (10 - xref_loc) continue else: # no xref table found at specified location assert False break # if not zero-indexed, verify that the table is correct # change it if necessary if self.xrefIndex and not self.strict: loc = stream.tell() for gen in self.xref: if gen == 65535: continue for id in self.xref[gen]: stream.seek(self.xref[gen][id], 0) pid, pgen = self.readObjectHeader(stream) if pid == id - self.xrefIndex: self._zeroXref(gen) break # if not, then either it's just plain wrong, # or the non-zero-index is actually correct stream.seek(loc, 0) # return to where it was
def getObject(self, indirectReference): retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) if retval is not None: return retval if indirectReference.generation == 0 \ and indirectReference.idnum in self.xref_objStm: # indirect reference to object in object stream # read the entire object stream into memory stmnum, idx = self.xref_objStm[indirectReference.idnum] objStm = IndirectObject(stmnum, 0, self).getObject() assert objStm['/Type'] == '/ObjStm' assert idx < objStm['/N'] streamData = StringIO(objStm.getData()) for i in range(objStm['/N']): objnum = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) offset = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) t = streamData.tell() streamData.seek(objStm['/First']+offset, 0) obj = readObject(streamData, self) self.resolvedObjects[0][objnum] = obj streamData.seek(t, 0) return self.resolvedObjects[0][indirectReference.idnum] if indirectReference.idnum \ not in self.xref[indirectReference.generation]: warnings.warn("Object %d %d not defined." % ( indirectReference.idnum, indirectReference.generation), utils.PdfReadWarning) return None start = self.xref[indirectReference.generation][ indirectReference.idnum] self.stream.seek(start, 0) idnum, generation = self.readObjectHeader(self.stream) try: assert idnum == indirectReference.idnum except AssertionError: if self.xrefIndex: # Xref table probably had bad indexes due to not # being zero-indexed if self.strict: raise utils.PdfReadError( "Expected object ID (%d %d) does " "not match actual (%d %d); xref " "table not zero-indexed." % ( indirectReference.idnum, indirectReference.generation, idnum, generation)) else: # should not happen since the xref table is corrected in # non-strict mode pass else: # some other problem raise utils.PdfReadError("Expected object ID (%d %d) does not " " match actual (%d %d)." % ( indirectReference.idnum, indirectReference.generation, idnum, generation)) assert generation == indirectReference.generation retval = readObject(self.stream, self) # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self.isEncrypted: # if we don't have the encryption key: if not hasattr(self, '_decryption_key'): raise Exception("file has not been decrypted") # otherwise, decrypt here... pack1 = struct.pack("<i", indirectReference.idnum)[:3] pack2 = struct.pack("<i", indirectReference.generation)[:2] key = self._decryption_key + pack1 + pack2 assert len(key) == (len(self._decryption_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._decryption_key) + 5)] retval = self._decryptObject(retval, key) self.cacheIndirectObject(generation, idnum, retval) return retval
def read(self, stream): # start at the end: stream.seek(-1, 2) line = b_('') while not line: line = self.readNextEndLine(stream) if line[:5] != b_("%%EOF"): raise utils.PdfReadError, "EOF marker not found" # find startxref entry - the location of the xref table line = self.readNextEndLine(stream) startxref = int(line) line = self.readNextEndLine(stream) if line[:9] != b_("startxref"): raise utils.PdfReadError, "startxref not found" # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} self.trailer = DictionaryObject() while 1: # load the xref table stream.seek(startxref, 0) x = stream.read(1) if x == b_("x"): # standard cross-reference table ref = stream.read(4) if ref[:3] != b_("ref"): raise utils.PdfReadError, "xref table read error" readNonWhitespace(stream) stream.seek(-1, 1) # check if the first time looking at the xref table firsttime = True while True: num = readObject(stream, self) if firsttime and num != 0: self.xrefIndex = num warnings.warn( "Xref table not zero-indexed. ID " "numbers for objects will %sbe " "corrected." % ("" if not self.strict else "not "), utils.PdfReadWarning) # if table not zero indexed, could be due to # error from when PDF was created # which will lead to mismatched indices later on firsttime = False readNonWhitespace(stream) stream.seek(-1, 1) size = readObject(stream, self) readNonWhitespace(stream) stream.seek(-1, 1) cnt = 0 while cnt < size: line = stream.read(20) # It's very clear in section 3.4.3 of the PDF spec # that all cross-reference table lines are a fixed # 20 bytes (as of PDF 1.7). However, some files have # 21-byte entries (or more) due to the use of \r\n # (CRLF) EOL's. Detect that case, and adjust the line # until it does not begin with a \r (CR) or \n (LF). while line[0] in b_("\x0D\x0A"): stream.seek(-20 + 1, 1) line = stream.read(20) # On the other hand, some malformed PDF files # use a single character EOL without a preceeding # space. Detect that case, and seek the stream # back one character. (0-9 means we've bled into # the next xref entry, t means we've bled into the # text "trailer"): if line[-1] in b_("0123456789t"): stream.seek(-1, 1) offset, generation = line[:16].split(b_(" ")) offset, generation = int(offset), int(generation) self.xref.setdefault(generation, {}) if num in self.xref[generation]: # It really seems like we should allow the last # xref table in the file to override previous # ones. Since we read the file backwards, assume # any existing key is already set correctly. pass else: self.xref[generation][num] = offset cnt += 1 num += 1 readNonWhitespace(stream) stream.seek(-1, 1) trailertag = stream.read(7) if trailertag != b_("trailer"): # more xrefs! stream.seek(-7, 1) else: break readNonWhitespace(stream) stream.seek(-1, 1) newTrailer = readObject(stream, self) for key, value in newTrailer.items(): self.trailer.setdefault(key, value) if "/Prev" in newTrailer: startxref = newTrailer["/Prev"] else: break elif x.isdigit(): # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) idnum, generation = self.readObjectHeader(stream) xrefstream = readObject(stream, self) assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) streamData = StringIO(xrefstream.getData()) idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) entrySizes = xrefstream.get("/W") for num, size in self._pairs(idx_pairs): cnt = 0 while cnt < size: for i in range(len(entrySizes)): d = streamData.read(entrySizes[i]) di = convertToInt(d, entrySizes[i]) if i == 0: xref_type = di elif i == 1: if xref_type == 0: # next_free_object = di pass elif xref_type == 1: byte_offset = di elif xref_type == 2: objstr_num = di elif i == 2: if xref_type == 0: # next_generation = di pass elif xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 0: pass elif xref_type == 1: if generation not in self.xref: self.xref[generation] = {} if not num in self.xref[generation]: self.xref[generation][num] = byte_offset elif xref_type == 2: if not num in self.xref_objStm: self.xref_objStm[num] = [objstr_num, obstr_idx] cnt += 1 num += 1 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" for key in trailerKeys: if key in xrefstream and key not in self.trailer: self.trailer[NameObject(key)] = xrefstream.raw_get(key) if "/Prev" in xrefstream: startxref = xrefstream["/Prev"] else: break else: # bad xref character at startxref. Let's see if we can find # the xref table nearby, as we've observed this error with an # off-by-one before. stream.seek(-11, 1) tmp = stream.read(20) xref_loc = tmp.find(b_("xref")) if xref_loc != -1: startxref -= (10 - xref_loc) continue else: # no xref table found at specified location assert False break # if not zero-indexed, verify that the table is correct # change it if necessary if self.xrefIndex and not self.strict: loc = stream.tell() for gen in self.xref: if gen == 65535: continue for id in self.xref[gen]: stream.seek(self.xref[gen][id], 0) pid, pgen = self.readObjectHeader(stream) if pid == id - self.xrefIndex: self._zeroXref(gen) break # if not, then either it's just plain wrong, # or the non-zero-index is actually correct stream.seek(loc, 0) # return to where it was
def getObject(self, indirectReference): retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) if retval is not None: return retval if indirectReference.generation == 0 \ and indirectReference.idnum in self.xref_objStm: # indirect reference to object in object stream # read the entire object stream into memory stmnum, idx = self.xref_objStm[indirectReference.idnum] objStm = IndirectObject(stmnum, 0, self).getObject() assert objStm['/Type'] == '/ObjStm' assert idx < objStm['/N'] streamData = StringIO(objStm.getData()) for i in range(objStm['/N']): objnum = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) offset = NumberObject.readFromStream(streamData) readNonWhitespace(streamData) streamData.seek(-1, 1) t = streamData.tell() streamData.seek(objStm['/First'] + offset, 0) obj = readObject(streamData, self) self.resolvedObjects[0][objnum] = obj streamData.seek(t, 0) return self.resolvedObjects[0][indirectReference.idnum] if indirectReference.idnum \ not in self.xref[indirectReference.generation]: warnings.warn( "Object %d %d not defined." % (indirectReference.idnum, indirectReference.generation), utils.PdfReadWarning) return None start = self.xref[indirectReference.generation][ indirectReference.idnum] self.stream.seek(start, 0) idnum, generation = self.readObjectHeader(self.stream) try: assert idnum == indirectReference.idnum except AssertionError: if self.xrefIndex: # Xref table probably had bad indexes due to not # being zero-indexed if self.strict: raise utils.PdfReadError( "Expected object ID (%d %d) does " "not match actual (%d %d); xref " "table not zero-indexed." % (indirectReference.idnum, indirectReference.generation, idnum, generation)) else: # should not happen since the xref table is corrected in # non-strict mode pass else: # some other problem raise utils.PdfReadError( "Expected object ID (%d %d) does not " " match actual (%d %d)." % (indirectReference.idnum, indirectReference.generation, idnum, generation)) assert generation == indirectReference.generation retval = readObject(self.stream, self) # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self.isEncrypted: # if we don't have the encryption key: if not hasattr(self, '_decryption_key'): raise Exception("file has not been decrypted") # otherwise, decrypt here... pack1 = struct.pack("<i", indirectReference.idnum)[:3] pack2 = struct.pack("<i", indirectReference.generation)[:2] key = self._decryption_key + pack1 + pack2 assert len(key) == (len(self._decryption_key) + 5) md5_hash = md5(key).digest() key = md5_hash[:min(16, len(self._decryption_key) + 5)] retval = self._decryptObject(retval, key) self.cacheIndirectObject(generation, idnum, retval) return retval