Пример #1
0
 def _readInlineImage(self, stream):
     # begin reading just after the "BI" - begin image
     # first read the dictionary of settings.
     settings = DictionaryObject()
     while True:
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         if tok == "I":
             # "ID" - begin of image data
             break
         key = readObject(stream, self.pdf)
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         value = readObject(stream, self.pdf)
         settings[key] = value
     # left at beginning of ID
     tmp = stream.read(3)
     assert tmp[:2] == "ID"
     data = ""
     while True:
         tok = stream.read(1)
         if tok == "E":
             next = stream.read(1)
             if next == "I":
                 break
             else:
                 stream.seek(-1, 1)
                 data += tok
         else:
             data += tok
     readNonWhitespace(stream)
     stream.seek(-1, 1)
     return {"settings": settings, "data": data}
Пример #2
0
 def _readInlineImage(self, stream):
     # begin reading just after the "BI" - begin image
     # first read the dictionary of settings.
     settings = DictionaryObject()
     while True:
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         if tok == "I":
             # "ID" - begin of image data
             break
         key = readObject(stream, self.pdf)
         tok = readNonWhitespace(stream)
         stream.seek(-1, 1)
         value = readObject(stream, self.pdf)
         settings[key] = value
     # left at beginning of ID
     tmp = stream.read(3)
     assert tmp[:2] == "ID"
     data = ""
     while True:
         tok = stream.read(1)
         if tok == "E":
             next = stream.read(1)
             if next == "I":
                 break
             else:
                 stream.seek(-1, 1)
                 data += tok
         else:
             data += tok
     readNonWhitespace(stream)
     stream.seek(-1, 1)
     return {"settings": settings, "data": data}
Пример #3
0
 def __parseContentStream(self, stream):
     stream.seek(0, 0)
     operands = []
     while True:
         peek = readNonWhitespace(stream)
         if peek == '':
             break
         stream.seek(-1, 1)
         if peek.isalpha() or peek == "'" or peek == '"':
             operator = ""
             while True:
                 tok = stream.read(1)
                 if tok.isspace() or tok in NameObject.delimiterCharacters:
                     stream.seek(-1, 1)
                     break
                 elif tok == '':
                     break
                 operator += tok
             if operator == "BI":
                 # begin inline image - a completely different parsing
                 # mechanism is required, of course... thanks buddy...
                 assert operands == []
                 ii = self._readInlineImage(stream)
                 self.operations.append((ii, "INLINE IMAGE"))
             else:
                 self.operations.append((operands, operator))
                 operands = []
         elif peek == '%':
             # If we encounter a comment in the content stream, we have to
             # handle it here.  Typically, readObject will handle
             # encountering a comment -- but readObject assumes that
             # following the comment must be the object we're trying to
             # read.  In this case, it could be an operator instead.
             while peek not in ('\r', '\n'):
                 peek = stream.read(1)
         else:
             operands.append(readObject(stream, None))
Пример #4
0
 def __parseContentStream(self, stream):
     stream.seek(0, 0)
     operands = []
     while True:
         peek = readNonWhitespace(stream)
         if peek == '':
             break
         stream.seek(-1, 1)
         if peek.isalpha() or peek == "'" or peek == '"':
             operator = ""
             while True:
                 tok = stream.read(1)
                 if tok.isspace() or tok in NameObject.delimiterCharacters:
                     stream.seek(-1, 1)
                     break
                 elif tok == '':
                     break
                 operator += tok
             if operator == "BI":
                 # begin inline image - a completely different parsing
                 # mechanism is required, of course... thanks buddy...
                 assert operands == []
                 ii = self._readInlineImage(stream)
                 self.operations.append((ii, "INLINE IMAGE"))
             else:
                 self.operations.append((operands, operator))
                 operands = []
         elif peek == '%':
             # If we encounter a comment in the content stream, we have to
             # handle it here.  Typically, readObject will handle
             # encountering a comment -- but readObject assumes that
             # following the comment must be the object we're trying to
             # read.  In this case, it could be an operator instead.
             while peek not in ('\r', '\n'):
                 peek = stream.read(1)
         else:
             operands.append(readObject(stream, None))
Пример #5
0
    def read(self, stream):
        # start at the end:
        stream.seek(-1, 2)
        line = b_('')
        while not line:
            line = self.readNextEndLine(stream)
        if line[:5] != b_("%%EOF"):
            raise utils.PdfReadError, "EOF marker not found"
        # find startxref entry - the location of the xref table
        line = self.readNextEndLine(stream)
        startxref = int(line)
        line = self.readNextEndLine(stream)
        if line[:9] != b_("startxref"):
            raise utils.PdfReadError, "startxref not found"

        # read all cross reference tables and their trailers
        self.xref = {}
        self.xref_objStm = {}
        self.trailer = DictionaryObject()
        while 1:
            # load the xref table
            stream.seek(startxref, 0)
            x = stream.read(1)
            if x == b_("x"):
                # standard cross-reference table
                ref = stream.read(4)
                if ref[:3] != b_("ref"):
                    raise utils.PdfReadError, "xref table read error"
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                # check if the first time looking at the xref table
                firsttime = True
                while True:
                    num = readObject(stream, self)
                    if firsttime and num != 0:
                        self.xrefIndex = num
                        warnings.warn("Xref table not zero-indexed. ID "
                                      "numbers for objects will %sbe "
                                      "corrected." %
                                      ("" if not self.strict else "not "),
                                      utils.PdfReadWarning)
                         # if table not zero indexed, could be due to
                         # error from when PDF was created
                         # which will lead to mismatched indices later on
                    firsttime = False
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    size = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    cnt = 0
                    while cnt < size:
                        line = stream.read(20)
                        # It's very clear in section 3.4.3 of the PDF spec
                        # that all cross-reference table lines are a fixed
                        # 20 bytes (as of PDF 1.7). However, some files have
                        # 21-byte entries (or more) due to the use of \r\n
                        # (CRLF) EOL's. Detect that case, and adjust the line
                        # until it does not begin with a \r (CR) or \n (LF).
                        while line[0] in b_("\x0D\x0A"):
                            stream.seek(-20 + 1, 1)
                            line = stream.read(20)
                        # On the other hand, some malformed PDF files
                        # use a single character EOL without a preceeding
                        # space.  Detect that case, and seek the stream
                        # back one character.  (0-9 means we've bled into
                        # the next xref entry, t means we've bled into the
                        # text "trailer"):
                        if line[-1] in b_("0123456789t"):
                            stream.seek(-1, 1)
                        offset, generation = line[:16].split(b_(" "))
                        offset, generation = int(offset), int(generation)
                        self.xref.setdefault(generation, {})
                        if num in self.xref[generation]:
                            # It really seems like we should allow the last
                            # xref table in the file to override previous
                            # ones. Since we read the file backwards, assume
                            # any existing key is already set correctly.
                            pass
                        else:
                            self.xref[generation][num] = offset
                        cnt += 1
                        num += 1
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    trailertag = stream.read(7)
                    if trailertag != b_("trailer"):
                        # more xrefs!
                        stream.seek(-7, 1)
                    else:
                        break
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                newTrailer = readObject(stream, self)
                for key, value in newTrailer.items():
                    self.trailer.setdefault(key, value)
                if "/Prev" in newTrailer:
                    startxref = newTrailer["/Prev"]
                else:
                    break
            elif x.isdigit():
                # PDF 1.5+ Cross-Reference Stream
                stream.seek(-1, 1)
                idnum, generation = self.readObjectHeader(stream)
                xrefstream = readObject(stream, self)
                assert xrefstream["/Type"] == "/XRef"
                self.cacheIndirectObject(generation, idnum, xrefstream)
                streamData = StringIO(xrefstream.getData())
                idx_pairs = xrefstream.get("/Index",
                                           [0, xrefstream.get("/Size")])
                entrySizes = xrefstream.get("/W")
                for num, size in self._pairs(idx_pairs):
                    cnt = 0
                    while cnt < size:
                        for i in range(len(entrySizes)):
                            d = streamData.read(entrySizes[i])
                            di = convertToInt(d, entrySizes[i])
                            if i == 0:
                                xref_type = di
                            elif i == 1:
                                if xref_type == 0:
                                    # next_free_object = di
                                    pass
                                elif xref_type == 1:
                                    byte_offset = di
                                elif xref_type == 2:
                                    objstr_num = di
                            elif i == 2:
                                if xref_type == 0:
                                    # next_generation = di
                                    pass
                                elif xref_type == 1:
                                    generation = di
                                elif xref_type == 2:
                                    obstr_idx = di
                        if xref_type == 0:
                            pass
                        elif xref_type == 1:
                            if generation not in self.xref:
                                self.xref[generation] = {}
                            if not num in self.xref[generation]:
                                self.xref[generation][num] = byte_offset
                        elif xref_type == 2:
                            if not num in self.xref_objStm:
                                self.xref_objStm[num] = [objstr_num, obstr_idx]
                        cnt += 1
                        num += 1
                trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
                for key in trailerKeys:
                    if key in xrefstream and key not in self.trailer:
                        self.trailer[NameObject(key)] = xrefstream.raw_get(key)
                if "/Prev" in xrefstream:
                    startxref = xrefstream["/Prev"]
                else:
                    break
            else:
                # bad xref character at startxref.  Let's see if we can find
                # the xref table nearby, as we've observed this error with an
                # off-by-one before.
                stream.seek(-11, 1)
                tmp = stream.read(20)
                xref_loc = tmp.find(b_("xref"))
                if xref_loc != -1:
                    startxref -= (10 - xref_loc)
                    continue
                else:
                    # no xref table found at specified location
                    assert False
                    break
        # if not zero-indexed, verify that the table is correct
        # change it if necessary
        if self.xrefIndex and not self.strict:
            loc = stream.tell()
            for gen in self.xref:
                if gen == 65535:
                    continue
                for id in self.xref[gen]:
                    stream.seek(self.xref[gen][id], 0)
                    pid, pgen = self.readObjectHeader(stream)
                    if pid == id - self.xrefIndex:
                        self._zeroXref(gen)
                        break
                    # if not, then either it's just plain wrong,
                    # or the non-zero-index is actually correct
            stream.seek(loc, 0)  # return to where it was
Пример #6
0
    def getObject(self, indirectReference):
        retval = self.resolvedObjects.get(indirectReference.generation,
                                          {}).get(indirectReference.idnum,
                                                  None)
        if retval is not None:
            return retval
        if indirectReference.generation == 0 \
                and indirectReference.idnum in self.xref_objStm:
            # indirect reference to object in object stream
            # read the entire object stream into memory
            stmnum, idx = self.xref_objStm[indirectReference.idnum]
            objStm = IndirectObject(stmnum, 0, self).getObject()
            assert objStm['/Type'] == '/ObjStm'
            assert idx < objStm['/N']
            streamData = StringIO(objStm.getData())
            for i in range(objStm['/N']):
                objnum = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                offset = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                t = streamData.tell()
                streamData.seek(objStm['/First']+offset, 0)
                obj = readObject(streamData, self)
                self.resolvedObjects[0][objnum] = obj
                streamData.seek(t, 0)
            return self.resolvedObjects[0][indirectReference.idnum]
        if indirectReference.idnum \
                not in self.xref[indirectReference.generation]:
            warnings.warn("Object %d %d not defined." % (
                indirectReference.idnum, indirectReference.generation),
                utils.PdfReadWarning)
            return None
        start = self.xref[indirectReference.generation][
            indirectReference.idnum]
        self.stream.seek(start, 0)
        idnum, generation = self.readObjectHeader(self.stream)
        try:
            assert idnum == indirectReference.idnum
        except AssertionError:
            if self.xrefIndex:
                # Xref table probably had bad indexes due to not
                # being zero-indexed
                if self.strict:
                    raise utils.PdfReadError(
                        "Expected object ID (%d %d) does "
                        "not match actual (%d %d); xref "
                        "table not zero-indexed." % (
                            indirectReference.idnum,
                            indirectReference.generation,
                            idnum,
                            generation))
                else:
                    # should not happen since the xref table is corrected in
                    # non-strict mode
                    pass
            else:  # some other problem
                raise utils.PdfReadError("Expected object ID (%d %d) does not "
                                         " match actual (%d %d)." % (
                                             indirectReference.idnum,
                                             indirectReference.generation,
                                             idnum, generation))
        assert generation == indirectReference.generation
        retval = readObject(self.stream, self)
        # override encryption is used for the /Encrypt dictionary
        if not self._override_encryption and self.isEncrypted:
            # if we don't have the encryption key:
            if not hasattr(self, '_decryption_key'):
                raise Exception("file has not been decrypted")
            # otherwise, decrypt here...
            pack1 = struct.pack("<i", indirectReference.idnum)[:3]
            pack2 = struct.pack("<i", indirectReference.generation)[:2]
            key = self._decryption_key + pack1 + pack2
            assert len(key) == (len(self._decryption_key) + 5)
            md5_hash = md5(key).digest()
            key = md5_hash[:min(16, len(self._decryption_key) + 5)]
            retval = self._decryptObject(retval, key)

        self.cacheIndirectObject(generation, idnum, retval)
        return retval
Пример #7
0
    def read(self, stream):
        # start at the end:
        stream.seek(-1, 2)
        line = b_('')
        while not line:
            line = self.readNextEndLine(stream)
        if line[:5] != b_("%%EOF"):
            raise utils.PdfReadError, "EOF marker not found"
        # find startxref entry - the location of the xref table
        line = self.readNextEndLine(stream)
        startxref = int(line)
        line = self.readNextEndLine(stream)
        if line[:9] != b_("startxref"):
            raise utils.PdfReadError, "startxref not found"

        # read all cross reference tables and their trailers
        self.xref = {}
        self.xref_objStm = {}
        self.trailer = DictionaryObject()
        while 1:
            # load the xref table
            stream.seek(startxref, 0)
            x = stream.read(1)
            if x == b_("x"):
                # standard cross-reference table
                ref = stream.read(4)
                if ref[:3] != b_("ref"):
                    raise utils.PdfReadError, "xref table read error"
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                # check if the first time looking at the xref table
                firsttime = True
                while True:
                    num = readObject(stream, self)
                    if firsttime and num != 0:
                        self.xrefIndex = num
                        warnings.warn(
                            "Xref table not zero-indexed. ID "
                            "numbers for objects will %sbe "
                            "corrected." % ("" if not self.strict else "not "),
                            utils.PdfReadWarning)
                        # if table not zero indexed, could be due to
                        # error from when PDF was created
                        # which will lead to mismatched indices later on
                    firsttime = False
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    size = readObject(stream, self)
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    cnt = 0
                    while cnt < size:
                        line = stream.read(20)
                        # It's very clear in section 3.4.3 of the PDF spec
                        # that all cross-reference table lines are a fixed
                        # 20 bytes (as of PDF 1.7). However, some files have
                        # 21-byte entries (or more) due to the use of \r\n
                        # (CRLF) EOL's. Detect that case, and adjust the line
                        # until it does not begin with a \r (CR) or \n (LF).
                        while line[0] in b_("\x0D\x0A"):
                            stream.seek(-20 + 1, 1)
                            line = stream.read(20)
                        # On the other hand, some malformed PDF files
                        # use a single character EOL without a preceeding
                        # space.  Detect that case, and seek the stream
                        # back one character.  (0-9 means we've bled into
                        # the next xref entry, t means we've bled into the
                        # text "trailer"):
                        if line[-1] in b_("0123456789t"):
                            stream.seek(-1, 1)
                        offset, generation = line[:16].split(b_(" "))
                        offset, generation = int(offset), int(generation)
                        self.xref.setdefault(generation, {})
                        if num in self.xref[generation]:
                            # It really seems like we should allow the last
                            # xref table in the file to override previous
                            # ones. Since we read the file backwards, assume
                            # any existing key is already set correctly.
                            pass
                        else:
                            self.xref[generation][num] = offset
                        cnt += 1
                        num += 1
                    readNonWhitespace(stream)
                    stream.seek(-1, 1)
                    trailertag = stream.read(7)
                    if trailertag != b_("trailer"):
                        # more xrefs!
                        stream.seek(-7, 1)
                    else:
                        break
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                newTrailer = readObject(stream, self)
                for key, value in newTrailer.items():
                    self.trailer.setdefault(key, value)
                if "/Prev" in newTrailer:
                    startxref = newTrailer["/Prev"]
                else:
                    break
            elif x.isdigit():
                # PDF 1.5+ Cross-Reference Stream
                stream.seek(-1, 1)
                idnum, generation = self.readObjectHeader(stream)
                xrefstream = readObject(stream, self)
                assert xrefstream["/Type"] == "/XRef"
                self.cacheIndirectObject(generation, idnum, xrefstream)
                streamData = StringIO(xrefstream.getData())
                idx_pairs = xrefstream.get("/Index",
                                           [0, xrefstream.get("/Size")])
                entrySizes = xrefstream.get("/W")
                for num, size in self._pairs(idx_pairs):
                    cnt = 0
                    while cnt < size:
                        for i in range(len(entrySizes)):
                            d = streamData.read(entrySizes[i])
                            di = convertToInt(d, entrySizes[i])
                            if i == 0:
                                xref_type = di
                            elif i == 1:
                                if xref_type == 0:
                                    # next_free_object = di
                                    pass
                                elif xref_type == 1:
                                    byte_offset = di
                                elif xref_type == 2:
                                    objstr_num = di
                            elif i == 2:
                                if xref_type == 0:
                                    # next_generation = di
                                    pass
                                elif xref_type == 1:
                                    generation = di
                                elif xref_type == 2:
                                    obstr_idx = di
                        if xref_type == 0:
                            pass
                        elif xref_type == 1:
                            if generation not in self.xref:
                                self.xref[generation] = {}
                            if not num in self.xref[generation]:
                                self.xref[generation][num] = byte_offset
                        elif xref_type == 2:
                            if not num in self.xref_objStm:
                                self.xref_objStm[num] = [objstr_num, obstr_idx]
                        cnt += 1
                        num += 1
                trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
                for key in trailerKeys:
                    if key in xrefstream and key not in self.trailer:
                        self.trailer[NameObject(key)] = xrefstream.raw_get(key)
                if "/Prev" in xrefstream:
                    startxref = xrefstream["/Prev"]
                else:
                    break
            else:
                # bad xref character at startxref.  Let's see if we can find
                # the xref table nearby, as we've observed this error with an
                # off-by-one before.
                stream.seek(-11, 1)
                tmp = stream.read(20)
                xref_loc = tmp.find(b_("xref"))
                if xref_loc != -1:
                    startxref -= (10 - xref_loc)
                    continue
                else:
                    # no xref table found at specified location
                    assert False
                    break
        # if not zero-indexed, verify that the table is correct
        # change it if necessary
        if self.xrefIndex and not self.strict:
            loc = stream.tell()
            for gen in self.xref:
                if gen == 65535:
                    continue
                for id in self.xref[gen]:
                    stream.seek(self.xref[gen][id], 0)
                    pid, pgen = self.readObjectHeader(stream)
                    if pid == id - self.xrefIndex:
                        self._zeroXref(gen)
                        break
                    # if not, then either it's just plain wrong,
                    # or the non-zero-index is actually correct
            stream.seek(loc, 0)  # return to where it was
Пример #8
0
    def getObject(self, indirectReference):
        retval = self.resolvedObjects.get(indirectReference.generation,
                                          {}).get(indirectReference.idnum,
                                                  None)
        if retval is not None:
            return retval
        if indirectReference.generation == 0 \
                and indirectReference.idnum in self.xref_objStm:
            # indirect reference to object in object stream
            # read the entire object stream into memory
            stmnum, idx = self.xref_objStm[indirectReference.idnum]
            objStm = IndirectObject(stmnum, 0, self).getObject()
            assert objStm['/Type'] == '/ObjStm'
            assert idx < objStm['/N']
            streamData = StringIO(objStm.getData())
            for i in range(objStm['/N']):
                objnum = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                offset = NumberObject.readFromStream(streamData)
                readNonWhitespace(streamData)
                streamData.seek(-1, 1)
                t = streamData.tell()
                streamData.seek(objStm['/First'] + offset, 0)
                obj = readObject(streamData, self)
                self.resolvedObjects[0][objnum] = obj
                streamData.seek(t, 0)
            return self.resolvedObjects[0][indirectReference.idnum]
        if indirectReference.idnum \
                not in self.xref[indirectReference.generation]:
            warnings.warn(
                "Object %d %d not defined." %
                (indirectReference.idnum, indirectReference.generation),
                utils.PdfReadWarning)
            return None
        start = self.xref[indirectReference.generation][
            indirectReference.idnum]
        self.stream.seek(start, 0)
        idnum, generation = self.readObjectHeader(self.stream)
        try:
            assert idnum == indirectReference.idnum
        except AssertionError:
            if self.xrefIndex:
                # Xref table probably had bad indexes due to not
                # being zero-indexed
                if self.strict:
                    raise utils.PdfReadError(
                        "Expected object ID (%d %d) does "
                        "not match actual (%d %d); xref "
                        "table not zero-indexed." %
                        (indirectReference.idnum, indirectReference.generation,
                         idnum, generation))
                else:
                    # should not happen since the xref table is corrected in
                    # non-strict mode
                    pass
            else:  # some other problem
                raise utils.PdfReadError(
                    "Expected object ID (%d %d) does not "
                    " match actual (%d %d)." %
                    (indirectReference.idnum, indirectReference.generation,
                     idnum, generation))
        assert generation == indirectReference.generation
        retval = readObject(self.stream, self)
        # override encryption is used for the /Encrypt dictionary
        if not self._override_encryption and self.isEncrypted:
            # if we don't have the encryption key:
            if not hasattr(self, '_decryption_key'):
                raise Exception("file has not been decrypted")
            # otherwise, decrypt here...
            pack1 = struct.pack("<i", indirectReference.idnum)[:3]
            pack2 = struct.pack("<i", indirectReference.generation)[:2]
            key = self._decryption_key + pack1 + pack2
            assert len(key) == (len(self._decryption_key) + 5)
            md5_hash = md5(key).digest()
            key = md5_hash[:min(16, len(self._decryption_key) + 5)]
            retval = self._decryptObject(retval, key)

        self.cacheIndirectObject(generation, idnum, retval)
        return retval