Пример #1
0
 def readFromStream(stream, pdf):
     debug = False
     if debug: print((stream.tell()))
     name = stream.read(1)
     if name != NameObject.surfix:
         raise PdfReadError("name read error")
     name += utils.readUntilRegex(stream,
                                  NameObject.delimiterPattern,
                                  ignore_eof=True)
     if debug: print(name)
     try:
         try:
             ret = name.decode('utf-8')
         except (UnicodeEncodeError, UnicodeDecodeError):
             ret = name.decode('gbk')
         return NameObject(ret)
     except (UnicodeEncodeError, UnicodeDecodeError):
         # Name objects should represent irregular characters
         # with a '#' followed by the symbol's hex number
         if not pdf.strict:
             warnings.warn("Illegal character in Name Object",
                           utils.PdfReadWarning)
             return NameObject(name)
         else:
             raise PdfReadError("Illegal character in Name Object")
Пример #2
0
 def readFromStream(stream, pdf):
     idnum = b_("")
     while True:
         tok = stream.read(1)
         if not tok:
             raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
         if tok.isspace():
             break
         idnum += tok
     generation = b_("")
     while True:
         tok = stream.read(1)
         if not tok:
             raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
         if tok.isspace():
             if not generation:
                 continue
             break
         generation += tok
     r = readNonWhitespace(stream)
     if r != b_("R"):
         raise PdfReadError(
             "Error reading indirect object reference at byte %s" %
             utils.hexStr(stream.tell()))
     return IndirectObject(int(idnum), int(generation), pdf)
Пример #3
0
 def decode(self):
     """ algorithm derived from:
     http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html
     and the PDFReference
     """
     cW = self.CLEARDICT
     baos = ""
     while True:
         pW = cW
         cW = self.nextCode()
         if cW == -1:
             raise PdfReadError("Missed the stop code in LZWDecode!")
         if cW == self.STOP:
             break
         elif cW == self.CLEARDICT:
             self.resetDict()
         elif pW == self.CLEARDICT:
             baos += self.dict[cW]
         else:
             if cW < self.dictlen:
                 baos += self.dict[cW]
                 p = self.dict[pW] + self.dict[cW][0]
                 self.dict[self.dictlen] = p
                 self.dictlen += 1
             else:
                 p = self.dict[pW] + self.dict[pW][0]
                 baos += p
                 self.dict[self.dictlen] = p
                 self.dictlen += 1
             if (self.dictlen >= (1 << self.bitspercode) - 1
                     and self.bitspercode < 12):
                 self.bitspercode += 1
     return baos
Пример #4
0
    def decode(data, decodeParms):
        """
        :param data: flate-encoded data.
        :param decodeParms: a dictionary of values, understanding the
            "/Predictor":<int> key only
        :return: the flate-decoded data.
        """
        data = decompress(data)
        predictor = 1

        if decodeParms:
            try:
                from PyPDF2.generic import ArrayObject
                if isinstance(decodeParms, ArrayObject):
                    for decodeParm in decodeParms:
                        if '/Predictor' in decodeParm:
                            predictor = decodeParm['/Predictor']
                else:
                    predictor = decodeParms.get("/Predictor", 1)
            except AttributeError:
                pass  # Usually an array with a null object was read
        # predictor 1 == no predictor
        if predictor != 1:
            # The /Columns param. has 1 as the default value; see ISO 32000,
            # §7.4.4.3 LZWDecode and FlateDecode Parameters, Table 8
            columns = decodeParms.get(LZW.COLUMNS, 1)

            # PNG prediction:
            if 10 <= predictor <= 15:
                data = FlateDecode._decode_png_prediction(data, columns)
            else:
                # unsupported predictor
                raise PdfReadError("Unsupported flatedecode predictor %r" % predictor)
        return data
Пример #5
0
 def _decode_png_prediction(data, columns):
     output = StringIO()
     # PNG prediction can vary from row to row
     rowlength = columns + 1
     assert len(data) % rowlength == 0
     prev_rowdata = (0,) * rowlength
     for row in range(len(data) // rowlength):
         rowdata = [ord_(x) for x in data[(row*rowlength):((row+1)*rowlength)]]
         filterByte = rowdata[0]
         if filterByte == 0:
             pass
         elif filterByte == 1:
             for i in range(2, rowlength):
                 rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256
         elif filterByte == 2:
             for i in range(1, rowlength):
                 rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
         elif filterByte == 3:
             for i in range(1, rowlength):
                 left = rowdata[i-1] if i > 1 else 0
                 floor = math.floor(left + prev_rowdata[i])/2
                 rowdata[i] = (rowdata[i] + int(floor)) % 256
         elif filterByte == 4:
             for i in range(1, rowlength):
                 left = rowdata[i - 1] if i > 1 else 0
                 up = prev_rowdata[i]
                 up_left = prev_rowdata[i - 1] if i > 1 else 0
                 paeth = paethPredictor(left, up, up_left)
                 rowdata[i] = (rowdata[i] + paeth) % 256
         else:
             # unsupported PNG filter
             raise PdfReadError("Unsupported PNG filter %r" % filterByte)
         prev_rowdata = rowdata
         output.write(''.join([chr(x) for x in rowdata[1:]]))
     return output.getvalue()
Пример #6
0
 def decode(data, decodeParms):
     data = decompress(data)
     predictor = 1
     if decodeParms:
         try:
             from PyPDF2.generic import ArrayObject
             if isinstance(decodeParms, ArrayObject):
                 for decodeParm in decodeParms:
                     if '/Predictor' in decodeParm:
                         predictor = decodeParm['/Predictor']
             else:
                 predictor = decodeParms.get("/Predictor", 1)
         except AttributeError:
             pass  # usually an array with a null object was read
     # predictor 1 == no predictor
     if predictor != 1:
         columns = decodeParms[LZW.COLUMNS]
         # PNG prediction:
         if predictor >= 10 and predictor <= 15:
             data = FlateDecode._decode_png_prediction(data, columns)
         else:
             # unsupported predictor
             raise PdfReadError("Unsupported flatedecode predictor %r" %
                                predictor)
     return data
Пример #7
0
 def readFromStream(stream):
     word = stream.read(4)
     if word == b_("true"):
         return BooleanObject(True)
     elif word == b_("fals"):
         stream.read(1)
         return BooleanObject(False)
     else:
         raise PdfReadError('Could not read Boolean object')
Пример #8
0
 def __init__(self, stream: ContentStream) -> None:
     self.stream = stream
     try:
         data = self.stream.get_data()
         doc_root: Document = parseString(data)
     except ExpatError as e:
         raise PdfReadError(f"XML in XmpInformation was invalid: {e}")
     self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(
         RDF_NAMESPACE, "RDF")[0]
     self.cache: Dict[Any, Any] = {}
Пример #9
0
def test_DictionaryObject_read_from_stream_stream_stream_valid(
        strict, length, should_fail):
    stream = BytesIO(b"<< /S /GoTo /Length %d >>stream\nBT /F1\nendstream\n" %
                     length)

    class Tst:  # to replace pdf
        strict = True

    pdf = Tst()
    pdf.strict = strict
    with pytest.raises(PdfReadError) as exc:
        do = DictionaryObject.read_from_stream(stream, pdf)
        # TODO: What should happen with the stream?
        assert do == {"/S": "/GoTo"}
        if length in (6, 10):
            assert b"BT /F1" in do._StreamObject__data
        raise PdfReadError("__ALLGOOD__")
    print(exc.value)
    assert should_fail ^ (exc.value.args[0] == "__ALLGOOD__")
Пример #10
0
 def readFromStream(stream, pdf):
     arr = ArrayObject()
     tmp = stream.read(1)
     if tmp != b_("["):
         raise PdfReadError("Could not read array")
     while True:
         # skip leading whitespace
         tok = stream.read(1)
         while tok.isspace():
             tok = stream.read(1)
         stream.seek(-1, 1)
         # check for array ending
         peekahead = stream.read(1)
         if peekahead == b_("]"):
             break
         stream.seek(-1, 1)
         # read and append obj
         arr.append(readObject(stream, pdf))
     return arr
Пример #11
0
        def decode(self):
            """
            TIFF 6.0 specification explains in sufficient details the steps to
            implement the LZW encode() and decode() algorithms.

            algorithm derived from:
            http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html
            and the PDFReference

            :rtype: bytes
            """
            cW = self.CLEARDICT
            baos=""
            while True:
                pW = cW
                cW = self.nextCode()
                if cW == -1:
                    raise PdfReadError("Missed the stop code in LZWDecode!")
                if cW == self.STOP:
                    break
                elif cW == self.CLEARDICT:
                    self.resetDict()
                elif pW == self.CLEARDICT:
                    baos+=self.dict[cW]
                else:
                    if cW < self.dictlen:
                        baos += self.dict[cW]
                        p=self.dict[pW]+self.dict[cW][0]
                        self.dict[self.dictlen]=p
                        self.dictlen+=1
                    else:
                        p=self.dict[pW]+self.dict[pW][0]
                        baos+=p
                        self.dict[self.dictlen] = p
                        self.dictlen+=1
                    if (self.dictlen >= (1 << self.bitspercode) - 1 and
                        self.bitspercode < 12):
                        self.bitspercode+=1
            return baos
Пример #12
0
    def __init__(self, title, page, typ, *args):
        DictionaryObject.__init__(self)
        self[NameObject("/Title")] = title
        self[NameObject("/Page")] = page
        self[NameObject("/Type")] = typ

        from PyPDF2.constants import TypArguments as TA
        from PyPDF2.constants import TypFitArguments as TF

        # from table 8.2 of the PDF 1.7 reference.
        if typ == "/XYZ":
            (self[NameObject(TA.LEFT)], self[NameObject(TA.TOP)],
             self[NameObject("/Zoom")]) = args
        elif typ == TF.FIT_R:
            (self[NameObject(TA.LEFT)], self[NameObject(TA.BOTTOM)],
             self[NameObject(TA.RIGHT)], self[NameObject(TA.TOP)]) = args
        elif typ in [TF.FIT_H, TF.FIT_BH]:
            self[NameObject(TA.TOP)], = args
        elif typ in [TF.FIT_V, TF.FIT_BV]:
            self[NameObject(TA.LEFT)], = args
        elif typ in [TF.FIT, TF.FIT_B]:
            pass
        else:
            raise PdfReadError("Unknown Destination Type: %r" % typ)
Пример #13
0
 def decode(data, decodeParms):
     data = decompress(data)
     predictor = 1
     if decodeParms:
         try:
             from PyPDF2.generic import ArrayObject
             if isinstance(decodeParms, ArrayObject):
                 for decodeParm in decodeParms:
                     if '/Predictor' in decodeParm:
                         predictor = decodeParm['/Predictor']
             else:
                 predictor = decodeParms.get("/Predictor", 1)
         except AttributeError:
             pass  # usually an array with a null object was read
     # predictor 1 == no predictor
     if predictor != 1:
         columns = decodeParms[LZW.COLUMNS]
         # PNG prediction:
         if predictor >= 10 and predictor <= 15:
             output = StringIO()
             # PNG prediction can vary from row to row
             rowlength = columns + 1
             assert len(data) % rowlength == 0
             prev_rowdata = (0, ) * rowlength
             for row in range(len(data) // rowlength):
                 rowdata = [
                     ord_(x) for x in data[(row * rowlength):((row + 1) *
                                                              rowlength)]
                 ]
                 filterByte = rowdata[0]
                 if filterByte == 0:
                     pass
                 elif filterByte == 1:
                     for i in range(2, rowlength):
                         rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256
                 elif filterByte == 2:
                     for i in range(1, rowlength):
                         rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
                 elif filterByte == 3:
                     for i in range(1, rowlength):
                         left = rowdata[i - 1] if i > 1 else 0
                         floor = math.floor(left + prev_rowdata[i]) / 2
                         rowdata[i] = (rowdata[i] + int(floor)) % 256
                 elif filterByte == 4:
                     for i in range(1, rowlength):
                         left = rowdata[i - 1] if i > 1 else 0
                         up = prev_rowdata[i]
                         up_left = prev_rowdata[i - 1] if i > 1 else 0
                         paeth = paethPredictor(left, up, up_left)
                         rowdata[i] = (rowdata[i] + paeth) % 256
                 else:
                     # unsupported PNG filter
                     raise PdfReadError("Unsupported PNG filter %r" %
                                        filterByte)
                 prev_rowdata = rowdata
                 output.write(''.join([chr(x) for x in rowdata[1:]]))
             data = output.getvalue()
         else:
             # unsupported predictor
             raise PdfReadError("Unsupported flatedecode predictor %r" %
                                predictor)
     return data
Пример #14
0
 def setData(self, data):
     raise PdfReadError(
         "Creating EncodedStreamObject is not currently supported")
Пример #15
0
    def readFromStream(stream, pdf):
        debug = False
        tmp = stream.read(2)
        if tmp != b_("<<"):
            raise PdfReadError(
                "Dictionary read error at byte %s: stream must begin with '<<'"
                % utils.hexStr(stream.tell()))
        data = {}
        while True:
            tok = readNonWhitespace(stream)
            if tok == b_('\x00'):
                continue
            elif tok == b_('%'):
                stream.seek(-1, 1)
                skipOverComment(stream)
                continue
            if not tok:
                raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

            if debug: print(("Tok:", tok))
            if tok == b_(">"):
                stream.read(1)
                break
            stream.seek(-1, 1)
            key = readObject(stream, pdf)
            tok = readNonWhitespace(stream)
            stream.seek(-1, 1)
            value = readObject(stream, pdf)
            if not data.get(key):
                data[key] = value
            elif pdf.strict:
                # multiple definitions of key not permitted
                raise PdfReadError(
                    "Multiple definitions in dictionary at byte %s for key %s" \
                    % (utils.hexStr(stream.tell()), key))
            else:
                warnings.warn(
                    "Multiple definitions in dictionary at byte %s for key %s" \
                    % (utils.hexStr(stream.tell()), key), PdfReadWarning)

        pos = stream.tell()
        s = readNonWhitespace(stream)
        if s == b_('s') and stream.read(5) == b_('tream'):
            eol = stream.read(1)
            # odd PDF file output has spaces after 'stream' keyword but before EOL.
            # patch provided by Danial Sandler
            while eol == b_(' '):
                eol = stream.read(1)
            assert eol in (b_("\n"), b_("\r"))
            if eol == b_("\r"):
                # read \n after
                if stream.read(1) != b_('\n'):
                    stream.seek(-1, 1)
            # this is a stream object, not a dictionary
            assert SA.LENGTH in data
            length = data[SA.LENGTH]
            if debug: print(data)
            if isinstance(length, IndirectObject):
                t = stream.tell()
                length = pdf.getObject(length)
                stream.seek(t, 0)
            data["__streamdata__"] = stream.read(length)
            if debug: print("here")
            # if debug: print(binascii.hexlify(data["__streamdata__"]))
            e = readNonWhitespace(stream)
            ndstream = stream.read(8)
            if (e + ndstream) != b_("endstream"):
                # (sigh) - the odd PDF file has a length that is too long, so
                # we need to read backwards to find the "endstream" ending.
                # ReportLab (unknown version) generates files with this bug,
                # and Python users into PDF files tend to be our audience.
                # we need to do this to correct the streamdata and chop off
                # an extra character.
                pos = stream.tell()
                stream.seek(-10, 1)
                end = stream.read(9)
                if end == b_("endstream"):
                    # we found it by looking back one character further.
                    data["__streamdata__"] = data["__streamdata__"][:-1]
                else:
                    stream.seek(pos, 0)
                    raise PdfReadError(
                        "Unable to find 'endstream' marker after stream at byte %s."
                        % utils.hexStr(stream.tell()))
        else:
            stream.seek(pos, 0)
        if "__streamdata__" in data:
            return StreamObject.initializeFromDictionary(data)
        else:
            retval = DictionaryObject()
            retval.update(data)
            return retval
Пример #16
0
def readStringFromStream(stream):
    tok = stream.read(1)
    parens = 1
    txt = b_("")
    while True:
        tok = stream.read(1)
        if not tok:
            raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
        if tok == b_("("):
            parens += 1
        elif tok == b_(")"):
            parens -= 1
            if parens == 0:
                break
        elif tok == b_("\\"):
            tok = stream.read(1)
            ESCAPE_DICT = {
                b_("n"): b_("\n"),
                b_("r"): b_("\r"),
                b_("t"): b_("\t"),
                b_("b"): b_("\b"),
                b_("f"): b_("\f"),
                b_("c"): b_(r"\c"),
                b_("("): b_("("),
                b_(")"): b_(")"),
                b_("/"): b_("/"),
                b_("\\"): b_("\\"),
                b_(" "): b_(" "),
                b_("/"): b_("/"),
                b_("%"): b_("%"),
                b_("<"): b_("<"),
                b_(">"): b_(">"),
                b_("["): b_("["),
                b_("]"): b_("]"),
                b_("#"): b_("#"),
                b_("_"): b_("_"),
                b_("&"): b_("&"),
                b_('$'): b_('$'),
            }
            try:
                tok = ESCAPE_DICT[tok]
            except KeyError:
                if tok.isdigit():
                    # "The number ddd may consist of one, two, or three
                    # octal digits; high-order overflow shall be ignored.
                    # Three octal digits shall be used, with leading zeros
                    # as needed, if the next character of the string is also
                    # a digit." (PDF reference 7.3.4.2, p 16)
                    for _ in range(2):
                        ntok = stream.read(1)
                        if ntok.isdigit():
                            tok += ntok
                        else:
                            break
                    tok = b_(chr(int(tok, base=8)))
                elif tok in b_("\n\r"):
                    # This case is  hit when a backslash followed by a line
                    # break occurs.  If it's a multi-char EOL, consume the
                    # second character:
                    tok = stream.read(1)
                    if tok not in b_("\n\r"):
                        stream.seek(-1, 1)
                    # Then don't add anything to the actual string, since this
                    # line break was escaped:
                    tok = b_('')
                else:
                    raise PdfReadError(r"Unexpected escaped string: %s" % tok)
        txt += tok
    return createStringObject(txt)
Пример #17
0
 def readFromStream(stream):
     nulltxt = stream.read(4)
     if nulltxt != b_("null"):
         raise PdfReadError("Could not read Null object")
     return NullObject()