Exemplo n.º 1
0
    def create_from_file(cls, f: io.BufferedReader, doc):
        char = f.peek(1)[0:1]
        # TODO:
        if char == b't' or char == b'f':
            return PdfBooleanObject.create_from_file(f)
        elif char in [
                b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9',
                b'+', b'-', b'.'
        ]:
            o = f.tell()
            n = PdfNumericObject.create_from_file(f)
            if n.value < 0 or n.value - int(
                    n.value
            ) != 0:  # a decimal or a negative number, never a indirect obj
                return n
            o2 = f.tell()

            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
            n2, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
            if re.match(
                    rb'\d+$', n2
            ) is None:  # next token not a number, never an indirect obj
                f.seek(o2, io.SEEK_SET)
                return n
            else:
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                s, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
                if s == b'obj':  # all 3 tokens are correct, an indirect obj
                    f.seek(o, io.SEEK_SET)
                    return PdfIndirectObject.create_from_file(f, doc)
                elif s == b'R':  # all 3 tokens are correct, an indirect reference
                    f.seek(o, io.SEEK_SET)
                    return PdfReferenceObject.create_from_file(f, doc)
                else:
                    f.seek(o2, io.SEEK_SET)
                    return n
        elif char == b'(':
            return PdfLiteralStringObject.create_from_file(f)
        elif char == b'<':
            char = utils.peek_at_least(f, 2)[0:2]
            if char == b'<<':
                dictobj = PdfDictionaryObject.create_from_file(f, doc)
                return dictobj
            else:
                return PdfHexStringObject.create_from_file(f)
        elif char == b'/':
            return PdfNameObject.create_from_file(f)
        elif char == b'[':
            return PdfArrayObject.create_from_file(f, doc)
        elif char == b'n':
            return PdfNullObject.create_from_file(f)
        else:
            raise Exception(f'Unknown token at {f.tell()}')
Exemplo n.º 2
0
def decode_objstm(objstmobj, doc):
    result = {}
    streamObj = objstmobj.value
    objstmobj_no = objstmobj.obj_no
    if not isinstance(streamObj, PdfStreamObject):
        raise ValueError(
            'objstmobj is not a PdfIndirectObject containing a PdfStreamObject'
        )

    objbytestream = io.BufferedReader(io.BytesIO(streamObj.decode()))
    # N pairs of integers
    # 1st int is obj no of the compressed object
    # 2nd int is byte offset of that object, relative to the first obj
    objbytestream.seek(0, io.SEEK_SET)
    utils.seek_until(objbytestream,
                     syntax.NON_WHITESPACES,
                     ignore_comment=True)
    numbers = []
    N = 0
    First = 0
    try:
        # TODO: assuming both N and First have direct obj values
        N = int(str(streamObj.dict['N'].value))
        First = int(str(streamObj.dict['First'].value))
        if N < 0 or First < 0:
            raise Exception(
                f'Invalid N or First field in ObjStm at offset {org_pos}.')
    except Exception as ex:
        raise Exception(
            f'Invalid N or First field in ObjStm at offset {org_pos}.') from ex
    for _ in range(2 * N):
        utils.seek_until(objbytestream,
                         syntax.NON_WHITESPACES,
                         ignore_comment=True)
        numobj = PdfNumericObject.create_from_file(objbytestream)
        try:
            temp = int(str(numobj.value))
            if temp < 0:
                raise Exception(
                    f'Invalid obj no./offset in ObjStm at offset {org_pos}.')
            numbers += [temp]
        except Exception as ex:
            raise Exception(f'Invalid ObjStm at offset {org_pos}.') from ex
    for idx, p in enumerate(utils.chunks(numbers, 2)):
        # gen no, of object stream and of any compressed object is implicitly 0
        objbytestream.seek(First + p[1], io.SEEK_SET)
        result[objstmobj_no, idx] = PdfIndirectObject(
            PdfObject.create_from_file(objbytestream, doc), p[0], 0)
        # TODO: check for orphaned bytes between compressed objectes?

    return result
Exemplo n.º 3
0
 def create_from_file(cls, f: io.BufferedReader, doc):
     org_pos = f.tell()
     token: bytes = f.read(1)
     if token != b'[':
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid array object at offset {org_pos}.')
     result = []
     while True:
         utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
         if f.peek(1)[0:1] != b']':
             try:
                 result += [PdfObject.create_from_file(f, doc)]
             except Exception as ex:
                 raise Exception(
                     f'Parse Error: Not a valid array object at offset {org_pos}.'
                 ) from ex
         else:
             f.read(1)
             break
     return PdfArrayObject(result)
Exemplo n.º 4
0
 def create_from_file(cls, f: io.BufferedReader, doc):
     org_pos = f.tell()
     num, _ = utils.read_until(f, syntax.WHITESPACES)
     if re.match(rb'\d+$', num) is None:
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid reference at offset {org_pos}.')
     obj_no = int(num)
     utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
     num, _ = utils.read_until(f, syntax.WHITESPACES)
     if re.match(rb'\d+$', num) is None:
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid reference at offset {org_pos}.')
     gen_no = int(num)
     utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
     tok, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
     if tok != b'R':
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid reference at offset {org_pos}.')
     return PdfReferenceObject(doc, obj_no, gen_no)
Exemplo n.º 5
0
 def create_from_file(cls, f: io.BufferedReader, doc):
     org_pos = f.tell()
     token: bytes = f.read(2)
     if token != b'<<':
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid dictionary object at offset {org_pos}.'
         )
     result = {}
     while True:
         utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
         if f.peek(1)[0:1] == b'/':
             try:
                 # key, must be a name
                 k = PdfNameObject.create_from_file(f)
                 # value
                 utils.seek_until(f,
                                  syntax.NON_WHITESPACES,
                                  ignore_comment=True)
                 v = PdfObject.create_from_file(f, doc)
                 # TODO: check if dict use __eq__ for key equality and existence check
                 result[k] = v
             except Exception as ex:
                 raise Exception(
                     f'Parse Error: Not a valid dictionary object at offset {org_pos}.'
                 ) from ex
             continue
         elif utils.peek_at_least(f, 2)[0:2] == b'>>':
             f.read(2)
             break
         else:
             f.seek(org_pos, io.SEEK_SET)
             raise Exception(
                 f'Parse Error: Not a valid dictionary object at offset {org_pos}.'
             )
     return PdfDictionaryObject(result)
Exemplo n.º 6
0
    def get_xref_trailer_at_offset(self, f, offset):
        # read xref, trailer should directly follow, and MUST be read TOGETHER with xref
        # linearized PDF specified the last appering trailer DOES NOT have Prev entry, and startxref points to 1st page xref table near start of file
        # which has its own trailer, making the last trailer technically the 'first' trailer
        # therefore, searching for trailer dict from end of file would get the wrong trailer dict
        # moreover, in a xref stream, the xref and trailer dict is lumped together as the stream object
        if offset in self.offset_xref_trailer:
            return self.offset_xref_trailer[offset]
        f.seek(offset, io.SEEK_SET)
        temp, _ = utils.read_until(f, syntax.EOL)
        f.seek(offset, io.SEEK_SET)
        # TODO: catch exception for parsing PdfXRefSection
        if temp == b'xref':
            # uncompressed xref section
            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
            xref_section = PdfXRefSection(f)
            # find trailer dict and Prev
            # trailer dict CAN contain references
            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
            temp, _ = utils.read_until(f, syntax.EOL)
            if temp == b'trailer':
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                trailer_dict = PdfDictionaryObject.create_from_file(f, self)
                self.offset_xref_trailer[offset] = (xref_section, trailer_dict)
            else:
                # TODO: check for objects between xref and trailer dict, and between trailer dict and startxref?
                raise Exception(
                    f'trailer dict not found after xref table at {f.tell() - 7}'
                )
        else:
            # may be compressed xref stream
            # trailer dict IS the stream dict, and CANNOT contain references
            try:
                xref_stream = PdfIndirectObject.create_from_file(f, self)
            except Exception as ex:
                raise Exception('Invalid xref stream') from ex
            xref_section = PdfXRefSection.from_xrefstm(xref_stream)
            self.offset_xref_trailer[offset] = (xref_section,
                                                xref_stream.value.dict)

        return self.offset_xref_trailer[offset]
Exemplo n.º 7
0
    def parse_linear(self, f, progress_cb=None):
        '''Initialize a PdfDocument from a opened PDF file f from the beginning'''
        def print_progress():
            print('', end="\r")
            print(f'{f.tell() / filesize * 100:5.2f}% processed',
                  end='',
                  flush=True)
            if progress_cb is not None:
                progress_cb(f'{f.tell() / filesize * 100:5.2f}% processed',
                            read=f.tell(),
                            total=filesize)

        f.seek(0, io.SEEK_SET)
        filesize = os.fstat(f.fileno()).st_size

        print_progress()

        # First line is header
        s, eol_marker = utils.read_until(f, syntax.EOL)
        header = re.match(rb'%PDF-(\d+\.\d+)', s)
        if header:
            self.version = Decimal(header.group(1).decode('iso-8859-1'))
            f.seek(len(eol_marker), io.SEEK_CUR)
        else:
            raise Exception('Not a PDF file')

        while True:
            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=False)
            if f.tell() >= filesize:
                break
            org_pos = f.tell()
            s, eol_marker = utils.read_until(f, syntax.EOL)
            if s == b'startxref':  # the last startxref always override the ones before
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                t, _ = utils.read_until(f, syntax.EOL)
                self.startxref = int(t)
                self.increments[-1]['startxref'] = self.startxref
                continue
            elif s == b'xref':
                f.seek(-4, io.SEEK_CUR)
                self.increments[-1]['xref_section'] = PdfXRefSection(f)
                self.offset_xref[org_pos] = self.increments[-1]['xref_section']
                continue
            elif s == b'trailer':
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                self.increments[-1][
                    'trailer'] = PdfDictionaryObject.create_from_file(f, self)
                continue
            elif s == b'%%EOF':
                # TODO: check if trailer dict immediately precedes %%EOF
                # since we are seeking until non-ws, the only case EOF marker
                # does not appear by itself it when it is preceded by some
                # whitespaces, which should be ignored
                self.increments[-1]['eof'] = True
                f.seek(5 + len(eol_marker), io.SEEK_CUR)
                continue
            elif s[0:1] == b'%':
                # otherwise, it is a comment, ignore the whole remaining line
                utils.seek_until(f, syntax.EOL)
                continue
            #else:

            f.seek(org_pos, io.SEEK_SET)
            if self.increments[-1]['eof']:
                self.increments += [{
                    'body': [],
                    'xref_section': None,
                    'trailer': None,
                    'startxref': None,
                    'eof': False
                }]

            # TODO: how to handle object parse error?
            new_obj = PdfObject.create_from_file(f, self)
            self.increments[-1]['body'] += [new_obj]
            self.offset_obj[org_pos] = new_obj
            if isinstance(new_obj.value, PdfStreamObject
                          ) and new_obj.value.dict.get('Type') == 'ObjStm':
                self.offset_obj_streams[org_pos] = new_obj
            print_progress()

        print('', end="\r")
        print('100% processed    ')
        if progress_cb is not None:
            progress_cb('100% processed', read=f.tell(), total=filesize)
        self.ready = True

        print('Decoding object streams...')
        if progress_cb is not None:
            progress_cb('Decoding object streams...',
                        read=f.tell(),
                        total=filesize)
        for k in self.offset_obj_streams:
            from objstm import decode_objstm
            self.compressed_obj = {
                **(self.compressed_obj),
                **(decode_objstm(self.offset_obj_streams[k], self))
            }
        print('Done')
        if progress_cb is not None:
            progress_cb('Done', read=f.tell(), total=filesize)
Exemplo n.º 8
0
    def create_from_file(cls, f: io.BufferedReader, doc):
        org_pos = f.tell()

        stream_dict = PdfObject.create_from_file(f, doc)
        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)

        if not (utils.peek_at_least(f, 7)[0:7] == b'stream\n'
                or utils.peek_at_least(f, 8)[0:8] == b'stream\r\n'):
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')
        token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)

        # check if dict has the required key /Length with valid values
        if not isinstance(stream_dict, PdfDictionaryObject):
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')
        if stream_dict.get('Length') is None:
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')

        if isinstance(stream_dict['Length'], PdfReferenceObject):
            size = stream_dict['Length'].deref().value
        elif isinstance(stream_dict['Length'], PdfNumericObject):
            size = stream_dict['Length'].value
        else:
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')
        if size.as_integer_ratio()[0] <= 0 or size.as_integer_ratio()[1] != 1:
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')
        size = size.as_integer_ratio()[0]

        # check for filters
        filt = None
        if stream_dict.get('Filter') is not None:
            # TODO: Remove the assumption that Filter value is always a direct obj
            filt = stream_dict['Filter']
            if isinstance(filt, PdfArrayObject):
                if any(not isinstance(x, PdfNameObject) for x in filt.value):
                    raise Exception(
                        f'Parse Error: Not a valid stream object at offset {org_pos}.'
                    )
                filt = filt.value[0]
            # /Filter (or first element of the array) must specify a Name
            if not isinstance(filt, PdfNameObject):
                raise Exception(
                    f'Parse Error: Not a valid stream object at offset {org_pos}.'
                )

        # read only /Length bytes
        # filter implementation is reponsible for checking if the data length is correct
        # e.g. if any needed end-of-data marker is present at only the end
        raw = f.read(size)

        # check if stream ends with b'endstream', optionally preceeded by b'\r', 'b'\n' or b'\r\n'
        if utils.peek_at_least(f, 2)[0:2] == b'\r\n':
            f.seek(2, io.SEEK_CUR)
        elif utils.peek_at_least(f, 1)[0:1] == b'\r' or utils.peek_at_least(
                f, 1)[0:1] == b'\n':
            f.seek(1, io.SEEK_CUR)
        token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
        if token != b'endstream':
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')

        # actual decoding is done in constructor
        return PdfStreamObject(stream_dict, raw)
Exemplo n.º 9
0
 def inner2():
     f.seek(inner_content_pos, io.SEEK_SET)
     obj = PdfObject.create_from_file(f, doc)
     utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
     return obj
Exemplo n.º 10
0
    def create_from_file(cls, f: io.BufferedReader, doc):
        org_pos = f.tell()
        num, _ = utils.read_until(f, syntax.WHITESPACES)
        if re.match(rb'\d+$', num) is None:
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid indirect object at offset {org_pos}.'
            )
        obj_no = int(num)
        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
        num, _ = utils.read_until(f, syntax.WHITESPACES)
        if re.match(rb'\d+$', num) is None:
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid indirect object at offset {org_pos}.'
            )
        gen_no = int(num)
        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
        tok, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
        if tok != b'obj':
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid indirect object at offset {org_pos}.'
            )

        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
        inner_content_pos = f.tell()

        # parse inner object
        def inner2():
            f.seek(inner_content_pos, io.SEEK_SET)
            obj = PdfObject.create_from_file(f, doc)
            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
            return obj

        inner_obj = inner2()

        # if inner object is a dict, and is followed by a stream extent, then the object should be stream object
        # otherwise, if there is no endobj token, it is an error
        temp = f.tell()
        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
        token, endtoken = utils.read_until(f,
                                           syntax.DELIMS + syntax.WHITESPACES,
                                           maxsize=7)
        f.seek(temp, io.SEEK_SET)
        if not (token == b'endobj' and
                (endtoken != b''
                 or endtoken is None)):  # endtoken None to indicate EOF
            if utils.peek_at_least(
                    f, 7)[0:7] == b'stream\n' or utils.peek_at_least(
                        f, 8)[0:8] == b'stream\r\n':
                f.seek(inner_content_pos, io.SEEK_SET)
                streamObj = PdfStreamObject.create_from_file(f, doc)
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                token, endtoken = utils.read_until(f,
                                                   syntax.DELIMS +
                                                   syntax.WHITESPACES,
                                                   maxsize=7)
                if not (token == b'endobj' and
                        (endtoken != b'' or endtoken is None)):
                    f.seek(org_pos, io.SEEK_SET)
                    raise Exception(
                        f'Parse Error: Not a valid indirect object at offset {org_pos}.'
                    )
                inner_obj = streamObj
                #if streamObj.dict.get('Type') == 'ObjStm': # Object Stream, decode and parse the content

            else:
                f.seek(org_pos, io.SEEK_SET)
                raise Exception(
                    f'Parse Error: Not a valid indirect object at offset {org_pos}.'
                )
        else:
            f.seek(6, io.SEEK_CUR)

        return PdfIndirectObject(inner_obj, obj_no, gen_no)