Exemplo n.º 1
0
    def get_obj(self, obj_num, gen_num):
        if not self.ready:
            raise Exception(
                'get_obj can only be called after the document is scanned completely.'
            )
        startxref_found = False
        startxref = -1
        for increment in range(len(self.increments)):
            increment = -(increment + 1)  # increment from -1 to -len
            xref_section = self.increments[increment]['xref_section']

            offset = xref_section.get_obj_offset(obj_num, gen_num)
            if offset is None:
                # offset is None <=> obj_num not found
                continue
            elif isinstance(offset, tuple):
                # TODO: handle the case where obj is not already cached
                return self.compressed_obj[offset]
            elif offset > 0:
                if self.offset_obj.get(offset) is None:
                    temp_f = open(self.__f.name, 'rb')
                    temp_f.seek(offset, io.SEEK_SET)
                    self.offset_obj[offset] = PdfObject.create_from_file(
                        temp_f, self)
                return self.offset_obj[offset]
            elif offset == 0:
                # offset = 0 <=> obj_num is free at gen_num
                return None

        raise Exception('Object not found')
Exemplo n.º 2
0
def decode_objstm(objstmobj, doc):
    result = {}
    streamObj = objstmobj.value
    objstmobj_no = objstmobj.obj_no
    if not isinstance(streamObj, PdfStreamObject):
        raise ValueError(
            'objstmobj is not a PdfIndirectObject containing a PdfStreamObject'
        )

    objbytestream = io.BufferedReader(io.BytesIO(streamObj.decode()))
    # N pairs of integers
    # 1st int is obj no of the compressed object
    # 2nd int is byte offset of that object, relative to the first obj
    objbytestream.seek(0, io.SEEK_SET)
    utils.seek_until(objbytestream,
                     syntax.NON_WHITESPACES,
                     ignore_comment=True)
    numbers = []
    N = 0
    First = 0
    try:
        # TODO: assuming both N and First have direct obj values
        N = int(str(streamObj.dict['N'].value))
        First = int(str(streamObj.dict['First'].value))
        if N < 0 or First < 0:
            raise Exception(
                f'Invalid N or First field in ObjStm at offset {org_pos}.')
    except Exception as ex:
        raise Exception(
            f'Invalid N or First field in ObjStm at offset {org_pos}.') from ex
    for _ in range(2 * N):
        utils.seek_until(objbytestream,
                         syntax.NON_WHITESPACES,
                         ignore_comment=True)
        numobj = PdfNumericObject.create_from_file(objbytestream)
        try:
            temp = int(str(numobj.value))
            if temp < 0:
                raise Exception(
                    f'Invalid obj no./offset in ObjStm at offset {org_pos}.')
            numbers += [temp]
        except Exception as ex:
            raise Exception(f'Invalid ObjStm at offset {org_pos}.') from ex
    for idx, p in enumerate(utils.chunks(numbers, 2)):
        # gen no, of object stream and of any compressed object is implicitly 0
        objbytestream.seek(First + p[1], io.SEEK_SET)
        result[objstmobj_no, idx] = PdfIndirectObject(
            PdfObject.create_from_file(objbytestream, doc), p[0], 0)
        # TODO: check for orphaned bytes between compressed objectes?

    return result
Exemplo n.º 3
0
    def parse_linear(self, f, progress_cb=None):
        '''Initialize a PdfDocument from a opened PDF file f from the beginning'''
        def print_progress():
            print('', end="\r")
            print(f'{f.tell() / filesize * 100:5.2f}% processed',
                  end='',
                  flush=True)
            if progress_cb is not None:
                progress_cb(f'{f.tell() / filesize * 100:5.2f}% processed',
                            read=f.tell(),
                            total=filesize)

        f.seek(0, io.SEEK_SET)
        filesize = os.fstat(f.fileno()).st_size

        print_progress()

        # First line is header
        s, eol_marker = utils.read_until(f, syntax.EOL)
        header = re.match(rb'%PDF-(\d+\.\d+)', s)
        if header:
            self.version = Decimal(header.group(1).decode('iso-8859-1'))
            f.seek(len(eol_marker), io.SEEK_CUR)
        else:
            raise Exception('Not a PDF file')

        while True:
            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=False)
            if f.tell() >= filesize:
                break
            org_pos = f.tell()
            s, eol_marker = utils.read_until(f, syntax.EOL)
            if s == b'startxref':  # the last startxref always override the ones before
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                t, _ = utils.read_until(f, syntax.EOL)
                self.startxref = int(t)
                self.increments[-1]['startxref'] = self.startxref
                continue
            elif s == b'xref':
                f.seek(-4, io.SEEK_CUR)
                self.increments[-1]['xref_section'] = PdfXRefSection(f)
                self.offset_xref[org_pos] = self.increments[-1]['xref_section']
                continue
            elif s == b'trailer':
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                self.increments[-1][
                    'trailer'] = PdfDictionaryObject.create_from_file(f, self)
                continue
            elif s == b'%%EOF':
                # TODO: check if trailer dict immediately precedes %%EOF
                # since we are seeking until non-ws, the only case EOF marker
                # does not appear by itself it when it is preceded by some
                # whitespaces, which should be ignored
                self.increments[-1]['eof'] = True
                f.seek(5 + len(eol_marker), io.SEEK_CUR)
                continue
            elif s[0:1] == b'%':
                # otherwise, it is a comment, ignore the whole remaining line
                utils.seek_until(f, syntax.EOL)
                continue
            #else:

            f.seek(org_pos, io.SEEK_SET)
            if self.increments[-1]['eof']:
                self.increments += [{
                    'body': [],
                    'xref_section': None,
                    'trailer': None,
                    'startxref': None,
                    'eof': False
                }]

            # TODO: how to handle object parse error?
            new_obj = PdfObject.create_from_file(f, self)
            self.increments[-1]['body'] += [new_obj]
            self.offset_obj[org_pos] = new_obj
            if isinstance(new_obj.value, PdfStreamObject
                          ) and new_obj.value.dict.get('Type') == 'ObjStm':
                self.offset_obj_streams[org_pos] = new_obj
            print_progress()

        print('', end="\r")
        print('100% processed    ')
        if progress_cb is not None:
            progress_cb('100% processed', read=f.tell(), total=filesize)
        self.ready = True

        print('Decoding object streams...')
        if progress_cb is not None:
            progress_cb('Decoding object streams...',
                        read=f.tell(),
                        total=filesize)
        for k in self.offset_obj_streams:
            from objstm import decode_objstm
            self.compressed_obj = {
                **(self.compressed_obj),
                **(decode_objstm(self.offset_obj_streams[k], self))
            }
        print('Done')
        if progress_cb is not None:
            progress_cb('Done', read=f.tell(), total=filesize)
Exemplo n.º 4
0
    def parse_normal(self, f, progress_cb=None):
        '''Initialize a PdfDocument from a opened PDF file f by reading xref and trailers. After this is called, offset_obj, offset_obj_streams, compressed_obj, offset_xref_trailer, all xref sections are ready'''
        f.seek(0, io.SEEK_SET)
        filesize = os.fstat(f.fileno()).st_size
        # First line is header
        s, eol_marker = utils.read_until(f, syntax.EOL)
        header = re.match(rb'%PDF-(\d+\.\d+)', s)
        if header:
            self.version = Decimal(header.group(1).decode('iso-8859-1'))
            f.seek(len(eol_marker), io.SEEK_CUR)
        else:
            raise Exception('Not a PDF file')

        # read from end of file, find xref
        eof_found = -1
        startxref_found = -1
        temp_line = b''
        temp_count = 2
        temp_offset = 0
        for line in utils.rlines(f):
            temp_offset -= len(line)
            if line.rstrip() == b'%%EOF':
                eof_found = temp_offset
            if eof_found != -1 and temp_count == 0:
                if line.rstrip() == b'startxref':
                    startxref_found = temp_offset
                    break
                else:
                    raise Exception(
                        'startxref not found at 2 lines before EOF marker')
            elif eof_found != -1:
                temp_count -= 1
                temp_line = line
        xref_offset = int(temp_line.decode('iso-8859-1'))
        self.startxref = xref_offset
        # The only required part for a trailer (and marks the end of an increment) is startxref and %%EOF
        self.increments[-1]['startxref'] = xref_offset
        self.increments[-1]['eof'] = True

        inuse_count = 0
        while True:
            f.seek(xref_offset, io.SEEK_SET)
            xref_section, trailer = self.get_xref_trailer_at_offset(
                f, xref_offset)
            self.offset_xref_trailer[xref_offset] = (xref_section, trailer)
            for subsec in xref_section.subsections:
                inuse_count += len(subsec.inuse_entry)
            self.increments[0]['xref_section'] = xref_section
            self.increments[0]['trailer'] = trailer
            if trailer.get('Prev') is None:
                break
            if trailer['Prev'].value - int(trailer['Prev'].value) != 0:
                raise Exception(
                    f'Prev must be an integer, in trailer dict at offset {xref_offset}'
                )
            xref_offset = int(trailer['Prev'].value)  # must not be indirect
            self.increments = [{
                'body': [],
                'xref_section': None,
                'trailer': None,
                'startxref': None,
                'eof': False
            }] + self.increments
            self.increments[0]['startxref'] = xref_offset
        self.ready = True

        inuse_parsed_count = 0
        # parse each in use obj num
        for inc in self.increments:
            for subsec in inc['xref_section'].subsections:
                for entry in subsec.inuse_entry:
                    if entry.get('compressed'):
                        inuse_parsed_count += 1
                        continue
                    offset = entry['offset']
                    f.seek(offset, io.SEEK_SET)
                    new_obj = PdfObject.create_from_file(f, self)
                    if not isinstance(
                            new_obj,
                            PdfIndirectObject) or new_obj.obj_no != entry[
                                'obj_no'] or new_obj.gen_no != entry['gen_no']:
                        raise Exception(
                            f'Invalid obj referenced by xref at offset {offset}'
                        )
                    self.offset_obj[offset] = new_obj
                    if isinstance(new_obj.value,
                                  PdfStreamObject) and new_obj.value.dict.get(
                                      'Type') == 'ObjStm':
                        self.offset_obj_streams[offset] = new_obj
                    inuse_parsed_count += 1
                    print('', end="\r")
                    print(
                        f'{inuse_parsed_count / inuse_count * 100:5.2f}% processed',
                        end='',
                        flush=True)
                    if progress_cb is not None:
                        progress_cb(
                            f'{inuse_parsed_count / inuse_count * 100:5.2f}% processed',
                            read=inuse_parsed_count,
                            total=inuse_count)

        print('Decoding object streams...')
        if progress_cb is not None:
            progress_cb('Decoding object streams...',
                        read=inuse_parsed_count,
                        total=inuse_count)
        for k in self.offset_obj_streams:
            from objstm import decode_objstm
            self.compressed_obj = {
                **(self.compressed_obj),
                **(decode_objstm(self.offset_obj_streams[k], self))
            }
        print('', end="\r")
        print('100% processed    ')
        if progress_cb is not None:
            progress_cb('100% processed',
                        read=inuse_parsed_count,
                        total=inuse_count)
        print('Done')
        if progress_cb is not None:
            progress_cb('Done', read=inuse_parsed_count, total=inuse_count)