def get_obj(self, obj_num, gen_num): if not self.ready: raise Exception( 'get_obj can only be called after the document is scanned completely.' ) startxref_found = False startxref = -1 for increment in range(len(self.increments)): increment = -(increment + 1) # increment from -1 to -len xref_section = self.increments[increment]['xref_section'] offset = xref_section.get_obj_offset(obj_num, gen_num) if offset is None: # offset is None <=> obj_num not found continue elif isinstance(offset, tuple): # TODO: handle the case where obj is not already cached return self.compressed_obj[offset] elif offset > 0: if self.offset_obj.get(offset) is None: temp_f = open(self.__f.name, 'rb') temp_f.seek(offset, io.SEEK_SET) self.offset_obj[offset] = PdfObject.create_from_file( temp_f, self) return self.offset_obj[offset] elif offset == 0: # offset = 0 <=> obj_num is free at gen_num return None raise Exception('Object not found')
def decode_objstm(objstmobj, doc): result = {} streamObj = objstmobj.value objstmobj_no = objstmobj.obj_no if not isinstance(streamObj, PdfStreamObject): raise ValueError( 'objstmobj is not a PdfIndirectObject containing a PdfStreamObject' ) objbytestream = io.BufferedReader(io.BytesIO(streamObj.decode())) # N pairs of integers # 1st int is obj no of the compressed object # 2nd int is byte offset of that object, relative to the first obj objbytestream.seek(0, io.SEEK_SET) utils.seek_until(objbytestream, syntax.NON_WHITESPACES, ignore_comment=True) numbers = [] N = 0 First = 0 try: # TODO: assuming both N and First have direct obj values N = int(str(streamObj.dict['N'].value)) First = int(str(streamObj.dict['First'].value)) if N < 0 or First < 0: raise Exception( f'Invalid N or First field in ObjStm at offset {org_pos}.') except Exception as ex: raise Exception( f'Invalid N or First field in ObjStm at offset {org_pos}.') from ex for _ in range(2 * N): utils.seek_until(objbytestream, syntax.NON_WHITESPACES, ignore_comment=True) numobj = PdfNumericObject.create_from_file(objbytestream) try: temp = int(str(numobj.value)) if temp < 0: raise Exception( f'Invalid obj no./offset in ObjStm at offset {org_pos}.') numbers += [temp] except Exception as ex: raise Exception(f'Invalid ObjStm at offset {org_pos}.') from ex for idx, p in enumerate(utils.chunks(numbers, 2)): # gen no, of object stream and of any compressed object is implicitly 0 objbytestream.seek(First + p[1], io.SEEK_SET) result[objstmobj_no, idx] = PdfIndirectObject( PdfObject.create_from_file(objbytestream, doc), p[0], 0) # TODO: check for orphaned bytes between compressed objectes? return result
def parse_linear(self, f, progress_cb=None): '''Initialize a PdfDocument from a opened PDF file f from the beginning''' def print_progress(): print('', end="\r") print(f'{f.tell() / filesize * 100:5.2f}% processed', end='', flush=True) if progress_cb is not None: progress_cb(f'{f.tell() / filesize * 100:5.2f}% processed', read=f.tell(), total=filesize) f.seek(0, io.SEEK_SET) filesize = os.fstat(f.fileno()).st_size print_progress() # First line is header s, eol_marker = utils.read_until(f, syntax.EOL) header = re.match(rb'%PDF-(\d+\.\d+)', s) if header: self.version = Decimal(header.group(1).decode('iso-8859-1')) f.seek(len(eol_marker), io.SEEK_CUR) else: raise Exception('Not a PDF file') while True: utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=False) if f.tell() >= filesize: break org_pos = f.tell() s, eol_marker = utils.read_until(f, syntax.EOL) if s == b'startxref': # the last startxref always override the ones before utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) t, _ = utils.read_until(f, syntax.EOL) self.startxref = int(t) self.increments[-1]['startxref'] = self.startxref continue elif s == b'xref': f.seek(-4, io.SEEK_CUR) self.increments[-1]['xref_section'] = PdfXRefSection(f) self.offset_xref[org_pos] = self.increments[-1]['xref_section'] continue elif s == b'trailer': utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) self.increments[-1][ 'trailer'] = PdfDictionaryObject.create_from_file(f, self) continue elif s == b'%%EOF': # TODO: check if trailer dict immediately precedes %%EOF # since we are seeking until non-ws, the only case EOF marker # does not appear by itself it when it is preceded by some # whitespaces, which should be ignored self.increments[-1]['eof'] = True f.seek(5 + len(eol_marker), io.SEEK_CUR) continue elif s[0:1] == b'%': # otherwise, it is a comment, ignore the whole remaining line utils.seek_until(f, syntax.EOL) continue #else: f.seek(org_pos, io.SEEK_SET) if self.increments[-1]['eof']: self.increments += [{ 'body': [], 'xref_section': None, 'trailer': None, 'startxref': None, 'eof': False }] # TODO: how to handle object parse error? new_obj = PdfObject.create_from_file(f, self) self.increments[-1]['body'] += [new_obj] self.offset_obj[org_pos] = new_obj if isinstance(new_obj.value, PdfStreamObject ) and new_obj.value.dict.get('Type') == 'ObjStm': self.offset_obj_streams[org_pos] = new_obj print_progress() print('', end="\r") print('100% processed ') if progress_cb is not None: progress_cb('100% processed', read=f.tell(), total=filesize) self.ready = True print('Decoding object streams...') if progress_cb is not None: progress_cb('Decoding object streams...', read=f.tell(), total=filesize) for k in self.offset_obj_streams: from objstm import decode_objstm self.compressed_obj = { **(self.compressed_obj), **(decode_objstm(self.offset_obj_streams[k], self)) } print('Done') if progress_cb is not None: progress_cb('Done', read=f.tell(), total=filesize)
def parse_normal(self, f, progress_cb=None): '''Initialize a PdfDocument from a opened PDF file f by reading xref and trailers. After this is called, offset_obj, offset_obj_streams, compressed_obj, offset_xref_trailer, all xref sections are ready''' f.seek(0, io.SEEK_SET) filesize = os.fstat(f.fileno()).st_size # First line is header s, eol_marker = utils.read_until(f, syntax.EOL) header = re.match(rb'%PDF-(\d+\.\d+)', s) if header: self.version = Decimal(header.group(1).decode('iso-8859-1')) f.seek(len(eol_marker), io.SEEK_CUR) else: raise Exception('Not a PDF file') # read from end of file, find xref eof_found = -1 startxref_found = -1 temp_line = b'' temp_count = 2 temp_offset = 0 for line in utils.rlines(f): temp_offset -= len(line) if line.rstrip() == b'%%EOF': eof_found = temp_offset if eof_found != -1 and temp_count == 0: if line.rstrip() == b'startxref': startxref_found = temp_offset break else: raise Exception( 'startxref not found at 2 lines before EOF marker') elif eof_found != -1: temp_count -= 1 temp_line = line xref_offset = int(temp_line.decode('iso-8859-1')) self.startxref = xref_offset # The only required part for a trailer (and marks the end of an increment) is startxref and %%EOF self.increments[-1]['startxref'] = xref_offset self.increments[-1]['eof'] = True inuse_count = 0 while True: f.seek(xref_offset, io.SEEK_SET) xref_section, trailer = self.get_xref_trailer_at_offset( f, xref_offset) self.offset_xref_trailer[xref_offset] = (xref_section, trailer) for subsec in xref_section.subsections: inuse_count += len(subsec.inuse_entry) self.increments[0]['xref_section'] = xref_section self.increments[0]['trailer'] = trailer if trailer.get('Prev') is None: break if trailer['Prev'].value - int(trailer['Prev'].value) != 0: raise Exception( f'Prev must be an integer, in trailer dict at offset {xref_offset}' ) xref_offset = int(trailer['Prev'].value) # must not be indirect self.increments = [{ 'body': [], 'xref_section': None, 'trailer': None, 'startxref': None, 'eof': False }] + self.increments self.increments[0]['startxref'] = xref_offset self.ready = True inuse_parsed_count = 0 # parse each in use obj num for inc in self.increments: for subsec in inc['xref_section'].subsections: for entry in subsec.inuse_entry: if entry.get('compressed'): inuse_parsed_count += 1 continue offset = entry['offset'] f.seek(offset, io.SEEK_SET) new_obj = PdfObject.create_from_file(f, self) if not isinstance( new_obj, PdfIndirectObject) or new_obj.obj_no != entry[ 'obj_no'] or new_obj.gen_no != entry['gen_no']: raise Exception( f'Invalid obj referenced by xref at offset {offset}' ) self.offset_obj[offset] = new_obj if isinstance(new_obj.value, PdfStreamObject) and new_obj.value.dict.get( 'Type') == 'ObjStm': self.offset_obj_streams[offset] = new_obj inuse_parsed_count += 1 print('', end="\r") print( f'{inuse_parsed_count / inuse_count * 100:5.2f}% processed', end='', flush=True) if progress_cb is not None: progress_cb( f'{inuse_parsed_count / inuse_count * 100:5.2f}% processed', read=inuse_parsed_count, total=inuse_count) print('Decoding object streams...') if progress_cb is not None: progress_cb('Decoding object streams...', read=inuse_parsed_count, total=inuse_count) for k in self.offset_obj_streams: from objstm import decode_objstm self.compressed_obj = { **(self.compressed_obj), **(decode_objstm(self.offset_obj_streams[k], self)) } print('', end="\r") print('100% processed ') if progress_cb is not None: progress_cb('100% processed', read=inuse_parsed_count, total=inuse_count) print('Done') if progress_cb is not None: progress_cb('Done', read=inuse_parsed_count, total=inuse_count)