def findxref(fdata): ''' Find the cross reference section at the end of a file ''' startloc = fdata.rfind('startxref') if startloc < 0: raise PdfParseError('Did not find "startxref" at end of file') source = PdfTokens(fdata, startloc, False) tok = source.next() assert tok == 'startxref' # (We just checked this...) tableloc = source.next_default() if not tableloc.isdigit(): source.exception('Expected table location') if source.next_default().rstrip().lstrip('%') != 'EOF': source.exception('Expected %%EOF') return startloc, PdfTokens(fdata, int(tableloc), True)
def load_stream_objects(self, object_streams): # read object streams objs = [] for num in object_streams: obj = self.findindirect(num, 0).real_value() assert obj.Type == '/ObjStm' objs.append(obj) # read objects from stream if objs: # Decrypt if self.crypt_filters is not None: crypt.decrypt_objects(objs, self.stream_crypt_filter, self.crypt_filters) # Decompress uncompress(objs) for obj in objs: objsource = PdfTokens(obj.stream, 0, False) next = objsource.next offsets = [] firstoffset = int(obj.First) while objsource.floc < firstoffset: offsets.append((int(next()), firstoffset + int(next()))) for num, offset in offsets: # Read the object, and call special code if it starts # an array or dictionary objsource.floc = offset sobj = next() func = self.special.get(sobj) if func is not None: sobj = func(objsource) key = (num, 0) self.indirect_objects[key] = sobj if key in self.deferred_objects: self.deferred_objects.remove(key) # Mark the object as indirect, and # add it to the list of streams if it starts a stream sobj.indirect = key
def load_stream_objects(self, object_streams): # read object streams objs = [] for num in object_streams.iterkeys(): obj = self.findindirect(num, 0).real_value() assert obj.Type == '/ObjStm' objs.append(obj) # read objects from stream if objs: uncompress(objs) for obj in objs: objsource = PdfTokens(obj.stream, 0, False) snext = objsource.next offsets = {} firstoffset = int(obj.First) num = snext() while num.isdigit(): offset = int(snext()) offsets[int(num)] = firstoffset + offset num = snext() for num, offset in offsets.iteritems(): # Read the object, and call special code if it starts # an array or dictionary objsource.floc = offset sobj = snext() func = self.special.get(sobj) if func is not None: sobj = func(objsource) key = (num, 0) self.indirect_objects[key] = sobj if key in self.deferred_objects: self.deferred_objects.remove(key) # Mark the object as indirect, and # add it to the list of streams if it starts a stream sobj.indirect = key
def __init__(self, fname=None, fdata=None, decompress=False, decrypt=False, password='', disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc + 1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") # the name in slowparsing is newdict self.update(newdict) else: """ startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning('Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict # old name is newdict, below the new name is trailer self.update(newdict) """ ### NEW STUFF BEGINS HERE startloc, source = self.findxref(fdata) private.source = source # Find all the xref tables/streams, and # then deal with them backwards. xref_list = [] while 1: source.obj_offsets = {} trailer, is_stream = self.parsexref(source) prev = trailer.Prev if prev is None: token = source.next() if token != 'startxref' and not xref_list: source.warning('Expected "startxref" ' 'at end of xref table') break xref_list.append((source.obj_offsets, trailer, is_stream)) source.floc = int(prev) #print 'xref_list:', xref_list #print 'trailer:', trailer # Handle document encryption private.crypt_filters = None if decrypt and PdfName.Encrypt in trailer: identity_filter = crypt.IdentityCryptFilter() crypt_filters = {PdfName.Identity: identity_filter} private.crypt_filters = crypt_filters private.stream_crypt_filter = identity_filter private.string_crypt_filter = identity_filter if not crypt.HAS_CRYPTO: raise PdfParseError( 'Install PyCrypto to enable encryption support') self._parse_encrypt_info(source, password, trailer) if is_stream: self.load_stream_objects(trailer.object_streams) while xref_list: later_offsets, later_trailer, is_stream = xref_list.pop() source.obj_offsets.update(later_offsets) if is_stream: trailer.update(later_trailer) self.load_stream_objects(later_trailer.object_streams) else: trailer = later_trailer trailer.Prev = None if (trailer.Version and float(trailer.Version) > float(self.version)): self.private.version = trailer.Version if decrypt: self.decrypt_all() trailer.Encrypt = None if is_stream: self.Root = trailer.Root self.Info = trailer.Info self.ID = trailer.ID self.Size = trailer.Size self.Encrypt = trailer.Encrypt else: self.update(trailer) ### NEW STUFF ENDS HERE # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = {'<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc+1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") else: startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning('Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict self.update(newdict) # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc + 1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") else: startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning( 'Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict self.update(newdict) # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()