def uncompress(mylist, leave_raw=False, warnings=set(), flate=PdfName.FlateDecode, decompress=decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms or obj.DP if ftype != flate: msg = ('Not decompressing: cannot use filter %s' ' with parameters %s') % (repr(ftype), repr(parms)) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() try: data = dco.decompress(convert_store(obj.stream)) except Exception as s: error = str(s) else: error = None if isinstance(parms, PdfArray): oldparms = parms parms = PdfDict() for x in oldparms: parms.update(x) if parms: predictor = int(parms.Predictor or 1) columns = int(parms.Columns or 1) colors = int(parms.Colors or 1) bpc = int(parms.BitsPerComponent or 8) if 10 <= predictor <= 15: data, error = flate_png(data, predictor, columns, colors, bpc) elif predictor != 1: error = ('Unsupported flatedecode predictor %s' % repr(predictor)) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = ('Unconsumed compression data: %s' % repr(dco.unused_data[:20])) if error is None: obj.Filter = None obj.stream = data if leave_raw else convert_load(data) else: log.error('%s %s' % (error, repr(obj.indirect))) ok = False return ok
def uncompress(mylist, warnings=set(), flate = PdfName.FlateDecode, decompress=zlib.decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms if ftype != flate or parms is not None: msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms)) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() error = None try: data = dco.decompress(obj.stream) except Exception, s: error = str(s) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = 'Unconsumed compression data: %s' % repr(dco.unused_data[:20]) if error is None: obj.Filter = None obj.stream = data else: log.error('%s %s' % (error, repr(obj.indirect)))
def old_parsexref(self, source, int=int, range=range): ''' Parse (one of) the cross-reference file section(s) ''' fdata = source.fdata setdefault = source.obj_offsets.setdefault add_offset = source.all_offsets.append next = source.next tok = next() if tok != 'xref': source.exception('Expected "xref" keyword') start = source.floc try: while 1: tok = next() if tok == 'trailer': return startobj = int(tok) for objnum in range(startobj, startobj + int(next())): offset = int(next()) generation = int(next()) inuse = next() if inuse == 'n': if offset != 0: setdefault((objnum, generation), offset) add_offset(offset) elif inuse != 'f': raise ValueError except: pass try: # Table formatted incorrectly. See if # we can figure it out anyway. end = source.fdata.rindex('trailer', start) table = source.fdata[start:end].splitlines() for line in table: tokens = line.split() if len(tokens) == 2: objnum = int(tokens[0]) elif len(tokens) == 3: offset, generation, inuse = (int(tokens[0]), int(tokens[1]), tokens[2]) if offset != 0 and inuse == 'n': setdefault((objnum, generation), offset) add_offset(offset) objnum += 1 elif tokens: log.error('Invalid line in xref table: %s' % repr(line)) raise ValueError log.warning('Badly formatted xref table') source.floc = end source.next() except: source.floc = start source.exception('Invalid table format')
def parsexref(self, source, int=int, range=range): ''' Parse (one of) the cross-reference file section(s) ''' fdata = source.fdata setdefault = source.obj_offsets.setdefault add_offset = source.all_offsets.append next = source.next tok = next() if tok != 'xref': source.exception('Expected "xref" keyword') start = source.floc try: while 1: tok = next() if tok == 'trailer': return startobj = int(tok) for objnum in range(startobj, startobj + int(next())): offset = int(next()) generation = int(next()) inuse = next() if inuse == 'n': if offset != 0: setdefault((objnum, generation), offset) add_offset(offset) elif inuse != 'f': raise ValueError except: pass try: # Table formatted incorrectly. See if # we can figure it out anyway. end = source.fdata.rindex('trailer', start) table = source.fdata[start:end].splitlines() for line in table: tokens = line.split() if len(tokens) == 2: objnum = int(tokens[0]) elif len(tokens) == 3: offset, generation, inuse = (int(tokens[0]), int(tokens[1]), tokens[2]) if offset != 0 and inuse == 'n': setdefault((objnum, generation), offset) add_offset(offset) objnum += 1 elif tokens: log.error('Invalid line in xref table: %s' % repr(line)) raise ValueError log.warning('Badly formatted xref table') source.floc = end source.next() except: source.floc = start source.exception('Invalid table format')
def loadindirect(self, key): result = self.indirect_objects.get(key) if not isinstance(result, PdfIndirect): return result source = self.source offset = int(self.source.obj_offsets.get(key, '0')) if not offset: log.warning("Did not find PDF object %s" % (key, )) return None # Read the object header and validate it objnum, gennum = key source.floc = offset objid = source.multiple(3) ok = len(objid) == 3 ok = ok and objid[0].isdigit() and int(objid[0]) == objnum ok = ok and objid[1].isdigit() and int(objid[1]) == gennum ok = ok and objid[2] == 'obj' if not ok: source.floc = offset source.next() objheader = '%d %d obj' % (objnum, gennum) fdata = source.fdata offset2 = (fdata.find('\n' + objheader) + 1 or fdata.find('\r' + objheader) + 1) if (not offset2 or fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0): source.warning("Expected indirect object '%s'" % objheader) return None source.warning("Indirect object %s found at incorrect " "offset %d (expected offset %d)" % (objheader, offset2, offset)) source.floc = offset2 + len(objheader) # Read the object, and call special code if it starts # an array or dictionary obj = source.next() func = self.special.get(obj) if func is not None: obj = func(source) self.indirect_objects[key] = obj self.deferred_objects.remove(key) # Mark the object as indirect, and # add it to the list of streams if it starts a stream obj.indirect = key tok = source.next() if tok != 'endobj': self.readstream(obj, self.findstream(obj, tok, source), source) return obj
def loadindirect(self, key): result = self.indirect_objects.get(key) if not isinstance(result, PdfIndirect): return result source = self.source offset = int(self.source.obj_offsets.get(key, '0')) if not offset: log.warning("Did not find PDF object %s" % (key,)) return None # Read the object header and validate it objnum, gennum = key source.floc = offset objid = source.multiple(3) ok = len(objid) == 3 ok = ok and objid[0].isdigit() and int(objid[0]) == objnum ok = ok and objid[1].isdigit() and int(objid[1]) == gennum ok = ok and objid[2] == 'obj' if not ok: source.floc = offset source.next() objheader = '%d %d obj' % (objnum, gennum) fdata = source.fdata offset2 = (fdata.find('\n' + objheader) + 1 or fdata.find('\r' + objheader) + 1) if (not offset2 or fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0): source.warning("Expected indirect object '%s'" % objheader) return None source.warning("Indirect object %s found at incorrect " "offset %d (expected offset %d)" % (objheader, offset2, offset)) source.floc = offset2 + len(objheader) # Read the object, and call special code if it starts # an array or dictionary obj = source.next() func = self.special.get(obj) if func is not None: obj = func(source) self.indirect_objects[key] = obj self.deferred_objects.remove(key) # Mark the object as indirect, and # add it to the list of streams if it starts a stream obj.indirect = key tok = source.next() if tok != 'endobj': self.readstream(obj, self.findstream(obj, tok, source), source) return obj
def add(obj): ''' Add an object to our list, if it's an indirect object. Just format it if not. ''' # Can't hash dicts, so just hash the object ID objid = id(obj) # Automatically set stream objects to indirect if isinstance(obj, PdfDict): indirect = obj.indirect or (obj.stream is not None) else: indirect = getattr(obj, 'indirect', False) if not indirect: if objid in visited: log.warning('Replicating direct %s object, ' 'should be indirect for optimal file size' % type(obj)) obj = type(obj)(obj) objid = id(obj) visiting(objid) result = format_obj(obj) leaving(objid) return result objnum = indirect_dict_get(objid) # If we haven't seen the object yet, we need to # add it to the indirect object list. if objnum is None: swapped = swapobj(objid) if swapped is not None: old_id = objid obj = swapped objid = id(obj) objnum = indirect_dict_get(objid) if objnum is not None: indirect_dict[old_id] = objnum return '%s 0 R' % objnum objnum = len(objlist) + 1 objlist_append(None) indirect_dict[objid] = objnum deferred.append((objnum - 1, obj)) return '%s 0 R' % objnum
def uncompress(mylist, warnings=set(), flate=PdfName.FlateDecode, decompress=zlib.decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms if ftype != flate or parms is not None: msg = ('Not decompressing: cannot use filter %s ' 'with parameters %s' % (repr(ftype), repr(parms))) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() error = None try: data = dco.decompress(obj.stream) except Exception, s: error = str(s) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = ('Unconsumed compression data: %s' % repr(dco.unused_data[:20])) if error is None: obj.Filter = None obj.stream = data else: log.error('%s %s' % (error, repr(obj.indirect)))
def parsexref(self, source, int=int, range=range): ''' Parse (one of) the cross-reference file section(s) ''' def _pairs(array): i = 0 while 1: yield int(array[i]), int(array[i + 1]) i += 2 if (i + 1) >= len(array): break def convert_to_int(d, size): if size > 8: source.exception('Invalid size in convert_to_int') d = '\x00\x00\x00\x00\x00\x00\x00\x00' + d d = d[-8:] return struct.unpack('>q', d)[0] def read_trailer(): tok = next() if tok != '<<': source.exception('Expected "<<" starting catalog') return self.readdict(source) setdefault = source.obj_offsets.setdefault add_offset = source.all_offsets.append next = source.next tok = next() if tok.isdigit(): # check for xref stream object objid = source.multiple(2) ok = len(objid) == 2 ok = ok and objid[0].isdigit() ok = ok and objid[1] == 'obj' if ok: next() # start of dict obj = self.readdict(source) assert obj.Type == '/XRef' tok = next() end = source.floc + int(obj.Length) self.readstream(obj, self.findstream(obj, tok, source), source) uncompress([obj]) num_pairs = obj.Index or PdfArray(['0', obj.Size]) entry_sizes = [int(x) for x in obj.W] object_streams = {} for num, size in _pairs(num_pairs): cnt = 0 stream_offset = 0 while cnt < size: for i in range(len(entry_sizes)): d = obj.stream[stream_offset:stream_offset + entry_sizes[i]] stream_offset += entry_sizes[i] di = convert_to_int(d, entry_sizes[i]) if i == 0: xref_type = di if xref_type == 0 and entry_sizes[0] == 0: xref_type = 1 elif i == 1: if xref_type == 1: offset = di elif xref_type == 2: objnum = di elif i == 2: if xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 1 and offset != 0: setdefault((num, generation), offset) add_offset(offset) elif xref_type == 2: if not objnum in object_streams: object_streams[objnum] = [] object_streams[objnum].append(obstr_idx) cnt += 1 num += 1 self.load_stream_objects(object_streams) source.floc = end endit = source.multiple(2) if endit != ['endstream', 'endobj']: source.exception('Expected endstream endobj') return obj else: source.exception('Expected xref stream') elif tok == 'xref': # plain xref table start = source.floc try: while 1: tok = next() if tok == 'trailer': return read_trailer() startobj = int(tok) for objnum in range(startobj, startobj + int(next())): offset = int(next()) generation = int(next()) inuse = next() if inuse == 'n': if offset != 0: setdefault((objnum, generation), offset) add_offset(offset) elif inuse != 'f': raise ValueError except: pass try: # Table formatted incorrectly. # See if we can figure it out anyway. end = source.fdata.rindex('trailer', start) table = source.fdata[start:end].splitlines() for line in table: tokens = line.split() if len(tokens) == 2: objnum = int(tokens[0]) elif len(tokens) == 3: offset, generation, inuse = \ int(tokens[0]), int(tokens[1]), tokens[2] if offset != 0 and inuse == 'n': setdefault((objnum, generation), offset) add_offset(offset) objnum += 1 elif tokens: log.error('Invalid line in xref table: %s' % repr(line)) raise ValueError log.warning('Badly formatted xref table') source.floc = end next() except: source.floc = start source.exception('Invalid table format') return read_trailer() else: source.exception('Expected "xref" keyword or xref stream object')
def warning(self, *arg): log.warning(self.msg(*arg))
def uncompress(mylist, warnings=set(), flate=PdfName.FlateDecode, decompress=zlib.decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms if ftype != flate or parms is not None: msg = 'Not decompressing: cannot use filter %s with parameters %s' % ( repr(ftype), repr(parms)) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() error = None try: data = dco.decompress(obj.stream) if parms: # try png predictor predictor = int(parms['/Predictor']) or 1 # predictor 1 == no predictor if predictor != 1: columns = int(parms['/Columns']) # PNG prediction: if predictor >= 10 and predictor <= 15: output = StringIO() # PNG prediction can vary from row to row rowlen = columns + 1 assert len(data) % rowlen == 0 prev_rowdata = (0, ) * rowlen for row in xrange(len(data) / rowlen): rowdata = [ ord(x) for x in data[(row * rowlen):((row + 1) * rowlen)] ] filter_byte = rowdata[0] if filter_byte == 0: pass elif filter_byte == 1: for i in xrange(2, rowlen): rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256 elif filter_byte == 2: for i in xrange(1, rowlen): rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 else: # unsupported PNG filter raise Exception( ('Unsupported PNG ' 'filter %r') % filter_byte) prev_rowdata = rowdata output.write(''.join( [chr(x) for x in rowdata[1:]])) data = output.getvalue() else: # unsupported predictor raise Exception(('Unsupported flatedecode' ' predictor %r') % predictor) except Exception, s: error = str(s) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = 'Unconsumed compression data: %s' % repr( dco.unused_data[:20]) if error is None: obj.Filter = None obj.stream = data else: log.error('%s %s' % (error, repr(obj.indirect)))
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: raise PdfParseError('EOF mark not found: %s' % repr(fdata[-20:])) endloc += 6 junk = fdata[endloc:] fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = {'<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning('Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict self.update(newdict) #self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable()
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = {'<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc+1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") else: startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning('Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict self.update(newdict) # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) self.version = fdata[5:8] endloc = fdata.rfind('%EOF') if endloc < 0: raise PdfParseError('EOF mark not found: %s' % repr(fdata[-20:])) endloc += 6 junk = fdata[endloc:] fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = {'<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken startloc, source = self.findxref(fdata) private.source = source xref_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables/streams trailer = self.parsexref(source) # Loop if any previously-written xrefs. prev = trailer.Prev if prev is None: token = source.next() if token != 'startxref': source.warning('Expected "startxref" at end of xref table') break if not xref_list: trailer.Prev = None original_trailer = trailer source.floc = int(prev) xref_list.append(source.obj_offsets) if xref_list: for update in reversed(xref_list): source.obj_offsets.update(update) trailer.update(original_trailer) if trailer.Version and \ float(trailer.Version) > float(self.version): self.version = trailer.Version trailer = PdfDict( Root=trailer.Root, Info=trailer.Info, ID=trailer.ID # TODO: add Encrypt when implemented ) self.update(trailer) #self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() finally: if disable_gc: gc.enable()
def __init__(self, fname=None, fdata=None, decompress=False, decrypt=False, password='', disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc + 1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") # the name in slowparsing is newdict self.update(newdict) else: """ startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning('Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict # old name is newdict, below the new name is trailer self.update(newdict) """ ### NEW STUFF BEGINS HERE startloc, source = self.findxref(fdata) private.source = source # Find all the xref tables/streams, and # then deal with them backwards. xref_list = [] while 1: source.obj_offsets = {} trailer, is_stream = self.parsexref(source) prev = trailer.Prev if prev is None: token = source.next() if token != 'startxref' and not xref_list: source.warning('Expected "startxref" ' 'at end of xref table') break xref_list.append((source.obj_offsets, trailer, is_stream)) source.floc = int(prev) #print 'xref_list:', xref_list #print 'trailer:', trailer # Handle document encryption private.crypt_filters = None if decrypt and PdfName.Encrypt in trailer: identity_filter = crypt.IdentityCryptFilter() crypt_filters = {PdfName.Identity: identity_filter} private.crypt_filters = crypt_filters private.stream_crypt_filter = identity_filter private.string_crypt_filter = identity_filter if not crypt.HAS_CRYPTO: raise PdfParseError( 'Install PyCrypto to enable encryption support') self._parse_encrypt_info(source, password, trailer) if is_stream: self.load_stream_objects(trailer.object_streams) while xref_list: later_offsets, later_trailer, is_stream = xref_list.pop() source.obj_offsets.update(later_offsets) if is_stream: trailer.update(later_trailer) self.load_stream_objects(later_trailer.object_streams) else: trailer = later_trailer trailer.Prev = None if (trailer.Version and float(trailer.Version) > float(self.version)): self.private.version = trailer.Version if decrypt: self.decrypt_all() trailer.Encrypt = None if is_stream: self.Root = trailer.Root self.Info = trailer.Info self.ID = trailer.ID self.Size = trailer.Size self.Encrypt = trailer.Encrypt else: self.update(trailer) ### NEW STUFF ENDS HERE # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc + 1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") else: startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning( 'Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict self.update(newdict) # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()
def uncompress(mylist, warnings=set(), flate=PdfName.FlateDecode, decompress=zlib.decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms if ftype != flate or parms is not None: msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms)) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() error = None try: data = dco.decompress(obj.stream) if parms: # try png predictor predictor = int(parms['/Predictor']) or 1 # predictor 1 == no predictor if predictor != 1: columns = int(parms['/Columns']) # PNG prediction: if predictor >= 10 and predictor <= 15: output = StringIO() # PNG prediction can vary from row to row rowlen = columns + 1 assert len(data) % rowlen == 0 prev_rowdata = (0,) * rowlen for row in xrange(len(data) / rowlen): rowdata = [ord(x) for x in data[(row * rowlen):((row + 1) * rowlen)]] filter_byte = rowdata[0] if filter_byte == 0: pass elif filter_byte == 1: for i in xrange(2, rowlen): rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256 elif filter_byte == 2: for i in xrange(1, rowlen): rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 else: # unsupported PNG filter raise Exception(('Unsupported PNG ' 'filter %r') % filter_byte) prev_rowdata = rowdata output.write(''.join([chr(x) for x in rowdata[1:]])) data = output.getvalue() else: # unsupported predictor raise Exception(('Unsupported flatedecode' ' predictor %r') % predictor) except Exception, s: error = str(s) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = 'Unconsumed compression data: %s' % repr( dco.unused_data[:20]) if error is None: obj.Filter = None obj.stream = data else: log.error('%s %s' % (error, repr(obj.indirect)))
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) self.version = fdata[5:8] endloc = fdata.rfind('%EOF') if endloc < 0: raise PdfParseError('EOF mark not found: %s' % repr(fdata[-20:])) endloc += 6 junk = fdata[endloc:] fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken startloc, source = self.findxref(fdata) private.source = source xref_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables/streams trailer = self.parsexref(source) # Loop if any previously-written xrefs. prev = trailer.Prev if prev is None: token = source.next() if token != 'startxref': source.warning( 'Expected "startxref" at end of xref table') break if not xref_list: trailer.Prev = None original_trailer = trailer source.floc = int(prev) xref_list.append(source.obj_offsets) if xref_list: for update in reversed(xref_list): source.obj_offsets.update(update) trailer.update(original_trailer) if trailer.Version and \ float(trailer.Version) > float(self.version): self.version = trailer.Version trailer = PdfDict(Root=trailer.Root, Info=trailer.Info, ID=trailer.ID # TODO: add Encrypt when implemented ) self.update(trailer) #self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() finally: if disable_gc: gc.enable()