def create_from_file(cls, f: io.BufferedReader, doc): char = f.peek(1)[0:1] # TODO: if char == b't' or char == b'f': return PdfBooleanObject.create_from_file(f) elif char in [ b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'+', b'-', b'.' ]: o = f.tell() n = PdfNumericObject.create_from_file(f) if n.value < 0 or n.value - int( n.value ) != 0: # a decimal or a negative number, never a indirect obj return n o2 = f.tell() utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) n2, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if re.match( rb'\d+$', n2 ) is None: # next token not a number, never an indirect obj f.seek(o2, io.SEEK_SET) return n else: utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) s, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if s == b'obj': # all 3 tokens are correct, an indirect obj f.seek(o, io.SEEK_SET) return PdfIndirectObject.create_from_file(f, doc) elif s == b'R': # all 3 tokens are correct, an indirect reference f.seek(o, io.SEEK_SET) return PdfReferenceObject.create_from_file(f, doc) else: f.seek(o2, io.SEEK_SET) return n elif char == b'(': return PdfLiteralStringObject.create_from_file(f) elif char == b'<': char = utils.peek_at_least(f, 2)[0:2] if char == b'<<': dictobj = PdfDictionaryObject.create_from_file(f, doc) return dictobj else: return PdfHexStringObject.create_from_file(f) elif char == b'/': return PdfNameObject.create_from_file(f) elif char == b'[': return PdfArrayObject.create_from_file(f, doc) elif char == b'n': return PdfNullObject.create_from_file(f) else: raise Exception(f'Unknown token at {f.tell()}')
def decode_objstm(objstmobj, doc): result = {} streamObj = objstmobj.value objstmobj_no = objstmobj.obj_no if not isinstance(streamObj, PdfStreamObject): raise ValueError( 'objstmobj is not a PdfIndirectObject containing a PdfStreamObject' ) objbytestream = io.BufferedReader(io.BytesIO(streamObj.decode())) # N pairs of integers # 1st int is obj no of the compressed object # 2nd int is byte offset of that object, relative to the first obj objbytestream.seek(0, io.SEEK_SET) utils.seek_until(objbytestream, syntax.NON_WHITESPACES, ignore_comment=True) numbers = [] N = 0 First = 0 try: # TODO: assuming both N and First have direct obj values N = int(str(streamObj.dict['N'].value)) First = int(str(streamObj.dict['First'].value)) if N < 0 or First < 0: raise Exception( f'Invalid N or First field in ObjStm at offset {org_pos}.') except Exception as ex: raise Exception( f'Invalid N or First field in ObjStm at offset {org_pos}.') from ex for _ in range(2 * N): utils.seek_until(objbytestream, syntax.NON_WHITESPACES, ignore_comment=True) numobj = PdfNumericObject.create_from_file(objbytestream) try: temp = int(str(numobj.value)) if temp < 0: raise Exception( f'Invalid obj no./offset in ObjStm at offset {org_pos}.') numbers += [temp] except Exception as ex: raise Exception(f'Invalid ObjStm at offset {org_pos}.') from ex for idx, p in enumerate(utils.chunks(numbers, 2)): # gen no, of object stream and of any compressed object is implicitly 0 objbytestream.seek(First + p[1], io.SEEK_SET) result[objstmobj_no, idx] = PdfIndirectObject( PdfObject.create_from_file(objbytestream, doc), p[0], 0) # TODO: check for orphaned bytes between compressed objectes? return result
def create_from_file(cls, f: io.BufferedReader, doc): org_pos = f.tell() token: bytes = f.read(1) if token != b'[': f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid array object at offset {org_pos}.') result = [] while True: utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) if f.peek(1)[0:1] != b']': try: result += [PdfObject.create_from_file(f, doc)] except Exception as ex: raise Exception( f'Parse Error: Not a valid array object at offset {org_pos}.' ) from ex else: f.read(1) break return PdfArrayObject(result)
def create_from_file(cls, f: io.BufferedReader, doc): org_pos = f.tell() num, _ = utils.read_until(f, syntax.WHITESPACES) if re.match(rb'\d+$', num) is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid reference at offset {org_pos}.') obj_no = int(num) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) num, _ = utils.read_until(f, syntax.WHITESPACES) if re.match(rb'\d+$', num) is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid reference at offset {org_pos}.') gen_no = int(num) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) tok, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if tok != b'R': f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid reference at offset {org_pos}.') return PdfReferenceObject(doc, obj_no, gen_no)
def create_from_file(cls, f: io.BufferedReader, doc): org_pos = f.tell() token: bytes = f.read(2) if token != b'<<': f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid dictionary object at offset {org_pos}.' ) result = {} while True: utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) if f.peek(1)[0:1] == b'/': try: # key, must be a name k = PdfNameObject.create_from_file(f) # value utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) v = PdfObject.create_from_file(f, doc) # TODO: check if dict use __eq__ for key equality and existence check result[k] = v except Exception as ex: raise Exception( f'Parse Error: Not a valid dictionary object at offset {org_pos}.' ) from ex continue elif utils.peek_at_least(f, 2)[0:2] == b'>>': f.read(2) break else: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid dictionary object at offset {org_pos}.' ) return PdfDictionaryObject(result)
def get_xref_trailer_at_offset(self, f, offset): # read xref, trailer should directly follow, and MUST be read TOGETHER with xref # linearized PDF specified the last appering trailer DOES NOT have Prev entry, and startxref points to 1st page xref table near start of file # which has its own trailer, making the last trailer technically the 'first' trailer # therefore, searching for trailer dict from end of file would get the wrong trailer dict # moreover, in a xref stream, the xref and trailer dict is lumped together as the stream object if offset in self.offset_xref_trailer: return self.offset_xref_trailer[offset] f.seek(offset, io.SEEK_SET) temp, _ = utils.read_until(f, syntax.EOL) f.seek(offset, io.SEEK_SET) # TODO: catch exception for parsing PdfXRefSection if temp == b'xref': # uncompressed xref section utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) xref_section = PdfXRefSection(f) # find trailer dict and Prev # trailer dict CAN contain references utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) temp, _ = utils.read_until(f, syntax.EOL) if temp == b'trailer': utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) trailer_dict = PdfDictionaryObject.create_from_file(f, self) self.offset_xref_trailer[offset] = (xref_section, trailer_dict) else: # TODO: check for objects between xref and trailer dict, and between trailer dict and startxref? raise Exception( f'trailer dict not found after xref table at {f.tell() - 7}' ) else: # may be compressed xref stream # trailer dict IS the stream dict, and CANNOT contain references try: xref_stream = PdfIndirectObject.create_from_file(f, self) except Exception as ex: raise Exception('Invalid xref stream') from ex xref_section = PdfXRefSection.from_xrefstm(xref_stream) self.offset_xref_trailer[offset] = (xref_section, xref_stream.value.dict) return self.offset_xref_trailer[offset]
def parse_linear(self, f, progress_cb=None): '''Initialize a PdfDocument from a opened PDF file f from the beginning''' def print_progress(): print('', end="\r") print(f'{f.tell() / filesize * 100:5.2f}% processed', end='', flush=True) if progress_cb is not None: progress_cb(f'{f.tell() / filesize * 100:5.2f}% processed', read=f.tell(), total=filesize) f.seek(0, io.SEEK_SET) filesize = os.fstat(f.fileno()).st_size print_progress() # First line is header s, eol_marker = utils.read_until(f, syntax.EOL) header = re.match(rb'%PDF-(\d+\.\d+)', s) if header: self.version = Decimal(header.group(1).decode('iso-8859-1')) f.seek(len(eol_marker), io.SEEK_CUR) else: raise Exception('Not a PDF file') while True: utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=False) if f.tell() >= filesize: break org_pos = f.tell() s, eol_marker = utils.read_until(f, syntax.EOL) if s == b'startxref': # the last startxref always override the ones before utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) t, _ = utils.read_until(f, syntax.EOL) self.startxref = int(t) self.increments[-1]['startxref'] = self.startxref continue elif s == b'xref': f.seek(-4, io.SEEK_CUR) self.increments[-1]['xref_section'] = PdfXRefSection(f) self.offset_xref[org_pos] = self.increments[-1]['xref_section'] continue elif s == b'trailer': utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) self.increments[-1][ 'trailer'] = PdfDictionaryObject.create_from_file(f, self) continue elif s == b'%%EOF': # TODO: check if trailer dict immediately precedes %%EOF # since we are seeking until non-ws, the only case EOF marker # does not appear by itself it when it is preceded by some # whitespaces, which should be ignored self.increments[-1]['eof'] = True f.seek(5 + len(eol_marker), io.SEEK_CUR) continue elif s[0:1] == b'%': # otherwise, it is a comment, ignore the whole remaining line utils.seek_until(f, syntax.EOL) continue #else: f.seek(org_pos, io.SEEK_SET) if self.increments[-1]['eof']: self.increments += [{ 'body': [], 'xref_section': None, 'trailer': None, 'startxref': None, 'eof': False }] # TODO: how to handle object parse error? new_obj = PdfObject.create_from_file(f, self) self.increments[-1]['body'] += [new_obj] self.offset_obj[org_pos] = new_obj if isinstance(new_obj.value, PdfStreamObject ) and new_obj.value.dict.get('Type') == 'ObjStm': self.offset_obj_streams[org_pos] = new_obj print_progress() print('', end="\r") print('100% processed ') if progress_cb is not None: progress_cb('100% processed', read=f.tell(), total=filesize) self.ready = True print('Decoding object streams...') if progress_cb is not None: progress_cb('Decoding object streams...', read=f.tell(), total=filesize) for k in self.offset_obj_streams: from objstm import decode_objstm self.compressed_obj = { **(self.compressed_obj), **(decode_objstm(self.offset_obj_streams[k], self)) } print('Done') if progress_cb is not None: progress_cb('Done', read=f.tell(), total=filesize)
def create_from_file(cls, f: io.BufferedReader, doc): org_pos = f.tell() stream_dict = PdfObject.create_from_file(f, doc) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) if not (utils.peek_at_least(f, 7)[0:7] == b'stream\n' or utils.peek_at_least(f, 8)[0:8] == b'stream\r\n'): f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) # check if dict has the required key /Length with valid values if not isinstance(stream_dict, PdfDictionaryObject): f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') if stream_dict.get('Length') is None: raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') if isinstance(stream_dict['Length'], PdfReferenceObject): size = stream_dict['Length'].deref().value elif isinstance(stream_dict['Length'], PdfNumericObject): size = stream_dict['Length'].value else: raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') if size.as_integer_ratio()[0] <= 0 or size.as_integer_ratio()[1] != 1: raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') size = size.as_integer_ratio()[0] # check for filters filt = None if stream_dict.get('Filter') is not None: # TODO: Remove the assumption that Filter value is always a direct obj filt = stream_dict['Filter'] if isinstance(filt, PdfArrayObject): if any(not isinstance(x, PdfNameObject) for x in filt.value): raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.' ) filt = filt.value[0] # /Filter (or first element of the array) must specify a Name if not isinstance(filt, PdfNameObject): raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.' ) # read only /Length bytes # filter implementation is reponsible for checking if the data length is correct # e.g. if any needed end-of-data marker is present at only the end raw = f.read(size) # check if stream ends with b'endstream', optionally preceeded by b'\r', 'b'\n' or b'\r\n' if utils.peek_at_least(f, 2)[0:2] == b'\r\n': f.seek(2, io.SEEK_CUR) elif utils.peek_at_least(f, 1)[0:1] == b'\r' or utils.peek_at_least( f, 1)[0:1] == b'\n': f.seek(1, io.SEEK_CUR) token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if token != b'endstream': raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') # actual decoding is done in constructor return PdfStreamObject(stream_dict, raw)
def inner2(): f.seek(inner_content_pos, io.SEEK_SET) obj = PdfObject.create_from_file(f, doc) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) return obj
def create_from_file(cls, f: io.BufferedReader, doc): org_pos = f.tell() num, _ = utils.read_until(f, syntax.WHITESPACES) if re.match(rb'\d+$', num) is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid indirect object at offset {org_pos}.' ) obj_no = int(num) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) num, _ = utils.read_until(f, syntax.WHITESPACES) if re.match(rb'\d+$', num) is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid indirect object at offset {org_pos}.' ) gen_no = int(num) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) tok, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if tok != b'obj': f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid indirect object at offset {org_pos}.' ) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) inner_content_pos = f.tell() # parse inner object def inner2(): f.seek(inner_content_pos, io.SEEK_SET) obj = PdfObject.create_from_file(f, doc) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) return obj inner_obj = inner2() # if inner object is a dict, and is followed by a stream extent, then the object should be stream object # otherwise, if there is no endobj token, it is an error temp = f.tell() utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) token, endtoken = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES, maxsize=7) f.seek(temp, io.SEEK_SET) if not (token == b'endobj' and (endtoken != b'' or endtoken is None)): # endtoken None to indicate EOF if utils.peek_at_least( f, 7)[0:7] == b'stream\n' or utils.peek_at_least( f, 8)[0:8] == b'stream\r\n': f.seek(inner_content_pos, io.SEEK_SET) streamObj = PdfStreamObject.create_from_file(f, doc) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) token, endtoken = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES, maxsize=7) if not (token == b'endobj' and (endtoken != b'' or endtoken is None)): f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid indirect object at offset {org_pos}.' ) inner_obj = streamObj #if streamObj.dict.get('Type') == 'ObjStm': # Object Stream, decode and parse the content else: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid indirect object at offset {org_pos}.' ) else: f.seek(6, io.SEEK_CUR) return PdfIndirectObject(inner_obj, obj_no, gen_no)