def get_reference(self, object: AnyPDFType, context: WriteTransformerContext) -> Reference: # look through existing indirect objects obj_hash: int = hash(object) if obj_hash in context.indirect_objects: for obj in context.indirect_objects[obj_hash]: if obj == object: ref = obj.get_reference() # type: ignore [union-attr] assert ref is not None assert isinstance(ref, Reference) return ref # generate new object number existing_obj_numbers = set([ item.get_reference().object_number # type: ignore [union-attr] for sublist in [v for k, v in context.indirect_objects.items()] for item in sublist ]) obj_number = len(existing_obj_numbers) + 1 while obj_number in existing_obj_numbers: # type: ignore [union-attr] obj_number += 1 # build reference ref = Reference(object_number=obj_number) object.set_reference(ref) # type: ignore [union-attr] # insert into context if obj_hash in context.indirect_objects: context.indirect_objects[obj_hash].append(object) else: context.indirect_objects[obj_hash] = [object] # return return ref
def _read_section(self, src: io.IOBase, tok: HighLevelTokenizer) -> List[Reference]: tokens = [tok.next_non_comment_token() for _ in range(0, 2)] assert tokens[0] is not None assert tokens[1] is not None if tokens[0].text in ["trailer", "startxref"]: src.seek(tokens[0].byte_offset) return [] if tokens[0].token_type != TokenType.NUMBER: raise PDFValueError( byte_offset=tokens[0].byte_offset, expected_value_description="number", received_value_description=tokens[0].text, ) if tokens[1].token_type != TokenType.NUMBER: raise PDFValueError( byte_offset=tokens[1].byte_offset, expected_value_description="number", received_value_description=tokens[1].text, ) start_object_number = int(tokens[0].text) number_of_objects = int(tokens[1].text) indirect_references = [] # read subsection for i in range(0, number_of_objects): tokens = [tok.next_non_comment_token() for _ in range(0, 3)] assert tokens[0] is not None assert tokens[1] is not None assert tokens[2] is not None if tokens[0].text in ["trailer", "startxref"]: raise PDFSyntaxError( byte_offset=tokens[0].byte_offset, message="unexpected EOF while processing XREF", ) if (tokens[0].token_type != TokenType.NUMBER or tokens[1].token_type != TokenType.NUMBER or tokens[2].token_type != TokenType.OTHER or tokens[2].text not in ["f", "n"]): raise PDFSyntaxError( byte_offset=tokens[0].byte_offset, message="invalid XREF line", ) indirect_references.append( Reference( object_number=start_object_number + i, byte_offset=int(tokens[0].text), generation_number=int(tokens[1].text), is_in_use=(tokens[2].text == "n"), )) # return return indirect_references
def test_hash_types(self): obj0 = Dictionary() obj0[Name("Root")] = Reference(object_number=10) obj0[Name("Marked")] = Boolean(True) obj1 = List() obj1.append(Name("Red")) obj1.append(Decimal(0.5)) print(hash(obj1))
def _section_xref(self, context: Optional[WriteTransformerContext] = None): assert context is not None # get all references indirect_objects: typing.List[AnyPDFType] = [ item for sublist in [v for k, v in context.indirect_objects.items()] for item in sublist ] references: typing.List[Reference] = [] for obj in indirect_objects: ref = obj.get_reference() # type: ignore [union-attr] if ref is not None: references.append(ref) # sort references.sort(key=lambda x: x.object_number) # insert magic entry if needed if len(references) == 0 or references[0].generation_number != 65535: references.insert( 0, Reference( generation_number=65535, object_number=0, byte_offset=0, is_in_use=False, ), ) # divide into sections sections = [[references[0]]] for i in range(1, len(references)): ref = references[i] prev_object_number = sections[-1][-1].object_number assert prev_object_number is not None if ref.object_number == prev_object_number + 1: sections[-1].append(ref) else: sections.append([ref]) # return return sections
def read_indirect_object(self) -> Optional[AnyPDFType]: """ This method processes the next tokens and returns an indirect PDFObject. It fails and throws various errors if the next tokens do not represent an indirect PDFObject. """ # read object number token = self.next_non_comment_token() assert token is not None byte_offset = token.byte_offset if token.token_type != TokenType.NUMBER or not re.match( "^[0-9]+$", token.text): self.seek(byte_offset) return None object_number = int(token.text) # read generation number token = self.next_non_comment_token() assert token is not None if token.token_type != TokenType.NUMBER or not re.match( "^[0-9]+$", token.text): self.seek(byte_offset) return None generation_number = int(token.text) # read 'obj' token = self.next_non_comment_token() assert token is not None if token.token_type != TokenType.OTHER or token.text != "obj": self.seek(byte_offset) return None # read obj value = self.read_object() if value is not None: value.set_reference( # type: ignore[union-attr] Reference(object_number=object_number, generation_number=generation_number)) # return return value
def read_indirect_reference(self) -> Optional[Reference]: """ This method processes the next tokens and returns an indirect reference. It fails and throws various errors if the next tokens do not represent an indirect reference. """ # read object number token = self.next_non_comment_token() assert token is not None byte_offset = token.byte_offset if token.token_type != TokenType.NUMBER or not re.match( "^[0-9]+$", token.text): self.seek(byte_offset) return None object_number = int(token.text) # read generation number token = self.next_non_comment_token() assert token is not None if token.token_type != TokenType.NUMBER or not re.match( "^[0-9]+$", token.text): self.seek(byte_offset) return None generation_number = int(token.text) # read 'R' token = self.next_non_comment_token() assert token is not None if token.token_type != TokenType.OTHER or token.text != "R": self.seek(byte_offset) return None # return return Reference( object_number=object_number, generation_number=generation_number, )
def read( self, io_source: Union[io.BufferedIOBase, io.RawIOBase], tokenizer: HighLevelTokenizer, initial_offset: Optional[int] = None, ) -> "XREF": if initial_offset is not None: io_source.seek(initial_offset) else: self._seek_to_xref_token(io_source, tokenizer) xref_stream = tokenizer.read_object() assert isinstance(xref_stream, Stream) # check widths assert "W" in xref_stream assert all([ isinstance(xref_stream["W"][x], Decimal) for x in range(0, len(xref_stream["W"])) ]) # decode widths widths = [ int(xref_stream["W"][x]) for x in range(0, len(xref_stream["W"])) ] total_entry_width = sum(widths) # parent document = self.get_root() # type: ignore [attr-defined] # list of references indirect_references = [ Reference( object_number=0, generation_number=65535, is_in_use=False, document=document, ) ] # check size assert "Size" in xref_stream assert isinstance(xref_stream["Size"], Decimal) # get size number_of_objects = int(xref_stream["Size"]) # index index = [] if "Index" in xref_stream: index = xref_stream["Index"] assert isinstance(index, List) assert len(index) % 2 == 0 assert isinstance(index[0], Decimal) assert isinstance(index[1], Decimal) else: index = [Decimal(0), Decimal(number_of_objects)] # apply filters xref_stream = decode_stream(xref_stream) # read every range specified in \Index xref_stream_decoded_bytes = xref_stream["DecodedBytes"] for idx in range(0, len(index), 2): start = int(index[idx]) length = int(index[idx + 1]) bptr = 0 for i in range(0, length): # object number object_number = start + i # read type type = 1 if widths[0] > 0: type = 0 for j in range(0, widths[0]): type = (type << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # read field 2 field2 = 0 for j in range(0, widths[1]): field2 = (field2 << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # read field 3 field3 = 0 for j in range(0, widths[2]): field3 = (field3 << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # check type assert type in [0, 1, 2] pdf_indirect_reference = None if type == 0: # type :The type of this entry, which shall be 0. Type 0 entries define # the linked list of free objects (corresponding to f entries in a # cross-reference table). # field2 : The object number of the next free object # field3 : The generation number to use if this object number is used again pdf_indirect_reference = Reference( document=document, object_number=object_number, byte_offset=field2, generation_number=field3, is_in_use=False, ) if type == 1: # Type : The type of this entry, which shall be 1. Type 1 entries define # objects that are in use but are not compressed (corresponding # to n entries in a cross-reference table). # field2 : The byte offset of the object, starting from the beginning of the # file. # field3 : The generation number of the object. Default value: 0. pdf_indirect_reference = Reference( document=document, object_number=object_number, byte_offset=field2, generation_number=field3, ) if type == 2: # Type : The type of this entry, which shall be 2. Type 2 entries define # compressed objects. # field2 : The object number of the object stream in which this object is # stored. (The generation number of the object stream shall be # implicitly 0.) # field3 : The index of this object within the object stream. pdf_indirect_reference = Reference( document=document, object_number=object_number, generation_number=0, parent_stream_object_number=field2, index_in_parent_stream=field3, ) assert pdf_indirect_reference is not None # append existing_indirect_ref = next( iter([ x for x in indirect_references if x.object_number is not None and x.object_number == Decimal(object_number) ]), None, ) ref_is_in_reading_state = ( existing_indirect_ref is not None and existing_indirect_ref.is_in_use and existing_indirect_ref.generation_number == pdf_indirect_reference.generation_number) ref_is_first_encountered = existing_indirect_ref is None or ( not ref_is_in_reading_state and existing_indirect_ref.document is None) if ref_is_first_encountered: assert pdf_indirect_reference is not None indirect_references.append(pdf_indirect_reference) elif ref_is_in_reading_state: assert existing_indirect_ref is not None assert pdf_indirect_reference is not None existing_indirect_ref.index_in_parent_stream = ( pdf_indirect_reference.index_in_parent_stream) existing_indirect_ref.parent_stream_object_number = ( pdf_indirect_reference.parent_stream_object_number) # add section for r in indirect_references: self.append(r) # initialize trailer self["Trailer"] = Dictionary(xref_stream) # return return self