def _seek_to_xref_token( self, src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, ): # find "startxref" text start_of_xref_token_byte_offset = self._find_backwards( src, tok, "startxref") assert start_of_xref_token_byte_offset is not None assert start_of_xref_token_byte_offset != -1 # set tokenizer to "startxref" src.seek(start_of_xref_token_byte_offset) token = tok.next_non_comment_token() assert token is not None if token.text == "xref": src.seek(start_of_xref_token_byte_offset) return # if we are at startxref, we are reading the XREF table backwards # and we need to go back to the start of XREF if token.text == "startxref": token = tok.next_non_comment_token() assert token is not None assert token.token_type == TokenType.NUMBER start_of_xref_offset = int(token.text) src.seek(start_of_xref_offset)
def _read_trailer( self, src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, ) -> Dictionary: # return None if there is no trailer token = tok.next_non_comment_token() assert token is not None if token.text != "trailer": return Dictionary() # if there is a keyword "trailer" the next token should be TokenType.START_DICT token = tok.next_non_comment_token() assert token is not None assert token.token_type == TokenType.START_DICT # go back 2 chars "<<" src.seek(-2, io.SEEK_CUR) # read dictionary as trailer trailer_dict = tok.read_dictionary() # process startxref token = tok.next_non_comment_token() assert token is not None assert token.token_type == TokenType.OTHER assert token.text == "startxref" # return return trailer_dict
def read( self, src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, initial_offset: Optional[int] = None, ) -> "XREF": """ This method attempts to read a plaintext XREF from the given io_source. It will either throw an exception, or return this XREF """ if initial_offset is not None: src.seek(initial_offset) else: self._seek_to_xref_token(src, tok) # now we should be back to the start of XREF token = tok.next_non_comment_token() assert token is not None assert token.text == "xref" # read xref sections while True: xref_section = self._read_section(src, tok) if len(xref_section) == 0: break else: for r in xref_section: self.append(r) # process trailer self[Name("Trailer")] = self._read_trailer(src, tok) # return self return self
def _read_section( self, src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, ) -> List[Reference]: tokens = [tok.next_non_comment_token() for _ in range(0, 2)] assert tokens[0] is not None assert tokens[1] is not None if tokens[0].text in ["trailer", "startxref"]: src.seek(tokens[0].byte_offset) return [] assert tokens[0].token_type == TokenType.NUMBER assert tokens[1].token_type == TokenType.NUMBER start_object_number = int(tokens[0].text) number_of_objects = int(tokens[1].text) indirect_references = [] # read subsection for i in range(0, number_of_objects): tokens = [tok.next_non_comment_token() for _ in range(0, 3)] assert tokens[0] is not None assert tokens[0].text not in ["trailer", "startxref"] assert tokens[0].token_type == TokenType.NUMBER assert tokens[1] is not None assert tokens[1].token_type == TokenType.NUMBER assert tokens[2] is not None assert tokens[2].token_type == TokenType.OTHER assert tokens[2].text in ["f", "n"] indirect_references.append( Reference( object_number=start_object_number + i, byte_offset=int(tokens[0].text), generation_number=int(tokens[1].text), is_in_use=(tokens[2].text == "n"), )) # return return indirect_references
def read(self, cmap_bytes: str) -> "CMap": N = len(cmap_bytes) tok = HighLevelTokenizer(io.BytesIO(cmap_bytes.encode("latin-1"))) prev_token: Optional[Token] = None while tok.tell() < N: token = tok.next_non_comment_token() if token is None: break # beginbfchar if token.text == "beginbfchar": assert prev_token is not None n = int(prev_token.text) for j in range(0, n): obj = tok.read_object() assert isinstance(obj, HexadecimalString) c = self._hex_string_to_int_or_tuple(obj) assert isinstance(c, int) obj = tok.read_object() assert isinstance(obj, HexadecimalString) uc = self._hex_string_to_int_or_tuple(obj) self._add_symbol(c, uc) continue # beginbfrange if token.text == "beginbfrange": assert prev_token is not None n = int(prev_token.text) for j in range(0, n): c_start_token = tok.read_object() assert c_start_token is not None assert isinstance(c_start_token, HexadecimalString) c_start = int(str(c_start_token), 16) c_end_token = tok.read_object() assert c_end_token is not None assert isinstance(c_end_token, HexadecimalString) c_end = int(str(c_end_token), 16) tmp = tok.read_object() if isinstance(tmp, HexadecimalString): uc = self._hex_string_to_int_or_tuple(tmp) for k in range(0, c_end - c_start + 1): if isinstance(uc, int): self._add_symbol(c_start + k, uc + k) elif isinstance(uc, tuple): self._add_symbol(c_start + k, (uc[0], uc[1] + k)) elif isinstance(tmp, list): for k in range(0, c_end - c_start + 1): uc = self._hex_string_to_int_or_tuple(tmp[k]) self._add_symbol(c_start + k, uc) # default prev_token = token return self