def _seek_to_xref_token(self, src: io.IOBase, tok: HighLevelTokenizer): # find "startxref" text start_of_xref_token_byte_offset = self._find_backwards( src, tok, "startxref") assert start_of_xref_token_byte_offset is not None if start_of_xref_token_byte_offset == -1: raise StartXREFTokenNotFoundError() # set tokenizer to "startxref" src.seek(start_of_xref_token_byte_offset) token = tok.next_non_comment_token() assert token is not None if token.text == "xref": src.seek(start_of_xref_token_byte_offset) return # if we are at startxref, we are reading the XREF table backwards # and we need to go back to the start of XREF if token.text == "startxref": token = tok.next_non_comment_token() assert token is not None if token.token_type != TokenType.NUMBER: raise PDFSyntaxError(byte_offset=token.byte_offset, message="invalid XREF") start_of_xref_offset = int(token.text) src.seek(start_of_xref_offset)
def _read_trailer(self, src: io.IOBase, tok: HighLevelTokenizer) -> Dictionary: # return None if there is no trailer token = tok.next_non_comment_token() assert token is not None if token.text != "trailer": return Dictionary() # if there is a keyword "trailer" the next token should be TokenType.START_DICT token = tok.next_non_comment_token() assert token is not None if token.token_type != TokenType.START_DICT: raise PDFSyntaxError( byte_offset=tok.tell(), message="invalid XREF trailer", ) # go back 2 chars "<<" src.seek(-2, io.SEEK_CUR) # read dictionary as trailer trailer_dict = tok.read_dictionary() # process startxref token = tok.next_non_comment_token() assert token is not None if token.token_type != TokenType.OTHER or token.text != "startxref": raise PDFSyntaxError( byte_offset=token.byte_offset, message="start of XREF not found", ) # return return trailer_dict
def _read_section(self, src: io.IOBase, tok: HighLevelTokenizer) -> List[Reference]: tokens = [tok.next_non_comment_token() for _ in range(0, 2)] assert tokens[0] is not None assert tokens[1] is not None if tokens[0].text in ["trailer", "startxref"]: src.seek(tokens[0].byte_offset) return [] if tokens[0].token_type != TokenType.NUMBER: raise PDFValueError( byte_offset=tokens[0].byte_offset, expected_value_description="number", received_value_description=tokens[0].text, ) if tokens[1].token_type != TokenType.NUMBER: raise PDFValueError( byte_offset=tokens[1].byte_offset, expected_value_description="number", received_value_description=tokens[1].text, ) start_object_number = int(tokens[0].text) number_of_objects = int(tokens[1].text) indirect_references = [] # read subsection for i in range(0, number_of_objects): tokens = [tok.next_non_comment_token() for _ in range(0, 3)] assert tokens[0] is not None assert tokens[1] is not None assert tokens[2] is not None if tokens[0].text in ["trailer", "startxref"]: raise PDFSyntaxError( byte_offset=tokens[0].byte_offset, message="unexpected EOF while processing XREF", ) if (tokens[0].token_type != TokenType.NUMBER or tokens[1].token_type != TokenType.NUMBER or tokens[2].token_type != TokenType.OTHER or tokens[2].text not in ["f", "n"]): raise PDFSyntaxError( byte_offset=tokens[0].byte_offset, message="invalid XREF line", ) indirect_references.append( Reference( object_number=start_object_number + i, byte_offset=int(tokens[0].text), generation_number=int(tokens[1].text), is_in_use=(tokens[2].text == "n"), )) # return return indirect_references
def read( self, src: io.IOBase, tok: HighLevelTokenizer, initial_offset: Optional[int] = None, ) -> "XREF": if initial_offset is not None: src.seek(initial_offset) else: self._seek_to_xref_token(src, tok) # now we should be back to the start of XREF token = tok.next_non_comment_token() assert token is not None if token.text != "xref": raise XREFTokenNotFoundError() # read xref sections while True: xref_section = self._read_section(src, tok) if len(xref_section) == 0: break else: for r in xref_section: self.append(r) # process trailer self["Trailer"] = self._read_trailer(src, tok) # return self return self
def read(self, cmap_bytes: str) -> "CMap": N = len(cmap_bytes) tok = HighLevelTokenizer(io.BytesIO(cmap_bytes.encode("latin-1"))) prev_token = None while tok.tell() < N: token = tok.next_non_comment_token() if token is None: break # beginbfchar if token.text == "beginbfchar": n = int(prev_token.text) for j in range(0, n): c = self._hex_string_to_int_or_tuple(tok.read_object()) uc = self._hex_string_to_int_or_tuple(tok.read_object()) self._add_symbol(c, uc) continue # beginbfrange if token.text == "beginbfrange": n = int(prev_token.text) for j in range(0, n): c_start_token = tok.read_object() c_start = int(c_start_token, 16) c_end_token = tok.read_object() c_end = int(c_end_token, 16) tmp = tok.read_object() if isinstance(tmp, HexadecimalString): uc = self._hex_string_to_int_or_tuple(tmp) for k in range(0, c_end - c_start + 1): if isinstance(uc, int): self._add_symbol(c_start + k, uc + k) elif isinstance(uc, tuple): self._add_symbol(c_start + k, (uc[0], uc[1] + k)) elif isinstance(tmp, list): for k in range(0, c_end - c_start + 1): uc = self._hex_string_to_int_or_tuple(tmp[k]) self._add_symbol(c_start + k, uc) # default prev_token = token return self