def read(self, io_source: io.IOBase) -> "Canvas": """ This method reads a byte stream of canvas operators, and processes them, returning this Canvas afterwards """ io_source.seek(0, os.SEEK_END) length = io_source.tell() io_source.seek(0) canvas_tokenizer = HighLevelTokenizer(io_source) # process content operand_stk = [] instruction_number: int = 0 while canvas_tokenizer.tell() != length: # print("<canvas pos='%d' length='%d' percentage='%d'/>" % ( canvas_tokenizer.tell(), length, int(canvas_tokenizer.tell() * 100 / length))) # attempt to read object obj = canvas_tokenizer.read_object() if obj is None: break # push argument onto stack if not isinstance(obj, CanvasOperatorName): operand_stk.append(obj) continue # process operator instruction_number += 1 operator = self.canvas_operators.get(obj, None) if operator is None: logger.debug("Missing operator %s" % obj) continue if not self.in_compatibility_section: assert len(operand_stk) >= operator.get_number_of_operands() operands: typing.List["CanvasOperator"] = [] # type: ignore [name-defined] for _ in range(0, operator.get_number_of_operands()): operands.insert(0, operand_stk.pop(-1)) # debug operand_str = str([str(x) for x in operands]) if len(operands) == 1 and isinstance(operands[0], list): operand_str = str([str(x) for x in operands[0]]) logger.debug("%d %s %s" % (instruction_number, operator.text, operand_str)) # invoke try: operator.invoke(self, operands) except Exception as e: if not self.in_compatibility_section: raise e # return return self
def get_object( self, indirect_reference: Union[Reference, int], src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, ) -> Optional[AnyPDFType]: """ This function looks up an object in this XREF table. Objects can be looked up by Reference, or object number. """ # cache if (isinstance(indirect_reference, Reference) and indirect_reference.parent_stream_object_number is None): assert indirect_reference.object_number is not None cached_obj = self.cache.get(indirect_reference.object_number, None) if cached_obj is not None: return cached_obj # lookup Reference object for int obj = None if isinstance(indirect_reference, int) or isinstance( indirect_reference, Decimal): refs = [ x for x in self.entries if x.object_number == int(indirect_reference) ] if len(refs) == 0: return None indirect_reference = refs[0] # lookup Reference (in self) for Reference elif isinstance(indirect_reference, Reference): refs = [ x for x in self.entries if x.object_number == indirect_reference.object_number ] if len(refs) == 0: return None indirect_reference = refs[0] # reference points to an object that is not in use assert isinstance(indirect_reference, Reference) if not indirect_reference.is_in_use: obj = None # the indirect reference may have a byte offset if indirect_reference.byte_offset is not None: byte_offset = int(indirect_reference.byte_offset) tell_before = tok.tell() tok.seek(byte_offset) obj = tok.read_object(xref=self) tok.seek(tell_before) # entry specifies a parent object if (indirect_reference.parent_stream_object_number is not None and indirect_reference.index_in_parent_stream is not None): stream_object = self.get_object( indirect_reference.parent_stream_object_number, src, tok) assert isinstance(stream_object, Stream) assert "Length" in stream_object assert "First" in stream_object # Length may be Reference if isinstance(stream_object["Length"], Reference): stream_object[Name("Length")] = self.get_object( stream_object["Length"], src=src, tok=tok) # First may be Reference if isinstance(stream_object["First"], Reference): stream_object[Name("First")] = self.get_object( stream_object["First"], src=src, tok=tok) first_byte = int(stream_object.get("First", 0)) if "DecodedBytes" not in stream_object: try: stream_object = decode_stream(stream_object) except Exception as ex: logger.debug( "unable to inflate stream for object %d" % indirect_reference.parent_stream_object_number) raise ex stream_bytes = stream_object["DecodedBytes"][first_byte:] # tokenize parent stream index = int(indirect_reference.index_in_parent_stream) length = int(stream_object["Length"]) if index < length: tok = HighLevelTokenizer(io.BytesIO(stream_bytes)) list_of_objs = [tok.read_object() for x in range(0, index + 1)] obj = list_of_objs[-1] else: obj = None # update cache if indirect_reference.parent_stream_object_number is None: assert indirect_reference.object_number is not None self.cache[indirect_reference.object_number] = obj # return return obj
def read(self, cmap_bytes: str) -> "CMap": N = len(cmap_bytes) tok = HighLevelTokenizer(io.BytesIO(cmap_bytes.encode("latin-1"))) prev_token: Optional[Token] = None while tok.tell() < N: token = tok.next_non_comment_token() if token is None: break # beginbfchar if token.text == "beginbfchar": assert prev_token is not None n = int(prev_token.text) for j in range(0, n): obj = tok.read_object() assert isinstance(obj, HexadecimalString) c = self._hex_string_to_int_or_tuple(obj) assert isinstance(c, int) obj = tok.read_object() assert isinstance(obj, HexadecimalString) uc = self._hex_string_to_int_or_tuple(obj) self._add_symbol(c, uc) continue # beginbfrange if token.text == "beginbfrange": assert prev_token is not None n = int(prev_token.text) for j in range(0, n): c_start_token = tok.read_object() assert c_start_token is not None assert isinstance(c_start_token, HexadecimalString) c_start = int(str(c_start_token), 16) c_end_token = tok.read_object() assert c_end_token is not None assert isinstance(c_end_token, HexadecimalString) c_end = int(str(c_end_token), 16) tmp = tok.read_object() if isinstance(tmp, HexadecimalString): uc = self._hex_string_to_int_or_tuple(tmp) for k in range(0, c_end - c_start + 1): if isinstance(uc, int): self._add_symbol(c_start + k, uc + k) elif isinstance(uc, tuple): self._add_symbol(c_start + k, (uc[0], uc[1] + k)) elif isinstance(tmp, list): for k in range(0, c_end - c_start + 1): uc = self._hex_string_to_int_or_tuple(tmp[k]) self._add_symbol(c_start + k, uc) # default prev_token = token return self