def read(self, cmap_bytes: str) -> "CMap":

        N = len(cmap_bytes)
        tok = HighLevelTokenizer(io.BytesIO(cmap_bytes.encode("latin-1")))

        prev_token = None
        while tok.tell() < N:

            token = tok.next_non_comment_token()
            if token is None:
                break

            # beginbfchar
            if token.text == "beginbfchar":
                n = int(prev_token.text)
                for j in range(0, n):
                    c = self._hex_string_to_int_or_tuple(tok.read_object())
                    uc = self._hex_string_to_int_or_tuple(tok.read_object())
                    self._add_symbol(c, uc)
                continue

            # beginbfrange
            if token.text == "beginbfrange":
                n = int(prev_token.text)
                for j in range(0, n):

                    c_start_token = tok.read_object()
                    c_start = int(c_start_token, 16)

                    c_end_token = tok.read_object()
                    c_end = int(c_end_token, 16)

                    tmp = tok.read_object()
                    if isinstance(tmp, HexadecimalString):
                        uc = self._hex_string_to_int_or_tuple(tmp)
                        for k in range(0, c_end - c_start + 1):
                            if isinstance(uc, int):
                                self._add_symbol(c_start + k, uc + k)
                            elif isinstance(uc, tuple):
                                self._add_symbol(c_start + k,
                                                 (uc[0], uc[1] + k))

                    elif isinstance(tmp, list):
                        for k in range(0, c_end - c_start + 1):
                            uc = self._hex_string_to_int_or_tuple(tmp[k])
                            self._add_symbol(c_start + k, uc)

            # default
            prev_token = token

        return self
示例#2
0
    def read(
        self,
        io_source: Union[io.BufferedIOBase, io.RawIOBase],
        tokenizer: HighLevelTokenizer,
        initial_offset: Optional[int] = None,
    ) -> "XREF":

        if initial_offset is not None:
            io_source.seek(initial_offset)
        else:
            self._seek_to_xref_token(io_source, tokenizer)

        xref_stream = tokenizer.read_object()
        assert isinstance(xref_stream, Stream)

        # check widths
        assert "W" in xref_stream
        assert all([
            isinstance(xref_stream["W"][x], Decimal)
            for x in range(0, len(xref_stream["W"]))
        ])
        # decode widths
        widths = [
            int(xref_stream["W"][x]) for x in range(0, len(xref_stream["W"]))
        ]
        total_entry_width = sum(widths)

        # parent
        document = self.get_root()  # type: ignore [attr-defined]

        # list of references
        indirect_references = [
            Reference(
                object_number=0,
                generation_number=65535,
                is_in_use=False,
                document=document,
            )
        ]

        # check size
        assert "Size" in xref_stream
        assert isinstance(xref_stream["Size"], Decimal)

        # get size
        number_of_objects = int(xref_stream["Size"])

        # index
        index = []
        if "Index" in xref_stream:
            index = xref_stream["Index"]
            assert isinstance(index, List)
            assert len(index) % 2 == 0
            assert isinstance(index[0], Decimal)
            assert isinstance(index[1], Decimal)
        else:
            index = [Decimal(0), Decimal(number_of_objects)]

        # apply filters
        xref_stream = decode_stream(xref_stream)

        # read every range specified in \Index
        xref_stream_decoded_bytes = xref_stream["DecodedBytes"]
        for idx in range(0, len(index), 2):
            start = int(index[idx])
            length = int(index[idx + 1])

            bptr = 0
            for i in range(0, length):

                # object number
                object_number = start + i

                # read type
                type = 1
                if widths[0] > 0:
                    type = 0
                    for j in range(0, widths[0]):
                        type = (type << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                        bptr += 1

                # read field 2
                field2 = 0
                for j in range(0, widths[1]):
                    field2 = (field2 << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                    bptr += 1

                # read field 3
                field3 = 0
                for j in range(0, widths[2]):
                    field3 = (field3 << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                    bptr += 1

                # check type
                assert type in [0, 1, 2]

                pdf_indirect_reference = None
                if type == 0:
                    # type      :The type of this entry, which shall be 0. Type 0 entries define
                    # the linked list of free objects (corresponding to f entries in a
                    # cross-reference table).
                    # field2    : The object number of the next free object
                    # field3    : The generation number to use if this object number is used again
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        byte_offset=field2,
                        generation_number=field3,
                        is_in_use=False,
                    )

                if type == 1:
                    # Type      : The type of this entry, which shall be 1. Type 1 entries define
                    # objects that are in use but are not compressed (corresponding
                    # to n entries in a cross-reference table).
                    # field2    : The byte offset of the object, starting from the beginning of the
                    # file.
                    # field3    : The generation number of the object. Default value: 0.
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        byte_offset=field2,
                        generation_number=field3,
                    )

                if type == 2:
                    # Type      : The type of this entry, which shall be 2. Type 2 entries define
                    # compressed objects.
                    # field2    : The object number of the object stream in which this object is
                    # stored. (The generation number of the object stream shall be
                    # implicitly 0.)
                    # field3    : The index of this object within the object stream.
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        generation_number=0,
                        parent_stream_object_number=field2,
                        index_in_parent_stream=field3,
                    )

                assert pdf_indirect_reference is not None

                # append
                existing_indirect_ref = next(
                    iter([
                        x for x in indirect_references
                        if x.object_number is not None
                        and x.object_number == Decimal(object_number)
                    ]),
                    None,
                )
                ref_is_in_reading_state = (
                    existing_indirect_ref is not None
                    and existing_indirect_ref.is_in_use
                    and existing_indirect_ref.generation_number
                    == pdf_indirect_reference.generation_number)
                ref_is_first_encountered = existing_indirect_ref is None or (
                    not ref_is_in_reading_state
                    and existing_indirect_ref.document is None)

                if ref_is_first_encountered:
                    assert pdf_indirect_reference is not None
                    indirect_references.append(pdf_indirect_reference)
                elif ref_is_in_reading_state:
                    assert existing_indirect_ref is not None
                    assert pdf_indirect_reference is not None
                    existing_indirect_ref.index_in_parent_stream = (
                        pdf_indirect_reference.index_in_parent_stream)
                    existing_indirect_ref.parent_stream_object_number = (
                        pdf_indirect_reference.parent_stream_object_number)

        # add section
        for r in indirect_references:
            self.append(r)

        # initialize trailer
        self["Trailer"] = Dictionary(xref_stream)

        # return
        return self
示例#3
0
    def read(self, io_source: io.IOBase) -> "Canvas":

        io_source.seek(0, os.SEEK_END)
        length = io_source.tell()
        io_source.seek(0)

        canvas_tokenizer = HighLevelTokenizer(io_source)

        # process content
        operand_stk = []
        while canvas_tokenizer.tell() != length:

            # attempt to read object
            obj = canvas_tokenizer.read_object()
            if obj is None:
                break

            # push argument onto stack
            if not isinstance(obj, CanvasOperatorName):
                operand_stk.append(obj)
                continue

            # process operator
            candidate_ops = [
                x for x in self.canvas_operators if x.get_text() == str(obj)
            ]
            if len(candidate_ops) == 1:
                operator = candidate_ops[0]
                if len(operand_stk) < operator.get_number_of_operands():
                    # if we are in a compatibility section ignore any possible mistake
                    if self.in_compatibility_section:
                        continue
                    raise IllegalGraphicsStateError(
                        message="Unable to execute operator %s. Expected %d arguments, received %d."
                        % (
                            operator.text,
                            operator.get_number_of_operands(),
                            len(operand_stk),
                        )
                    )
                operands = []
                for _ in range(0, operator.get_number_of_operands()):
                    operands.insert(0, operand_stk.pop(-1))

                # append
                if "Instructions" not in self:
                    self["Instructions"] = List().set_parent(self)

                instruction_number = len(self["Instructions"])
                instruction_dictionary = Dictionary()
                instruction_dictionary["Name"] = operator.get_text()
                instruction_dictionary["Args"] = List().set_parent(
                    instruction_dictionary
                )

                if len(operands) > 0:
                    for i in range(0, len(operands)):
                        instruction_dictionary["Args"].append(operands[i])
                self["Instructions"].append(instruction_dictionary)

                # debug
                logger.debug(
                    "%d %s %s"
                    % (
                        instruction_number,
                        operator.text,
                        str([str(x) for x in operands]),
                    )
                )

                # invoke
                try:
                    operator.invoke(self, operands)
                except Exception as e:
                    if not self.in_compatibility_section:
                        raise e

            # unknown operator
            if len(candidate_ops) == 0:
                # print("Missing OPERATOR %s" % obj)
                pass

        # return
        return self
示例#4
0
    def get(
        self,
        indirect_reference: Union[Reference, int],
        src: io.IOBase,
        tok: HighLevelTokenizer,
    ) -> Optional[AnyPDFType]:

        # cache
        obj = None

        # lookup Reference object for int
        if isinstance(indirect_reference, int) or isinstance(
                indirect_reference, Decimal):
            refs = [
                x for x in self.entries
                if x.object_number == int(indirect_reference)
            ]
            if len(refs) == 0:
                return None
            indirect_reference = refs[0]

        # lookup Reference (in self) for Reference
        elif isinstance(indirect_reference, Reference):
            refs = [
                x for x in self.entries
                if x.object_number == indirect_reference.object_number
            ]
            if len(refs) == 0:
                return None
            indirect_reference = refs[0]

        # reference points to an object that is not in use
        assert isinstance(indirect_reference, Reference)
        if not indirect_reference.is_in_use:
            obj = None

        # the indirect reference may have a byte offset
        if indirect_reference.byte_offset is not None:
            byte_offset = int(indirect_reference.byte_offset)
            tell_before = tok.tell()
            tok.seek(byte_offset)
            obj = tok.read_object(xref=self)
            tok.seek(tell_before)

        # entry specifies a parent object
        if indirect_reference.parent_stream_object_number is not None:

            stream_object = self.get(
                indirect_reference.parent_stream_object_number, src, tok)
            assert isinstance(stream_object, dict)
            if "Length" not in stream_object:
                raise PDFTypeError(expected_type=Union[Decimal, Reference],
                                   received_type=None)

            if "First" not in stream_object:
                raise PDFTypeError(expected_type=Union[Decimal, Reference],
                                   received_type=None)

            # Length may be Reference
            if isinstance(stream_object["Length"], Reference):
                stream_object["Length"] = self.get(stream_object["Length"],
                                                   src=src,
                                                   tok=tok)

            # First may be Reference
            if isinstance(stream_object["First"], Reference):
                stream_object["First"] = self.get(stream_object["First"],
                                                  src=src,
                                                  tok=tok)

            first_byte = int(stream_object.get("First", 0))
            if "DecodedBytes" not in stream_object:
                try:
                    stream_object = decode_stream(stream_object)
                except Exception as ex:
                    logger.debug(
                        "unable to inflate stream for object %d" %
                        indirect_reference.parent_stream_object_number)
                    raise ex
            stream_bytes = stream_object["DecodedBytes"][first_byte:]

            # tokenize parent stream
            index = int(indirect_reference.index_in_parent_stream)
            length = int(stream_object["Length"])
            if index < length:
                tok = HighLevelTokenizer(io.BytesIO(stream_bytes))
                obj = [tok.read_object() for x in range(0, index + 1)]
                obj = obj[-1]
            else:
                obj = None

        # return
        return obj
    def read(self, io_source: io.IOBase) -> "Canvas":

        io_source.seek(0, os.SEEK_END)
        length = io_source.tell()
        io_source.seek(0)

        canvas_tokenizer = HighLevelTokenizer(io_source)

        # process content
        operand_stk = []
        while canvas_tokenizer.tell() != length:

            # print("<canvas pos='%d' length='%d' percentage='%d'/>" % ( canvas_tokenizer.tell(), length, int(canvas_tokenizer.tell() * 100 / length)))

            # attempt to read object
            obj = canvas_tokenizer.read_object()
            if obj is None:
                break

            # push argument onto stack
            if not isinstance(obj, CanvasOperatorName):
                operand_stk.append(obj)
                continue

            # process operator
            operator = self.canvas_operators.get(obj, None)
            if operator is None:
                logger.debug("Missing operator %s" % obj)
                continue

            if not self.in_compatibility_section:
                assert len(operand_stk) >= operator.get_number_of_operands()
            operands: typing.List["CanvasOperator"] = []  # type: ignore [name-defined]
            for _ in range(0, operator.get_number_of_operands()):
                operands.insert(0, operand_stk.pop(-1))

            # append
            if "Instructions" not in self:
                self["Instructions"] = List().set_parent(self)  # type: ignore [attr-defined]

            instruction_number = len(self["Instructions"])
            instruction_dictionary = Dictionary()
            instruction_dictionary["Name"] = operator.get_text()
            instruction_dictionary["Args"] = List().set_parent(  # type: ignore [attr-defined]
                instruction_dictionary
            )

            if len(operands) > 0:
                for i in range(0, len(operands)):
                    instruction_dictionary["Args"].append(operands[i])
            self["Instructions"].append(instruction_dictionary)

            # debug
            logger.debug(
                "%d %s %s"
                % (
                    instruction_number,
                    operator.text,
                    str([str(x) for x in operands]),
                )
            )

            # invoke
            try:
                operator.invoke(self, operands)
            except Exception as e:
                if not self.in_compatibility_section:
                    raise e

        # return
        return self