示例#1
0
    def _seek_to_xref_token(
        self,
        src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO],
        tok: HighLevelTokenizer,
    ):

        # find "startxref" text
        start_of_xref_token_byte_offset = self._find_backwards(
            src, tok, "startxref")
        assert start_of_xref_token_byte_offset is not None
        assert start_of_xref_token_byte_offset != -1

        # set tokenizer to "startxref"
        src.seek(start_of_xref_token_byte_offset)
        token = tok.next_non_comment_token()
        assert token is not None
        if token.text == "xref":
            src.seek(start_of_xref_token_byte_offset)
            return

        # if we are at startxref, we are reading the XREF table backwards
        # and we need to go back to the start of XREF
        if token.text == "startxref":
            token = tok.next_non_comment_token()
            assert token is not None
            assert token.token_type == TokenType.NUMBER
            start_of_xref_offset = int(token.text)
            src.seek(start_of_xref_offset)
示例#2
0
    def _read_trailer(
        self,
        src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO],
        tok: HighLevelTokenizer,
    ) -> Dictionary:

        # return None if there is no trailer
        token = tok.next_non_comment_token()
        assert token is not None
        if token.text != "trailer":
            return Dictionary()

        # if there is a keyword "trailer" the next token should be TokenType.START_DICT
        token = tok.next_non_comment_token()
        assert token is not None
        assert token.token_type == TokenType.START_DICT

        # go back 2 chars "<<"
        src.seek(-2, io.SEEK_CUR)

        # read dictionary as trailer
        trailer_dict = tok.read_dictionary()

        # process startxref
        token = tok.next_non_comment_token()
        assert token is not None
        assert token.token_type == TokenType.OTHER
        assert token.text == "startxref"

        # return
        return trailer_dict
示例#3
0
    def read(
        self,
        src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO],
        tok: HighLevelTokenizer,
        initial_offset: Optional[int] = None,
    ) -> "XREF":
        """
        This method attempts to read a plaintext XREF from the given io_source.
        It will either throw an exception, or return this XREF
        """

        if initial_offset is not None:
            src.seek(initial_offset)
        else:
            self._seek_to_xref_token(src, tok)

        # now we should be back to the start of XREF
        token = tok.next_non_comment_token()
        assert token is not None
        assert token.text == "xref"

        # read xref sections
        while True:
            xref_section = self._read_section(src, tok)
            if len(xref_section) == 0:
                break
            else:
                for r in xref_section:
                    self.append(r)

        # process trailer
        self[Name("Trailer")] = self._read_trailer(src, tok)

        # return self
        return self
示例#4
0
    def _read_section(
        self,
        src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO],
        tok: HighLevelTokenizer,
    ) -> List[Reference]:

        tokens = [tok.next_non_comment_token() for _ in range(0, 2)]
        assert tokens[0] is not None
        assert tokens[1] is not None
        if tokens[0].text in ["trailer", "startxref"]:
            src.seek(tokens[0].byte_offset)
            return []
        assert tokens[0].token_type == TokenType.NUMBER
        assert tokens[1].token_type == TokenType.NUMBER

        start_object_number = int(tokens[0].text)
        number_of_objects = int(tokens[1].text)
        indirect_references = []

        # read subsection
        for i in range(0, number_of_objects):
            tokens = [tok.next_non_comment_token() for _ in range(0, 3)]
            assert tokens[0] is not None
            assert tokens[0].text not in ["trailer", "startxref"]
            assert tokens[0].token_type == TokenType.NUMBER

            assert tokens[1] is not None
            assert tokens[1].token_type == TokenType.NUMBER

            assert tokens[2] is not None
            assert tokens[2].token_type == TokenType.OTHER
            assert tokens[2].text in ["f", "n"]

            indirect_references.append(
                Reference(
                    object_number=start_object_number + i,
                    byte_offset=int(tokens[0].text),
                    generation_number=int(tokens[1].text),
                    is_in_use=(tokens[2].text == "n"),
                ))

        # return
        return indirect_references
示例#5
0
    def read(self, cmap_bytes: str) -> "CMap":

        N = len(cmap_bytes)
        tok = HighLevelTokenizer(io.BytesIO(cmap_bytes.encode("latin-1")))

        prev_token: Optional[Token] = None
        while tok.tell() < N:

            token = tok.next_non_comment_token()
            if token is None:
                break

            # beginbfchar
            if token.text == "beginbfchar":
                assert prev_token is not None
                n = int(prev_token.text)
                for j in range(0, n):
                    obj = tok.read_object()
                    assert isinstance(obj, HexadecimalString)
                    c = self._hex_string_to_int_or_tuple(obj)
                    assert isinstance(c, int)

                    obj = tok.read_object()
                    assert isinstance(obj, HexadecimalString)
                    uc = self._hex_string_to_int_or_tuple(obj)

                    self._add_symbol(c, uc)
                continue

            # beginbfrange
            if token.text == "beginbfrange":
                assert prev_token is not None
                n = int(prev_token.text)
                for j in range(0, n):

                    c_start_token = tok.read_object()
                    assert c_start_token is not None
                    assert isinstance(c_start_token, HexadecimalString)
                    c_start = int(str(c_start_token), 16)

                    c_end_token = tok.read_object()
                    assert c_end_token is not None
                    assert isinstance(c_end_token, HexadecimalString)
                    c_end = int(str(c_end_token), 16)

                    tmp = tok.read_object()
                    if isinstance(tmp, HexadecimalString):
                        uc = self._hex_string_to_int_or_tuple(tmp)
                        for k in range(0, c_end - c_start + 1):
                            if isinstance(uc, int):
                                self._add_symbol(c_start + k, uc + k)
                            elif isinstance(uc, tuple):
                                self._add_symbol(c_start + k,
                                                 (uc[0], uc[1] + k))

                    elif isinstance(tmp, list):
                        for k in range(0, c_end - c_start + 1):
                            uc = self._hex_string_to_int_or_tuple(tmp[k])
                            self._add_symbol(c_start + k, uc)

            # default
            prev_token = token

        return self