示例#1
0
    def _seek_to_xref_token(self, src: io.IOBase, tok: HighLevelTokenizer):

        # find "startxref" text
        start_of_xref_token_byte_offset = self._find_backwards(
            src, tok, "startxref")
        assert start_of_xref_token_byte_offset is not None
        if start_of_xref_token_byte_offset == -1:
            raise StartXREFTokenNotFoundError()

        # set tokenizer to "startxref"
        src.seek(start_of_xref_token_byte_offset)
        token = tok.next_non_comment_token()
        assert token is not None
        if token.text == "xref":
            src.seek(start_of_xref_token_byte_offset)
            return

        # if we are at startxref, we are reading the XREF table backwards
        # and we need to go back to the start of XREF
        if token.text == "startxref":
            token = tok.next_non_comment_token()
            assert token is not None
            if token.token_type != TokenType.NUMBER:
                raise PDFSyntaxError(byte_offset=token.byte_offset,
                                     message="invalid XREF")

            start_of_xref_offset = int(token.text)
            src.seek(start_of_xref_offset)
示例#2
0
    def _read_trailer(self, src: io.IOBase,
                      tok: HighLevelTokenizer) -> Dictionary:

        # return None if there is no trailer
        token = tok.next_non_comment_token()
        assert token is not None
        if token.text != "trailer":
            return Dictionary()

        # if there is a keyword "trailer" the next token should be TokenType.START_DICT
        token = tok.next_non_comment_token()
        assert token is not None
        if token.token_type != TokenType.START_DICT:
            raise PDFSyntaxError(
                byte_offset=tok.tell(),
                message="invalid XREF trailer",
            )

        # go back 2 chars "<<"
        src.seek(-2, io.SEEK_CUR)

        # read dictionary as trailer
        trailer_dict = tok.read_dictionary()

        # process startxref
        token = tok.next_non_comment_token()
        assert token is not None
        if token.token_type != TokenType.OTHER or token.text != "startxref":
            raise PDFSyntaxError(
                byte_offset=token.byte_offset,
                message="start of XREF not found",
            )

        # return
        return trailer_dict
示例#3
0
    def _read_section(self, src: io.IOBase,
                      tok: HighLevelTokenizer) -> List[Reference]:

        tokens = [tok.next_non_comment_token() for _ in range(0, 2)]
        assert tokens[0] is not None
        assert tokens[1] is not None
        if tokens[0].text in ["trailer", "startxref"]:
            src.seek(tokens[0].byte_offset)
            return []
        if tokens[0].token_type != TokenType.NUMBER:
            raise PDFValueError(
                byte_offset=tokens[0].byte_offset,
                expected_value_description="number",
                received_value_description=tokens[0].text,
            )
        if tokens[1].token_type != TokenType.NUMBER:
            raise PDFValueError(
                byte_offset=tokens[1].byte_offset,
                expected_value_description="number",
                received_value_description=tokens[1].text,
            )

        start_object_number = int(tokens[0].text)
        number_of_objects = int(tokens[1].text)
        indirect_references = []

        # read subsection
        for i in range(0, number_of_objects):
            tokens = [tok.next_non_comment_token() for _ in range(0, 3)]
            assert tokens[0] is not None
            assert tokens[1] is not None
            assert tokens[2] is not None
            if tokens[0].text in ["trailer", "startxref"]:
                raise PDFSyntaxError(
                    byte_offset=tokens[0].byte_offset,
                    message="unexpected EOF while processing XREF",
                )
            if (tokens[0].token_type != TokenType.NUMBER
                    or tokens[1].token_type != TokenType.NUMBER
                    or tokens[2].token_type != TokenType.OTHER
                    or tokens[2].text not in ["f", "n"]):
                raise PDFSyntaxError(
                    byte_offset=tokens[0].byte_offset,
                    message="invalid XREF line",
                )

            indirect_references.append(
                Reference(
                    object_number=start_object_number + i,
                    byte_offset=int(tokens[0].text),
                    generation_number=int(tokens[1].text),
                    is_in_use=(tokens[2].text == "n"),
                ))

        # return
        return indirect_references
示例#4
0
    def read(
        self,
        src: io.IOBase,
        tok: HighLevelTokenizer,
        initial_offset: Optional[int] = None,
    ) -> "XREF":

        if initial_offset is not None:
            src.seek(initial_offset)
        else:
            self._seek_to_xref_token(src, tok)

        # now we should be back to the start of XREF
        token = tok.next_non_comment_token()
        assert token is not None
        if token.text != "xref":
            raise XREFTokenNotFoundError()

        # read xref sections
        while True:
            xref_section = self._read_section(src, tok)
            if len(xref_section) == 0:
                break
            else:
                for r in xref_section:
                    self.append(r)

        # process trailer
        self["Trailer"] = self._read_trailer(src, tok)

        # return self
        return self
    def read(self, cmap_bytes: str) -> "CMap":

        N = len(cmap_bytes)
        tok = HighLevelTokenizer(io.BytesIO(cmap_bytes.encode("latin-1")))

        prev_token = None
        while tok.tell() < N:

            token = tok.next_non_comment_token()
            if token is None:
                break

            # beginbfchar
            if token.text == "beginbfchar":
                n = int(prev_token.text)
                for j in range(0, n):
                    c = self._hex_string_to_int_or_tuple(tok.read_object())
                    uc = self._hex_string_to_int_or_tuple(tok.read_object())
                    self._add_symbol(c, uc)
                continue

            # beginbfrange
            if token.text == "beginbfrange":
                n = int(prev_token.text)
                for j in range(0, n):

                    c_start_token = tok.read_object()
                    c_start = int(c_start_token, 16)

                    c_end_token = tok.read_object()
                    c_end = int(c_end_token, 16)

                    tmp = tok.read_object()
                    if isinstance(tmp, HexadecimalString):
                        uc = self._hex_string_to_int_or_tuple(tmp)
                        for k in range(0, c_end - c_start + 1):
                            if isinstance(uc, int):
                                self._add_symbol(c_start + k, uc + k)
                            elif isinstance(uc, tuple):
                                self._add_symbol(c_start + k,
                                                 (uc[0], uc[1] + k))

                    elif isinstance(tmp, list):
                        for k in range(0, c_end - c_start + 1):
                            uc = self._hex_string_to_int_or_tuple(tmp[k])
                            self._add_symbol(c_start + k, uc)

            # default
            prev_token = token

        return self