示例#1
0
    def __parse_attributes(self, attributes: bytes) -> Dict[str, str]:
        """
        Given a string representing zero or more possible attributes, parse them into
        a dictionary.

        Returns:
            A dictionary keyed by the attribute name and who's values are unescaped strings.
            If no attributes exist, this returns an empty dictionary.
        """
        attr_stream = InputStream(attributes)
        parsed_attrs: Dict[str, str] = {}
        state = 'space'
        attr = b''
        val = b''

        def unescape(value: bytes) -> str:
            val = value.decode(self.encoding)
            val = val.replace('&', '&')
            val = val.replace('&lt;', '<')
            val = val.replace('&gt;', '>')
            val = val.replace('&apos;', '\'')
            val = val.replace('&quot;', '\"')
            val = val.replace('&#13;', '\r')
            return val.replace('&#10;', '\n')

        while True:
            c = attr_stream.read_byte()

            if c is None:
                return parsed_attrs
            if state == 'space':
                if not c.isspace():
                    state = 'attr'
                    attr = c
            elif state == 'attr':
                if c == b'=':
                    attr = attr.strip()
                    state = 'valstart'
                else:
                    attr = attr + c
            elif state == 'valstart':
                if c == b'"':
                    state = 'valdouble'
                    val = b''
                elif c == b'\'':
                    state = 'valsingle'
                    val = b''
            elif state == 'valdouble':
                if c == b'"':
                    state = 'space'
                    parsed_attrs[attr.decode('ascii')] = unescape(val)
                else:
                    val = val + c
            elif state == 'valsingle':
                if c == b'\'':
                    state = 'space'
                    parsed_attrs[attr.decode('ascii')] = unescape(val)
                else:
                    val = val + c
示例#2
0
    def __init__(self, data: bytes, encoding: str) -> None:
        """
        Initialize the XML decoder.

        Parameters:
            data - String XML data which should be decoded into Nodes.
            encoding - The expected encoding of the XML.
        """
        self.stream = InputStream(data)
        self.root: Optional[Node] = None
        self.current: List[Node] = []
        self.encoding = encoding
示例#3
0
    def __init__(self, data: bytes, encoding: str) -> None:
        """
        Initialize the object.

        Parameters:
            - data - A binary blob of data to be decoded
            - encoding - A string representing the text encoding for string elements. Should be either
                         'shift-jis', 'euc-jp' or 'utf-8'
        """
        self.stream = InputStream(data)
        self.encoding = encoding
        self.executed = False
示例#4
0
    def __split_node(self, content: bytes) -> Tuple[bytes, bytes]:
        node_stream = InputStream(content)
        tag = b''
        attributes = b''
        state = "tag"

        while True:
            c = node_stream.read_byte()

            if c is None:
                break
            if state == "tag":
                if c.isspace():
                    state = "space"
                else:
                    tag = tag + c
            elif state == "space":
                if not c.isspace():
                    attributes = c
                    state = "attributes"
            elif state == "attributes":
                attributes = attributes + c

        return (tag, attributes)
示例#5
0
class XmlDecoder:
    """
    A hand-rolled XML parser, suitable for parsing old-style XML documents in
    game data or from legacy game traffic. I did consider using lxml and other
    data stores, but they insist on mangling data inside binary/string blobs
    making them unsuitable for a protocol with exact specifications.
    """
    def __init__(self, data: bytes, encoding: str) -> None:
        """
        Initialize the XML decoder.

        Parameters:
            data - String XML data which should be decoded into Nodes.
            encoding - The expected encoding of the XML.
        """
        self.stream = InputStream(data)
        self.root: Optional[Node] = None
        self.current: List[Node] = []
        self.encoding = encoding

    def __start_element(self, tag: bytes, attributes: Dict[str, str]) -> None:
        """
        Called when we encounter an element open tag. Also called when we encounter
        an empty element. Creates a new node with the specified name and attributes.

        Parameters:
            tag - The string tag name.
            attributes - A dictionary keyed by attribute name and whose values are the string
                         attribute values. This attribute values should already be decoded from
                         the XML's encoding.
        """
        data_type = attributes.get('__type')

        array_str = attributes.get('__count')
        if array_str is not None:
            array = True
        else:
            array = False

        if data_type is None:
            # Special case for nodes that don't have a type
            node = Node(name=tag.decode('ascii'), type=Node.NODE_TYPE_VOID)
        else:
            # Get the data value
            type_int = Node.typename_to_type(data_type)
            if type_int is None:
                raise XmlEncodingException(
                    f'Invalid node type {data_type} for node {tag.decode("ascii")}'
                )

            node = Node(name=tag.decode('ascii'), type=type_int, array=array)

        # Now, do the attributes
        for attr in attributes:
            if attr == '__type' or attr == '__count':
                # Skip these, handled
                continue
            else:
                node.set_attribute(attr, attributes.get(attr))

        self.current.append(node)

    def __end_element(self, tag: bytes) -> None:
        """
        Called when we encounter an element close tag. Also called when we encounter an empty element,
        after __start_element is called. Does bookkeeping related to element order.

        Parameters:
            tag - The string tag name.
        """
        node = self.current.pop()

        if node.name != tag.decode('ascii'):
            raise Exception(
                f'Logic error, expected {tag.decode("ascii")} but got {node.name}'
            )

        if len(self.current) == 0:
            self.root = node
        else:
            parent = self.current[-1]
            parent.add_child(node)

    def __yield_values(self, text: str) -> Iterator[str]:
        value = ''

        for c in text:
            if c.isspace():
                if len(value) > 0:
                    yield value
                    value = ''
            else:
                value = value + c

        if len(value) > 0:
            yield value

    def __text(self, text: bytes) -> None:
        """
        Called when we finish parsing arbitrary non-element text. Note that the text passed in is in
        the XML document's encoding and it is this function's responsibility to decode it.

        Parameters:
            text - String text value of the node, as encoded by the XML document's encoding.
        """
        try:
            value = text.decode(self.encoding)
        except UnicodeDecodeError:
            raise XmlEncodingException(
                'Failed to decode text node with given encoding')

        if len(self.current) > 0:
            data_type = self.current[-1].data_type
            composite = self.current[-1].is_composite
            array = self.current[-1].is_array

            if data_type == 'void':
                # We can't handle this
                return

            if data_type == 'str':
                # Do nothing, already fine
                value = value.replace('&amp;', '&')
                value = value.replace('&lt;', '<')
                value = value.replace('&gt;', '>')
                value = value.replace('&apos;', '\'')
                value = value.replace('&quot;', '\"')
                if self.current[-1].value is None:
                    self.current[-1].set_value(value)
                else:
                    self.current[-1].set_value(self.current[-1].value + value)
            elif data_type == 'bin':
                # Convert from a hex string
                def hex_to_bin(hexval: str) -> bytes:
                    intval = int(hexval, 16)
                    return struct.pack('>B', intval)

                # Remove any spaces first
                value = ''.join([c for c in value if not c.isspace()])
                if self.current[-1].value is None:
                    self.current[-1].set_value(b''.join([
                        hex_to_bin(value[i:(i + 2)])
                        for i in range(0, len(value), 2)
                    ]))
                else:
                    self.current[-1].set_value(
                        self.current[-1].value + b''.join([
                            hex_to_bin(value[i:(i + 2)])
                            for i in range(0, len(value), 2)
                        ]))
            elif data_type == 'ip4':
                # Do nothing, already fine
                self.current[-1].set_value(value)
            elif data_type == 'bool':

                def conv_bool(val: str) -> bool:
                    if val and val.lower() in ['0', 'false']:
                        return False
                    else:
                        return True

                if array or composite:
                    self.current[-1].set_value(
                        [conv_bool(v) for v in self.__yield_values(value)])
                else:
                    self.current[-1].set_value(conv_bool(value))
            elif data_type == 'float':
                if array or composite:
                    self.current[-1].set_value(
                        [float(v) for v in self.__yield_values(value)])
                else:
                    self.current[-1].set_value(float(value))
            else:
                if array or composite:
                    self.current[-1].set_value(
                        [int(v) for v in self.__yield_values(value)])
                else:
                    self.current[-1].set_value(int(value))

    def __parse_attributes(self, attributes: bytes) -> Dict[str, str]:
        """
        Given a string representing zero or more possible attributes, parse them into
        a dictionary.

        Returns:
            A dictionary keyed by the attribute name and who's values are unescaped strings.
            If no attributes exist, this returns an empty dictionary.
        """
        attr_stream = InputStream(attributes)
        parsed_attrs: Dict[str, str] = {}
        state = 'space'
        attr = b''
        val = b''

        def unescape(value: bytes) -> str:
            val = value.decode(self.encoding)
            val = val.replace('&amp;', '&')
            val = val.replace('&lt;', '<')
            val = val.replace('&gt;', '>')
            val = val.replace('&apos;', '\'')
            val = val.replace('&quot;', '\"')
            val = val.replace('&#13;', '\r')
            return val.replace('&#10;', '\n')

        while True:
            c = attr_stream.read_byte()

            if c is None:
                return parsed_attrs
            if state == 'space':
                if not c.isspace():
                    state = 'attr'
                    attr = c
            elif state == 'attr':
                if c == b'=':
                    attr = attr.strip()
                    state = 'valstart'
                else:
                    attr = attr + c
            elif state == 'valstart':
                if c == b'"':
                    state = 'valdouble'
                    val = b''
                elif c == b'\'':
                    state = 'valsingle'
                    val = b''
            elif state == 'valdouble':
                if c == b'"':
                    state = 'space'
                    parsed_attrs[attr.decode('ascii')] = unescape(val)
                else:
                    val = val + c
            elif state == 'valsingle':
                if c == b'\'':
                    state = 'space'
                    parsed_attrs[attr.decode('ascii')] = unescape(val)
                else:
                    val = val + c

    def __split_node(self, content: bytes) -> Tuple[bytes, bytes]:
        node_stream = InputStream(content)
        tag = b''
        attributes = b''
        state = "tag"

        while True:
            c = node_stream.read_byte()

            if c is None:
                break
            if state == "tag":
                if c.isspace():
                    state = "space"
                else:
                    tag = tag + c
            elif state == "space":
                if not c.isspace():
                    attributes = c
                    state = "attributes"
            elif state == "attributes":
                attributes = attributes + c

        return (tag, attributes)

    def __handle_node(self, content: bytes) -> None:
        """
        Called whenever we encounter any node type. Filters out special nodes,
        determines whether this is a start, end or empty node, and fires off
        calls to the respective __start_element and __end_element functions.

        Parameters:
            The node contents, minus the < and > characters. This will be encoded
            in the XML document's encoding.
        """
        if content[:1] == b'?' and content[-1:] == b'?':
            # Special node, parse to get the encoding.
            tag, attributes = self.__split_node(content[1:-1])
            if tag == b'xml':
                attributes_dict = self.__parse_attributes(attributes)
                if 'encoding' in attributes_dict:
                    self.encoding = attributes_dict['encoding']
            return

        if content[:1] == b'/':
            # We got an element end
            self.__end_element(content[1:])
        else:
            # We got a start element
            if content[-1:] == b'/':
                # This is an empty element
                empty = True
                content = content[:-1]
            else:
                # This node has subnodes or text
                empty = False

            tag, attributes = self.__split_node(content)
            self.__start_element(tag, self.__parse_attributes(attributes))
            if empty:
                self.__end_element(tag)

    def get_tree(self) -> Node:
        """
        Walk the XML document and parse into nodes.

        Returns:
            A Node object representing the root of the XML document.
        """
        state = 'text'
        text = b''
        node = b''

        while True:
            c = self.stream.read_byte()

            if c is None:
                return self.root
            elif state == 'text':
                if c == b'<':
                    self.__text(text)
                    state = 'node'
                    node = b''
                else:
                    text = text + c
            elif state == 'node':
                if c == b'>':
                    self.__handle_node(node)
                    state = 'text'
                    text = b''
                else:
                    node = node + c
示例#6
0
class BinaryDecoder:
    """
    A class capable of taking a binary blob and decoding it to a Node tree.
    """
    def __init__(self, data: bytes, encoding: str) -> None:
        """
        Initialize the object.

        Parameters:
            - data - A binary blob of data to be decoded
            - encoding - A string representing the text encoding for string elements. Should be either
                         'shift-jis', 'euc-jp' or 'utf-8'
        """
        self.stream = InputStream(data)
        self.encoding = encoding
        self.executed = False

    def __read_node_name(self) -> str:
        """
        Given the current position in the stream, read the 6-bit-byte packed string name of the
        node.

        Returns:
            A string representing the name in ascii
        """
        length = self.stream.read_int()
        if length is None:
            raise BinaryEncodingException(
                "Ran out of data when attempting to read node name length!")
        binary_length = int(((length * 6) + 7) / 8)

        def int_to_bin(integer: int) -> str:
            val = bin(integer)[2:]
            while len(val) < 8:
                val = '0' + val

            return val

        data = ''
        for _ in range(binary_length):
            next_byte = self.stream.read_int()
            if next_byte is None:
                raise BinaryEncodingException(
                    "Ran out of data when attempting to read node name!")
            data = data + int_to_bin(next_byte)
        data_str = [data[i:(i + 6)] for i in range(0, len(data), 6)]
        data_int = [int(val, 2) for val in data_str]
        ret = ''.join([Node.NODE_NAME_CHARS[val] for val in data_int])
        ret = ret[:length]
        return ret

    def __read_node(self, node_type: int) -> Node:
        """
        Given an integer node type, read the node's name, possible attributes
        and children. Will return a Node representing this node. Note
        that calling this on the first node should return a tree of all nodes.

        Returns:
            Node object
        """
        name = self.__read_node_name()
        node = Node(name=name, type=node_type)

        while True:
            child_type = self.stream.read_int()
            if child_type is None:
                raise BinaryEncodingException(
                    "Ran out of data when attempting to read node type!")

            if child_type == Node.END_OF_NODE:
                return node
            elif child_type == Node.ATTR_TYPE:
                key = self.__read_node_name()
                node.set_attribute(key)
            else:
                child = self.__read_node(child_type)
                node.add_child(child)

    def get_tree(self) -> Node:
        """
        Parse the header and body such that we can return a Node tree
        representing the data passed to us.

        Returns:
            Node object
        """
        if self.executed:
            raise BinaryEncodingException(
                "Logic error, should only call this once per instance")
        self.executed = True

        # Read the header first
        header_length = self.stream.read_int(4)
        if header_length is None:
            raise BinaryEncodingException(
                "Ran out of data when attempting to read header length!")

        node_type = self.stream.read_int()
        if node_type is None:
            raise BinaryEncodingException(
                "Ran out of data when attempting to read root node type!")
        root = self.__read_node(node_type)

        eod = self.stream.read_int()
        if eod != Node.END_OF_DOCUMENT:
            raise BinaryEncodingException(
                f'Unknown node type {eod} at end of document')

        # Skip by any padding
        while self.stream.pos < header_length + 4:
            self.stream.read_byte()

        # Read the body next
        body_length = self.stream.read_int(4)

        if body_length is not None and body_length > 0:
            # We have a body
            body = self.stream.read_blob(body_length)
            if body is None:
                raise BinaryEncodingException('Body has insufficient data')

            ordering = PackedOrdering(body_length)

            values = PackedOrdering.node_to_body_ordering(root)

            for value in values:
                node = value['node']

                if value['type'] == 'attribute':
                    size = None
                    enc = 's'
                    dtype = 'str'
                    array = False
                    composite = False
                else:
                    size = node.data_length
                    enc = node.data_encoding
                    dtype = node.data_type
                    array = node.is_array
                    composite = node.is_composite

                if composite and array:
                    raise Exception(
                        'Logic error, no support for composite arrays!')

                if not array:
                    # Scalar value
                    alignment = value['alignment']

                    if alignment == 1:
                        loc = ordering.get_next_byte()
                    elif alignment == 2:
                        loc = ordering.get_next_short()
                    elif alignment == 4:
                        loc = ordering.get_next_int()
                    if loc is None:
                        raise BinaryEncodingException(
                            "Ran out of data when attempting to read node data location!"
                        )

                    if size is None:
                        # The size should be read from the first 4 bytes
                        size = struct.unpack('>I', body[loc:(loc + 4)])[0]
                        ordering.mark_used(size + 4, loc, round_to=4)
                        loc = loc + 4

                        decode_data = body[loc:(loc + size)]
                        decode_value = f'>{size}{enc}'
                    else:
                        # The size is built-in
                        ordering.mark_used(size, loc)

                        decode_data = body[loc:(loc + size)]
                        decode_value = f'>{enc}'

                    if composite:
                        val_list = list(
                            struct.unpack(decode_value, decode_data))
                        if value['type'] == 'attribute':
                            raise Exception(
                                'Logic error, shouldn\'t have composite attribute type!'
                            )
                        node.set_value(val_list)
                        continue

                    val = struct.unpack(decode_value, decode_data)[0]

                    if dtype == 'str':
                        # Need to convert this from encoding to standard string.
                        # Also, need to lob off the trailing null.
                        try:
                            val = val[:-1].decode(self.encoding)
                        except UnicodeDecodeError:
                            # Nothing we can do here
                            pass

                    if value['type'] == 'attribute':
                        node.set_attribute(value['name'], val)
                    else:
                        node.set_value(val)
                else:
                    # Array value
                    loc = ordering.get_next_int()
                    if loc is None:
                        raise BinaryEncodingException(
                            "Ran out of data when attempting to read array length location!"
                        )

                    # The raw size in bytes
                    length = struct.unpack('>I', body[loc:(loc + 4)])[0]
                    elems = int(length / size)

                    ordering.mark_used(length + 4, loc, round_to=4)
                    loc = loc + 4
                    decode_data = body[loc:(loc + length)]
                    decode_value = f'>{enc * elems}'

                    val = struct.unpack(decode_value, decode_data)
                    node.set_value([v for v in val])

        return root