示例#1
0
    def process_packet_info(self, hi, inst):
        """Process packet information.

        Keeping track of all the instructions in the packet is necessary as many
        instructions depend on previous ones (e.g., constant extenders), and this
        dependency is only limited to the packet: all the information needed to
        correctly disassemble the instructions is in the packet itself.

        The disassembler is designed to be used in sequential mode, disassembling
        all the instructions in the same packet one after the other. A single instruction
        can't be correctly analyzed outside that scope (although IDA analysis sometimes
        does that).

        During a packet disassembly, if an instruction from a different packet is
        disassembled (calling `disasm_one_inst`) all the current packet information
        is lost. All the instructions of a single packet have to be disassembled in
        continuous order.

        Args:
            hi (HexagonInstruction): Current instruction being disassembled.
            inst (int): Actual instruction value.

        Returns:
            None

        TODOs:
            * Review and move part of this docstring to the project documentation.

            * Remove the `inst` argument once it is added to the HexagonInstruction class.

        """

        # Check if a new packet is being disassembled, either because:
        #   1. This is the first ever instruction being disassembled (i.e.,
        #       ``curr_packet`` is None).
        #   2. The previous (contiguous) instruction was the end of its packet,
        #       therefore this instruction has to start a new one.
        #   3. The previous disassembled instruction is not contiguous (an address
        #       that is not 4 bytes back), so it has to be assumed (for lack of any
        #       other information) that a new packet is being disassembled. There
        #       is no way to know for sure that this instruction is indeed the first one
        #       in the packet (the parse bits only indicate the last, but not the
        #       first instruction), so it's the safest bet (assuming the disassembler
        #       is being correctly used a jump to the middle of tha packet is not allowed).

        if self.curr_packet is None:
            hi.start_packet = True
            # Case 1.

        elif hi.addr - INST_SIZE == self.curr_packet.get_last_inst().addr:
            # There's a continuity in the disassembler use.

            if self.curr_packet.get_last_inst().end_packet:
                hi.start_packet = True
                # Case 2.

            else:
                hi.start_packet = False
                # The current packet continues with this instruction.

        else:
            hi.start_packet = True
            # Case 3.

        if hi.start_packet:
            self.curr_packet = HexagonPacket(hi)
            # If it is the first instruction in the packet it has to be new one.

        else:
            self.curr_packet.add_next_inst(hi)
            # This instruction continues the current packet so it's added to the list.

        hi.packet = self.curr_packet
        # TODO: Maybe there's some overlapping here and I don't need `self.curr_packet`.

        # Check if this instruction is the end of the packet, which is indicated by
        # the PP (parity) bits if their value is:
        #   1. '11' for a normal instruction, signals packet end.
        #   2. '00' signals a duplex instruction, and from the manual: "The duplex
        #       must always appear as the last word in a packet."

        hi.parse_bits = extract_bits(inst, 15, 14)
        if hi.parse_bits in [0b00, 0b11]:
            hi.end_packet = True
        else:
            hi.end_packet = False
        # TODO: Perform two different checks. The normal PP == 11, and `hi.is_duplex` in
        # another if (`is_duplex` has to be set first, which is not happening now).

        return
示例#2
0
    def process_packet_info(self, hi, inst):
        """Process packet information.

        Keeping track of all the instructions in the packet is necessary as many
        instructions depend on previous ones (e.g., constant extenders), and this
        dependency is only limited to the packet: all the information needed to
        correctly disassemble the instructions is in the packet itself.

        The disassembler is designed to be used in sequential mode, disassembling
        all the instructions in the same packet one after the other. A single instruction
        can't be correctly analyzed outside that scope (although IDA analysis sometimes
        does that).

        During a packet disassembly, if an instruction from a different packet is
        disassembled (calling `disasm_one_inst`) all the current packet information
        is lost. All the instructions of a single packet have to be disassembled in
        continuous order.

        Args:
            hi (HexagonInstruction): Current instruction being disassembled.
            inst (int): Actual instruction value.

        Returns:
            None

        TODOs:
            * Review and move part of this docstring to the project documentation.

            * Remove the `inst` argument once it is added to the HexagonInstruction class.

        """

        # Check if a new packet is being disassembled, either because:
        #   1. This is the first ever instruction being disassembled (i.e.,
        #       ``curr_packet`` is None).
        #   2. The previous (contiguous) instruction was the end of its packet,
        #       therefore this instruction has to start a new one.
        #   3. The previous disassembled instruction is not contiguous (an address
        #       that is not 4 bytes back), so it has to be assumed (for lack of any
        #       other information) that a new packet is being disassembled. There
        #       is no way to know for sure that this instruction is indeed the first one
        #       in the packet (the parse bits only indicate the last, but not the
        #       first instruction), so it's the safest bet (assuming the disassembler
        #       is being correctly used a jump to the middle of tha packet is not allowed).

        if self.curr_packet is None:
            hi.start_packet = True
            # Case 1.

        elif hi.addr - INST_SIZE == self.curr_packet.get_last_inst().addr:
            # There's a continuity in the disassembler use.

            if self.curr_packet.get_last_inst().end_packet:
                hi.start_packet = True
                # Case 2.

            else:
                hi.start_packet = False
                # The current packet continues with this instruction.

        else:
            hi.start_packet = True
            # Case 3.

        if hi.start_packet:
            self.curr_packet = HexagonPacket(hi)
            # If it is the first instruction in the packet it has to be new one.

        else:
            self.curr_packet.add_next_inst(hi)
            # This instruction continues the current packet so it's added to the list.

        hi.packet = self.curr_packet
        # TODO: Maybe there's some overlapping here and I don't need `self.curr_packet`.

        # Check if this instruction is the end of the packet, which is indicated by
        # the PP (parity) bits if their value is:
        #   1. '11' for a normal instruction, signals packet end.
        #   2. '00' signals a duplex instruction, and from the manual: "The duplex
        #       must always appear as the last word in a packet."

        hi.parse_bits = extract_bits(inst, 15, 14)
        if hi.parse_bits in [0b00, 0b11]:
            hi.end_packet = True
        else:
            hi.end_packet = False
        # TODO: Perform two different checks. The normal PP == 11, and `hi.is_duplex` in
        # another if (`is_duplex` has to be set first, which is not happening now).

        return
示例#3
0
class HexagonDisassembler(object):
    """Hexagon disassembler.

    Attributes:
        inst_templates (List[InstructionTemplate]): List of instruction templates generated by the decoder.
        curr_packet (HexagonPacket): Packet that contains the current instruction.
        objdump_compatible (bool):  Used to produce objdump compatible syntax, to test
            the effectiveness of the disassembler against Qualcomm's objdump. Many
            times the objdump syntax is not the preferred one (e.g., when using the
            disassembler for the IDA processor module), so it can be disabled.
        segmented_inst_templates (Dict[int, List[InstructionTemplate]]): Dictionary of
            lists of instruction templates, classified by their 4 ICLASS bits,
            not including the duplex instructions. Each entry in the dict. is indexed by the
            ICLASS bits, and contains the segment of instructions belonging to that ICLASS.
        duplex_templates (List[InstructionTemplate]): List of duplex instructions templates, that
            are separated from the rest of the templates in `segmented_inst_templates`.

    """
    __slots__ = [
        'inst_templates',
        'curr_packet',
        'segmented_inst_templates',
        'duplex_templates',
        'objdump_compatible',
    ]

    def __init__(self, objdump_compatible=False):

        self.inst_templates = common.pickle_load(common.INST_TEMPL_PATH)

        self.curr_packet = None
        self.objdump_compatible = objdump_compatible

        # Classify the (non duplex) instructions by the ICLASS bits (31:28),
        # which are always fixed to 0/1. This improves performance at the time
        # to find an instruction template match, because the search will be limited
        # to the reduced template segment indexed by these 4 bits.
        #
        # The duplex instructions go in a separate segment. First, because their
        # ICLASS bits have different positions (bits 31:29 and 13). Second, because
        # the duplex instructions require a "don't have" match for their PP (parse) bits.
        # In a normal instruction template bit matching, a certain (defined) pattern is
        # being looked for, e.g., ``0101xxx101xx1...`` (``x``: can have any value).
        # But for duplex instructions, apart from the "have certain bits" match,
        # another condition has to be met, that the PP bits are NOT set to 00.
        # This negative condition is harder to implement in the current framework,
        # therefore the duplex instructions are processed separately.
        # TODO: rewrite this explanation.

        self.segmented_inst_templates = {}
        self.duplex_templates = []
        for inst in self.inst_templates:
            if inst.is_duplex:
                self.duplex_templates.append(inst)
                # TODO: The duplex instructions can be segmented too, but I don't know if their quantity merits that split.
            else:
                iclass = int(inst.encoding.text[0:4], 2)
                if iclass not in self.segmented_inst_templates:
                    self.segmented_inst_templates[iclass] = []
                self.segmented_inst_templates[iclass].append(inst)
        # TODO: Move the segmentation to the decoding phase.

    def process_constant_extender(self, hi):
        """Process (if exists) a constant extender from the previous instruction, and apply it to this one.

        If the previous instruction was a constant extender (``immext``), it has to be
        applied to one of the immediate operands of this instruction. Which one of the
        immediate operands it has to be applied to depends on the type of the instruction,
        as specified in Table 10-10.

        To avoid coding all the information of that table inside this function some
        simplifications have been applied. First, if the instruction has only one
        immediate operand, then it has to be applied to that one. Second, the
        ``HexagonInstructionDecoder``, in ``resolve_constant_extender``, takes advantage
        of the behavior of the instruction (``apply_extension`` function in the
        instruction's behavior) to infer which operand the extension applies.

        Note (from the manual): "When constant extenders are used, scaled immediates are
        not scaled by the processor. Instead, the assembler must encode the full 32-bit
        unscaled value."

        Args:
            hi (HexagonInstruction): Current instruction being disassembled.

        Returns:
            None: the extension is applied to the HexagonInstruction itself.

        """
        if self.curr_packet.n_inst() < 2:
            # There has to be at least 2 instructions in the packet so far to apply a constant
            # extension, the ``immext`` and the following instruction to apply it to.
            return

        if self.curr_packet.get_before_last_inst().immext is None:
            # Previous instruction was not a constant extender.
            return

        if len(hi.imm_ops) == 0:
            raise UnknownInstructionException(
                "Previous instruction was an 'immext', but current instruction doesn't have "
                "any immediate operands to apply the extension to.")

        if len(hi.imm_ops) > 2:
            raise UnknownInstructionException(
                "Instruction has more than 2 immediate operands ({:d}). No instruction "
                "studied so far has been observed to have more than that, this is probably "
                "an error from the parsing/decoding stages.".format(
                    len(hi.imm_ops)))
            # Although having more than 2 imm. ops. impacts the logic of this function,
            # the check should be done prior to the disassembling stage.
            # TODO: Move this check to a more adequate function, maybe in the decoding stage.

        extension_target = None  # type: InstructionImmediate
        # The immediate operand to which the constant extension will be applied.

        if len(hi.imm_ops) == 1:
            extension_target = hi.imm_ops[0]
            # If there is only one immediate operand, then this is the one to be extended.

        elif hi.template.imm_ext_op:
            extension_target = hi.get_real_operand(hi.template.imm_ext_op)
            # Two imm. operands, rely on the `imm_ext_op` indicator generated by the decoder.

        else:
            extension_target = hi.imm_ops[0]
            # The decoder couldn't figure out which of the two imm. op. the
            # extension applies to. Arbitrarily, decide to apply it to the
            # first one.
            # This case shouldn't be happening, there should always be a call
            # to ``apply_extension`` in the behavior of an instruction whose imm.
            # op. can be extended.
            # TODO: Log this case if it happens.

        extension_target.value = (
            self.curr_packet.get_before_last_inst().immext
            | extract_bits(extension_target.field_value, 5, 0))
        # When an immediate value is being extended, just the lower 6 bits of its original value
        # remain, the rest are taken from the constant extender (`immext`). The `immext` value
        # has already been left shifted 6 positions.

        extension_target.is_extended = True

        return

    def fill_in_reg_info(self, reg, hi):
        """Set the register operand value and text format.

        Args:
            reg (InstructionRegister): Target register operand.
            hi (HexagonInstruction): Current instruction being disassembled..

        Returns:
            None: the data is applied to the InstructionRegister itself.

        TODOs:
            * Split in two functions for register pair and single register.

            * And maybe also split in more functions regarding register type, particularly for New-value.

        """
        if reg.template.is_register_pair:

            # Register pair, e.g., R5:4. From the field value determine both
            # register numbers: odd and even.

            if hi.template.mult_inst is False:
                # TODO: It's not clear how the odd/even numbers of a register pair are specified.
                # I'm assuming that if the register field value is odd,
                # then it corresponds to the number of the first register of the pair,
                # if it's even, it's referring to the second number of the pair.
                # The order is always ``R_odd:even`` (odd > even), so the other register
                # number (that is not specified by the field value) is set accordingly
                # to respect this order.

                if reg.field_value % 2 == 0:
                    odd, even = reg.field_value + 1, reg.field_value
                else:
                    odd, even = reg.field_value, reg.field_value - 1

            else:  # Duplex instruction.

                # TODO: Differentiate between duplex and mult_inst (that includes compound).
                # I think this case applies only to the duplex case, so that attribute (and
                # not `mult_inst`) should be tested in the if.

                # Map a field value to a pair or register numbers. Copied from Table 10-3
                # of the manual, as not to make a miss, could be reduced to a formula.
                register_pair_map = {
                    0b000: (1, 0),
                    0b001: (3, 2),
                    0b010: (5, 4),
                    0b011: (7, 6),
                    0b100: (17, 16),
                    0b101: (19, 18),
                    0b110: (21, 20),
                    0b111: (23, 22),
                }

                odd, even = register_pair_map[reg.field_value]

            if self.objdump_compatible:
                reg.name = reg.template.syntax_name.replace(
                    reg.field_char * 2, "{:d}:{:d}".format(odd, even))
            else:
                reg.name = reg.template.syntax_name.replace(
                    reg.field_char * 2,
                    "{:d}:{:s}{:d}".format(odd, reg.template.syntax_name[0],
                                           even))
                # Prefer full register names: "r7:r6" (instead of "r7:6"), to take advantage of the IDA
                # text highlighting feature, to easily spot register references.

            return

        # Single register case.
        # ---------------------

        if reg.template.syntax_name[0] == 'N':
            # From the manual, 10.11 New-value operands: "Instructions that include a new-value
            # register operand specify in their encodings which instruction in the
            # packet has its destination register accessed as the new-value register."
            #
            # In the manual it mentions without a clear definition the terms consumer
            # and producer. I understand the term "producer" as the destination register
            # in a instruction with an assignment (a register to the left of '=').

            producer_distance = extract_bits(reg.field_value, 2,
                                             1)  # type: int
            # From the manual:
            #     Nt[2:1] encodes the distance (in instructions) from the producer to
            #     the consumer, as follows:
            #         Nt[2:1] = 00     // reserved
            #         Nt[2:1] = 01     // producer is +1 instruction ahead of consumer
            #         Nt[2:1] = 10     // producer is +2 instructions ahead of consumer
            #         Nt[2:1] = 11     // producer is +3 instructions ahead of consumer

            if producer_distance == 0:
                raise UnknownInstructionException(
                    "New-value operands with a (invalid) consumer distance of 0 (reserved value)"
                )

            # From the current consumer ('Nt') register, try to find the producer,
            # that is 1-3 instructions behind (in the same packet), "not counting
            # empty slots or constant extenders" (from the manual).
            #
            # I'm not sure what an "empty slot" is, besides maybe a nop, but real
            # cases show that nop is taken into account in the distance, and the
            # only thing that is ignored are constant extenders.

            producer_inst = None  # type: HexagonInstruction
            distance_walked = 0

            for packet_inst in reversed(self.curr_packet.instructions[0:-1]):
                # Walk the packet in reverse order, from the current instruction,
                # containing the consumer register, to the first one.
                # TODO: avoid direct access to 'self.curr_packet.instructions'.

                if packet_inst.immext is None:
                    # Not a constant extender instruction, applies to the distance count.
                    distance_walked += 1

                if distance_walked == producer_distance:
                    producer_inst = packet_inst
                    break

            if producer_inst is None:
                raise UnknownInstructionException(
                    "New-value register operand with a producer distance of {:d} "
                    "doesn't correspond to a producer instruction.".format(
                        producer_distance))

            # It may happen that the disassembler is called for random instruction (i.e.,
            # not in sequential address order), and I don't have the previous instructions
            # of the packet to find the producer.
            # TODO: Is there a better way to handle it than to raise an exception as before?

            # The instruction with the producer register has been found, now capture the
            # name of the producer register name inside that instruction.

            m = re.search(
                r"""
                # Looking for something like: "R14 = ..."

                (             # Open a capture group for the reg. name.
                    r         # The producer register is supposed to be a general
                              # purpose one (Rx). The reg. name is in lowercase (hence
                              # the use of a lower 'r'), converted by populate_syntax.
                    \d{1,2}   # Register number (0-31).
                )             # End of the capture group, only care for the reg. name.
                \s  *
                .?            # Used to cover for cases of compound assignment (e.g.,
                              # '+=', '&=', etc.)
                =             # The producer register has to be the target of an assignment
                              # (i.e., to the left of the '=')
            """, producer_inst.text, re.X)

            # TODO: There may be more than one assignment ('=' in the syntax), if there are multiple instructions.

            if m is None:
                raise UnknownInstructionException(
                    "New-value operand with a producer instruction that is not producing "
                    "a new register operand. The pattern 'Rx = ...' was not found."
                )

            reg.name = reg.template.syntax_name.replace(
                'N' + reg.field_char, m.group(1))
            # Replace the consumer register placeholder 'Nt.new' with the name of the actual
            # producer register, e.g., 'R14', resulting in the reg. name: 'R14.new'.

            return

        # Single register (not a new-value register operand).
        # TODO: The most common case ends up at the end of a very long function.

        reg_number = reg.field_value
        if hi.template.mult_inst:
            # TODO: Check and replace `mult_inst` with `is_duplex`. Those are two different checks
            # (even though it is working like this for unknown reasons).

            # Instruction duplex. Table 10-3: single register case. Field values from 0-7 match
            # exactly to reg. numbers 0-7. Field values from 8-15, on the other hand, match a
            # consecutive number range of 16-23, which is the field value plus 8.

            if reg_number > 7:
                reg_number += 8

        reg.name = reg.template.syntax_name.replace(reg.field_char,
                                                    str(reg_number))

        return

    def fill_in_imm_info(self, imm, hi):
        """Set the immediate operand value (except for constant extensions) and text format.

        Set the immediate operand value and text format according to the operand type.
        The constant extension has to be performed prior to this function, calling
        ``process_constant_extender``.

        Args:
            imm (InstructionImmediate): Target immediate operand.
            hi (HexagonInstruction): Current instruction being disassembled..

        Returns:
            None: the data is applied to the InstructionImmediate itself.

        TODOs:
            * Handle the case of resulting negative values from constant-extended imm. ops.

        """
        if imm.is_extended is False:

            # Immediate operand was not extended, apply value (with sign) and scaling.

            imm.value = imm.field_value

            if imm.template.type in ['r', 'm', 's']:
                # The immediate operand type is signed.

                bit_len = hi.template.encoding.fields[imm.field_char].mask_len
                # TODO: Find a shorter way to get the op. mask len, and avoid using HexagonInstruction
                # directly, it's the only reason the `hi` argument was added to this function.

                imm.value = common.get_signed_value(imm.value, bit_len)

            imm.value <<= imm.template.scaled

        if self.objdump_compatible:
            if imm.is_extended:
                imm.print_format = '##{:d}'
            else:
                imm.print_format = '#{:d}'
        else:
            imm.print_format = '#{:X}'
            # I prefer hexadecimal values, don't care about the double hash.

        if (imm.template.type == 'm'):
            # Special case: Modifier registers.
            imm.value += 1
            # TODO: The min-max range for this type in Table 1-3 doesn't seem to add up.

        if (imm.template.type == 'r'):
            # Special case: imm. operand used as a target by jump/call,
            # it's relative to PC (added to the packet address)
            # and usually printed in hex without the '#'.

            imm.value &= ~0x3
            imm.value += hi.packet.address

            if imm.value < 0:
                raise UnknownInstructionException(
                    "Branch target (taken from an imm. op. of type 'r') "
                    "resulted in a negative value: {:x}".format(imm.value))

            if self.objdump_compatible:
                imm.print_format = "0x{:x}"
                # Addresses are printed in hexadecimal with the 0x prefix and without the '#'
                # (this format was only observed for branch targets).
            else:
                imm.print_format = "{:x}"

    def generate_inst_text(self, hi):
        """Get the instruction text output.

        Args:
            hi (HexagonInstruction): Current instruction being disassembled..

        Returns:
            str: instruction text.

        Raises:
            UnknownInstructionException: If the instruction is unknown and a text can't be produced.

        TODOs:
            * Change function name to something like ``get/extract inst_text (_output)``. It's not exactly
                ``get_inst_text``, because the text is not completely defined here, the start/end packet
                ``{}`` and the ``endloop`` tags have to be added. So what would be the correct terminology
                 for the string inside the packet ``{}``. Split between full instruction text, and the
                 "inner" instruction text.

        """
        if hi.template is None and hi.immext is None:
            # It's neither a recognized instruction (from a known template) nor a constant extender.
            # TODO: Elaborate on this, why this two attributes have to be none?

            raise UnknownInstructionException('Instruction not recognized.')
            # TODO: Move this raise to the caller? although the UnknownInstruction pattern works correctly.

        if hi.immext is not None:
            # Constant extender.

            if self.objdump_compatible:
                return 'immext (#{:d})'.format(hi.immext)
            else:
                return 'immext'
                # I don't care about the extension value, the final value will be shown in the next instruction.

        inst_text = hi.template.syntax

        # Get the immediate values and register names, and replace them in the
        # instruction syntax.

        self.process_constant_extender(hi)
        for imm in hi.imm_ops:
            self.fill_in_imm_info(imm, hi)

            inst_text = inst_text.replace(imm.template.syntax_name, repr(imm))
            # E.g., 'Rd = add(Rs, #s16)' -> 'Rd = add(Rs, 2BF4)'

        for reg in hi.reg_ops:
            self.fill_in_reg_info(reg, hi)

            inst_text = inst_text.replace(reg.template.syntax_name, reg.name)
            # E.g., 'Rd = add(Rs, 2BF4)' -> 'R8 = add(Rs, 2BF4)'  (first iteration)
            #       'R8 = add(Rs, 2BF4)' -> 'R8 = add(R17, 2BF4)' (second iteration)

        inst_text = inst_text.lower()
        # Like objdump.
        # TODO: Check objdump_compatible and only lower it on that case? Not lowering reg. names
        # will need adjusting in some regex and str. manipulation. For simplicity it can be left
        # like this (no check, always lower text).

        return inst_text

    def generate_instruction_operands(self, inst, hi):
        """Generate the instruction operands from the template operands.

        Args:
            inst(int): Actual instruction value.
            hi (HexagonInstruction): Current instruction being disassembled..

        Returns:
            None: the generated operands are stored in `HexagonInstruction.imm_ops` and `reg_ops`.

        Raises:
            UnexpectedException: If for some error there is a different type of template operand
                than the register or the immediate.

        TODOs:
            * Remove `inst` argument, maybe move it as an attribute of the HexagonInstruction.

        """
        for c in hi.template.encoding.fields:

            if c == 'N':
                continue
                # TODO: Handle 'N' field char.

            inst_op = None  # type: InstructionOperand
            # Instruction operand being generated (from the template operand).

            if isinstance(hi.template.operands[c], ImmediateTemplate):
                inst_op = InstructionImmediate()
                hi.imm_ops.append(inst_op)

            elif isinstance(hi.template.operands[c], RegisterTemplate):
                inst_op = InstructionRegister()
                hi.reg_ops.append(inst_op)

            else:
                raise UnexpectedException("Unknown operand type.")

            inst_op.field_char = c
            inst_op.field_value = self.extract_and_join_mask_bits(
                inst, hi.template.encoding.fields[c])

            inst_op.template = hi.template.operands[c]
            # The instruction operand has a "pointer" to the template operand from which
            # it was created. It's redundant, but allows a more cleaner access than going through
            # HexagonInstruction.
            # TODO: Move this comment to the docstring of the InstructionOperand class.

        return

    def process_endloops(self, hi):
        """Process (if exists) a hardware loop end.

        Checks if this instruction signals the end of a hardware loop (e.g.,
        ``endloop0``). The presence of the ``endloop`` signal is indicated in the HexagonInstruction
        by saving the number of the loop being ended (0 or 1) in its `endloop` attribute.

        The function ``process_packet_info`` needs to be called before this instruction,
        as it is needed to know if this instruction is the last one of the packet.

        Args:
            hi (HexagonInstruction): Current instruction being disassembled..

        Returns:
            None: the processing is done inside the `endloop` attribute.

        TODOs:
            * endloop1 analysis.

        """
        if not hi.end_packet:
            # Only the last instruction of the packet can signal the end of the loop.
            return

        # Check for "Last in loop 0" (``endloop0``).

        if self.curr_packet.n_inst() >= 2:
            # "The last packet in a hardware loop 0 must contain two or
            # more instructions." (From the manual.)

            if self.curr_packet.get_inst(0).parse_bits == 0b10:
                # Parse Field in First Instruction: 10 (Table 10-7).

                if self.curr_packet.get_inst(1).parse_bits in [0b01, 0b11]:
                    # Parse Field in Second Instruction: 01 or 11 (Table 10-7).

                    hi.endloop.append(0)

                elif self.curr_packet.get_inst(1).parse_bits in [0b10]:
                    hi.endloop.append(0)
                    # The table and the examples don't seem to add up. The examples
                    # where both loops end, i.e., ``:endloop0:endloop1``, have parse bits
                    # values of 10 in the second instruction, which would violate the
                    # rule in Table 10-7, which indicates that for ``endloop0`` only 01 or 11
                    # are allowed in the second instruction. For now I'm adding the 10 case,
                    # as an exception.
                    # TODO: Check real examples.

        return

    def process_packet_info(self, hi, inst):
        """Process packet information.

        Keeping track of all the instructions in the packet is necessary as many
        instructions depend on previous ones (e.g., constant extenders), and this
        dependency is only limited to the packet: all the information needed to
        correctly disassemble the instructions is in the packet itself.

        The disassembler is designed to be used in sequential mode, disassembling
        all the instructions in the same packet one after the other. A single instruction
        can't be correctly analyzed outside that scope (although IDA analysis sometimes
        does that).

        During a packet disassembly, if an instruction from a different packet is
        disassembled (calling `disasm_one_inst`) all the current packet information
        is lost. All the instructions of a single packet have to be disassembled in
        continuous order.

        Args:
            hi (HexagonInstruction): Current instruction being disassembled.
            inst (int): Actual instruction value.

        Returns:
            None

        TODOs:
            * Review and move part of this docstring to the project documentation.

            * Remove the `inst` argument once it is added to the HexagonInstruction class.

        """

        # Check if a new packet is being disassembled, either because:
        #   1. This is the first ever instruction being disassembled (i.e.,
        #       ``curr_packet`` is None).
        #   2. The previous (contiguous) instruction was the end of its packet,
        #       therefore this instruction has to start a new one.
        #   3. The previous disassembled instruction is not contiguous (an address
        #       that is not 4 bytes back), so it has to be assumed (for lack of any
        #       other information) that a new packet is being disassembled. There
        #       is no way to know for sure that this instruction is indeed the first one
        #       in the packet (the parse bits only indicate the last, but not the
        #       first instruction), so it's the safest bet (assuming the disassembler
        #       is being correctly used a jump to the middle of tha packet is not allowed).

        if self.curr_packet is None:
            hi.start_packet = True
            # Case 1.

        elif hi.addr - INST_SIZE == self.curr_packet.get_last_inst().addr:
            # There's a continuity in the disassembler use.

            if self.curr_packet.get_last_inst().end_packet:
                hi.start_packet = True
                # Case 2.

            else:
                hi.start_packet = False
                # The current packet continues with this instruction.

        else:
            hi.start_packet = True
            # Case 3.

        if hi.start_packet:
            self.curr_packet = HexagonPacket(hi)
            # If it is the first instruction in the packet it has to be new one.

        else:
            self.curr_packet.add_next_inst(hi)
            # This instruction continues the current packet so it's added to the list.

        hi.packet = self.curr_packet
        # TODO: Maybe there's some overlapping here and I don't need `self.curr_packet`.

        # Check if this instruction is the end of the packet, which is indicated by
        # the PP (parity) bits if their value is:
        #   1. '11' for a normal instruction, signals packet end.
        #   2. '00' signals a duplex instruction, and from the manual: "The duplex
        #       must always appear as the last word in a packet."

        hi.parse_bits = extract_bits(inst, 15, 14)
        if hi.parse_bits in [0b00, 0b11]:
            hi.end_packet = True
        else:
            hi.end_packet = False
        # TODO: Perform two different checks. The normal PP == 11, and `hi.is_duplex` in
        # another if (`is_duplex` has to be set first, which is not happening now).

        return

    def extract_and_join_mask_bits(self, inst, encoding_field):
        """Extract a field value from an instruction, based on the field encoding.

        Args:
            inst (int): Actual instruction value from which the field value is extracted.
            encoding_field (EncodingField): field whose value will be extracted.

        Returns:
            int: extracted field value.

        TODOs:
            * Change function name, what is being is extracted is a field value, not a mask,
                and the join should be implicit.

        """

        # The (most common) case of no mask split is processed separately for performance reasons.
        if encoding_field.no_mask_split:
            extracted_value = inst >> encoding_field.mask_lower_pos
            extracted_value &= (2**encoding_field.mask_len) - 1
            return extracted_value
            # TODO: Use `extract_bits`.

        # Case when the field is not unified, the field chars are scattered, the extracted
        # bits have to be unified.
        extracted_value = 0
        for pos in range(31, -1, -1):
            if encoding_field.mask & (1 << pos):
                extracted_value = (extracted_value << 1) | (
                    (inst & (1 << pos)) >> pos)
                # TODO: Too clobbered, split in two or three lines.
                # TODO: Use set_bit and clear_bit functions.

        return extracted_value

    def disasm_one_inst(self, inst, addr=0):
        """Disassemble one instruction value interpreted as an unsigned int.

        Args:
            inst (int): Actual instruction value.
            addr (Optional[int]): Address of the instruction being disassembled (used for
                packet processing purposes).

        Returns:
            HexagonInstruction: disassembled instruction.

        TODOs:
            * Define the input type, for now I it's an unsigned int with the endianness (little endiand) resolved.

        """
        if not isinstance(inst, int):
            raise UnexpectedException()

        if inst < 0 or inst > 0xFFFFFFFF:
            raise UnexpectedException()

        hi = HexagonInstruction()
        hi.addr = addr

        self.process_packet_info(hi, inst)

        hi.is_duplex = (hi.parse_bits == 0b00)

        if extract_bits(inst, 31, 28) == 0 and hi.is_duplex == False:

            # Constant extender instruction, extract the extension value:
            # bits 27:16 | 13:0, joined and moved to the upper 26 bits.

            hi.immext = (extract_bits(inst, 27, 16) << 14) | extract_bits(
                inst, 13, 0)
            hi.immext <<= 6
            # TODO: Move to a separate function.

        else:
            # Not a constant extender function. Search available templates for a match.
            if self.find_template(inst, hi):
                self.generate_instruction_operands(inst, hi)

        packet_prefix = '{ ' if hi.start_packet else '  '
        hi.text += packet_prefix

        try:
            hi.text += self.generate_inst_text(hi)
            # TODO: Move all str manipulation to `generate_inst_text` function? The nice thing of the
            # current arrangement is the exception catch, where I can have an unknown with {}
            # (i.e., ``{ <unknown> }``) even if the disassembly failed.

        except UnknownInstructionException as e:
            hi.text += "<unknown>"
            hi.is_unknown = True

        if hi.end_packet:
            hi.text += ' }'
            # Even if the instruction is unknown, the parity bits analysis is
            # still valid, so the start/end packet settings stand, e.g.,
            # ``{ <unknown> }`` is a valid text output.

        self.process_endloops(hi)
        if 0 in hi.endloop:
            hi.text += ':endloop0'

        return hi

    def find_template(self, inst, hi):
        """Find the template for an instruction value.

        Args:
            inst (int): Actual instruction value.
            hi (HexagonInstruction): Instruction object where the `template` attribute is set
                to the value of the found (if any) template.

        Returns:
            bool: True if a template was found; False otherwise.

        TODOs:
            * Improve performance.

        """
        template_sources = []  # type: List[InstructionTemplate]
        if hi.is_duplex:
            template_sources = self.duplex_templates
        else:
            template_sources = self.segmented_inst_templates[extract_bits(
                inst, 31, 28)]

        i = 0
        template_sources_len = len(template_sources)
        # Length precomputed to improve performance.

        while i < template_sources_len:
            template = template_sources[i]

            # TODO: A while is used instead of a ``for ... in enumerate`` because I'm not sure I can
            # modify the list being enumerated (see below).

            if inst & template.encoding.mask == template.encoding.value:
                hi.template = template
                # Found a template match.

                # TODO: Small hack to partially reorder the list by most found, one swap at time, should be improved.
                if i != 0:
                    template_sources[i], template_sources[
                        i - 1] = template_sources[i - 1], template_sources[i]

                return True

            i += 1

        return False
示例#4
0
class HexagonDisassembler(object):
    """Hexagon disassembler.

    Attributes:
        inst_templates (List[InstructionTemplate]): List of instruction templates generated by the decoder.
        curr_packet (HexagonPacket): Packet that contains the current instruction.
        objdump_compatible (bool):  Used to produce objdump compatible syntax, to test
            the effectiveness of the disassembler against Qualcomm's objdump. Many
            times the objdump syntax is not the preferred one (e.g., when using the
            disassembler for the IDA processor module), so it can be disabled.
        segmented_inst_templates (Dict[int, List[InstructionTemplate]]): Dictionary of
            lists of instruction templates, classified by their 4 ICLASS bits,
            not including the duplex instructions. Each entry in the dict. is indexed by the
            ICLASS bits, and contains the segment of instructions belonging to that ICLASS.
        duplex_templates (List[InstructionTemplate]): List of duplex instructions templates, that
            are separated from the rest of the templates in `segmented_inst_templates`.

    """
    __slots__ = ['inst_templates', 'curr_packet', 'segmented_inst_templates',
                 'duplex_templates', 'objdump_compatible',]

    def __init__(self, objdump_compatible = False):

        self.inst_templates = common.pickle_load(common.INST_TEMPL_PATH)

        self.curr_packet = None
        self.objdump_compatible = objdump_compatible

        # Classify the (non duplex) instructions by the ICLASS bits (31:28),
        # which are always fixed to 0/1. This improves performance at the time
        # to find an instruction template match, because the search will be limited
        # to the reduced template segment indexed by these 4 bits.
        #
        # The duplex instructions go in a separate segment. First, because their
        # ICLASS bits have different positions (bits 31:29 and 13). Second, because
        # the duplex instructions require a "don't have" match for their PP (parse) bits.
        # In a normal instruction template bit matching, a certain (defined) pattern is
        # being looked for, e.g., ``0101xxx101xx1...`` (``x``: can have any value).
        # But for duplex instructions, apart from the "have certain bits" match,
        # another condition has to be met, that the PP bits are NOT set to 00.
        # This negative condition is harder to implement in the current framework,
        # therefore the duplex instructions are processed separately.
        # TODO: rewrite this explanation.

        self.segmented_inst_templates = {}
        self.duplex_templates = []
        for inst in self.inst_templates:
            if inst.is_duplex:
                self.duplex_templates.append(inst)
                # TODO: The duplex instructions can be segmented too, but I don't know if their quantity merits that split.
            else:
                iclass = int(inst.encoding.text[0:4], 2)
                if iclass not in self.segmented_inst_templates:
                    self.segmented_inst_templates[iclass] = []
                self.segmented_inst_templates[iclass].append(inst)
        # TODO: Move the segmentation to the decoding phase.


    def process_constant_extender(self, hi):
        """Process (if exists) a constant extender from the previous instruction, and apply it to this one.

        If the previous instruction was a constant extender (``immext``), it has to be
        applied to one of the immediate operands of this instruction. Which one of the
        immediate operands it has to be applied to depends on the type of the instruction,
        as specified in Table 10-10.

        To avoid coding all the information of that table inside this function some
        simplifications have been applied. First, if the instruction has only one
        immediate operand, then it has to be applied to that one. Second, the
        ``HexagonInstructionDecoder``, in ``resolve_constant_extender``, takes advantage
        of the behavior of the instruction (``apply_extension`` function in the
        instruction's behavior) to infer which operand the extension applies.

        Note (from the manual): "When constant extenders are used, scaled immediates are
        not scaled by the processor. Instead, the assembler must encode the full 32-bit
        unscaled value."

        Args:
            hi (HexagonInstruction): Current instruction being disassembled.

        Returns:
            None: the extension is applied to the HexagonInstruction itself.

        """
        if self.curr_packet.n_inst() < 2:
            # There has to be at least 2 instructions in the packet so far to apply a constant
            # extension, the ``immext`` and the following instruction to apply it to.
            return

        if self.curr_packet.get_before_last_inst().immext is None:
            # Previous instruction was not a constant extender.
            return

        if len(hi.imm_ops) == 0:
            raise UnknownInstructionException(
                "Previous instruction was an 'immext', but current instruction doesn't have "
                "any immediate operands to apply the extension to."
            )

        if len(hi.imm_ops) > 2:
            raise UnknownInstructionException(
                "Instruction has more than 2 immediate operands ({:d}). No instruction "
                "studied so far has been observed to have more than that, this is probably "
                "an error from the parsing/decoding stages.".format(len(hi.imm_ops))
            )
            # Although having more than 2 imm. ops. impacts the logic of this function,
            # the check should be done prior to the disassembling stage.
            # TODO: Move this check to a more adequate function, maybe in the decoding stage.

        extension_target = None # type: InstructionImmediate
        # The immediate operand to which the constant extension will be applied.

        if len(hi.imm_ops) == 1:
            extension_target = hi.imm_ops[0]
            # If there is only one immediate operand, then this is the one to be extended.

        elif hi.template.imm_ext_op:
            extension_target = hi.get_real_operand(hi.template.imm_ext_op)
            # Two imm. operands, rely on the `imm_ext_op` indicator generated by the decoder.

        else:
            extension_target = hi.imm_ops[0]
            # The decoder couldn't figure out which of the two imm. op. the
            # extension applies to. Arbitrarily, decide to apply it to the
            # first one.
            # This case shouldn't be happening, there should always be a call
            # to ``apply_extension`` in the behavior of an instruction whose imm.
            # op. can be extended.
            # TODO: Log this case if it happens.

        extension_target.value = (
            self.curr_packet.get_before_last_inst().immext |
            extract_bits(extension_target.field_value, 5, 0)
        )
        # When an immediate value is being extended, just the lower 6 bits of its original value
        # remain, the rest are taken from the constant extender (`immext`). The `immext` value
        # has already been left shifted 6 positions.

        extension_target.is_extended = True

        return

    def fill_in_reg_info(self, reg, hi):
        """Set the register operand value and text format.

        Args:
            reg (InstructionRegister): Target register operand.
            hi (HexagonInstruction): Current instruction being disassembled..

        Returns:
            None: the data is applied to the InstructionRegister itself.

        TODOs:
            * Split in two functions for register pair and single register.

            * And maybe also split in more functions regarding register type, particularly for New-value.

        """
        if reg.template.is_register_pair:

            # Register pair, e.g., R5:4. From the field value determine both
            # register numbers: odd and even.

            if hi.template.mult_inst is False:
                # TODO: It's not clear how the odd/even numbers of a register pair are specified.
                # I'm assuming that if the register field value is odd,
                # then it corresponds to the number of the first register of the pair,
                # if it's even, it's referring to the second number of the pair.
                # The order is always ``R_odd:even`` (odd > even), so the other register
                # number (that is not specified by the field value) is set accordingly
                # to respect this order.

                if reg.field_value % 2 == 0:
                    odd, even = reg.field_value + 1, reg.field_value
                else:
                    odd, even = reg.field_value, reg.field_value - 1

            else:  # Duplex instruction.

                # TODO: Differentiate between duplex and mult_inst (that includes compound).
                # I think this case applies only to the duplex case, so that attribute (and
                # not `mult_inst`) should be tested in the if.

                # Map a field value to a pair or register numbers. Copied from Table 10-3
                # of the manual, as not to make a miss, could be reduced to a formula.
                register_pair_map = {
                    0b000: (1, 0),
                    0b001: (3, 2),
                    0b010: (5, 4),
                    0b011: (7, 6),

                    0b100: (17, 16),
                    0b101: (19, 18),
                    0b110: (21, 20),
                    0b111: (23, 22),
                }

                odd, even = register_pair_map[reg.field_value]

            if self.objdump_compatible:
                reg.name = reg.template.syntax_name.replace(
                    reg.field_char * 2,
                    "{:d}:{:d}".format(odd, even)
                )
            else:
                reg.name = reg.template.syntax_name.replace(
                    reg.field_char * 2,
                    "{:d}:{:s}{:d}".format(odd,reg.template.syntax_name[0], even)
                )
                # Prefer full register names: "r7:r6" (instead of "r7:6"), to take advantage of the IDA
                # text highlighting feature, to easily spot register references.

            return

        # Single register case.
        # ---------------------

        if reg.template.syntax_name[0] == 'N':
            # From the manual, 10.11 New-value operands: "Instructions that include a new-value
            # register operand specify in their encodings which instruction in the
            # packet has its destination register accessed as the new-value register."
            #
            # In the manual it mentions without a clear definition the terms consumer
            # and producer. I understand the term "producer" as the destination register
            # in a instruction with an assignment (a register to the left of '=').

            producer_distance = extract_bits(reg.field_value, 2, 1) # type: int
            # From the manual:
            #     Nt[2:1] encodes the distance (in instructions) from the producer to
            #     the consumer, as follows:
            #         Nt[2:1] = 00     // reserved
            #         Nt[2:1] = 01     // producer is +1 instruction ahead of consumer
            #         Nt[2:1] = 10     // producer is +2 instructions ahead of consumer
            #         Nt[2:1] = 11     // producer is +3 instructions ahead of consumer

            if producer_distance == 0:
                raise UnknownInstructionException(
                    "New-value operands with a (invalid) consumer distance of 0 (reserved value)"
                )

            # From the current consumer ('Nt') register, try to find the producer,
            # that is 1-3 instructions behind (in the same packet), "not counting
            # empty slots or constant extenders" (from the manual).
            #
            # I'm not sure what an "empty slot" is, besides maybe a nop, but real
            # cases show that nop is taken into account in the distance, and the
            # only thing that is ignored are constant extenders.

            producer_inst = None # type: HexagonInstruction
            distance_walked = 0

            for packet_inst in reversed(self.curr_packet.instructions[0:-1]):
                # Walk the packet in reverse order, from the current instruction,
                # containing the consumer register, to the first one.
                # TODO: avoid direct access to 'self.curr_packet.instructions'.

                if packet_inst.immext is None:
                    # Not a constant extender instruction, applies to the distance count.
                    distance_walked += 1

                if distance_walked == producer_distance:
                    producer_inst = packet_inst
                    break

            if producer_inst is None:
                raise UnknownInstructionException(
                    "New-value register operand with a producer distance of {:d} "
                    "doesn't correspond to a producer instruction.".format(producer_distance)
                )

            # It may happen that the disassembler is called for random instruction (i.e.,
            # not in sequential address order), and I don't have the previous instructions
            # of the packet to find the producer.
            # TODO: Is there a better way to handle it than to raise an exception as before?

            # The instruction with the producer register has been found, now capture the
            # name of the producer register name inside that instruction.

            m = re.search(r"""
                # Looking for something like: "R14 = ..."

                (             # Open a capture group for the reg. name.
                    r         # The producer register is supposed to be a general
                              # purpose one (Rx). The reg. name is in lowercase (hence
                              # the use of a lower 'r'), converted by populate_syntax.
                    \d{1,2}   # Register number (0-31).
                )             # End of the capture group, only care for the reg. name.
                \s  *
                .?            # Used to cover for cases of compound assignment (e.g.,
                              # '+=', '&=', etc.)
                =             # The producer register has to be the target of an assignment
                              # (i.e., to the left of the '=')
            """, producer_inst.text, re.X)

            # TODO: There may be more than one assignment ('=' in the syntax), if there are multiple instructions.

            if m is None:
                raise UnknownInstructionException(
                    "New-value operand with a producer instruction that is not producing "
                    "a new register operand. The pattern 'Rx = ...' was not found.")

            reg.name = reg.template.syntax_name.replace('N' + reg.field_char, m.group(1))
            # Replace the consumer register placeholder 'Nt.new' with the name of the actual
            # producer register, e.g., 'R14', resulting in the reg. name: 'R14.new'.

            return

        # Single register (not a new-value register operand).
        # TODO: The most common case ends up at the end of a very long function.

        reg_number = reg.field_value
        if hi.template.mult_inst:
            # TODO: Check and replace `mult_inst` with `is_duplex`. Those are two different checks
            # (even though it is working like this for unknown reasons).

            # Instruction duplex. Table 10-3: single register case. Field values from 0-7 match
            # exactly to reg. numbers 0-7. Field values from 8-15, on the other hand, match a
            # consecutive number range of 16-23, which is the field value plus 8.

            if reg_number > 7:
                reg_number += 8

        reg.name = reg.template.syntax_name.replace(reg.field_char, str(reg_number))

        return

    def fill_in_imm_info(self, imm, hi):
        """Set the immediate operand value (except for constant extensions) and text format.

        Set the immediate operand value and text format according to the operand type.
        The constant extension has to be performed prior to this function, calling
        ``process_constant_extender``.

        Args:
            imm (InstructionImmediate): Target immediate operand.
            hi (HexagonInstruction): Current instruction being disassembled..

        Returns:
            None: the data is applied to the InstructionImmediate itself.

        TODOs:
            * Handle the case of resulting negative values from constant-extended imm. ops.

        """
        if imm.is_extended is False:

            # Immediate operand was not extended, apply value (with sign) and scaling.

            imm.value = imm.field_value

            if imm.template.type in ['r', 'm', 's']:
                # The immediate operand type is signed.

                bit_len = hi.template.encoding.fields[imm.field_char].mask_len
                # TODO: Find a shorter way to get the op. mask len, and avoid using HexagonInstruction
                # directly, it's the only reason the `hi` argument was added to this function.

                imm.value = common.get_signed_value(imm.value, bit_len)

            imm.value <<= imm.template.scaled

        if self.objdump_compatible:
            if imm.is_extended:
                imm.print_format = '##{:d}'
            else:
                imm.print_format = '#{:d}'
        else:
            imm.print_format = '#{:X}'
            # I prefer hexadecimal values, don't care about the double hash.

        if (imm.template.type == 'm'):
            # Special case: Modifier registers.
            imm.value += 1
            # TODO: The min-max range for this type in Table 1-3 doesn't seem to add up.

        if (imm.template.type == 'r'):
            # Special case: imm. operand used as a target by jump/call,
            # it's relative to PC (added to the packet address)
            # and usually printed in hex without the '#'.

            imm.value &= ~0x3
            imm.value += hi.packet.address

            if imm.value < 0:
                raise UnknownInstructionException(
                    "Branch target (taken from an imm. op. of type 'r') "
                    "resulted in a negative value: {:x}".format(imm.value)
                )

            if self.objdump_compatible:
                imm.print_format = "0x{:x}"
                # Addresses are printed in hexadecimal with the 0x prefix and without the '#'
                # (this format was only observed for branch targets).
            else:
                imm.print_format = "{:x}"

    def generate_inst_text(self, hi):
        """Get the instruction text output.

        Args:
            hi (HexagonInstruction): Current instruction being disassembled..

        Returns:
            str: instruction text.

        Raises:
            UnknownInstructionException: If the instruction is unknown and a text can't be produced.

        TODOs:
            * Change function name to something like ``get/extract inst_text (_output)``. It's not exactly
                ``get_inst_text``, because the text is not completely defined here, the start/end packet
                ``{}`` and the ``endloop`` tags have to be added. So what would be the correct terminology
                 for the string inside the packet ``{}``. Split between full instruction text, and the
                 "inner" instruction text.

        """
        if hi.template is None and hi.immext is None:
            # It's neither a recognized instruction (from a known template) nor a constant extender.
            # TODO: Elaborate on this, why this two attributes have to be none?

            raise UnknownInstructionException('Instruction not recognized.')
            # TODO: Move this raise to the caller? although the UnknownInstruction pattern works correctly.

        if hi.immext is not None:
            # Constant extender.

            if self.objdump_compatible:
                return 'immext (#{:d})'.format(hi.immext)
            else:
                return 'immext'
                # I don't care about the extension value, the final value will be shown in the next instruction.

        inst_text = hi.template.syntax

        # Get the immediate values and register names, and replace them in the
        # instruction syntax.

        self.process_constant_extender(hi)
        for imm in hi.imm_ops:
            self.fill_in_imm_info(imm, hi)

            inst_text = inst_text.replace(imm.template.syntax_name, repr(imm))
            # E.g., 'Rd = add(Rs, #s16)' -> 'Rd = add(Rs, 2BF4)'

        for reg in hi.reg_ops:
            self.fill_in_reg_info(reg, hi)

            inst_text = inst_text.replace(reg.template.syntax_name, reg.name)
            # E.g., 'Rd = add(Rs, 2BF4)' -> 'R8 = add(Rs, 2BF4)'  (first iteration)
            #       'R8 = add(Rs, 2BF4)' -> 'R8 = add(R17, 2BF4)' (second iteration)

        inst_text = inst_text.lower()
        # Like objdump.
        # TODO: Check objdump_compatible and only lower it on that case? Not lowering reg. names
        # will need adjusting in some regex and str. manipulation. For simplicity it can be left
        # like this (no check, always lower text).

        return inst_text
    
    def generate_instruction_operands(self, inst, hi):
        """Generate the instruction operands from the template operands.

        Args:
            inst(int): Actual instruction value.
            hi (HexagonInstruction): Current instruction being disassembled..

        Returns:
            None: the generated operands are stored in `HexagonInstruction.imm_ops` and `reg_ops`.

        Raises:
            UnexpectedException: If for some error there is a different type of template operand
                than the register or the immediate.

        TODOs:
            * Remove `inst` argument, maybe move it as an attribute of the HexagonInstruction.

        """
        for c in hi.template.encoding.fields:
            
            if c == 'N':
                continue
                # TODO: Handle 'N' field char.

            inst_op = None # type: InstructionOperand
            # Instruction operand being generated (from the template operand).

            if isinstance(hi.template.operands[c], ImmediateTemplate):
                inst_op = InstructionImmediate()
                hi.imm_ops.append(inst_op)

            elif isinstance(hi.template.operands[c], RegisterTemplate):
                inst_op = InstructionRegister()
                hi.reg_ops.append(inst_op)

            else:
                raise UnexpectedException("Unknown operand type.")

            inst_op.field_char = c
            inst_op.field_value = self.extract_and_join_mask_bits(inst, hi.template.encoding.fields[c])

            inst_op.template = hi.template.operands[c]
            # The instruction operand has a "pointer" to the template operand from which
            # it was created. It's redundant, but allows a more cleaner access than going through
            # HexagonInstruction.
            # TODO: Move this comment to the docstring of the InstructionOperand class.

        return

    def process_endloops(self, hi):
        """Process (if exists) a hardware loop end.

        Checks if this instruction signals the end of a hardware loop (e.g.,
        ``endloop0``). The presence of the ``endloop`` signal is indicated in the HexagonInstruction
        by saving the number of the loop being ended (0 or 1) in its `endloop` attribute.

        The function ``process_packet_info`` needs to be called before this instruction,
        as it is needed to know if this instruction is the last one of the packet.

        Args:
            hi (HexagonInstruction): Current instruction being disassembled..

        Returns:
            None: the processing is done inside the `endloop` attribute.

        TODOs:
            * endloop1 analysis.

        """
        if not hi.end_packet:
            # Only the last instruction of the packet can signal the end of the loop.
            return

        # Check for "Last in loop 0" (``endloop0``).

        if self.curr_packet.n_inst() >= 2:
            # "The last packet in a hardware loop 0 must contain two or
            # more instructions." (From the manual.)

            if self.curr_packet.get_inst(0).parse_bits == 0b10:
                # Parse Field in First Instruction: 10 (Table 10-7).

                if self.curr_packet.get_inst(1).parse_bits in [0b01, 0b11]:
                    # Parse Field in Second Instruction: 01 or 11 (Table 10-7).

                        hi.endloop.append(0)

                elif self.curr_packet.get_inst(1).parse_bits in [0b10]:
                    hi.endloop.append(0)
                    # The table and the examples don't seem to add up. The examples
                    # where both loops end, i.e., ``:endloop0:endloop1``, have parse bits
                    # values of 10 in the second instruction, which would violate the
                    # rule in Table 10-7, which indicates that for ``endloop0`` only 01 or 11
                    # are allowed in the second instruction. For now I'm adding the 10 case,
                    # as an exception.
                    # TODO: Check real examples.

        return

    def process_packet_info(self, hi, inst):
        """Process packet information.

        Keeping track of all the instructions in the packet is necessary as many
        instructions depend on previous ones (e.g., constant extenders), and this
        dependency is only limited to the packet: all the information needed to
        correctly disassemble the instructions is in the packet itself.

        The disassembler is designed to be used in sequential mode, disassembling
        all the instructions in the same packet one after the other. A single instruction
        can't be correctly analyzed outside that scope (although IDA analysis sometimes
        does that).

        During a packet disassembly, if an instruction from a different packet is
        disassembled (calling `disasm_one_inst`) all the current packet information
        is lost. All the instructions of a single packet have to be disassembled in
        continuous order.

        Args:
            hi (HexagonInstruction): Current instruction being disassembled.
            inst (int): Actual instruction value.

        Returns:
            None

        TODOs:
            * Review and move part of this docstring to the project documentation.

            * Remove the `inst` argument once it is added to the HexagonInstruction class.

        """

        # Check if a new packet is being disassembled, either because:
        #   1. This is the first ever instruction being disassembled (i.e.,
        #       ``curr_packet`` is None).
        #   2. The previous (contiguous) instruction was the end of its packet,
        #       therefore this instruction has to start a new one.
        #   3. The previous disassembled instruction is not contiguous (an address
        #       that is not 4 bytes back), so it has to be assumed (for lack of any
        #       other information) that a new packet is being disassembled. There
        #       is no way to know for sure that this instruction is indeed the first one
        #       in the packet (the parse bits only indicate the last, but not the
        #       first instruction), so it's the safest bet (assuming the disassembler
        #       is being correctly used a jump to the middle of tha packet is not allowed).

        if self.curr_packet is None:
            hi.start_packet = True
            # Case 1.

        elif hi.addr - INST_SIZE == self.curr_packet.get_last_inst().addr:
            # There's a continuity in the disassembler use.

            if self.curr_packet.get_last_inst().end_packet:
                hi.start_packet = True
                # Case 2.

            else:
                hi.start_packet = False
                # The current packet continues with this instruction.

        else:
            hi.start_packet = True
            # Case 3.

        if hi.start_packet:
            self.curr_packet = HexagonPacket(hi)
            # If it is the first instruction in the packet it has to be new one.

        else:
            self.curr_packet.add_next_inst(hi)
            # This instruction continues the current packet so it's added to the list.

        hi.packet = self.curr_packet
        # TODO: Maybe there's some overlapping here and I don't need `self.curr_packet`.

        # Check if this instruction is the end of the packet, which is indicated by
        # the PP (parity) bits if their value is:
        #   1. '11' for a normal instruction, signals packet end.
        #   2. '00' signals a duplex instruction, and from the manual: "The duplex
        #       must always appear as the last word in a packet."

        hi.parse_bits = extract_bits(inst, 15, 14)
        if hi.parse_bits in [0b00, 0b11]:
            hi.end_packet = True
        else:
            hi.end_packet = False
        # TODO: Perform two different checks. The normal PP == 11, and `hi.is_duplex` in
        # another if (`is_duplex` has to be set first, which is not happening now).

        return

    def extract_and_join_mask_bits(self, inst, encoding_field):
        """Extract a field value from an instruction, based on the field encoding.

        Args:
            inst (int): Actual instruction value from which the field value is extracted.
            encoding_field (EncodingField): field whose value will be extracted.

        Returns:
            int: extracted field value.

        TODOs:
            * Change function name, what is being is extracted is a field value, not a mask,
                and the join should be implicit.

        """

        # The (most common) case of no mask split is processed separately for performance reasons.
        if encoding_field.no_mask_split:
            extracted_value = inst >> encoding_field.mask_lower_pos
            extracted_value &= (2 ** encoding_field.mask_len) - 1
            return extracted_value
            # TODO: Use `extract_bits`.
        
        # Case when the field is not unified, the field chars are scattered, the extracted
        # bits have to be unified.
        extracted_value = 0
        for pos in range(31, -1, -1):
            if encoding_field.mask & (1 << pos):
                extracted_value = (extracted_value << 1) | ((inst & (1 << pos)) >> pos)
                # TODO: Too clobbered, split in two or three lines.
                # TODO: Use set_bit and clear_bit functions.

        return extracted_value
                
    def disasm_one_inst(self, inst, addr = 0):
        """Disassemble one instruction value interpreted as an unsigned int.

        Args:
            inst (int): Actual instruction value.
            addr (Optional[int]): Address of the instruction being disassembled (used for
                packet processing purposes).

        Returns:
            HexagonInstruction: disassembled instruction.

        TODOs:
            * Define the input type, for now I it's an unsigned int with the endianness (little endiand) resolved.

        """
        if not isinstance(inst, int):
            raise UnexpectedException()

        if inst < 0 or inst > 0xFFFFFFFF:
            raise UnexpectedException()
        
        hi = HexagonInstruction()
        hi.addr = addr

        self.process_packet_info(hi, inst)

        hi.is_duplex = (hi.parse_bits == 0b00)

        if extract_bits(inst, 31, 28) == 0 and hi.is_duplex == False:

            # Constant extender instruction, extract the extension value:
            # bits 27:16 | 13:0, joined and moved to the upper 26 bits.

            hi.immext = (extract_bits(inst, 27, 16) << 14) | extract_bits(inst, 13, 0)
            hi.immext <<= 6
            # TODO: Move to a separate function.

        else:
            # Not a constant extender function. Search available templates for a match.
            if self.find_template(inst, hi):
                self.generate_instruction_operands(inst, hi)

        packet_prefix = '{ ' if hi.start_packet else '  '
        hi.text += packet_prefix

        try:
            hi.text += self.generate_inst_text(hi)
            # TODO: Move all str manipulation to `generate_inst_text` function? The nice thing of the
            # current arrangement is the exception catch, where I can have an unknown with {}
            # (i.e., ``{ <unknown> }``) even if the disassembly failed.

        except UnknownInstructionException as e:
            hi.text += "<unknown>"
            hi.is_unknown = True

        if hi.end_packet:
            hi.text += ' }'
            # Even if the instruction is unknown, the parity bits analysis is
            # still valid, so the start/end packet settings stand, e.g.,
            # ``{ <unknown> }`` is a valid text output.

        self.process_endloops(hi)
        if 0 in hi.endloop:
            hi.text += ':endloop0'

        return hi

    def find_template(self, inst, hi):
        """Find the template for an instruction value.

        Args:
            inst (int): Actual instruction value.
            hi (HexagonInstruction): Instruction object where the `template` attribute is set
                to the value of the found (if any) template.

        Returns:
            bool: True if a template was found; False otherwise.

        TODOs:
            * Improve performance.

        """
        template_sources = [] # type: List[InstructionTemplate]
        if hi.is_duplex:
            template_sources = self.duplex_templates
        else:
            template_sources = self.segmented_inst_templates[extract_bits(inst, 31, 28)]
            
        i = 0
        template_sources_len = len(template_sources)
        # Length precomputed to improve performance.

        while i < template_sources_len:
            template = template_sources[i]

            # TODO: A while is used instead of a ``for ... in enumerate`` because I'm not sure I can
            # modify the list being enumerated (see below).
            
            if inst & template.encoding.mask == template.encoding.value:
                hi.template = template
                # Found a template match.

                # TODO: Small hack to partially reorder the list by most found, one swap at time, should be improved.
                if i != 0:
                    template_sources[i], template_sources[i - 1] = template_sources[i - 1], template_sources[i]
                    
                return True
            
            i += 1
        
        return False