Exemplo n.º 1
0
    def test_retransform_message_indices(self):
        sync_ends = np.array([12, 12, 12, 14, 14])

        rng = CommonRange(0,
                          8,
                          "1" * 8,
                          score=1,
                          field_type="length",
                          message_indices={0, 1, 2, 3, 4})
        retransformed_ranges = FormatFinder.retransform_message_indices(
            [rng], [0, 1, 2, 3, 4], sync_ends)

        # two different sync ends
        self.assertEqual(len(retransformed_ranges), 2)

        expected1 = CommonRange(12,
                                8,
                                "1" * 8,
                                score=1,
                                field_type="length",
                                message_indices={0, 1, 2})
        expected2 = CommonRange(14,
                                8,
                                "1" * 8,
                                score=1,
                                field_type="length",
                                message_indices={3, 4})

        self.assertIn(expected1, retransformed_ranges)
        self.assertIn(expected2, retransformed_ranges)
Exemplo n.º 2
0
    def test_create_message_types_2(self):
        rng1 = CommonRange(0, 8, "1" * 8, score=1, field_type="Length")
        rng1.message_indices = {0, 2, 4, 6, 8, 12}
        rng2 = CommonRange(8, 8, "1" * 8, score=1, field_type="Address")
        rng2.message_indices = {1, 2, 3, 4, 5, 12}
        rng3 = CommonRange(16, 8, "1" * 8, score=1, field_type="Seq")
        rng3.message_indices = {1, 3, 5, 7, 12}

        message_types = FormatFinder.create_common_range_containers(
            {rng1, rng2, rng3})
        expected1 = CommonRangeContainer([rng1], message_indices={0, 6, 8})
        expected2 = CommonRangeContainer([rng1, rng2], message_indices={2, 4})
        expected3 = CommonRangeContainer([rng1, rng2, rng3],
                                         message_indices={12})
        expected4 = CommonRangeContainer([rng2, rng3],
                                         message_indices={1, 3, 5})
        expected5 = CommonRangeContainer([rng3], message_indices={7})

        self.assertEqual(len(message_types), 5)

        self.assertIn(expected1, message_types)
        self.assertIn(expected2, message_types)
        self.assertIn(expected3, message_types)
        self.assertIn(expected4, message_types)
        self.assertIn(expected5, message_types)
Exemplo n.º 3
0
    def test_handle_medium_overlapping_conflict(self):
        rng1 = CommonRange(8, 8, "1" * 8, score=1, field_type="Length")
        rng2 = CommonRange(4, 10, "1" * 8, score=0.8, field_type="Address")
        rng3 = CommonRange(15, 20, "1" * 8, score=1, field_type="Seq")
        rng4 = CommonRange(60, 80, "1" * 8, score=0.8, field_type="Type")
        rng5 = CommonRange(70, 90, "1" * 8, score=0.9, field_type="Data")

        container = CommonRangeContainer([rng1, rng2, rng3, rng4, rng5])
        result = FormatFinder.handle_overlapping_conflict([container])
        self.assertEqual(len(result), 1)
        self.assertEqual(len(result[0]), 3)
        self.assertIn(rng1, result[0])
        self.assertIn(rng3, result[0])
        self.assertIn(rng5, result[0])
Exemplo n.º 4
0
    def test_ensure_not_overlaps(self):
        test_range = CommonRange(start=4, length=8, value="12345678")
        self.assertEqual(test_range.end, 11)

        # no overlapping
        self.assertEqual(test_range, test_range.ensure_not_overlaps(0, 3)[0])
        self.assertEqual(test_range, test_range.ensure_not_overlaps(20, 24)[0])

        # overlapping on left
        result = test_range.ensure_not_overlaps(2, 6)[0]
        self.assertEqual(result.start, 6)
        self.assertEqual(result.end, 11)

        # overlapping on right
        result = test_range.ensure_not_overlaps(6, 14)[0]
        self.assertEqual(result.start, 4)
        self.assertEqual(result.end, 5)

        # full overlapping
        self.assertEqual(len(test_range.ensure_not_overlaps(3, 14)), 0)

        # overlapping in the middle
        result = test_range.ensure_not_overlaps(6, 9)
        self.assertEqual(len(result), 2)
        left, right = result[0], result[1]
        self.assertEqual(left.start, 4)
        self.assertEqual(left.end, 5)
        self.assertEqual(right.start, 10)
        self.assertEqual(right.end, 11)
Exemplo n.º 5
0
    def score_ranges(common_ranges_by_length: dict, n_gram_length: int):
        """
        Calculate score for the common ranges

        :param common_ranges_by_length:
        :param n_gram_length:
        :return:
        """

        # The window length must be smaller than common range's length
        # and is something like 8 in case of on 8 bit integer.
        # We make this generic so e.g. 4 bit integers are supported as well
        if n_gram_length == 8:
            window_lengths = [8, 16, 32, 64]
        else:
            window_lengths = [n_gram_length * i for i in range(1, 5)]

        scored_ranges = dict()
        for length in common_ranges_by_length:
            scored_ranges[length] = dict()
            for window_length in window_lengths:
                scored_ranges[length][window_length] = []

        byteorders = ["big", "little"] if n_gram_length == 8 else ["big"]
        for window_length in window_lengths:
            for length, common_ranges in common_ranges_by_length.items():
                for common_range in filter(
                        lambda cr: cr.length >= window_length, common_ranges):
                    bits = common_range.value
                    rng_byte_order = "big"

                    max_score = max_start = -1
                    for start in range(0,
                                       len(bits) + 1 - window_length,
                                       n_gram_length):
                        for byteorder in byteorders:
                            score = LengthEngine.score_bits(
                                bits[start:start + window_length],
                                length,
                                position=start,
                                byteorder=byteorder)

                            if score > max_score:
                                max_score = score
                                max_start = start
                                rng_byte_order = byteorder

                    rng = CommonRange(
                        common_range.start + max_start,
                        window_length,
                        common_range.value[max_start:max_start +
                                           window_length],
                        score=max_score,
                        field_type="length",
                        message_indices=common_range.message_indices,
                        range_type=common_range.range_type,
                        byte_order=rng_byte_order)
                    scored_ranges[length][window_length].append(rng)

        return scored_ranges
Exemplo n.º 6
0
    def find_common_ranges(self, alpha=0.95, range_type="bit"):
        """
        Find all common ranges where at least alpha percent of numbers are equal

        :param range_type: on of bit/hex/byte
        :param alpha:
        :return:
        """
        data_indices = np.argwhere(self.data >= alpha).flatten()

        if len(data_indices) < 2:
            return []

        result = []
        start, length = None, 0
        for i in range(1, len(data_indices)):
            if start is None:
                start = data_indices[i - 1]
                length = 1

            if data_indices[i] - data_indices[i - 1] == 1:
                length += 1
            else:
                if length >= 2:
                    value = self.__get_value_for_common_range(start, length)
                    result.append(
                        CommonRange(start,
                                    length,
                                    value,
                                    message_indices=set(self.__active_indices),
                                    range_type=range_type))

                start, length = None, 0

            if i == len(data_indices) - 1 and length >= 2:
                value = self.__get_value_for_common_range(start, length)
                result.append(
                    CommonRange(start,
                                length,
                                value,
                                message_indices=set(self.__active_indices),
                                range_type=range_type))

        return result
Exemplo n.º 7
0
    def get_preamble_and_sync(preamble_starts, preamble_lengths, sync_ends,
                              message_type_indices):
        """
        Get preamble and sync common ranges based on the data

        :type preamble_starts: np.ndarray
        :type preamble_lengths: np.ndarray
        :type sync_ends: np.ndarray
        :type message_type_indices: list
        :rtype: set of CommonRange
        """
        assert len(preamble_starts) == len(preamble_lengths) == len(sync_ends)

        result = set()  # type: set[CommonRange]
        for i in message_type_indices:
            preamble = CommonRange(preamble_starts[i],
                                   preamble_lengths[i],
                                   field_type="preamble",
                                   message_indices={i})
            existing_preamble = next(
                (rng for rng in result if preamble == rng), None)
            if existing_preamble is not None:
                existing_preamble.message_indices.add(i)
            elif preamble_lengths[i] > 0:
                result.add(preamble)

            preamble_end = preamble_starts[i] + preamble_lengths[i]
            sync_end = sync_ends[i]
            sync = CommonRange(preamble_end,
                               sync_end - preamble_end,
                               field_type="synchronization",
                               message_indices={i})
            existing_sync = next((rng for rng in result if sync == rng), None)
            if existing_sync is not None:
                existing_sync.message_indices.add(i)
            elif sync_end - preamble_end > 0:
                result.add(sync)

        return result
Exemplo n.º 8
0
    def test_create_message_types_1(self):
        rng1 = CommonRange(0, 8, "1" * 8, score=1, field_type="Length")
        rng1.message_indices = {0, 1, 2}
        rng2 = CommonRange(8, 8, "1" * 8, score=1, field_type="Address")
        rng2.message_indices = {0, 1, 2}

        message_types = FormatFinder.create_common_range_containers(
            {rng1, rng2})
        self.assertEqual(len(message_types), 1)

        expected = CommonRangeContainer([rng1, rng2],
                                        message_indices={0, 1, 2})
        self.assertEqual(message_types[0], expected)
Exemplo n.º 9
0
    def test_handle_easy_overlapping_conflict(self):
        # Easy conflict: First Label has higher score
        rng1 = CommonRange(8, 8, "1" * 8, score=1, field_type="Length")
        rng1.message_indices = {0, 1, 2}
        rng2 = CommonRange(8, 8, "1" * 8, score=0.8, field_type="Address")
        rng2.message_indices = {0, 1, 2}

        container = CommonRangeContainer([rng1, rng2],
                                         message_indices={0, 1, 2})
        result = FormatFinder.handle_overlapping_conflict([container])
        self.assertEqual(len(result), 1)
        self.assertEqual(len(result[0]), 1)
        self.assertIn(rng1, result[0])
        self.assertEqual(result[0].message_indices, {0, 1, 2})
Exemplo n.º 10
0
    def test_handle_no_overlapping_conflict(self):
        rng1 = CommonRange(0, 8, "1" * 8, score=1, field_type="Length")
        rng1.message_indices = {0, 1, 2}
        rng2 = CommonRange(8, 8, "1" * 8, score=1, field_type="Address")
        rng2.message_indices = {0, 1, 2}

        container = CommonRangeContainer([rng1, rng2],
                                         message_indices={0, 1, 2})

        # no conflict
        result = FormatFinder.handle_overlapping_conflict([container])
        self.assertEqual(len(result), 1)
        self.assertEqual(len(result[0]), 2)
        self.assertIn(rng1, result[0])
        self.assertEqual(result[0].message_indices, {0, 1, 2})
        self.assertIn(rng2, result[0])
Exemplo n.º 11
0
    def choose_high_scored_ranges(self, scored_ranges: dict,
                                  bitvectors_by_n_gram_length: dict,
                                  minimum_score: float):

        # Set for every window length the highest scored range as candidate
        possible_window_lengths = defaultdict(int)
        for length, ranges_by_window_length in scored_ranges.items():
            for window_length, ranges in ranges_by_window_length.items():
                try:
                    ranges_by_window_length[window_length] = max(
                        filter(lambda x: x.score >= minimum_score, ranges),
                        key=lambda x: x.score)
                    possible_window_lengths[window_length] += 1
                except ValueError:
                    ranges_by_window_length[window_length] = None

        try:
            # Choose window length -> window length that has a result most often and choose greater on tie
            chosen_window_length = max(possible_window_lengths,
                                       key=lambda x:
                                       (possible_window_lengths[x], x))
        except ValueError:
            return dict()

        high_scores_by_length = dict()

        # Choose all ranges with highest score per cluster if score surpasses the minimum score
        for length, ranges_by_window_length in scored_ranges.items():
            try:
                if ranges_by_window_length[chosen_window_length]:
                    high_scores_by_length[length] = ranges_by_window_length[
                        chosen_window_length]
            except KeyError:
                continue

        # If there are length clusters with only one message see if we can assign a range from other clusters
        for length, msg_indices in bitvectors_by_n_gram_length.items():
            if len(msg_indices) != 1:
                continue

            msg_index = msg_indices[0]
            bitvector = self.bitvectors[msg_index]
            max_score, best_match = 0, None

            for rng in high_scores_by_length.values():
                bits = bitvector[rng.start:rng.end + 1]
                if len(bits) > 0:
                    score = self.score_bits(bits, length, rng.start)
                    if score > max_score:
                        best_match, max_score = rng, score

            if best_match is not None:
                high_scores_by_length[length] = CommonRange(
                    best_match.start,
                    best_match.length,
                    value=bitvector[best_match.start:best_match.end + 1],
                    score=max_score,
                    field_type="length",
                    message_indices={msg_index},
                    range_type="bit")

        return high_scores_by_length
Exemplo n.º 12
0
    def _py_find_field(self, messages, verbose=False):
        """

        :type messages: list of urh.signalprocessing.Message.Message
        :return:
        """
        msg_indices_per_participant = defaultdict(list)
        """:type : dict[urh.signalprocessing.Participant.Participant, list[int]] """

        for i, msg in enumerate(messages):
            msg_indices_per_participant[msg.participant].append(i)


        # Cluster participants
        equal_ranges_per_participant = defaultdict(list)
        """:type : dict[urh.signalprocessing.Participant.Participant, list[CommonRange]] """

        alignment = 8

        # Step 1: Find equal ranges for participants by evaluating the XOR matrix participant wise
        for participant, participant_msg_indices in msg_indices_per_participant.items():
            for i, msg_index in enumerate(participant_msg_indices):
                msg = messages[msg_index]
                bitvector_str = msg.decoded_bits_str

                for other_index in participant_msg_indices[i+1:]:
                    other_msg = messages[other_index]
                    xor_vec = self.xor_matrix[msg_index, other_index][self.xor_matrix[msg_index, other_index] != -1] # -1 = End of Vector

                    # addresses are searched across message types, as we assume them to be in almost every message
                    # therefore we need to consider message types of both messages we compare and ignore already labeled areas
                    unlabeled_ranges = msg.message_type.unlabeled_ranges_with_other_mt(other_msg.message_type)
                    for rng_start, rng_end in unlabeled_ranges:
                        start = 0
                        # The last 1 marks end of sequence, and prevents swallowing long zero sequences at the end
                        cmp_vector = np.append(xor_vec[rng_start:rng_end], 1)
                        for end in np.where(cmp_vector == 1)[0]:
                            if end - start >= self.MIN_ADDRESS_LENGTH:
                                equal_range_start = alignment * ((rng_start + start) // alignment)
                                equal_range_end = alignment * ((rng_start + end) // alignment)
                                bits = bitvector_str[equal_range_start:equal_range_end]

                                # Did we already found this range?
                                cr = next((cr for cr in equal_ranges_per_participant[participant] if
                                          cr.start == equal_range_start and cr.end == equal_range_end
                                          and cr.bits == bits), None)

                                # If not: Create it
                                if cr is None:
                                    cr = CommonRange(equal_range_start, equal_range_end, bits)
                                    equal_ranges_per_participant[participant].append(cr)

                                cr.messages.add(msg_index)
                                cr.messages.add(other_index)

                            start = end + alignment

        if verbose:
            print(constants.color.BOLD + "Result after Step 1" +constants.color.END)
            self.__print_ranges(equal_ranges_per_participant)

        # Step 2: Now we want to find our address candidates.
        # We do this by weighting them in order of LCS they share with each other
        scored_candidates = self.find_candidates([cr for crl in equal_ranges_per_participant.values() for cr in crl])
        """:type : dict[str, int] """

        try:
            highscored = next(self.choose_candidate_pair(scored_candidates))
            assert len(highscored[0]) == len(highscored[1])
        except (StopIteration, AssertionError):
            return

        if verbose:
            print(scored_candidates)
            print(sorted(scored_candidates, key=scored_candidates.get, reverse=True))

        # Now get the common_ranges we need
        scored_candidates_per_participant = defaultdict(list)
        """:type : dict[urh.signalprocessing.Participant.Participant, list[CommonRange]] """

        for participant, ranges in equal_ranges_per_participant.items():
            for equal_range in ranges:
                for h in highscored:
                    rng = equal_range.pos_of_hex(h)
                    if rng is not None:
                        start, end = rng
                        bits = equal_range.bits[start:end]
                        rel_start = equal_range.start + start
                        rel_end = rel_start + (end - start)
                        cr = next((cr for cr in scored_candidates_per_participant[participant] if cr.start == rel_start
                                                                                               and cr.end == rel_end and
                                                                                               cr.bits == bits), None)
                        if cr is None:
                            cr = CommonRange(rel_start, rel_end, bits)
                            scored_candidates_per_participant[participant].append(cr)

                        cr.messages.update(equal_range.messages)

        # Now we have the highscored ranges per participant
        # If there is a crossmatch of the ranges we are good and found the addresses!
        # We have something like:
        #
        # Participant: Alice (A):                               Participant: Bob (B):
        # =======================                               =====================
        #
        # Range	   Value     Messages                           Range	   Value     Messages
        # -----    -----     --------                           -----      -----     --------
        # 72-96    1b6033    {1, 5, 9, 13, 17, 20}              72-96      78e289    {11, 3, 15, 7}
        # 88-112   1b6033    {2, 6, 10, 14, 18}                 88-112     78e289    {4, 8, 12, 16, 19}
        # 112-136  78e289    {2, 6, 10, 14, 18}                 112-136    1b6033    {0, 4, 8, 12, 16, 19}
        #

        # If the value doubles for the same participant in other range, then we need to create a new message type
        # We consider the default case (=default message type) to have addresses followed by each other
        # Furthermore, we assume if there is only one address per message type, it is the destination address
        clusters = {"default": defaultdict(set), "ack": defaultdict(set)}
        """:type: dict[str, dict[tuple[int.int],set[int]]]"""

        all_candidates = [cr for crl in scored_candidates_per_participant.values() for cr in crl]
        # Check for crossmatch and cluster in together and splitted addresses
        # Perform a merge by only saving the ranges and applying messages
        for candidate in sorted(all_candidates):
            if any(c.start == candidate.start and c.end == candidate.end and c.bits != candidate.bits for c in all_candidates):
                # Crossmatch! This is a address
                if any(c.start == candidate.end or c.end == candidate.start for c in all_candidates):
                     clusters["default"][(candidate.start, candidate.end)].update(candidate.messages)
                else:
                    clusters["ack"][(candidate.start, candidate.end)].update(candidate.messages)

        msg_clusters =  {cname: set(i for s in ranges.values() for i in s) for cname, ranges in clusters.items()}

        # If there are no addresses in default message type prevent evaluating everything as ACK
        if not msg_clusters["default"]:
            msg_clusters["ack"] = set()
            scored_candidates_per_participant.clear()

        self.assign_messagetypes(messages, msg_clusters)

        # Now try to find the addresses of the participants to separate SRC and DST address later
        self.assign_participant_addresses(messages, list(scored_candidates_per_participant.keys()), highscored)

        for participant, ranges in scored_candidates_per_participant.items():
            for rng in ranges:
                for msg_index in rng.messages:
                    msg = messages[msg_index]

                    if msg.message_type.name == "ack":
                       field_type = self.dst_field_type
                       name = self.dst_field_name
                    elif msg.participant:
                        if rng.hex_value == msg.participant.address_hex:
                            name = self.src_field_name
                            field_type = self.src_field_type
                        else:
                            name = self.dst_field_name
                            field_type = self.dst_field_type
                    else:
                        name = "Address"
                        field_type = None

                    if not any(lbl.name == name and lbl.auto_created for lbl in msg.message_type):
                        msg.message_type.add_protocol_label(rng.start, rng.end - 1, name=name,
                                                            auto_created=True, type=field_type)
Exemplo n.º 13
0
    def find(self):
        addresses_by_participant = {
            p: [addr.tostring()]
            for p, addr in self.known_addresses_by_participant.items()
        }
        addresses_by_participant.update(self.find_addresses())
        self._debug("Addresses by participant", addresses_by_participant)

        # Find the address candidates by participant in messages
        ranges_by_participant = defaultdict(
            list)  # type: dict[int, list[CommonRange]]

        addresses = [
            np.array(np.frombuffer(a, dtype=np.uint8))
            for address_list in addresses_by_participant.values()
            for a in address_list
        ]

        already_labeled_cols = array(
            "L", [e for rng in self.already_labeled for e in range(*rng)])

        # Find occurrences of address candidates in messages and create common ranges over matching positions
        for i, msg_vector in enumerate(self.msg_vectors):
            participant = self.participant_indices[i]
            for address in addresses:
                for index in awre_util.find_occurrences(
                        msg_vector, address, already_labeled_cols):
                    common_ranges = ranges_by_participant[participant]
                    rng = next((cr for cr in common_ranges
                                if cr.matches(index, address)),
                               None)  # type: CommonRange
                    if rng is not None:
                        rng.message_indices.add(i)
                    else:
                        common_ranges.append(
                            CommonRange(index,
                                        len(address),
                                        address,
                                        message_indices={i},
                                        range_type="hex"))

        num_messages_by_participant = defaultdict(int)
        for participant in self.participant_indices:
            num_messages_by_participant[participant] += 1

        # Look for cross swapped values between participant clusters
        for p1, p2 in itertools.combinations(ranges_by_participant, 2):
            ranges1_set, ranges2_set = set(ranges_by_participant[p1]), set(
                ranges_by_participant[p2])

            for rng1, rng2 in itertools.product(ranges_by_participant[p1],
                                                ranges_by_participant[p2]):
                if rng1 in ranges2_set and rng2 in ranges1_set:
                    if self.cross_swap_check(rng1, rng2):
                        rng1.score += len(rng2.message_indices
                                          ) / num_messages_by_participant[p2]
                        rng2.score += len(rng1.message_indices
                                          ) / num_messages_by_participant[p1]
                    elif self.ack_check(rng1, rng2):
                        # Add previous score in divisor to add bonus to ranges that apply to all messages
                        rng1.score += len(rng2.message_indices) / (
                            num_messages_by_participant[p2] + rng1.score)
                        rng2.score += len(rng1.message_indices) / (
                            num_messages_by_participant[p1] + rng2.score)

        if len(ranges_by_participant) == 1 and not self.src_field_present:
            for p, ranges in ranges_by_participant.items():
                for rng in sorted(ranges):
                    try:
                        if np.array_equal(
                                rng.value,
                                self.known_addresses_by_participant[p]):
                            # Only one participant in this iteration and address already known -> Highscore
                            rng.score = 1
                            break  # Take only the first (leftmost) range
                    except KeyError:
                        pass

        high_scored_ranges_by_participant = defaultdict(list)

        address_length = self.__estimate_address_length(ranges_by_participant)

        # Get highscored ranges by participant
        for participant, common_ranges in ranges_by_participant.items():
            # Sort by negative score so ranges with highest score appear first
            # Secondary sort by tuple to ensure order when ranges have same score
            sorted_ranges = sorted(filter(
                lambda cr: cr.score > self.minimum_score, common_ranges),
                                   key=lambda cr: (-cr.score, cr))
            if len(sorted_ranges) == 0:
                addresses_by_participant[participant] = dict()
                continue

            addresses_by_participant[participant] = {
                a
                for a in addresses_by_participant.get(participant, [])
                if len(a) == address_length
            }

            for rng in filter(lambda r: r.length == address_length,
                              sorted_ranges):
                rng.score = min(rng.score, 1.0)
                high_scored_ranges_by_participant[participant].append(rng)

        # Now we find the most probable address for all participants
        self.__assign_participant_addresses(addresses_by_participant,
                                            high_scored_ranges_by_participant)

        # Eliminate participants for which we could not assign an address
        for participant, address in addresses_by_participant.copy().items():
            if address is None:
                del addresses_by_participant[participant]

        # Now we can separate SRC and DST
        for participant, ranges in high_scored_ranges_by_participant.items():
            try:
                address = addresses_by_participant[participant]
            except KeyError:
                high_scored_ranges_by_participant[participant] = []
                continue

            result = []

            for rng in sorted(ranges, key=lambda r: r.score, reverse=True):
                rng.field_type = "source address" if rng.value.tostring(
                ) == address else "destination address"
                if len(result) == 0:
                    result.append(rng)
                else:
                    subset = next(
                        (r for r in result
                         if rng.message_indices.issubset(r.message_indices)),
                        None)
                    if subset is not None:
                        if rng.field_type == subset.field_type:
                            # Avoid adding same address type twice
                            continue

                        if rng.length != subset.length or (
                                rng.start != subset.end + 1
                                and rng.end + 1 != subset.start):
                            # Ensure addresses are next to each other
                            continue

                    result.append(rng)

            high_scored_ranges_by_participant[participant] = result

        self.__find_broadcast_fields(high_scored_ranges_by_participant,
                                     addresses_by_participant)

        result = [
            rng for ranges in high_scored_ranges_by_participant.values()
            for rng in ranges
        ]
        # If we did not find a SRC address, lower the score a bit,
        # so DST fields do not win later e.g. again length fields in case of tie
        if not any(rng.field_type == "source address" for rng in result):
            for rng in result:
                rng.score *= 0.95

        return result
Exemplo n.º 14
0
    def find(self):
        n = self.n_gram_length

        if len(self.bitvectors) < 3:
            # We need at least 3 bitvectors to properly find a sequence number
            return []

        diff_matrix = self.create_difference_matrix(self.bitvectors,
                                                    self.n_gram_length)
        diff_frequencies_by_column = dict()

        for j in range(diff_matrix.shape[1]):
            unique, counts = np.unique(diff_matrix[:, j], return_counts=True)
            diff_frequencies_by_column[j] = dict(zip(unique, counts))

        self._debug("Diff_frequencies_by_column", diff_frequencies_by_column)
        scores_by_column = dict()
        for column, frequencies in diff_frequencies_by_column.items():
            if column not in self.already_labeled_cols:
                scores_by_column[column] = self.calc_score(frequencies)
            else:
                scores_by_column[column] = 0

        self._debug("Scores by column", scores_by_column)
        result = []
        for candidate_column in sorted(scores_by_column,
                                       key=scores_by_column.get,
                                       reverse=True):
            score = scores_by_column[candidate_column]
            if score < self.minimum_score:
                continue

            most_common_diff = self.get_most_frequent(
                diff_frequencies_by_column[candidate_column])
            message_indices = np.flatnonzero(
                # get all rows that have the most common difference or zero
                (diff_matrix[:, candidate_column] == most_common_diff)
                | (diff_matrix[:, candidate_column] == 0))

            # For example, index 1 in diff matrix corresponds to index 1 and 2 of messages
            message_indices = set(message_indices) | set(message_indices + 1)
            values = set()
            for i in message_indices:
                values.add(
                    self.bitvectors[i][candidate_column *
                                       n:(candidate_column + 1) * n].tobytes())

            matching_ranges = [
                r for r in result if r.message_indices == message_indices
            ]

            try:
                matching_range = next(
                    r for r in matching_ranges
                    if r.start == (candidate_column - 1) *
                    n and (r.byte_order_is_unknown or r.byte_order == "big"))
                matching_range.length += n
                matching_range.byte_order = "big"
                matching_range.values.extend(list(values))
                continue
            except StopIteration:
                pass

            try:
                matching_range = next(
                    r for r in matching_ranges
                    if r.start == (candidate_column + 1) * n and (
                        r.byte_order_is_unknown or r.byte_order == "little"))
                matching_range.start -= n
                matching_range.length += n
                matching_range.byte_order = "little"
                matching_range.values.extend(list(values))
                continue
            except StopIteration:
                pass

            new_range = CommonRange(start=candidate_column * n,
                                    length=n,
                                    score=score,
                                    field_type="sequence number",
                                    message_indices=message_indices,
                                    byte_order=None)
            new_range.values.extend(list(values))
            result.append(new_range)

        # At least three different values needed to reliably identify a sequence number
        return [rng for rng in result if len(set(rng.values)) > 2]