Пример #1
0
    def _prunePairs(self, possible_pairs):
        # find pairs which have duplicates, select only best
        more_possible = []
        tiebreak_pairs = []

        max_pair = None
        for each in possible_pairs:
            tiebreak_pairs = []
            max_pair = each
            for pair in possible_pairs:
                if max_pair != pair and max_pair[0] == pair[0]:
                    if max_pair[2] < pair[2]:
                        max_pair = pair
                        tiebreak_pairs = []
                    elif max_pair[2] == pair[2]:
                        tiebreak_pairs.append(pair)
                        tiebreak_pairs.append(max_pair)

            if not max_pair in tiebreak_pairs:
                more_possible.append(max_pair)
            if len(tiebreak_pairs) > 0:
                #possible_pairs.extend(tiebreak_pairs)
                pass


        tiebreak_pairs = []
        most_possible = []
        for each in more_possible:
            tiebreak_pairs = []
            max_pair = each
            for pair in more_possible:
                if max_pair != pair and max_pair[1] == pair[1]:
                    if max_pair[2] < pair[2]:
                        max_pair = pair
                        tiebreak_pairs = []
                    elif max_pair[2] == pair[2]:
                        tiebreak_pairs.append(pair)
                        tiebreak_pairs.append(max_pair)

            if not max_pair in tiebreak_pairs:
                most_possible.append(max_pair)
            if len(tiebreak_pairs) > 0:
                #possible_pairs.extend(tiebreak_pairs)
                pass


        return _uniq(most_possible)
Пример #2
0
    def digestSCP(self, removed_set, added_set):
        # renames: yes, merges: no, splits: not handled, clones: yes
        possible_pairs = []
        max_pair = None
        tiebreak_pairs = []
        for r_block in removed_set:
            if max_pair is not None:
                #added_set.remove(max_pair[1]) # do not attempt to re-pair
                max_pair = None

            tiebreak_pairs = []
            for a_block in added_set:
                # for pairing of blocks with a small number of sub_blocks (1-3), this
                # will be fairly inaccurate
                r_block_seq = None
                a_block_seq = None

                if r_block.has_sub_blocks and a_block.has_sub_blocks:
                    if len(r_block.sub_blocks) > 2 and len(a_block.sub_blocks) > 2:
                        r_block_seq = r_block.sub_blocks
                        a_block_seq = a_block.sub_blocks

                if r_block_seq is None or a_block_seq is None:
                    r_block_seq = r_block.text
                    a_block_seq = a_block.text

                s = SequenceMatcher(None, r_block_seq, a_block_seq)
                relation_value = s.ratio()
                if relation_value == 0.0:
                    continue

                if max_pair is None:
                    max_pair = (r_block, a_block, relation_value)
                    tiebreak_pairs = []
                elif relation_value > max_pair[2]:
                    max_pair = (r_block, a_block, relation_value)
                    tiebreak_pairs = []
                elif relation_value == max_pair[2]:
                    # tie breaker needed, compare the names
                    tb = self._tiebreaker(r_block.name, a_block.name,
                            max_pair[1].name)
                    if tb == 0:
                        tb = self._tiebreaker(str(r_block), str(a_block),
                            str(max_pair[1]))

                    if tb == 0:
                        tiebreak_pairs.append((r_block, a_block,
                            relation_value))
                        tiebreak_pairs.append(max_pair)

                    if tb == 1:
                        max_pair = (r_block, a_block, relation_value)

            # since r_block->a_block pair has been found, should we remove
            # a_block from the list of possiblities?
            if max_pair is not None:
                if not max_pair in tiebreak_pairs:
                    possible_pairs.append(max_pair)
            if len(tiebreak_pairs) > 0:
                #possible_pairs.extend(tiebreak_pairs)
                print('------------')
                for each in tiebreak_pairs:
                    print('tiebreaker needed: %s, %s, %s' % each)
                print('------------')

        return self._prunePairs(_uniq(possible_pairs))