def _similarity_score(obj1, obj2, block_cache=None):
    """Compute a similarity score for two objects.

    Args:
      obj1: The first object to score.
      obj2: The second object to score.
      block_cache: An optional dict of SHA to block counts to cache
        results between calls.
    Returns:
      The similarity score between the two objects, defined as the
        number of bytes in common between the two objects divided by the
        maximum size, scaled to the range 0-100.
    """
    if block_cache is None:
        block_cache = {}
    if obj1.id not in block_cache:
        block_cache[obj1.id] = _count_blocks(obj1)
    if obj2.id not in block_cache:
        block_cache[obj2.id] = _count_blocks(obj2)

    common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])
    max_size = max(obj1.raw_length(), obj2.raw_length())
    if not max_size:
        return _MAX_SCORE
    return int(float(common_bytes) * _MAX_SCORE / max_size)
    def _find_content_rename_candidates(self):
        candidates = self._candidates = []
        # TODO: Optimizations:
        #  - Compare object sizes before counting blocks.
        #  - Skip if delete's S_IFMT differs from all adds.
        #  - Skip if adds or deletes is empty.
        # Match C git's behavior of not attempting to find content renames if
        # the matrix size exceeds the threshold.
        if not self._should_find_content_renames():
            return

        block_cache = {}
        check_paths = self._rename_threshold is not None
        for delete in self._deletes:
            if S_ISGITLINK(delete.old.mode):
                continue  # Git links don't exist in this repo.
            old_sha = delete.old.sha
            old_obj = self._store[old_sha]
            block_cache[old_sha] = _count_blocks(old_obj)
            for add in self._adds:
                if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
                    continue
                new_obj = self._store[add.new.sha]
                score = _similarity_score(old_obj, new_obj,
                                          block_cache=block_cache)
                if score > self._rename_threshold:
                    new_type = self._rename_type(check_paths, delete, add)
                    rename = TreeChange(new_type, delete.old, add.new)
                    candidates.append((-score, rename))
    def _find_content_rename_candidates(self):
        candidates = self._candidates = []
        # TODO: Optimizations:
        #  - Compare object sizes before counting blocks.
        #  - Skip if delete's S_IFMT differs from all adds.
        #  - Skip if adds or deletes is empty.
        # Match C git's behavior of not attempting to find content renames if
        # the matrix size exceeds the threshold.
        if not self._should_find_content_renames():
            return

        block_cache = {}
        check_paths = self._rename_threshold is not None
        for delete in self._deletes:
            if S_ISGITLINK(delete.old.mode):
                continue  # Git links don't exist in this repo.
            old_sha = delete.old.sha
            old_obj = self._store[old_sha]
            block_cache[old_sha] = _count_blocks(old_obj)
            for add in self._adds:
                if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
                    continue
                new_obj = self._store[add.new.sha]
                score = _similarity_score(old_obj,
                                          new_obj,
                                          block_cache=block_cache)
                if score > self._rename_threshold:
                    new_type = self._rename_type(check_paths, delete, add)
                    rename = TreeChange(new_type, delete.old, add.new)
                    candidates.append((-score, rename))
Exemplo n.º 4
0
    def _find_content_renames(self):
        # TODO: Optimizations:
        #  - Compare object sizes before counting blocks.
        #  - Skip if delete's S_IFMT differs from all adds.
        #  - Skip if adds or deletes is empty.
        # Match C git's behavior of not attempting to find content renames if
        # the matrix size exceeds the threshold.
        if len(self._adds) * len(self._deletes) > self._max_files ** 2:
            return

        check_paths = self._rename_threshold is not None
        candidates = []
        for delete in self._deletes:
            if S_ISGITLINK(delete.old.mode):
                continue  # Git links don't exist in this repo.
            old_sha = delete.old.sha
            old_obj = self._store[old_sha]
            old_blocks = _count_blocks(old_obj)
            for add in self._adds:
                if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
                    continue
                new_obj = self._store[add.new.sha]
                score = _similarity_score(old_obj, new_obj,
                                          block_cache={old_sha: old_blocks})
                if score > self._rename_threshold:
                    if check_paths and delete.old.path == add.new.path:
                        # If the paths match, this must be a split modify, so
                        # make sure it comes out as a modify.
                        new_type = CHANGE_MODIFY
                    elif delete.type != CHANGE_DELETE:
                        # If it's in deletes but not marked as a delete, it must
                        # have been added due to find_copies_harder, and needs
                        # to be marked as a copy.
                        new_type = CHANGE_COPY
                    else:
                        new_type = CHANGE_RENAME
                    rename = TreeChange(new_type, delete.old, add.new)
                    candidates.append((-score, rename))

        # Sort scores from highest to lowest, but keep names in ascending order.
        candidates.sort()

        delete_paths = set()
        add_paths = set()
        for _, change in candidates:
            new_path = change.new.path
            if new_path in add_paths:
                continue
            old_path = change.old.path
            orig_type = change.type
            if old_path in delete_paths:
                change = TreeChange(CHANGE_COPY, change.old, change.new)

            # If the candidate was originally a copy, that means it came from a
            # modified or unchanged path, so we don't want to prune it.
            if orig_type != CHANGE_COPY:
                delete_paths.add(old_path)
            add_paths.add(new_path)
            self._changes.append(change)
        self._prune(add_paths, delete_paths)
def _similarity_score(obj1, obj2, block_cache=None):
    """Compute a similarity score for two objects.

    :param obj1: The first object to score.
    :param obj2: The second object to score.
    :param block_cache: An optional dict of SHA to block counts to cache
        results between calls.
    :return: The similarity score between the two objects, defined as the
        number of bytes in common between the two objects divided by the
        maximum size, scaled to the range 0-100.
    """
    if block_cache is None:
        block_cache = {}
    if obj1.id not in block_cache:
        block_cache[obj1.id] = _count_blocks(obj1)
    if obj2.id not in block_cache:
        block_cache[obj2.id] = _count_blocks(obj2)

    common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])
    max_size = max(obj1.raw_length(), obj2.raw_length())
    if not max_size:
        return _MAX_SCORE
    return int(float(common_bytes) * _MAX_SCORE / max_size)
Exemplo n.º 6
0
    def _find_content_renames(self):
        # TODO: Optimizations:
        #  - Compare object sizes before counting blocks.
        #  - Skip if delete's S_IFMT differs from all adds.
        #  - Skip if adds or deletes is empty.
        # Match C git's behavior of not attempting to find content renames if
        # the matrix size exceeds the threshold.
        if len(self._adds) * len(self._deletes) > self._max_files**2:
            return

        check_paths = self._rename_threshold is not None
        candidates = []
        for delete in self._deletes:
            if S_ISGITLINK(delete.old.mode):
                continue  # Git links don't exist in this repo.
            old_sha = delete.old.sha
            old_obj = self._store[old_sha]
            old_blocks = _count_blocks(old_obj)
            for add in self._adds:
                if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
                    continue
                new_obj = self._store[add.new.sha]
                score = _similarity_score(old_obj,
                                          new_obj,
                                          block_cache={old_sha: old_blocks})
                if score > self._rename_threshold:
                    if check_paths and delete.old.path == add.new.path:
                        # If the paths match, this must be a split modify, so
                        # make sure it comes out as a modify.
                        new_type = CHANGE_MODIFY
                    elif delete.type != CHANGE_DELETE:
                        # If it's in deletes but not marked as a delete, it must
                        # have been added due to find_copies_harder, and needs
                        # to be marked as a copy.
                        new_type = CHANGE_COPY
                    else:
                        new_type = CHANGE_RENAME
                    rename = TreeChange(new_type, delete.old, add.new)
                    candidates.append((-score, rename))

        # Sort scores from highest to lowest, but keep names in ascending order.
        candidates.sort()

        delete_paths = set()
        add_paths = set()
        for _, change in candidates:
            new_path = change.new.path
            if new_path in add_paths:
                continue
            old_path = change.old.path
            orig_type = change.type
            if old_path in delete_paths:
                change = TreeChange(CHANGE_COPY, change.old, change.new)

            # If the candidate was originally a copy, that means it came from a
            # modified or unchanged path, so we don't want to prune it.
            if orig_type != CHANGE_COPY:
                delete_paths.add(old_path)
            add_paths.add(new_path)
            self._changes.append(change)
        self._prune(add_paths, delete_paths)