Пример #1
0
    def chunks(self) -> Generator[types.Chunk, None, None]:
        """Yield individual diffs from the repository's history.

        :rtype: Generator[Chunk, None, None]
        :raises types.GitRemoteException: If there was an error fetching branches
        """
        already_searched: Set[bytes] = set()

        try:
            if self.git_options.branch:
                # Single branch only
                if self.git_options.fetch:
                    self._repo.remotes.origin.fetch(self.git_options.branch)
                unfiltered_branches = list(self._repo.branches)
                branches = [
                    x for x in unfiltered_branches
                    if x == self.git_options.branch
                ]
            else:
                # Everything
                if self.git_options.fetch:
                    self._repo.remotes.origin.fetch()
                branches = list(self._repo.branches)
        except git.GitCommandError as exc:
            raise types.GitRemoteException(exc.stderr.strip()) from exc

        for branch in branches:
            diff_index: git.DiffIndex = None
            diff_hash: bytes
            curr_commit: git.Commit = None
            prev_commit: git.Commit = None
            for curr_commit, prev_commit in self._iter_branch_commits(
                    self._repo, branch):
                diff_index = curr_commit.diff(prev_commit, create_patch=True)
                diff_hash = hashlib.md5(
                    (str(prev_commit) +
                     str(curr_commit)).encode("utf-8")).digest()
                if diff_hash in already_searched:
                    continue
                already_searched.add(diff_hash)
                for blob, file_path in self._iter_diff_index(diff_index):
                    yield types.Chunk(
                        blob,
                        file_path,
                        util.extract_commit_metadata(prev_commit, branch),
                    )

            # Finally, yield the first commit to the branch
            if curr_commit:
                diff = curr_commit.diff(git.NULL_TREE, create_patch=True)
                for blob, file_path in self._iter_diff_index(diff):
                    yield types.Chunk(
                        blob,
                        file_path,
                        util.extract_commit_metadata(prev_commit, branch),
                    )
Пример #2
0
 def test_scan_exits_gracefully_when_remote_fetch_fails(
         self, mock_scanner: mock.MagicMock):
     mock_scanner.return_value.scan.side_effect = types.GitRemoteException(
         "Fetch failed!")
     runner = CliRunner()
     with runner.isolated_filesystem():
         result = runner.invoke(cli.main, ["scan-local-repo", "."])
     self.assertGreater(result.exit_code, 0)
     self.assertEqual(
         result.output,
         "There was an error fetching from the remote repository: Fetch failed!\n",
     )
Пример #3
0
def clone_git_repo(git_url: str,
                   target_dir: Optional[pathlib.Path] = None) -> pathlib.Path:
    """Clone a remote git repository and return its filesystem path.

    :param git_url: The URL of the git repository to be cloned
    :param target_dir: Where to clone the repository to
    :raises types.GitRemoteException: If there was an error cloning the repository
    """
    if not target_dir:
        project_path = tempfile.mkdtemp()
    else:
        project_path = str(target_dir)

    try:
        git.Repo.clone_from(git_url, project_path)
    except git.GitCommandError as exc:
        raise types.GitRemoteException(exc.stderr.strip()) from exc
    return pathlib.Path(project_path)
Пример #4
0
def clone_git_repo(
        git_url: str,
        target_dir: Optional[pathlib.Path] = None) -> Tuple[pathlib.Path, str]:
    """Clone a remote git repository and return its filesystem path.

    :param git_url: The URL of the git repository to be cloned
    :param target_dir: Where to clone the repository to
    :returns: Filesystem path of local clone and name of remote source
    :raises types.GitRemoteException: If there was an error cloning the repository
    """
    if not target_dir:
        project_path = tempfile.mkdtemp()
    else:
        project_path = str(target_dir)

    try:
        repo = git.Repo.clone_from(git_url, project_path)
        origin = repo.remotes[0].name
    except git.GitCommandError as exc:
        raise types.GitRemoteException(exc.stderr.strip()) from exc
    return pathlib.Path(project_path), origin
Пример #5
0
    def chunks(self) -> Generator[types.Chunk, None, None]:
        """Yield individual diffs from the repository's history.

        :raises types.GitRemoteException: If there was an error fetching branches
        """
        already_searched: Set[bytes] = set()

        try:
            if self.git_options.branch:
                # Single branch only
                branch = self._repo.branches.get(self.git_options.branch)
                if not branch:
                    raise BranchNotFoundException(
                        f"Branch {self.git_options.branch} was not found.")
                branches = [self.git_options.branch]
            else:
                # Everything
                if util.is_shallow_clone(self._repo):
                    # If this is a shallow clone, examine the repo head as a single
                    # commit to scan all files at once
                    branches = ["HEAD"]
                else:
                    # We use `self._repo.branches` here so that we make sure to
                    # scan not only the locally checked out branches (as provided
                    # by self._repo.listall_branches()), but to also scan all
                    # available remote refs
                    branches = list(self._repo.branches)
        except pygit2.GitError as exc:
            raise types.GitRemoteException(str(exc)) from exc

        self.logger.debug(
            "Branches to be scanned: %s",
            ", ".join([str(branch) for branch in branches]),
        )

        for branch_name in branches:
            self.logger.info("Scanning branch: %s", branch_name)
            if branch_name == "HEAD":
                commits = [self._repo.get(self._repo.head.target)]
            else:
                branch = self._repo.branches.get(branch_name)
                try:
                    commits = self._repo.walk(branch.resolve().target,
                                              pygit2.GIT_SORT_TOPOLOGICAL)
                except AttributeError:
                    self.logger.debug(
                        "Skipping branch %s because it cannot be resolved.",
                        branch_name)
                    continue
            diff_hash: bytes
            curr_commit: pygit2.Commit = None
            prev_commit: pygit2.Commit = None
            for curr_commit in commits:
                try:
                    prev_commit = curr_commit.parents[0]
                except (IndexError, KeyError, TypeError):
                    # IndexError: current commit has no parents
                    # KeyError: current commit has parents which are not local
                    # If a commit doesn't have a parent skip diff generation since it is the first commit
                    self.logger.debug(
                        "Skipping commit %s because it has no parents",
                        curr_commit.hex)
                    continue
                diff: pygit2.Diff = self._repo.diff(prev_commit, curr_commit)
                diff_hash = hashlib.md5(
                    (str(prev_commit) +
                     str(curr_commit)).encode("utf-8")).digest()
                if diff_hash in already_searched:
                    continue
                already_searched.add(diff_hash)
                diff.find_similar()
                for blob, file_path in self._iter_diff_index(diff):
                    yield types.Chunk(
                        blob,
                        file_path,
                        util.extract_commit_metadata(curr_commit, branch_name),
                    )

            # Finally, yield the first commit to the branch
            if curr_commit:
                tree: pygit2.Tree = self._repo.revparse_single(
                    curr_commit.hex).tree
                tree_diff: pygit2.Diff = tree.diff_to_tree(swap=True)
                iter_diff = self._iter_diff_index(tree_diff)
                for blob, file_path in iter_diff:
                    yield types.Chunk(
                        blob,
                        file_path,
                        util.extract_commit_metadata(curr_commit, branch_name),
                    )