def chunks(self) -> Generator[types.Chunk, None, None]: """Yield individual diffs from the repository's history. :rtype: Generator[Chunk, None, None] :raises types.GitRemoteException: If there was an error fetching branches """ already_searched: Set[bytes] = set() try: if self.git_options.branch: # Single branch only if self.git_options.fetch: self._repo.remotes.origin.fetch(self.git_options.branch) unfiltered_branches = list(self._repo.branches) branches = [ x for x in unfiltered_branches if x == self.git_options.branch ] else: # Everything if self.git_options.fetch: self._repo.remotes.origin.fetch() branches = list(self._repo.branches) except git.GitCommandError as exc: raise types.GitRemoteException(exc.stderr.strip()) from exc for branch in branches: diff_index: git.DiffIndex = None diff_hash: bytes curr_commit: git.Commit = None prev_commit: git.Commit = None for curr_commit, prev_commit in self._iter_branch_commits( self._repo, branch): diff_index = curr_commit.diff(prev_commit, create_patch=True) diff_hash = hashlib.md5( (str(prev_commit) + str(curr_commit)).encode("utf-8")).digest() if diff_hash in already_searched: continue already_searched.add(diff_hash) for blob, file_path in self._iter_diff_index(diff_index): yield types.Chunk( blob, file_path, util.extract_commit_metadata(prev_commit, branch), ) # Finally, yield the first commit to the branch if curr_commit: diff = curr_commit.diff(git.NULL_TREE, create_patch=True) for blob, file_path in self._iter_diff_index(diff): yield types.Chunk( blob, file_path, util.extract_commit_metadata(prev_commit, branch), )
def test_scan_exits_gracefully_when_remote_fetch_fails( self, mock_scanner: mock.MagicMock): mock_scanner.return_value.scan.side_effect = types.GitRemoteException( "Fetch failed!") runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke(cli.main, ["scan-local-repo", "."]) self.assertGreater(result.exit_code, 0) self.assertEqual( result.output, "There was an error fetching from the remote repository: Fetch failed!\n", )
def clone_git_repo(git_url: str, target_dir: Optional[pathlib.Path] = None) -> pathlib.Path: """Clone a remote git repository and return its filesystem path. :param git_url: The URL of the git repository to be cloned :param target_dir: Where to clone the repository to :raises types.GitRemoteException: If there was an error cloning the repository """ if not target_dir: project_path = tempfile.mkdtemp() else: project_path = str(target_dir) try: git.Repo.clone_from(git_url, project_path) except git.GitCommandError as exc: raise types.GitRemoteException(exc.stderr.strip()) from exc return pathlib.Path(project_path)
def clone_git_repo( git_url: str, target_dir: Optional[pathlib.Path] = None) -> Tuple[pathlib.Path, str]: """Clone a remote git repository and return its filesystem path. :param git_url: The URL of the git repository to be cloned :param target_dir: Where to clone the repository to :returns: Filesystem path of local clone and name of remote source :raises types.GitRemoteException: If there was an error cloning the repository """ if not target_dir: project_path = tempfile.mkdtemp() else: project_path = str(target_dir) try: repo = git.Repo.clone_from(git_url, project_path) origin = repo.remotes[0].name except git.GitCommandError as exc: raise types.GitRemoteException(exc.stderr.strip()) from exc return pathlib.Path(project_path), origin
def chunks(self) -> Generator[types.Chunk, None, None]: """Yield individual diffs from the repository's history. :raises types.GitRemoteException: If there was an error fetching branches """ already_searched: Set[bytes] = set() try: if self.git_options.branch: # Single branch only branch = self._repo.branches.get(self.git_options.branch) if not branch: raise BranchNotFoundException( f"Branch {self.git_options.branch} was not found.") branches = [self.git_options.branch] else: # Everything if util.is_shallow_clone(self._repo): # If this is a shallow clone, examine the repo head as a single # commit to scan all files at once branches = ["HEAD"] else: # We use `self._repo.branches` here so that we make sure to # scan not only the locally checked out branches (as provided # by self._repo.listall_branches()), but to also scan all # available remote refs branches = list(self._repo.branches) except pygit2.GitError as exc: raise types.GitRemoteException(str(exc)) from exc self.logger.debug( "Branches to be scanned: %s", ", ".join([str(branch) for branch in branches]), ) for branch_name in branches: self.logger.info("Scanning branch: %s", branch_name) if branch_name == "HEAD": commits = [self._repo.get(self._repo.head.target)] else: branch = self._repo.branches.get(branch_name) try: commits = self._repo.walk(branch.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL) except AttributeError: self.logger.debug( "Skipping branch %s because it cannot be resolved.", branch_name) continue diff_hash: bytes curr_commit: pygit2.Commit = None prev_commit: pygit2.Commit = None for curr_commit in commits: try: prev_commit = curr_commit.parents[0] except (IndexError, KeyError, TypeError): # IndexError: current commit has no parents # KeyError: current commit has parents which are not local # If a commit doesn't have a parent skip diff generation since it is the first commit self.logger.debug( "Skipping commit %s because it has no parents", curr_commit.hex) continue diff: pygit2.Diff = self._repo.diff(prev_commit, curr_commit) diff_hash = hashlib.md5( (str(prev_commit) + str(curr_commit)).encode("utf-8")).digest() if diff_hash in already_searched: continue already_searched.add(diff_hash) diff.find_similar() for blob, file_path in self._iter_diff_index(diff): yield types.Chunk( blob, file_path, util.extract_commit_metadata(curr_commit, branch_name), ) # Finally, yield the first commit to the branch if curr_commit: tree: pygit2.Tree = self._repo.revparse_single( curr_commit.hex).tree tree_diff: pygit2.Diff = tree.diff_to_tree(swap=True) iter_diff = self._iter_diff_index(tree_diff) for blob, file_path in iter_diff: yield types.Chunk( blob, file_path, util.extract_commit_metadata(curr_commit, branch_name), )