def baseline_paths(self) -> Iterator[List[Path]]: """ Prepare file system for baseline scan, and return the paths to be analyzed. Returned list of paths are all relative paths and include all files that are - already in the baseline commit, i.e. not created later - not ignored based on .semgrepignore rules - in any path include filters specified. Returned list is empty if a baseline commit is inaccessible. :return: A list of paths :raises ActionFailure: If git cannot detect a HEAD commit or unmerged files exist """ repo = get_git_repo() if not repo or self._base_commit is None: yield [] else: with self._baseline_context(): yield [ relative_path for relative_path in self._target_paths if self._fname_to_path(repo, str(relative_path)) not in self._status.added ]
def _abort_on_conflicting_untracked_paths(self) -> None: """ Raises ActionFailure if untracked paths were touched in the baseline, too. :raises ActionFailure: If the git repo is not in a clean state """ repo = get_git_repo() if not repo or self._base_commit is None: return changed_paths = set(self._status.added + self._status.modified + self._status.removed + self._status.unmerged) untracked_paths = { self._fname_to_path(repo, str(path)) for path in ( self._dirty_paths_by_status.get(StatusCode.Untracked, [])) } overlapping_paths = untracked_paths & changed_paths if overlapping_paths: raise ActionFailure( "Some paths that changed since the baseline commit now show up as untracked files. " f"Please commit or stash your untracked changes in these paths: {overlapping_paths}." )
def _fix_head_for_github( base_commit_ref: Optional[str] = None, head_ref: Optional[str] = None, ) -> Iterator[Optional[str]]: """ GHA can checkout the incorrect commit for a PR (it will create a fake merge commit), so we need to reset the head to the actual PR branch head before continuing. Note that this code is written in a generic manner, so that it becomes a no-op when the CI system has not artifically altered the HEAD ref. :return: The baseline ref as a commit hash """ stashed_rev: Optional[str] = None base_ref: Optional[str] = base_commit_ref if get_git_repo() is None: yield base_ref return if base_ref: # Preserve location of head^ after we possibly change location below try: process = git(["rev-parse", base_ref]) base_ref = process.stdout.decode("utf-8").rstrip() except sh.ErrorReturnCode as ex: raise ActionFailure(f"There is a problem with your git project:{ex}") if head_ref: stashed_rev = git(["branch", "--show-current"]).stdout.decode("utf-8").rstrip() if not stashed_rev: stashed_rev = git(["rev-parse", "HEAD"]).stdout.decode("utf-8").rstrip() click.echo(f"| not on head ref {head_ref}; checking that out now...", err=True) git.checkout([head_ref]) try: if base_ref is not None: merge_base = git("merge-base", base_ref, "HEAD").rstrip() # fmt:off click.echo("| reporting findings introduced by these commits:", err=True) print_git_log(f"{merge_base}..HEAD") if merge_base != git("rev-parse", base_ref).rstrip(): click.echo("| also reporting findings fixed by these commits from the baseline branch:", err=True) print_git_log(f"{merge_base}..{base_ref}") click.echo("| to exclude these latter commits, run with", err=True) click.echo(f"| --baseline-ref $(git merge-base {base_commit_ref} HEAD)", err=True) # fmt: on yield base_ref finally: if stashed_rev is not None: click.echo(f"| returning to original head revision {stashed_rev}", err=True) git.checkout([stashed_rev])
def _baseline_context(self) -> Iterator[None]: """ Runs a block of code on files from the current branch HEAD. :raises ActionFailure: If git cannot detect a HEAD commit :raises ActionFailure: If unmerged files are detected """ repo = get_git_repo() if not repo: yield return self._abort_on_pending_changes() self._abort_on_conflicting_untracked_paths() current_tree = git("write-tree").stdout.decode().strip() try: for a in self._status.added: a.unlink() git.checkout(self._base_commit, "--", ".") yield finally: # git checkout will fail if the checked-out index deletes all files in the repo # In this case, we still want to continue without error. # Note that we have no good way of detecting this issue without inspecting the checkout output # message, which means we are fragile with respect to git version here. try: git.checkout(current_tree.strip(), "--", ".") except sh.ErrorReturnCode as error: output = error.stderr.decode() if (output and len(output) >= 2 and "pathspec '.' did not match any file(s) known to git" in output.strip()): debug_echo( "Restoring git index failed due to total repository deletion; skipping checkout" ) else: raise ActionFailure( f"Fatal error restoring Git state; please restore your repository state manually:\n{output}" ) if self._status.removed: # Need to check if file exists since it is possible file was deleted # in both the base and head. Only call if there are files to delete to_remove = [r for r in self._status.removed if r.exists()] if to_remove: git.rm("-f", *(str(r) for r in to_remove))
def _get_path_lists(self) -> List[Path]: """ Return list of all absolute paths to analyze """ debug_echo("Getting path list") # resolve given paths relative to current working directory debug_echo(f"resolving all_paths: {self._all_paths}") paths = [p.resolve() for p in self._all_paths] if self._base_commit is not None: debug_echo(f"- base_commit is {self._base_commit}") paths = [ a for a in (self._status.added + self._status.modified) # diff_path is a subpath of some element of input_paths if any((a == path or path in a.parents) for path in paths) ] changed_count = len(paths) click.echo(f"| looking at {unit_len(paths, 'changed path')}", err=True) repo = get_git_repo() debug_echo("Got git repo") submodules = repo.submodules # type: ignore debug_echo(f"Resolving submodules {submodules}") submodule_paths = [ self._fname_to_path(repo, submodule.path) for submodule in submodules ] paths = [ path for path in paths if all(submodule_path not in path.parents for submodule_path in submodule_paths) ] if len(paths) != changed_count: click.echo( f"| skipping files in {unit_len(submodule_paths, 'submodule')}: " + ", ".join(str(path) for path in submodule_paths), err=True, ) debug_echo("Finished initializing path list") return [path.relative_to(self._base_path) for path in paths]
def get_git_status(self) -> GitStatus: """ Returns Absolute Paths to all files that are staged Ignores files that are symlinks to directories """ import gitdb.exc # type: ignore repo = get_git_repo() if not repo or self._base_commit is None: return GitStatus([], [], [], []) try: repo.rev_parse(self._base_commit) except gitdb.exc.BadName: raise ActionFailure(f"Unknown git ref '{self._base_commit}'") # Output of git command will be relative to git project root status_output = zsplit( git.diff( "--cached", "--name-status", "--no-ext-diff", "-z", "--diff-filter=ACDMRTUXB", "--ignore-submodules", self._base_commit, ).stdout.decode()) added = [] modified = [] removed = [] unmerged = [] while status_output: code = status_output[0] fname = status_output[1] trim_size = 2 if not code.strip(): continue if code == StatusCode.Untracked or code == StatusCode.Ignored: continue resolved_name = self._fname_to_path(repo, fname) # If file is symlink to directory, skip absolute_name = Path(repo.working_tree_dir) / fname if absolute_name.is_symlink() and resolved_name.is_dir(): click.echo( f"| Skipping {absolute_name} since it is a symlink to a directory: {resolved_name}", err=True, ) else: # The following detection for unmerged codes comes from `man git-status` if code == StatusCode.Unmerged: unmerged.append(resolved_name) if (code[0] == StatusCode.Renamed ): # code is RXXX, where XXX is percent similarity removed.append(resolved_name) fname = status_output[2] trim_size += 1 added.append(resolved_name) if code == StatusCode.Added: added.append(resolved_name) if code == StatusCode.Modified: modified.append(resolved_name) if code == StatusCode.Deleted: removed.append(resolved_name) status_output = status_output[trim_size:] debug_echo( f"Git status:\nadded: {added}\nmodified: {modified}\nremoved: {removed}\nunmerged: {unmerged}" ) return GitStatus(added, modified, removed, unmerged)
def _get_target_files(self) -> List[Path]: """ Return list of all absolute paths to analyze """ repo = get_git_repo() submodules = repo.submodules # type: ignore submodule_paths = [ self._fname_to_path(repo, submodule.path) for submodule in submodules ] # resolve given paths relative to current working directory paths = [p.resolve() for p in self._paths] if self._base_commit is not None: paths = [ a for a in (self._status.added + self._status.modified) # diff_path is a subpath of some element of input_paths if any((a == path or path in a.parents) for path in paths) ] changed_count = len(paths) click.echo(f"| looking at {unit_len(paths, 'changed path')}", err=True) paths = [ path for path in paths if all(submodule_path not in path.parents for submodule_path in submodule_paths) ] if len(paths) != changed_count: click.echo( f"| skipping files in {unit_len(submodule_paths, 'submodule')}: " + ", ".join(str(path) for path in submodule_paths), err=True, ) # Filter out ignore rules, expand directories self._ignore_rules_file.seek(0) patterns = Parser(self._base_path).parse(self._ignore_rules_file) file_ignore = FileIgnore(base_path=self._base_path, patterns=patterns, target_paths=paths) walked_entries = list(file_ignore.entries()) click.echo( f"| found {unit_len(walked_entries, 'file')} in the paths to be scanned", err=True, ) filtered: List[Path] = [] for elem in walked_entries: if elem.survives: filtered.append(elem.path) skipped_count = len(walked_entries) - len(filtered) if skipped_count: click.echo( f"| skipping {unit_len(range(skipped_count), 'file')} based on path ignore rules", err=True, ) relative_paths = [ path.relative_to(self._base_path) for path in filtered ] return relative_paths