예제 #1
0
    def baseline_paths(self) -> Iterator[List[Path]]:
        """
        Prepare file system for baseline scan, and return the paths to be analyzed.

        Returned list of paths are all relative paths and include all files that are
            - already in the baseline commit, i.e. not created later
            - not ignored based on .semgrepignore rules
            - in any path include filters specified.

        Returned list is empty if a baseline commit is inaccessible.

        :return: A list of paths
        :raises ActionFailure: If git cannot detect a HEAD commit or unmerged files exist
        """
        repo = get_git_repo()

        if not repo or self._base_commit is None:
            yield []
        else:
            with self._baseline_context():
                yield [
                    relative_path for relative_path in self._target_paths
                    if self._fname_to_path(repo, str(relative_path)) not in
                    self._status.added
                ]
예제 #2
0
    def _abort_on_conflicting_untracked_paths(self) -> None:
        """
        Raises ActionFailure if untracked paths were touched in the baseline, too.

        :raises ActionFailure: If the git repo is not in a clean state
        """
        repo = get_git_repo()

        if not repo or self._base_commit is None:
            return

        changed_paths = set(self._status.added + self._status.modified +
                            self._status.removed + self._status.unmerged)
        untracked_paths = {
            self._fname_to_path(repo, str(path))
            for path in (
                self._dirty_paths_by_status.get(StatusCode.Untracked, []))
        }
        overlapping_paths = untracked_paths & changed_paths

        if overlapping_paths:
            raise ActionFailure(
                "Some paths that changed since the baseline commit now show up as untracked files. "
                f"Please commit or stash your untracked changes in these paths: {overlapping_paths}."
            )
예제 #3
0
def _fix_head_for_github(
    base_commit_ref: Optional[str] = None,
    head_ref: Optional[str] = None,
) -> Iterator[Optional[str]]:
    """
    GHA can checkout the incorrect commit for a PR (it will create a fake merge commit),
    so we need to reset the head to the actual PR branch head before continuing.

    Note that this code is written in a generic manner, so that it becomes a no-op when
    the CI system has not artifically altered the HEAD ref.

    :return: The baseline ref as a commit hash
    """

    stashed_rev: Optional[str] = None
    base_ref: Optional[str] = base_commit_ref

    if get_git_repo() is None:
        yield base_ref
        return

    if base_ref:
        # Preserve location of head^ after we possibly change location below
        try:
            process = git(["rev-parse", base_ref])
            base_ref = process.stdout.decode("utf-8").rstrip()
        except sh.ErrorReturnCode as ex:
            raise ActionFailure(f"There is a problem with your git project:{ex}")

    if head_ref:
        stashed_rev = git(["branch", "--show-current"]).stdout.decode("utf-8").rstrip()
        if not stashed_rev:
            stashed_rev = git(["rev-parse", "HEAD"]).stdout.decode("utf-8").rstrip()
        click.echo(f"| not on head ref {head_ref}; checking that out now...", err=True)
        git.checkout([head_ref])

    try:
        if base_ref is not None:
            merge_base = git("merge-base", base_ref, "HEAD").rstrip()
            # fmt:off
            click.echo("| reporting findings introduced by these commits:", err=True)
            print_git_log(f"{merge_base}..HEAD")
            if merge_base != git("rev-parse", base_ref).rstrip():
                click.echo("| also reporting findings fixed by these commits from the baseline branch:", err=True)
                print_git_log(f"{merge_base}..{base_ref}")
                click.echo("| to exclude these latter commits, run with", err=True)
                click.echo(f"|   --baseline-ref $(git merge-base {base_commit_ref} HEAD)", err=True)
            # fmt: on

        yield base_ref
    finally:
        if stashed_rev is not None:
            click.echo(f"| returning to original head revision {stashed_rev}", err=True)
            git.checkout([stashed_rev])
예제 #4
0
    def _baseline_context(self) -> Iterator[None]:
        """
        Runs a block of code on files from the current branch HEAD.

        :raises ActionFailure: If git cannot detect a HEAD commit
        :raises ActionFailure: If unmerged files are detected
        """
        repo = get_git_repo()

        if not repo:
            yield
            return

        self._abort_on_pending_changes()
        self._abort_on_conflicting_untracked_paths()

        current_tree = git("write-tree").stdout.decode().strip()
        try:
            for a in self._status.added:
                a.unlink()
            git.checkout(self._base_commit, "--", ".")
            yield
        finally:
            # git checkout will fail if the checked-out index deletes all files in the repo
            # In this case, we still want to continue without error.
            # Note that we have no good way of detecting this issue without inspecting the checkout output
            # message, which means we are fragile with respect to git version here.
            try:
                git.checkout(current_tree.strip(), "--", ".")
            except sh.ErrorReturnCode as error:
                output = error.stderr.decode()
                if (output and len(output) >= 2 and
                        "pathspec '.' did not match any file(s) known to git"
                        in output.strip()):
                    debug_echo(
                        "Restoring git index failed due to total repository deletion; skipping checkout"
                    )
                else:
                    raise ActionFailure(
                        f"Fatal error restoring Git state; please restore your repository state manually:\n{output}"
                    )

            if self._status.removed:
                # Need to check if file exists since it is possible file was deleted
                # in both the base and head. Only call if there are files to delete
                to_remove = [r for r in self._status.removed if r.exists()]
                if to_remove:
                    git.rm("-f", *(str(r) for r in to_remove))
예제 #5
0
    def _get_path_lists(self) -> List[Path]:
        """
        Return list of all absolute paths to analyze
        """
        debug_echo("Getting path list")

        # resolve given paths relative to current working directory
        debug_echo(f"resolving all_paths: {self._all_paths}")
        paths = [p.resolve() for p in self._all_paths]

        if self._base_commit is not None:
            debug_echo(f"- base_commit is {self._base_commit}")
            paths = [
                a for a in (self._status.added + self._status.modified)
                # diff_path is a subpath of some element of input_paths
                if any((a == path or path in a.parents) for path in paths)
            ]
            changed_count = len(paths)
            click.echo(f"| looking at {unit_len(paths, 'changed path')}",
                       err=True)
            repo = get_git_repo()
            debug_echo("Got git repo")
            submodules = repo.submodules  # type: ignore
            debug_echo(f"Resolving submodules {submodules}")
            submodule_paths = [
                self._fname_to_path(repo, submodule.path)
                for submodule in submodules
            ]
            paths = [
                path for path in paths
                if all(submodule_path not in path.parents
                       for submodule_path in submodule_paths)
            ]
            if len(paths) != changed_count:
                click.echo(
                    f"| skipping files in {unit_len(submodule_paths, 'submodule')}: "
                    + ", ".join(str(path) for path in submodule_paths),
                    err=True,
                )

        debug_echo("Finished initializing path list")

        return [path.relative_to(self._base_path) for path in paths]
예제 #6
0
    def get_git_status(self) -> GitStatus:
        """
        Returns Absolute Paths to all files that are staged

        Ignores files that are symlinks to directories
        """
        import gitdb.exc  # type: ignore

        repo = get_git_repo()

        if not repo or self._base_commit is None:
            return GitStatus([], [], [], [])

        try:
            repo.rev_parse(self._base_commit)
        except gitdb.exc.BadName:
            raise ActionFailure(f"Unknown git ref '{self._base_commit}'")

        # Output of git command will be relative to git project root
        status_output = zsplit(
            git.diff(
                "--cached",
                "--name-status",
                "--no-ext-diff",
                "-z",
                "--diff-filter=ACDMRTUXB",
                "--ignore-submodules",
                self._base_commit,
            ).stdout.decode())

        added = []
        modified = []
        removed = []
        unmerged = []
        while status_output:
            code = status_output[0]
            fname = status_output[1]
            trim_size = 2

            if not code.strip():
                continue
            if code == StatusCode.Untracked or code == StatusCode.Ignored:
                continue

            resolved_name = self._fname_to_path(repo, fname)

            # If file is symlink to directory, skip
            absolute_name = Path(repo.working_tree_dir) / fname
            if absolute_name.is_symlink() and resolved_name.is_dir():
                click.echo(
                    f"| Skipping {absolute_name} since it is a symlink to a directory: {resolved_name}",
                    err=True,
                )
            else:
                # The following detection for unmerged codes comes from `man git-status`
                if code == StatusCode.Unmerged:
                    unmerged.append(resolved_name)
                if (code[0] == StatusCode.Renamed
                    ):  # code is RXXX, where XXX is percent similarity
                    removed.append(resolved_name)
                    fname = status_output[2]
                    trim_size += 1
                    added.append(resolved_name)
                if code == StatusCode.Added:
                    added.append(resolved_name)
                if code == StatusCode.Modified:
                    modified.append(resolved_name)
                if code == StatusCode.Deleted:
                    removed.append(resolved_name)

            status_output = status_output[trim_size:]
        debug_echo(
            f"Git status:\nadded: {added}\nmodified: {modified}\nremoved: {removed}\nunmerged: {unmerged}"
        )

        return GitStatus(added, modified, removed, unmerged)
예제 #7
0
    def _get_target_files(self) -> List[Path]:
        """
        Return list of all absolute paths to analyze
        """
        repo = get_git_repo()
        submodules = repo.submodules  # type: ignore
        submodule_paths = [
            self._fname_to_path(repo, submodule.path)
            for submodule in submodules
        ]

        # resolve given paths relative to current working directory
        paths = [p.resolve() for p in self._paths]
        if self._base_commit is not None:
            paths = [
                a for a in (self._status.added + self._status.modified)
                # diff_path is a subpath of some element of input_paths
                if any((a == path or path in a.parents) for path in paths)
            ]
            changed_count = len(paths)
            click.echo(f"| looking at {unit_len(paths, 'changed path')}",
                       err=True)
            paths = [
                path for path in paths
                if all(submodule_path not in path.parents
                       for submodule_path in submodule_paths)
            ]
            if len(paths) != changed_count:
                click.echo(
                    f"| skipping files in {unit_len(submodule_paths, 'submodule')}: "
                    + ", ".join(str(path) for path in submodule_paths),
                    err=True,
                )

        # Filter out ignore rules, expand directories
        self._ignore_rules_file.seek(0)
        patterns = Parser(self._base_path).parse(self._ignore_rules_file)

        file_ignore = FileIgnore(base_path=self._base_path,
                                 patterns=patterns,
                                 target_paths=paths)

        walked_entries = list(file_ignore.entries())
        click.echo(
            f"| found {unit_len(walked_entries, 'file')} in the paths to be scanned",
            err=True,
        )
        filtered: List[Path] = []
        for elem in walked_entries:
            if elem.survives:
                filtered.append(elem.path)

        skipped_count = len(walked_entries) - len(filtered)
        if skipped_count:
            click.echo(
                f"| skipping {unit_len(range(skipped_count), 'file')} based on path ignore rules",
                err=True,
            )

        relative_paths = [
            path.relative_to(self._base_path) for path in filtered
        ]

        return relative_paths