예제 #1
0
    def report_results(self, results: Results) -> None:
        debug_echo(f"=== reporting results to semgrep app at {self.url}")

        response: Optional["requests.Response"] = None

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/findings",
            json={
                "token": os.getenv("GITHUB_TOKEN"),
                "findings": [
                    finding.to_dict(omit=constants.PRIVACY_SENSITIVE_FIELDS)
                    for finding in results.findings.new
                ],
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../findings responded: {response!r}")
        try:
            response.raise_for_status()

            errors = response.json()["errors"]
            for error in errors:
                message = error["message"]
                click.echo(f"Server returned following warning: {message}", err=True)

        except requests.RequestException:
            raise ActionFailure(f"API server returned this error: {response.text}")

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/ignores",
            json={
                "findings": [
                    finding.to_dict(omit=constants.PRIVACY_SENSITIVE_FIELDS)
                    for finding in results.findings.ignored
                ],
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../ignores responded: {response!r}")
        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(f"API server returned this error: {response.text}")

        # mark as complete
        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/complete",
            json={"exit_code": -1, "stats": results.stats},
            timeout=30,
        )
        debug_echo(f"=== POST .../complete responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )
예제 #2
0
    def _baseline_context(self) -> Iterator[None]:
        """
        Runs a block of code on files from the current branch HEAD.

        :raises ActionFailure: If git cannot detect a HEAD commit
        :raises ActionFailure: If unmerged files are detected
        """
        repo = get_git_repo()

        if not repo:
            yield
            return

        self._abort_if_dirty()

        current_tree = git("write-tree").stdout.decode().strip()
        try:
            for a in self._status.added:
                a.unlink()
            git.checkout(self._base_commit, "--", ".")
            yield
        finally:
            # git checkout will fail if the checked-out index deletes all files in the repo
            # In this case, we still want to continue without error.
            # Note that we have no good way of detecting this issue without inspecting the checkout output
            # message, which means we are fragile with respect to git version here.
            try:
                git.checkout(current_tree.strip(), "--", ".")
            except sh.ErrorReturnCode as error:
                output = error.stderr.decode()
                if (output and len(output) >= 2 and
                        "pathspec '.' did not match any file(s) known to git"
                        in output.strip()):
                    debug_echo(
                        "Restoring git index failed due to total repository deletion; skipping checkout"
                    )
                else:
                    raise ActionFailure(
                        f"Fatal error restoring Git state; please restore your repository state manually:\n{output}"
                    )

            if self._status.removed:
                # Need to check if file exists since it is possible file was deleted
                # in both the base and head
                git.rm("-f",
                       *(str(r) for r in self._status.removed if r.exists()))
예제 #3
0
def _get_findings(context: RunContext) -> Tuple[FindingSets, RunStats]:
    """
    Gets head and baseline findings for this run

    :param context: This scan's run context object
    :return: This project's findings
    """
    debug_echo("=== adding semgrep configuration")

    rewrite_args: Sequence[str] = ([] if context.rewrite_rule_ids else
                                   ["--no-rewrite-rule-ids"])
    metrics_args: Sequence[str] = ["--enable-metrics"
                                   ] if context.enable_metrics else []

    with _fix_head_for_github(context.base_ref, context.head_ref) as base_ref:
        workdir = Path.cwd()
        debug_echo(f"Workdir: {str(workdir)}")
        targets = TargetFileManager(
            base_path=workdir,
            base_commit=base_ref,
            all_paths=[workdir],
        )
        debug_echo("Initialized TargetFileManager")

        config_args = []
        # Keep track of which config specifiers are local files/dirs
        local_configs: Set[str] = set()
        for conf in context.config_specifier:
            if Path(conf).exists():
                local_configs.add(conf)
            config_args.extend(["--config", conf])
        debug_echo("=== seeing if there are any findings")

        findings, stats = _get_head_findings(
            context, [*config_args, *metrics_args, *rewrite_args], targets)

    _update_baseline_findings(context, findings, local_configs, rewrite_args,
                              targets)

    if os.getenv("INPUT_GENERATESARIF"):
        click.echo("=== re-running scan to generate a SARIF report", err=True)
        sarif_path = Path("semgrep.sarif")
        with targets.current_paths() as paths:
            args = [*rewrite_args, *config_args]
            _, sarif_output = invoke_semgrep_sarif(
                args,
                [str(p) for p in paths],
                timeout=context.timeout,
                explicit_semgrepignore_path=context.action_ignores_path,
            )
        rewrite_sarif_file(sarif_output, sarif_path)

    return findings, stats
예제 #4
0
 def expand_directives(self, line: str) -> Iterable[str]:
     """Load :include files"""
     if line.startswith(":include "):
         include_path = self.base_path / line[9:]
         if include_path.is_file():
             with include_path.open() as include_lines:
                 sub_base = include_path.parent.resolve()
                 sub_parser = Parser(sub_base)
                 return sub_parser.parse(include_lines)
         else:
             debug_echo(
                 f"Skipping `:include {include_path}` directive, file not found"
             )
             return []
     elif CONTROL_REGEX.match(line):
         raise ActionFailure(
             f"Unknown ignore directive in Semgrep ignore file at {self.base_path}: '{line}'"
         )
     else:
         return (line for _ in range(1))
예제 #5
0
    def fetch_rules_text(self) -> str:
        """Get a YAML string with the configured semgrep rules in it."""
        if not self.scan.is_loaded:
            raise ActionFailure(
                f"The API server at {self.url} is not working properly. "
                f"Please contact {constants.SUPPORT_EMAIL} for assistance.")

        response = self.session.get(
            f"{self.url}/api/agent/scan/{self.scan.id}/rules.yaml",
            timeout=30,
        )
        debug_echo(f"=== POST .../rules.yaml responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}\n"
                "Failed to get configured rules")
        else:
            return response.text
예제 #6
0
    def fetch_rules_text(self) -> str:
        """Get a YAML string with the configured semgrep rules in it."""
        response = self.session.get(
            f"{self.url}/api/agent/scan/{self.scan.id}/rules.yaml",
            timeout=30,
        )
        debug_echo(f"=== POST .../rules.yaml responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}\n"
                "Failed to get configured rules")

        # Can remove once server guarantees will always have at least one rule
        parsed = yaml.load(response.text)
        if not parsed["rules"]:
            raise ActionFailure("No rules returned by server for this scan.")
        else:
            return response.text
예제 #7
0
    def report_results(self, results: Results) -> None:
        if not self.is_configured or not self.scan.is_loaded:
            debug_echo("=== no semgrep app config, skipping report_results")
            return
        debug_echo(f"=== reporting results to semgrep app at {self.url}")

        response: Optional["requests.Response"] = None

        # report findings
        for chunk in chunked_iter(results.findings.new, 10_000):
            response = self.session.post(
                f"{self.url}/api/agent/scan/{self.scan.id}/findings",
                json={
                    "token":
                    os.getenv("GITHUB_TOKEN"),
                    "findings": [
                        finding.to_dict(
                            omit=constants.PRIVACY_SENSITIVE_FIELDS)
                        for finding in chunk
                    ],
                },
                timeout=30,
            )
            debug_echo(f"=== POST .../findings responded: {response!r}")
            try:
                response.raise_for_status()
            except requests.RequestException:
                raise ActionFailure(
                    f"API server returned this error: {response.text}")
예제 #8
0
    def _find_branchoff_point(self, attempt_count: int = 0) -> str:
        fetch_depth = 4 ** attempt_count  # fetch 4, 16, 64, 256, 1024, ...
        if attempt_count >= self.MAX_FETCH_ATTEMPT_COUNT:  # get all commits on last try
            fetch_depth = 2 ** 31 - 1  # git expects a signed 32-bit integer

        if attempt_count:  # skip fetching on first try
            debug_echo(
                f"fetching {fetch_depth} commits to find branch-off point of pull request"
            )
            git.fetch(
                "origin",
                "--depth",
                fetch_depth,
                self.base_branch_tip,
                _timeout=GIT_SH_TIMEOUT,
            )
            git.fetch(
                "origin", "--depth", fetch_depth, self.head_ref, _timeout=GIT_SH_TIMEOUT
            )

        try:  # check if both branches connect to the yet-unknown branch-off point now
            process = git("merge-base", self.base_branch_tip, self.head_ref)
        except sh.ErrorReturnCode as error:
            output = error.stderr.decode().strip()
            if (
                output  # output is empty when unable to find branch-off point
                and "Not a valid " not in output  # the error when a ref is missing
            ):
                exit_with_sh_error(error)

            if attempt_count >= self.MAX_FETCH_ATTEMPT_COUNT:
                raise ActionFailure(
                    "Could not find branch-off point between "
                    f"the baseline tip {self.base_branch_tip} and current head '{self.head_ref}' "
                )

            return self._find_branchoff_point(attempt_count + 1)
        else:
            return process.stdout.decode().strip()
예제 #9
0
    def report_failure(self, error: SemgrepError) -> int:
        """
        Send semgrep cli non-zero exit code information to server
        and return what exit code semgrep should exit with.
        """
        debug_echo(f"=== sending failure information to semgrep app")

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/error",
            json={
                "exit_code": error.exit_code,
                "stderr": error.stderr,
            },
            timeout=30,
        )

        debug_echo(f"=== POST .../error responded: {response!r}")
        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(f"API server returned this error: {response.text}")

        exit_code = int(response.json()["exit_code"])
        return exit_code
예제 #10
0
    def report_start(self, meta: GitMeta) -> None:
        """
        Get scan id and file ignores

        returns name of policy used to scan
        """
        debug_echo(f"=== reporting start to semgrep app at {self.url}")

        response = self.session.post(
            f"{self.url}/api/agent/deployment/{self.deployment_id}/scan",
            json={"meta": meta.to_dict()},
            timeout=30,
        )

        debug_echo(f"=== POST .../scan responded: {response!r}")

        if response.status_code == 404:
            raise ActionFailure(
                "Failed to create a scan with given token and deployment_id."
                "Please make sure they have been set correctly."
                f"API server at {self.url} returned this response: {response.text}"
            )

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )
        else:
            body = response.json()
            self.scan = Scan(
                id=glom(body, T["scan"]["id"]),
                ignore_patterns=glom(
                    body, T["scan"]["meta"].get("ignored_files", [])),
                policy_list=glom(body, T["policy"]),
                autofix=glom(body, T.get("autofix", False)),
            )
            debug_echo(f"=== Our scan object is: {self.scan!r}")
예제 #11
0
    def _get_path_lists(self) -> List[Path]:
        """
        Return list of all absolute paths to analyze
        """
        debug_echo("Getting path list")

        # resolve given paths relative to current working directory
        debug_echo(f"resolving all_paths: {self._all_paths}")
        paths = [p.resolve() for p in self._all_paths]

        if self._base_commit is not None:
            debug_echo(f"- base_commit is {self._base_commit}")
            paths = [
                a for a in (self._status.added + self._status.modified)
                # diff_path is a subpath of some element of input_paths
                if any((a == path or path in a.parents) for path in paths)
            ]
            changed_count = len(paths)
            click.echo(f"| looking at {unit_len(paths, 'changed path')}",
                       err=True)
            repo = get_git_repo()
            debug_echo("Got git repo")
            submodules = repo.submodules  # type: ignore
            debug_echo(f"Resolving submodules {submodules}")
            submodule_paths = [
                self._fname_to_path(repo, submodule.path)
                for submodule in submodules
            ]
            paths = [
                path for path in paths
                if all(submodule_path not in path.parents
                       for submodule_path in submodule_paths)
            ]
            if len(paths) != changed_count:
                click.echo(
                    f"| skipping files in {unit_len(submodule_paths, 'submodule')}: "
                    + ", ".join(str(path) for path in submodule_paths),
                    err=True,
                )

        debug_echo("Finished initializing path list")

        return [path.relative_to(self._base_path) for path in paths]
예제 #12
0
class Sapp:
    url: str
    token: str
    deployment_id: int
    scan: Scan = Scan()
    is_configured: bool = False
    session: requests.Session = field(init=False)

    def __post_init__(self) -> None:
        # Get deployment from token
        #
        if self.token and self.deployment_id:
            self.is_configured = True
        self.session = requests.Session()
        self.session.headers["Authorization"] = f"Bearer {self.token}"

    def report_start(self, meta: GitMeta) -> None:
        if not self.is_configured:
            debug_echo("=== no semgrep app config, skipping report_start")
            return
        debug_echo(f"=== reporting start to semgrep app at {self.url}")

        response = self.session.post(
            f"{self.url}/api/agent/deployment/{self.deployment_id}/scan",
            json={"meta": meta.to_dict()},
            timeout=30,
        )
        debug_echo(f"=== POST .../scan responded: {response!r}")
        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )
        else:
            body = response.json()
            self.scan = Scan(
                id=glom(body, T["scan"]["id"]),
                config=glom(body, T["scan"]["meta"].get("config")),
                ignore_patterns=glom(
                    body, T["scan"]["meta"].get("ignored_files", [])),
            )
            debug_echo(f"=== Our scan object is: {self.scan!r}")

    def fetch_rules_text(self) -> str:
        """Get a YAML string with the configured semgrep rules in it."""
        if not self.scan.is_loaded:
            raise ActionFailure(
                f"The API server at {self.url} is not working properly. "
                f"Please contact {constants.SUPPORT_EMAIL} for assistance.")

        response = self.session.get(
            f"{self.url}/api/agent/scan/{self.scan.id}/rules.yaml",
            timeout=30,
        )
        debug_echo(f"=== POST .../rules.yaml responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}\n"
                "Failed to get configured rules")
        else:
            return response.text

    def download_rules(self) -> Path:
        """Save the rules configured on semgrep app to a temporary file"""
        # hey, it's just a tiny YAML file in CI, we'll survive without cleanup
        rules_file = tempfile.NamedTemporaryFile(suffix=".yml",
                                                 delete=False)  # nosem
        rules_path = Path(rules_file.name)
        rules_path.write_text(self.fetch_rules_text())
        return rules_path

    def report_results(self, results: Results) -> None:
        if not self.is_configured or not self.scan.is_loaded:
            debug_echo("=== no semgrep app config, skipping report_results")
            return
        debug_echo(f"=== reporting results to semgrep app at {self.url}")

        response: Optional["requests.Response"] = None

        # report findings
        for chunk in chunked_iter(results.new, 10_000):
            response = self.session.post(
                f"{self.url}/api/agent/scan/{self.scan.id}/findings",
                json=[
                    finding.to_dict(omit=constants.PRIVACY_SENSITIVE_FIELDS)
                    for finding in chunk
                ],
                timeout=30,
            )
            debug_echo(f"=== POST .../findings responded: {response!r}")
            try:
                response.raise_for_status()
            except requests.RequestException:
                raise ActionFailure(
                    f"API server returned this error: {response.text}")

        # mark as complete
        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/complete",
            json={
                "exit_code": -1,
                "stats": results.stats
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../complete responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )
예제 #13
0
def invoke_semgrep(
    config_specifier: str,
    committed_datetime: Optional[datetime],
    base_commit_ref: Optional[str],
    semgrep_ignore: TextIO,
) -> FindingSets:
    debug_echo("=== adding semgrep configuration")

    workdir = Path.cwd()
    targets = TargetFileManager(
        base_path=workdir,
        base_commit=base_commit_ref,
        paths=[workdir],
        ignore_rules_file=semgrep_ignore,
    )

    config_args = ["--config", config_specifier]

    debug_echo("=== seeing if there are any findings")
    finding_set = FindingSets()

    with targets.current_paths() as paths:
        click.echo("=== looking for current issues in " +
                   unit_len(paths, "file"),
                   err=True)
        for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE):
            args = ["--skip-unknown-extensions", "--json", *config_args]
            for path in chunk:
                args.append(path)
            count = 0
            for result in json.loads(str(semgrep(*args)))["results"]:
                finding_set.update_current(result, committed_datetime)
                count += 1
            click.echo(
                f"| {count} {cardinalize('current issue', count)} found",
                err=True)

    if not finding_set.has_current_issues():
        click.echo(
            "=== not looking at pre-existing issues since there are no current issues",
            err=True,
        )
    else:
        with targets.baseline_paths() as paths:
            if paths:
                paths_with_findings = finding_set.paths_with_current_findings()
                paths_to_check = set(str(path)
                                     for path in paths) & paths_with_findings
                click.echo(
                    "=== looking for pre-existing issues in " +
                    unit_len(paths_to_check, "file"),
                    err=True,
                )
                for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE):
                    args = [
                        "--skip-unknown-extensions", "--json", *config_args
                    ]
                    for path in chunk:
                        args.append(path)
                    count = 0
                    for result in json.loads(str(semgrep(*args)))["results"]:
                        finding_set.update_baseline(result, committed_datetime)
                        count += 1
                click.echo(
                    f"| {count} {cardinalize('pre-existing issue', count)} found",
                    err=True,
                )

    if os.getenv("INPUT_GENERATESARIF"):
        # FIXME: This will crash when running on thousands of files due to command length limit
        click.echo("=== re-running scan to generate a SARIF report", err=True)
        sarif_path = Path("semgrep.sarif")
        with targets.current_paths() as paths, sarif_path.open(
                "w") as sarif_file:
            args = ["--sarif", *config_args]
            for path in paths:
                args.extend(["--include", path])
            semgrep(*args, _out=sarif_file)
        rewrite_sarif_file(sarif_path)

    return finding_set
예제 #14
0
def get_findings(
    config_specifier: str,
    committed_datetime: Optional[datetime],
    base_commit_ref: Optional[str],
    head_ref: Optional[str],
    semgrep_ignore: TextIO,
    uses_managed_policy: bool,
) -> FindingSets:
    debug_echo("=== adding semgrep configuration")

    with fix_head_for_github(base_commit_ref, head_ref) as base_ref:
        workdir = Path.cwd()
        targets = TargetFileManager(
            base_path=workdir,
            base_commit=base_ref,
            paths=[workdir],
            ignore_rules_file=semgrep_ignore,
        )

        config_args = ["--config", config_specifier]
        rewrite_args = ["--no-rewrite-rule-ids"] if uses_managed_policy else []

        debug_echo("=== seeing if there are any findings")
        findings = FindingSets()

        with targets.current_paths() as paths:
            click.echo("=== looking for current issues in " +
                       unit_len(paths, "file"),
                       err=True)

            args = [
                "--skip-unknown-extensions",
                "--disable-nosem",
                "--json",
                *rewrite_args,
                *config_args,
            ]
            semgrep_results = invoke_semgrep(args, [str(p)
                                                    for p in paths])["results"]

            findings.current.update_findings(
                Finding.from_semgrep_result(result, committed_datetime)
                for result in semgrep_results
                if not result["extra"].get("is_ignored"))
            findings.ignored.update_findings(
                Finding.from_semgrep_result(result, committed_datetime)
                for result in semgrep_results
                if result["extra"].get("is_ignored"))
            click.echo(
                f"| {unit_len(findings.current, 'current issue')} found",
                err=True)
            click.echo(
                f"| {unit_len(findings.ignored, 'ignored issue')} found",
                err=True,
            )

    if not findings.current:
        click.echo(
            "=== not looking at pre-existing issues since there are no current issues",
            err=True,
        )
    else:
        with targets.baseline_paths() as paths:
            paths_with_findings = {
                finding.path
                for finding in findings.current
            }
            paths_to_check = list(
                set(str(path) for path in paths) & paths_with_findings)
            if not paths_to_check:
                click.echo(
                    "=== not looking at pre-existing issues since all files with current issues are newly created",
                    err=True,
                )
            else:
                click.echo(
                    "=== looking for pre-existing issues in " +
                    unit_len(paths_to_check, "file"),
                    err=True,
                )

                args = [
                    "--skip-unknown-extensions",
                    "--json",
                    *rewrite_args,
                    *config_args,
                ]
                semgrep_results = invoke_semgrep(args,
                                                 paths_to_check)["results"]
                findings.baseline.update_findings(
                    Finding.from_semgrep_result(result, committed_datetime)
                    for result in semgrep_results)
                click.echo(
                    f"| {unit_len(findings.baseline, 'pre-existing issue')} found",
                    err=True,
                )

    if os.getenv("INPUT_GENERATESARIF"):
        # FIXME: This will crash when running on thousands of files due to command length limit
        click.echo("=== re-running scan to generate a SARIF report", err=True)
        sarif_path = Path("semgrep.sarif")
        with targets.current_paths() as paths, sarif_path.open(
                "w") as sarif_file:
            args = ["--sarif", *rewrite_args, *config_args]
            for path in paths:
                args.extend(["--include", str(path)])
            semgrep_exec(*args, _out=sarif_file)
        rewrite_sarif_file(sarif_path)

    return findings
예제 #15
0
 def repo(self) -> gitpython.Repo:  # type: ignore
     repo = gitpython.Repo()
     debug_echo(f"found repo: {repo!r}")
     return repo
예제 #16
0
    def report_results(self, results: Results) -> None:
        debug_echo(f"=== reporting results to semgrep app at {self.url}")

        fields_to_omit = constants.PRIVACY_SENSITIVE_FIELDS.copy()

        if "pr-comment-autofix" in os.getenv("SEMGREP_AGENT_OPT_IN_FEATURES",
                                             ""):
            fields_to_omit.remove("fixed_lines")

        response: Optional["requests.Response"] = None

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/findings",
            json={
                # send a backup token in case the app is not available
                "token":
                os.getenv("GITHUB_TOKEN"),
                "findings": [
                    finding.to_dict(omit=fields_to_omit)
                    for finding in results.findings.new
                ],
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../findings responded: {response!r}")
        try:
            response.raise_for_status()

            errors = response.json()["errors"]
            for error in errors:
                message = error["message"]
                click.echo(f"Server returned following warning: {message}",
                           err=True)

        except requests.RequestException:
            raise ActionFailure(
                f"API server returned this error: {response.text}")

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/ignores",
            json={
                "findings":
                [finding.to_dict() for finding in results.findings.ignored],
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../ignores responded: {response!r}")
        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server returned this error: {response.text}")

        # mark as complete
        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/complete",
            json={
                "exit_code": -1,
                "stats": results.stats
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../complete responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )
예제 #17
0
                            omit=constants.PRIVACY_SENSITIVE_FIELDS)
                        for finding in chunk
                    ],
                },
                timeout=30,
            )
            debug_echo(f"=== POST .../ignores responded: {response!r}")
            try:
                response.raise_for_status()
            except requests.RequestException:
                raise ActionFailure(
                    f"API server returned this error: {response.text}")

        # mark as complete
        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/complete",
            json={
                "exit_code": -1,
                "stats": results.stats
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../complete responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )
예제 #18
0
    def report_results(self, results: Results, rule_ids: Sequence[str],
                       cai_ids: Sequence[str]) -> None:
        debug_echo(f"=== reporting results to semgrep app at {self.url}")

        fields_to_omit = constants.PRIVACY_SENSITIVE_FIELDS.copy()

        if self.scan.autofix:
            fields_to_omit.remove("fixed_lines")

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/findings",
            json={
                # send a backup token in case the app is not available
                "token":
                os.getenv("GITHUB_TOKEN"),
                "gitlab_token":
                os.getenv("GITLAB_TOKEN"),
                "findings": [
                    finding.to_dict(omit=fields_to_omit)
                    for finding in results.findings.new
                ],
                "searched_paths":
                [str(p) for p in results.findings.searched_paths],
                "rule_ids":
                rule_ids,
                "cai_ids":
                cai_ids,
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../findings responded: {response!r}")
        try:
            response.raise_for_status()

            errors = response.json()["errors"]
            for error in errors:
                message = error["message"]
                click.echo(f"Server returned following warning: {message}",
                           err=True)

        except requests.RequestException:
            raise ActionFailure(
                f"API server returned this error: {response.text}")

        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/ignores",
            json={
                "findings": [
                    finding.to_dict()
                    for finding in results.findings.new_ignored
                ],
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../ignores responded: {response!r}")
        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server returned this error: {response.text}")

        # mark as complete
        # In order to not overload our app database, we truncate target stats to the 20 heaviest hitters. This adds
        # approximately 80 kB of database load per scan when using p/ci.
        response = self.session.post(
            f"{self.url}/api/agent/scan/{self.scan.id}/complete",
            json={
                "exit_code": results.findings.max_exit_code,
                "stats": results.stats(n_heavy_targets=20),
            },
            timeout=30,
        )
        debug_echo(f"=== POST .../complete responded: {response!r}")

        try:
            response.raise_for_status()
        except requests.RequestException:
            raise ActionFailure(
                f"API server at {self.url} returned this error: {response.text}"
            )
예제 #19
0
    def _baseline_context(self) -> Iterator[None]:
        """
        Runs a block of code on files from the current branch HEAD.

        :raises ActionFailure: If git cannot detect a HEAD commit
        :raises ActionFailure: If unmerged files are detected
        """
        repo = get_git_repo()

        if not repo:
            yield
            return

        self._abort_on_pending_changes()
        self._abort_on_conflicting_untracked_paths()

        debug_echo("Running git write-tree")
        current_tree = git("write-tree").stdout.decode().strip()
        try:
            for a in self._status.added:
                try:
                    a.unlink()
                except FileNotFoundError:
                    click.echo(f"| {a} was not found when trying to delete",
                               err=True)

            debug_echo("Running git checkout for baseline context")
            git.checkout(self._base_commit, "--", ".", _timeout=GIT_SH_TIMEOUT)
            debug_echo("Finished git checkout for baseline context")
            yield
        finally:
            # git checkout will fail if the checked-out index deletes all files in the repo
            # In this case, we still want to continue without error.
            # Note that we have no good way of detecting this issue without inspecting the checkout output
            # message, which means we are fragile with respect to git version here.
            try:
                debug_echo("Running git checkout to return original context")
                git.checkout(current_tree.strip(),
                             "--",
                             ".",
                             _timeout=GIT_SH_TIMEOUT)
                debug_echo("Finished git checkout to return original context")
            except sh.ErrorReturnCode as error:
                output = error.stderr.decode()
                if (output and len(output) >= 2 and
                        "pathspec '.' did not match any file(s) known to git"
                        in output.strip()):
                    debug_echo(
                        "Restoring git index failed due to total repository deletion; skipping checkout"
                    )
                else:
                    raise ActionFailure(
                        f"Fatal error restoring Git state; please restore your repository state manually:\n{output}"
                    )

            if self._status.removed:
                # Need to check if file exists since it is possible file was deleted
                # in both the base and head. Only call if there are files to delete
                to_remove = [r for r in self._status.removed if r.exists()]
                if to_remove:
                    debug_echo("Running git rm")
                    git.rm("-f",
                           *(str(r) for r in to_remove),
                           _timeout=GIT_SH_TIMEOUT)
                    debug_echo("finished git rm")
예제 #20
0
 def _fname_to_path(self, repo: "gitpython.Repo",
                    fname: str) -> Path:  # type: ignore
     debug_echo(
         f"_fname_to_path: root: {repo.working_tree_dir} fname: {fname}")
     return (Path(repo.working_tree_dir) / fname).resolve()
예제 #21
0
 def repo(self) -> gitpython.Repo:  # type: ignore
     repo = gitpython.Repo(".", search_parent_directories=True)
     debug_echo(f"found repo: {repo!r}")
     return repo
예제 #22
0
    def get_git_status(self) -> GitStatus:
        """
        Returns Absolute Paths to all files that are staged

        Ignores files that are symlinks to directories
        """
        import gitdb.exc  # type: ignore

        repo = get_git_repo()

        if not repo or self._base_commit is None:
            return GitStatus([], [], [], [])

        try:
            repo.rev_parse(self._base_commit)
        except gitdb.exc.BadName:
            raise ActionFailure(f"Unknown git ref '{self._base_commit}'")

        # Output of git command will be relative to git project root
        status_output = zsplit(
            git.diff(
                "--cached",
                "--name-status",
                "--no-ext-diff",
                "-z",
                "--diff-filter=ACDMRTUXB",
                "--ignore-submodules",
                self._base_commit,
            ).stdout.decode())

        added = []
        modified = []
        removed = []
        unmerged = []
        while status_output:
            code = status_output[0]
            fname = status_output[1]
            trim_size = 2

            if not code.strip():
                continue
            if code == StatusCode.Untracked or code == StatusCode.Ignored:
                continue

            resolved_name = self._fname_to_path(repo, fname)

            # If file is symlink to directory, skip
            absolute_name = Path(repo.working_tree_dir) / fname
            if absolute_name.is_symlink() and resolved_name.is_dir():
                click.echo(
                    f"| Skipping {absolute_name} since it is a symlink to a directory: {resolved_name}",
                    err=True,
                )
            else:
                # The following detection for unmerged codes comes from `man git-status`
                if code == StatusCode.Unmerged:
                    unmerged.append(resolved_name)
                if (code[0] == StatusCode.Renamed
                    ):  # code is RXXX, where XXX is percent similarity
                    removed.append(resolved_name)
                    fname = status_output[2]
                    trim_size += 1
                    added.append(resolved_name)
                if code == StatusCode.Added:
                    added.append(resolved_name)
                if code == StatusCode.Modified:
                    modified.append(resolved_name)
                if code == StatusCode.Deleted:
                    removed.append(resolved_name)

            status_output = status_output[trim_size:]
        debug_echo(
            f"Git status:\nadded: {added}\nmodified: {modified}\nremoved: {removed}\nunmerged: {unmerged}"
        )

        return GitStatus(added, modified, removed, unmerged)
예제 #23
0
def _get_head_findings(
        context: RunContext, extra_args: Sequence[str],
        targets: TargetFileManager) -> Tuple[FindingSets, RunStats]:
    """
    Gets findings for the project's HEAD git commit

    :param context: The Semgrep run context object
    :param extra_args: Extra arguments to pass to Semgrep
    :param targets: This run's target manager
    :return: A findings object with existing head findings and empty baseline findings
    """
    with targets.current_paths() as paths:
        click.echo("=== looking for current issues in " +
                   unit_len(paths, "file"),
                   err=True)

        for path in paths:
            debug_echo(f"searching {str(path)}")

        args = [
            "--skip-unknown-extensions",
            "--disable-nosem",
            "--json",
            "--autofix",
            "--dryrun",
            "--time",
            "--timeout-threshold",
            "3",
            *extra_args,
        ]
        exit_code, semgrep_output = invoke_semgrep(
            args,
            [str(p) for p in paths],
            timeout=context.timeout,
            explicit_semgrepignore_path=context.action_ignores_path,
        )
        findings = FindingSets(
            exit_code,
            searched_paths=set(targets.searched_paths),
            errors=semgrep_output.errors,
        )

        stats = RunStats(
            rule_list=semgrep_output.timing.rules,
            target_data=semgrep_output.timing.targets,
        )

        findings.current.update_findings(
            Finding.from_semgrep_result(result, context.committed_datetime)
            for result in semgrep_output.results
            if not result["extra"].get("is_ignored"))
        findings.ignored.update_findings(
            Finding.from_semgrep_result(result, context.committed_datetime)
            for result in semgrep_output.results
            if result["extra"].get("is_ignored"))
        if findings.errors:
            click.echo(
                f"| Semgrep exited with {unit_len(findings.errors, 'error')}:",
                err=True,
            )
            for e in findings.errors:
                for s in render_error(e):
                    click.echo(f"|    {s}", err=True)
        inventory_findings_len = 0
        for finding in findings.current:
            if finding.is_cai_finding():
                inventory_findings_len += 1
        click.echo(
            f"| {unit_len(range(len(findings.current) - inventory_findings_len), 'current issue')} found",
            err=True,
        )
        if len(findings.ignored) > 0:
            click.echo(
                f"| {unit_len(findings.ignored, 'issue')} muted with nosemgrep comment (not counted as current)",
                err=True,
            )
    return findings, stats
예제 #24
0
 def event(self) -> Dict[str, Any]:
     value = os.getenv("GITHUB_EVENT_PATH")
     if value:
         debug_echo(f"found github event data at {value}")
         return json.loads(Path(value).read_text())  # type: ignore
     return {}
예제 #25
0
def invoke_semgrep(
    semgrep_args: List[str],
    targets: List[str],
    *,
    timeout: Optional[int],
    baseline: bool = False,
    explicit_semgrepignore_path: Optional[str] = None,
) -> Tuple[int, SemgrepOutput]:
    """
    Call semgrep passing in semgrep_args + targets as the arguments
    Also, save semgrep output as a list of json blobs in SEMGREP_SAVE_FILE
    to help debugging. Baseline scan output will be saved separately with
    the "_baseline" suffix.

    Returns json output of semgrep as dict object
    """
    max_exit_code = 0
    output = SemgrepOutput([], [], SemgrepTiming([], []))
    _env = ({
        "SEMGREP_R2C_INTERNAL_EXPLICIT_SEMGREPIGNORE":
        explicit_semgrepignore_path,
        **os.environ,
    } if explicit_semgrepignore_path else os.environ)

    semgrep_save_file_baseline = Path(SEMGREP_SAVE_FILE_BASELINE)
    if not baseline and semgrep_save_file_baseline.exists():
        semgrep_save_file_baseline.unlink()

    semgrep_save_file_path = (SEMGREP_SAVE_FILE_BASELINE
                              if baseline else SEMGREP_SAVE_FILE)
    semgrep_save_file = open(semgrep_save_file_path, "w+")
    semgrep_save_file.write("[")

    first_chunk = True

    for chunk in chunked_iter(targets, PATHS_CHUNK_SIZE):
        with tempfile.NamedTemporaryFile("w") as output_json_file:
            args = semgrep_args.copy()
            args.extend(["--debug"])
            args.extend([
                "-o",
                output_json_file.
                name,  # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush
            ])
            for c in chunk:
                args.append(c)

            debug_echo(f"== Invoking semgrep with { len(args) } args")

            exit_code = semgrep_exec(*args,
                                     _timeout=timeout,
                                     _err=debug_echo,
                                     _env=_env).exit_code
            max_exit_code = max(max_exit_code, exit_code)

            debug_echo(f"== Semgrep finished with exit code { exit_code }")

            with open(
                    output_json_file.
                    name  # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush
            ) as f:
                semgrep_output = f.read()
            parsed_output = json.loads(semgrep_output)
            if first_chunk:
                first_chunk = False
            else:
                semgrep_save_file.write(",")
            semgrep_save_file.write(semgrep_output)

            output.results = [*output.results, *parsed_output["results"]]
            output.errors = [*output.errors, *parsed_output["errors"]]
            parsed_timing = parsed_output.get("time", {})
            output.timing = SemgrepTiming(
                parsed_timing.get("rules", output.timing.rules),
                [*output.timing.targets, *parsed_timing.get("targets", [])],
            )

    semgrep_save_file.write("]")
    semgrep_save_file.close()

    return max_exit_code, output
예제 #26
0
 def commit(self) -> gitpython.Commit:  # type: ignore
     commit = self.repo.commit(self.commit_sha)
     debug_echo(f"found commit: {commit!r}")
     return commit
예제 #27
0
def _fix_head_for_github(
    base_ref_name: Optional[str] = None,
    head_ref: Optional[str] = None,
) -> Iterator[Optional[str]]:
    """
    GHA can checkout the incorrect commit for a PR (it will create a fake merge commit),
    so we need to reset the head to the actual PR branch head before continuing.

    Note that this code is written in a generic manner, so that it becomes a no-op when
    the CI system has not artifically altered the HEAD ref.

    :return: The baseline ref as a commit hash
    """
    debug_echo(
        f"Called _fix_head_for_github with base_ref_name: {base_ref_name} head_ref: {head_ref}"
    )

    stashed_rev: Optional[str] = None
    base_ref: Optional[str] = base_ref_name

    if get_git_repo() is None:
        debug_echo("Yielding base_ref since get_git_repo was None")
        yield base_ref
        return

    if base_ref:
        # Preserve location of head^ after we possibly change location below
        try:
            debug_echo(f"Calling git rev-parse {base_ref}")
            process = git(["rev-parse", base_ref])
            base_ref = process.stdout.decode("utf-8").rstrip()
        except sh.ErrorReturnCode as ex:
            raise ActionFailure(
                f"There is a problem with your git project:{ex}")

    if head_ref:
        debug_echo("Calling git branch --show-current")
        stashed_rev = git(["branch",
                           "--show-current"]).stdout.decode("utf-8").rstrip()
        debug_echo(f"stashed_rev: {stashed_rev}")
        if not stashed_rev:
            debug_echo("Calling git rev-parse HEAD")
            rev_parse = git(["rev-parse", "HEAD"])
            debug_echo(rev_parse.stderr.decode("utf-8").rstrip())
            stashed_rev = rev_parse.stdout.decode("utf-8").rstrip()
            debug_echo(f"stashed_rev: {stashed_rev}")

        click.echo(f"| not on head ref {head_ref}; checking that out now...",
                   err=True)
        git.checkout([head_ref],
                     _timeout=GIT_SH_TIMEOUT,
                     _out=debug_echo,
                     _err=debug_echo)
        debug_echo(f"checked out {head_ref}")

    try:
        if base_ref is not None:
            merge_base = git("merge-base", base_ref, "HEAD").rstrip()
            # fmt:off
            click.echo("| reporting findings introduced by these commits:",
                       err=True)
            print_git_log(f"{merge_base}..HEAD")
            if merge_base != git("rev-parse", base_ref).rstrip():
                click.echo(
                    "| also reporting findings fixed by these commits from the baseline branch:",
                    err=True)
                print_git_log(f"{merge_base}..{base_ref}")
                click.echo("| to exclude these latter commits, run with",
                           err=True)
                click.echo(
                    f"|   --baseline-ref $(git merge-base {base_ref_name} HEAD)",
                    err=True)
            # fmt: on
        debug_echo(f"yielding {base_ref}")
        yield base_ref
    finally:
        if stashed_rev is not None:
            click.echo(f"| returning to original head revision {stashed_rev}",
                       err=True)
            git.checkout([stashed_rev], _timeout=GIT_SH_TIMEOUT)