def report_results(self, results: Results) -> None: debug_echo(f"=== reporting results to semgrep app at {self.url}") response: Optional["requests.Response"] = None response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/findings", json={ "token": os.getenv("GITHUB_TOKEN"), "findings": [ finding.to_dict(omit=constants.PRIVACY_SENSITIVE_FIELDS) for finding in results.findings.new ], }, timeout=30, ) debug_echo(f"=== POST .../findings responded: {response!r}") try: response.raise_for_status() errors = response.json()["errors"] for error in errors: message = error["message"] click.echo(f"Server returned following warning: {message}", err=True) except requests.RequestException: raise ActionFailure(f"API server returned this error: {response.text}") response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/ignores", json={ "findings": [ finding.to_dict(omit=constants.PRIVACY_SENSITIVE_FIELDS) for finding in results.findings.ignored ], }, timeout=30, ) debug_echo(f"=== POST .../ignores responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure(f"API server returned this error: {response.text}") # mark as complete response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/complete", json={"exit_code": -1, "stats": results.stats}, timeout=30, ) debug_echo(f"=== POST .../complete responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" )
def _baseline_context(self) -> Iterator[None]: """ Runs a block of code on files from the current branch HEAD. :raises ActionFailure: If git cannot detect a HEAD commit :raises ActionFailure: If unmerged files are detected """ repo = get_git_repo() if not repo: yield return self._abort_if_dirty() current_tree = git("write-tree").stdout.decode().strip() try: for a in self._status.added: a.unlink() git.checkout(self._base_commit, "--", ".") yield finally: # git checkout will fail if the checked-out index deletes all files in the repo # In this case, we still want to continue without error. # Note that we have no good way of detecting this issue without inspecting the checkout output # message, which means we are fragile with respect to git version here. try: git.checkout(current_tree.strip(), "--", ".") except sh.ErrorReturnCode as error: output = error.stderr.decode() if (output and len(output) >= 2 and "pathspec '.' did not match any file(s) known to git" in output.strip()): debug_echo( "Restoring git index failed due to total repository deletion; skipping checkout" ) else: raise ActionFailure( f"Fatal error restoring Git state; please restore your repository state manually:\n{output}" ) if self._status.removed: # Need to check if file exists since it is possible file was deleted # in both the base and head git.rm("-f", *(str(r) for r in self._status.removed if r.exists()))
def _get_findings(context: RunContext) -> Tuple[FindingSets, RunStats]: """ Gets head and baseline findings for this run :param context: This scan's run context object :return: This project's findings """ debug_echo("=== adding semgrep configuration") rewrite_args: Sequence[str] = ([] if context.rewrite_rule_ids else ["--no-rewrite-rule-ids"]) metrics_args: Sequence[str] = ["--enable-metrics" ] if context.enable_metrics else [] with _fix_head_for_github(context.base_ref, context.head_ref) as base_ref: workdir = Path.cwd() debug_echo(f"Workdir: {str(workdir)}") targets = TargetFileManager( base_path=workdir, base_commit=base_ref, all_paths=[workdir], ) debug_echo("Initialized TargetFileManager") config_args = [] # Keep track of which config specifiers are local files/dirs local_configs: Set[str] = set() for conf in context.config_specifier: if Path(conf).exists(): local_configs.add(conf) config_args.extend(["--config", conf]) debug_echo("=== seeing if there are any findings") findings, stats = _get_head_findings( context, [*config_args, *metrics_args, *rewrite_args], targets) _update_baseline_findings(context, findings, local_configs, rewrite_args, targets) if os.getenv("INPUT_GENERATESARIF"): click.echo("=== re-running scan to generate a SARIF report", err=True) sarif_path = Path("semgrep.sarif") with targets.current_paths() as paths: args = [*rewrite_args, *config_args] _, sarif_output = invoke_semgrep_sarif( args, [str(p) for p in paths], timeout=context.timeout, explicit_semgrepignore_path=context.action_ignores_path, ) rewrite_sarif_file(sarif_output, sarif_path) return findings, stats
def expand_directives(self, line: str) -> Iterable[str]: """Load :include files""" if line.startswith(":include "): include_path = self.base_path / line[9:] if include_path.is_file(): with include_path.open() as include_lines: sub_base = include_path.parent.resolve() sub_parser = Parser(sub_base) return sub_parser.parse(include_lines) else: debug_echo( f"Skipping `:include {include_path}` directive, file not found" ) return [] elif CONTROL_REGEX.match(line): raise ActionFailure( f"Unknown ignore directive in Semgrep ignore file at {self.base_path}: '{line}'" ) else: return (line for _ in range(1))
def fetch_rules_text(self) -> str: """Get a YAML string with the configured semgrep rules in it.""" if not self.scan.is_loaded: raise ActionFailure( f"The API server at {self.url} is not working properly. " f"Please contact {constants.SUPPORT_EMAIL} for assistance.") response = self.session.get( f"{self.url}/api/agent/scan/{self.scan.id}/rules.yaml", timeout=30, ) debug_echo(f"=== POST .../rules.yaml responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}\n" "Failed to get configured rules") else: return response.text
def fetch_rules_text(self) -> str: """Get a YAML string with the configured semgrep rules in it.""" response = self.session.get( f"{self.url}/api/agent/scan/{self.scan.id}/rules.yaml", timeout=30, ) debug_echo(f"=== POST .../rules.yaml responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}\n" "Failed to get configured rules") # Can remove once server guarantees will always have at least one rule parsed = yaml.load(response.text) if not parsed["rules"]: raise ActionFailure("No rules returned by server for this scan.") else: return response.text
def report_results(self, results: Results) -> None: if not self.is_configured or not self.scan.is_loaded: debug_echo("=== no semgrep app config, skipping report_results") return debug_echo(f"=== reporting results to semgrep app at {self.url}") response: Optional["requests.Response"] = None # report findings for chunk in chunked_iter(results.findings.new, 10_000): response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/findings", json={ "token": os.getenv("GITHUB_TOKEN"), "findings": [ finding.to_dict( omit=constants.PRIVACY_SENSITIVE_FIELDS) for finding in chunk ], }, timeout=30, ) debug_echo(f"=== POST .../findings responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}")
def _find_branchoff_point(self, attempt_count: int = 0) -> str: fetch_depth = 4 ** attempt_count # fetch 4, 16, 64, 256, 1024, ... if attempt_count >= self.MAX_FETCH_ATTEMPT_COUNT: # get all commits on last try fetch_depth = 2 ** 31 - 1 # git expects a signed 32-bit integer if attempt_count: # skip fetching on first try debug_echo( f"fetching {fetch_depth} commits to find branch-off point of pull request" ) git.fetch( "origin", "--depth", fetch_depth, self.base_branch_tip, _timeout=GIT_SH_TIMEOUT, ) git.fetch( "origin", "--depth", fetch_depth, self.head_ref, _timeout=GIT_SH_TIMEOUT ) try: # check if both branches connect to the yet-unknown branch-off point now process = git("merge-base", self.base_branch_tip, self.head_ref) except sh.ErrorReturnCode as error: output = error.stderr.decode().strip() if ( output # output is empty when unable to find branch-off point and "Not a valid " not in output # the error when a ref is missing ): exit_with_sh_error(error) if attempt_count >= self.MAX_FETCH_ATTEMPT_COUNT: raise ActionFailure( "Could not find branch-off point between " f"the baseline tip {self.base_branch_tip} and current head '{self.head_ref}' " ) return self._find_branchoff_point(attempt_count + 1) else: return process.stdout.decode().strip()
def report_failure(self, error: SemgrepError) -> int: """ Send semgrep cli non-zero exit code information to server and return what exit code semgrep should exit with. """ debug_echo(f"=== sending failure information to semgrep app") response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/error", json={ "exit_code": error.exit_code, "stderr": error.stderr, }, timeout=30, ) debug_echo(f"=== POST .../error responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure(f"API server returned this error: {response.text}") exit_code = int(response.json()["exit_code"]) return exit_code
def report_start(self, meta: GitMeta) -> None: """ Get scan id and file ignores returns name of policy used to scan """ debug_echo(f"=== reporting start to semgrep app at {self.url}") response = self.session.post( f"{self.url}/api/agent/deployment/{self.deployment_id}/scan", json={"meta": meta.to_dict()}, timeout=30, ) debug_echo(f"=== POST .../scan responded: {response!r}") if response.status_code == 404: raise ActionFailure( "Failed to create a scan with given token and deployment_id." "Please make sure they have been set correctly." f"API server at {self.url} returned this response: {response.text}" ) try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" ) else: body = response.json() self.scan = Scan( id=glom(body, T["scan"]["id"]), ignore_patterns=glom( body, T["scan"]["meta"].get("ignored_files", [])), policy_list=glom(body, T["policy"]), autofix=glom(body, T.get("autofix", False)), ) debug_echo(f"=== Our scan object is: {self.scan!r}")
def _get_path_lists(self) -> List[Path]: """ Return list of all absolute paths to analyze """ debug_echo("Getting path list") # resolve given paths relative to current working directory debug_echo(f"resolving all_paths: {self._all_paths}") paths = [p.resolve() for p in self._all_paths] if self._base_commit is not None: debug_echo(f"- base_commit is {self._base_commit}") paths = [ a for a in (self._status.added + self._status.modified) # diff_path is a subpath of some element of input_paths if any((a == path or path in a.parents) for path in paths) ] changed_count = len(paths) click.echo(f"| looking at {unit_len(paths, 'changed path')}", err=True) repo = get_git_repo() debug_echo("Got git repo") submodules = repo.submodules # type: ignore debug_echo(f"Resolving submodules {submodules}") submodule_paths = [ self._fname_to_path(repo, submodule.path) for submodule in submodules ] paths = [ path for path in paths if all(submodule_path not in path.parents for submodule_path in submodule_paths) ] if len(paths) != changed_count: click.echo( f"| skipping files in {unit_len(submodule_paths, 'submodule')}: " + ", ".join(str(path) for path in submodule_paths), err=True, ) debug_echo("Finished initializing path list") return [path.relative_to(self._base_path) for path in paths]
class Sapp: url: str token: str deployment_id: int scan: Scan = Scan() is_configured: bool = False session: requests.Session = field(init=False) def __post_init__(self) -> None: # Get deployment from token # if self.token and self.deployment_id: self.is_configured = True self.session = requests.Session() self.session.headers["Authorization"] = f"Bearer {self.token}" def report_start(self, meta: GitMeta) -> None: if not self.is_configured: debug_echo("=== no semgrep app config, skipping report_start") return debug_echo(f"=== reporting start to semgrep app at {self.url}") response = self.session.post( f"{self.url}/api/agent/deployment/{self.deployment_id}/scan", json={"meta": meta.to_dict()}, timeout=30, ) debug_echo(f"=== POST .../scan responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" ) else: body = response.json() self.scan = Scan( id=glom(body, T["scan"]["id"]), config=glom(body, T["scan"]["meta"].get("config")), ignore_patterns=glom( body, T["scan"]["meta"].get("ignored_files", [])), ) debug_echo(f"=== Our scan object is: {self.scan!r}") def fetch_rules_text(self) -> str: """Get a YAML string with the configured semgrep rules in it.""" if not self.scan.is_loaded: raise ActionFailure( f"The API server at {self.url} is not working properly. " f"Please contact {constants.SUPPORT_EMAIL} for assistance.") response = self.session.get( f"{self.url}/api/agent/scan/{self.scan.id}/rules.yaml", timeout=30, ) debug_echo(f"=== POST .../rules.yaml responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}\n" "Failed to get configured rules") else: return response.text def download_rules(self) -> Path: """Save the rules configured on semgrep app to a temporary file""" # hey, it's just a tiny YAML file in CI, we'll survive without cleanup rules_file = tempfile.NamedTemporaryFile(suffix=".yml", delete=False) # nosem rules_path = Path(rules_file.name) rules_path.write_text(self.fetch_rules_text()) return rules_path def report_results(self, results: Results) -> None: if not self.is_configured or not self.scan.is_loaded: debug_echo("=== no semgrep app config, skipping report_results") return debug_echo(f"=== reporting results to semgrep app at {self.url}") response: Optional["requests.Response"] = None # report findings for chunk in chunked_iter(results.new, 10_000): response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/findings", json=[ finding.to_dict(omit=constants.PRIVACY_SENSITIVE_FIELDS) for finding in chunk ], timeout=30, ) debug_echo(f"=== POST .../findings responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") # mark as complete response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/complete", json={ "exit_code": -1, "stats": results.stats }, timeout=30, ) debug_echo(f"=== POST .../complete responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" )
def invoke_semgrep( config_specifier: str, committed_datetime: Optional[datetime], base_commit_ref: Optional[str], semgrep_ignore: TextIO, ) -> FindingSets: debug_echo("=== adding semgrep configuration") workdir = Path.cwd() targets = TargetFileManager( base_path=workdir, base_commit=base_commit_ref, paths=[workdir], ignore_rules_file=semgrep_ignore, ) config_args = ["--config", config_specifier] debug_echo("=== seeing if there are any findings") finding_set = FindingSets() with targets.current_paths() as paths: click.echo("=== looking for current issues in " + unit_len(paths, "file"), err=True) for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE): args = ["--skip-unknown-extensions", "--json", *config_args] for path in chunk: args.append(path) count = 0 for result in json.loads(str(semgrep(*args)))["results"]: finding_set.update_current(result, committed_datetime) count += 1 click.echo( f"| {count} {cardinalize('current issue', count)} found", err=True) if not finding_set.has_current_issues(): click.echo( "=== not looking at pre-existing issues since there are no current issues", err=True, ) else: with targets.baseline_paths() as paths: if paths: paths_with_findings = finding_set.paths_with_current_findings() paths_to_check = set(str(path) for path in paths) & paths_with_findings click.echo( "=== looking for pre-existing issues in " + unit_len(paths_to_check, "file"), err=True, ) for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE): args = [ "--skip-unknown-extensions", "--json", *config_args ] for path in chunk: args.append(path) count = 0 for result in json.loads(str(semgrep(*args)))["results"]: finding_set.update_baseline(result, committed_datetime) count += 1 click.echo( f"| {count} {cardinalize('pre-existing issue', count)} found", err=True, ) if os.getenv("INPUT_GENERATESARIF"): # FIXME: This will crash when running on thousands of files due to command length limit click.echo("=== re-running scan to generate a SARIF report", err=True) sarif_path = Path("semgrep.sarif") with targets.current_paths() as paths, sarif_path.open( "w") as sarif_file: args = ["--sarif", *config_args] for path in paths: args.extend(["--include", path]) semgrep(*args, _out=sarif_file) rewrite_sarif_file(sarif_path) return finding_set
def get_findings( config_specifier: str, committed_datetime: Optional[datetime], base_commit_ref: Optional[str], head_ref: Optional[str], semgrep_ignore: TextIO, uses_managed_policy: bool, ) -> FindingSets: debug_echo("=== adding semgrep configuration") with fix_head_for_github(base_commit_ref, head_ref) as base_ref: workdir = Path.cwd() targets = TargetFileManager( base_path=workdir, base_commit=base_ref, paths=[workdir], ignore_rules_file=semgrep_ignore, ) config_args = ["--config", config_specifier] rewrite_args = ["--no-rewrite-rule-ids"] if uses_managed_policy else [] debug_echo("=== seeing if there are any findings") findings = FindingSets() with targets.current_paths() as paths: click.echo("=== looking for current issues in " + unit_len(paths, "file"), err=True) args = [ "--skip-unknown-extensions", "--disable-nosem", "--json", *rewrite_args, *config_args, ] semgrep_results = invoke_semgrep(args, [str(p) for p in paths])["results"] findings.current.update_findings( Finding.from_semgrep_result(result, committed_datetime) for result in semgrep_results if not result["extra"].get("is_ignored")) findings.ignored.update_findings( Finding.from_semgrep_result(result, committed_datetime) for result in semgrep_results if result["extra"].get("is_ignored")) click.echo( f"| {unit_len(findings.current, 'current issue')} found", err=True) click.echo( f"| {unit_len(findings.ignored, 'ignored issue')} found", err=True, ) if not findings.current: click.echo( "=== not looking at pre-existing issues since there are no current issues", err=True, ) else: with targets.baseline_paths() as paths: paths_with_findings = { finding.path for finding in findings.current } paths_to_check = list( set(str(path) for path in paths) & paths_with_findings) if not paths_to_check: click.echo( "=== not looking at pre-existing issues since all files with current issues are newly created", err=True, ) else: click.echo( "=== looking for pre-existing issues in " + unit_len(paths_to_check, "file"), err=True, ) args = [ "--skip-unknown-extensions", "--json", *rewrite_args, *config_args, ] semgrep_results = invoke_semgrep(args, paths_to_check)["results"] findings.baseline.update_findings( Finding.from_semgrep_result(result, committed_datetime) for result in semgrep_results) click.echo( f"| {unit_len(findings.baseline, 'pre-existing issue')} found", err=True, ) if os.getenv("INPUT_GENERATESARIF"): # FIXME: This will crash when running on thousands of files due to command length limit click.echo("=== re-running scan to generate a SARIF report", err=True) sarif_path = Path("semgrep.sarif") with targets.current_paths() as paths, sarif_path.open( "w") as sarif_file: args = ["--sarif", *rewrite_args, *config_args] for path in paths: args.extend(["--include", str(path)]) semgrep_exec(*args, _out=sarif_file) rewrite_sarif_file(sarif_path) return findings
def repo(self) -> gitpython.Repo: # type: ignore repo = gitpython.Repo() debug_echo(f"found repo: {repo!r}") return repo
def report_results(self, results: Results) -> None: debug_echo(f"=== reporting results to semgrep app at {self.url}") fields_to_omit = constants.PRIVACY_SENSITIVE_FIELDS.copy() if "pr-comment-autofix" in os.getenv("SEMGREP_AGENT_OPT_IN_FEATURES", ""): fields_to_omit.remove("fixed_lines") response: Optional["requests.Response"] = None response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/findings", json={ # send a backup token in case the app is not available "token": os.getenv("GITHUB_TOKEN"), "findings": [ finding.to_dict(omit=fields_to_omit) for finding in results.findings.new ], }, timeout=30, ) debug_echo(f"=== POST .../findings responded: {response!r}") try: response.raise_for_status() errors = response.json()["errors"] for error in errors: message = error["message"] click.echo(f"Server returned following warning: {message}", err=True) except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/ignores", json={ "findings": [finding.to_dict() for finding in results.findings.ignored], }, timeout=30, ) debug_echo(f"=== POST .../ignores responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") # mark as complete response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/complete", json={ "exit_code": -1, "stats": results.stats }, timeout=30, ) debug_echo(f"=== POST .../complete responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" )
omit=constants.PRIVACY_SENSITIVE_FIELDS) for finding in chunk ], }, timeout=30, ) debug_echo(f"=== POST .../ignores responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") # mark as complete response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/complete", json={ "exit_code": -1, "stats": results.stats }, timeout=30, ) debug_echo(f"=== POST .../complete responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" )
def report_results(self, results: Results, rule_ids: Sequence[str], cai_ids: Sequence[str]) -> None: debug_echo(f"=== reporting results to semgrep app at {self.url}") fields_to_omit = constants.PRIVACY_SENSITIVE_FIELDS.copy() if self.scan.autofix: fields_to_omit.remove("fixed_lines") response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/findings", json={ # send a backup token in case the app is not available "token": os.getenv("GITHUB_TOKEN"), "gitlab_token": os.getenv("GITLAB_TOKEN"), "findings": [ finding.to_dict(omit=fields_to_omit) for finding in results.findings.new ], "searched_paths": [str(p) for p in results.findings.searched_paths], "rule_ids": rule_ids, "cai_ids": cai_ids, }, timeout=30, ) debug_echo(f"=== POST .../findings responded: {response!r}") try: response.raise_for_status() errors = response.json()["errors"] for error in errors: message = error["message"] click.echo(f"Server returned following warning: {message}", err=True) except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/ignores", json={ "findings": [ finding.to_dict() for finding in results.findings.new_ignored ], }, timeout=30, ) debug_echo(f"=== POST .../ignores responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server returned this error: {response.text}") # mark as complete # In order to not overload our app database, we truncate target stats to the 20 heaviest hitters. This adds # approximately 80 kB of database load per scan when using p/ci. response = self.session.post( f"{self.url}/api/agent/scan/{self.scan.id}/complete", json={ "exit_code": results.findings.max_exit_code, "stats": results.stats(n_heavy_targets=20), }, timeout=30, ) debug_echo(f"=== POST .../complete responded: {response!r}") try: response.raise_for_status() except requests.RequestException: raise ActionFailure( f"API server at {self.url} returned this error: {response.text}" )
def _baseline_context(self) -> Iterator[None]: """ Runs a block of code on files from the current branch HEAD. :raises ActionFailure: If git cannot detect a HEAD commit :raises ActionFailure: If unmerged files are detected """ repo = get_git_repo() if not repo: yield return self._abort_on_pending_changes() self._abort_on_conflicting_untracked_paths() debug_echo("Running git write-tree") current_tree = git("write-tree").stdout.decode().strip() try: for a in self._status.added: try: a.unlink() except FileNotFoundError: click.echo(f"| {a} was not found when trying to delete", err=True) debug_echo("Running git checkout for baseline context") git.checkout(self._base_commit, "--", ".", _timeout=GIT_SH_TIMEOUT) debug_echo("Finished git checkout for baseline context") yield finally: # git checkout will fail if the checked-out index deletes all files in the repo # In this case, we still want to continue without error. # Note that we have no good way of detecting this issue without inspecting the checkout output # message, which means we are fragile with respect to git version here. try: debug_echo("Running git checkout to return original context") git.checkout(current_tree.strip(), "--", ".", _timeout=GIT_SH_TIMEOUT) debug_echo("Finished git checkout to return original context") except sh.ErrorReturnCode as error: output = error.stderr.decode() if (output and len(output) >= 2 and "pathspec '.' did not match any file(s) known to git" in output.strip()): debug_echo( "Restoring git index failed due to total repository deletion; skipping checkout" ) else: raise ActionFailure( f"Fatal error restoring Git state; please restore your repository state manually:\n{output}" ) if self._status.removed: # Need to check if file exists since it is possible file was deleted # in both the base and head. Only call if there are files to delete to_remove = [r for r in self._status.removed if r.exists()] if to_remove: debug_echo("Running git rm") git.rm("-f", *(str(r) for r in to_remove), _timeout=GIT_SH_TIMEOUT) debug_echo("finished git rm")
def _fname_to_path(self, repo: "gitpython.Repo", fname: str) -> Path: # type: ignore debug_echo( f"_fname_to_path: root: {repo.working_tree_dir} fname: {fname}") return (Path(repo.working_tree_dir) / fname).resolve()
def repo(self) -> gitpython.Repo: # type: ignore repo = gitpython.Repo(".", search_parent_directories=True) debug_echo(f"found repo: {repo!r}") return repo
def get_git_status(self) -> GitStatus: """ Returns Absolute Paths to all files that are staged Ignores files that are symlinks to directories """ import gitdb.exc # type: ignore repo = get_git_repo() if not repo or self._base_commit is None: return GitStatus([], [], [], []) try: repo.rev_parse(self._base_commit) except gitdb.exc.BadName: raise ActionFailure(f"Unknown git ref '{self._base_commit}'") # Output of git command will be relative to git project root status_output = zsplit( git.diff( "--cached", "--name-status", "--no-ext-diff", "-z", "--diff-filter=ACDMRTUXB", "--ignore-submodules", self._base_commit, ).stdout.decode()) added = [] modified = [] removed = [] unmerged = [] while status_output: code = status_output[0] fname = status_output[1] trim_size = 2 if not code.strip(): continue if code == StatusCode.Untracked or code == StatusCode.Ignored: continue resolved_name = self._fname_to_path(repo, fname) # If file is symlink to directory, skip absolute_name = Path(repo.working_tree_dir) / fname if absolute_name.is_symlink() and resolved_name.is_dir(): click.echo( f"| Skipping {absolute_name} since it is a symlink to a directory: {resolved_name}", err=True, ) else: # The following detection for unmerged codes comes from `man git-status` if code == StatusCode.Unmerged: unmerged.append(resolved_name) if (code[0] == StatusCode.Renamed ): # code is RXXX, where XXX is percent similarity removed.append(resolved_name) fname = status_output[2] trim_size += 1 added.append(resolved_name) if code == StatusCode.Added: added.append(resolved_name) if code == StatusCode.Modified: modified.append(resolved_name) if code == StatusCode.Deleted: removed.append(resolved_name) status_output = status_output[trim_size:] debug_echo( f"Git status:\nadded: {added}\nmodified: {modified}\nremoved: {removed}\nunmerged: {unmerged}" ) return GitStatus(added, modified, removed, unmerged)
def _get_head_findings( context: RunContext, extra_args: Sequence[str], targets: TargetFileManager) -> Tuple[FindingSets, RunStats]: """ Gets findings for the project's HEAD git commit :param context: The Semgrep run context object :param extra_args: Extra arguments to pass to Semgrep :param targets: This run's target manager :return: A findings object with existing head findings and empty baseline findings """ with targets.current_paths() as paths: click.echo("=== looking for current issues in " + unit_len(paths, "file"), err=True) for path in paths: debug_echo(f"searching {str(path)}") args = [ "--skip-unknown-extensions", "--disable-nosem", "--json", "--autofix", "--dryrun", "--time", "--timeout-threshold", "3", *extra_args, ] exit_code, semgrep_output = invoke_semgrep( args, [str(p) for p in paths], timeout=context.timeout, explicit_semgrepignore_path=context.action_ignores_path, ) findings = FindingSets( exit_code, searched_paths=set(targets.searched_paths), errors=semgrep_output.errors, ) stats = RunStats( rule_list=semgrep_output.timing.rules, target_data=semgrep_output.timing.targets, ) findings.current.update_findings( Finding.from_semgrep_result(result, context.committed_datetime) for result in semgrep_output.results if not result["extra"].get("is_ignored")) findings.ignored.update_findings( Finding.from_semgrep_result(result, context.committed_datetime) for result in semgrep_output.results if result["extra"].get("is_ignored")) if findings.errors: click.echo( f"| Semgrep exited with {unit_len(findings.errors, 'error')}:", err=True, ) for e in findings.errors: for s in render_error(e): click.echo(f"| {s}", err=True) inventory_findings_len = 0 for finding in findings.current: if finding.is_cai_finding(): inventory_findings_len += 1 click.echo( f"| {unit_len(range(len(findings.current) - inventory_findings_len), 'current issue')} found", err=True, ) if len(findings.ignored) > 0: click.echo( f"| {unit_len(findings.ignored, 'issue')} muted with nosemgrep comment (not counted as current)", err=True, ) return findings, stats
def event(self) -> Dict[str, Any]: value = os.getenv("GITHUB_EVENT_PATH") if value: debug_echo(f"found github event data at {value}") return json.loads(Path(value).read_text()) # type: ignore return {}
def invoke_semgrep( semgrep_args: List[str], targets: List[str], *, timeout: Optional[int], baseline: bool = False, explicit_semgrepignore_path: Optional[str] = None, ) -> Tuple[int, SemgrepOutput]: """ Call semgrep passing in semgrep_args + targets as the arguments Also, save semgrep output as a list of json blobs in SEMGREP_SAVE_FILE to help debugging. Baseline scan output will be saved separately with the "_baseline" suffix. Returns json output of semgrep as dict object """ max_exit_code = 0 output = SemgrepOutput([], [], SemgrepTiming([], [])) _env = ({ "SEMGREP_R2C_INTERNAL_EXPLICIT_SEMGREPIGNORE": explicit_semgrepignore_path, **os.environ, } if explicit_semgrepignore_path else os.environ) semgrep_save_file_baseline = Path(SEMGREP_SAVE_FILE_BASELINE) if not baseline and semgrep_save_file_baseline.exists(): semgrep_save_file_baseline.unlink() semgrep_save_file_path = (SEMGREP_SAVE_FILE_BASELINE if baseline else SEMGREP_SAVE_FILE) semgrep_save_file = open(semgrep_save_file_path, "w+") semgrep_save_file.write("[") first_chunk = True for chunk in chunked_iter(targets, PATHS_CHUNK_SIZE): with tempfile.NamedTemporaryFile("w") as output_json_file: args = semgrep_args.copy() args.extend(["--debug"]) args.extend([ "-o", output_json_file. name, # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush ]) for c in chunk: args.append(c) debug_echo(f"== Invoking semgrep with { len(args) } args") exit_code = semgrep_exec(*args, _timeout=timeout, _err=debug_echo, _env=_env).exit_code max_exit_code = max(max_exit_code, exit_code) debug_echo(f"== Semgrep finished with exit code { exit_code }") with open( output_json_file. name # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush ) as f: semgrep_output = f.read() parsed_output = json.loads(semgrep_output) if first_chunk: first_chunk = False else: semgrep_save_file.write(",") semgrep_save_file.write(semgrep_output) output.results = [*output.results, *parsed_output["results"]] output.errors = [*output.errors, *parsed_output["errors"]] parsed_timing = parsed_output.get("time", {}) output.timing = SemgrepTiming( parsed_timing.get("rules", output.timing.rules), [*output.timing.targets, *parsed_timing.get("targets", [])], ) semgrep_save_file.write("]") semgrep_save_file.close() return max_exit_code, output
def commit(self) -> gitpython.Commit: # type: ignore commit = self.repo.commit(self.commit_sha) debug_echo(f"found commit: {commit!r}") return commit
def _fix_head_for_github( base_ref_name: Optional[str] = None, head_ref: Optional[str] = None, ) -> Iterator[Optional[str]]: """ GHA can checkout the incorrect commit for a PR (it will create a fake merge commit), so we need to reset the head to the actual PR branch head before continuing. Note that this code is written in a generic manner, so that it becomes a no-op when the CI system has not artifically altered the HEAD ref. :return: The baseline ref as a commit hash """ debug_echo( f"Called _fix_head_for_github with base_ref_name: {base_ref_name} head_ref: {head_ref}" ) stashed_rev: Optional[str] = None base_ref: Optional[str] = base_ref_name if get_git_repo() is None: debug_echo("Yielding base_ref since get_git_repo was None") yield base_ref return if base_ref: # Preserve location of head^ after we possibly change location below try: debug_echo(f"Calling git rev-parse {base_ref}") process = git(["rev-parse", base_ref]) base_ref = process.stdout.decode("utf-8").rstrip() except sh.ErrorReturnCode as ex: raise ActionFailure( f"There is a problem with your git project:{ex}") if head_ref: debug_echo("Calling git branch --show-current") stashed_rev = git(["branch", "--show-current"]).stdout.decode("utf-8").rstrip() debug_echo(f"stashed_rev: {stashed_rev}") if not stashed_rev: debug_echo("Calling git rev-parse HEAD") rev_parse = git(["rev-parse", "HEAD"]) debug_echo(rev_parse.stderr.decode("utf-8").rstrip()) stashed_rev = rev_parse.stdout.decode("utf-8").rstrip() debug_echo(f"stashed_rev: {stashed_rev}") click.echo(f"| not on head ref {head_ref}; checking that out now...", err=True) git.checkout([head_ref], _timeout=GIT_SH_TIMEOUT, _out=debug_echo, _err=debug_echo) debug_echo(f"checked out {head_ref}") try: if base_ref is not None: merge_base = git("merge-base", base_ref, "HEAD").rstrip() # fmt:off click.echo("| reporting findings introduced by these commits:", err=True) print_git_log(f"{merge_base}..HEAD") if merge_base != git("rev-parse", base_ref).rstrip(): click.echo( "| also reporting findings fixed by these commits from the baseline branch:", err=True) print_git_log(f"{merge_base}..{base_ref}") click.echo("| to exclude these latter commits, run with", err=True) click.echo( f"| --baseline-ref $(git merge-base {base_ref_name} HEAD)", err=True) # fmt: on debug_echo(f"yielding {base_ref}") yield base_ref finally: if stashed_rev is not None: click.echo(f"| returning to original head revision {stashed_rev}", err=True) git.checkout([stashed_rev], _timeout=GIT_SH_TIMEOUT)