def _run_rules_direct_to_semgrep_core( self, rules: List[Rule], target_manager: TargetManager, profiler: ProfileManager, ) -> Tuple[ Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]], List[SemgrepError], Set[Path], Dict[Any, Any], ]: from itertools import chain from collections import defaultdict outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list) errors: List[SemgrepError] = [] # cf. for bar_format: https://tqdm.github.io/docs/tqdm/ with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir: for rule, language in tuple( chain( *( [(rule, language) for language in rule.languages] for rule in rules ) ) ): debug_tqdm_write(f"Running rule {rule._raw.get('id')}...") with tempfile.NamedTemporaryFile( "w", suffix=".yaml" ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file: targets = self.get_files_for_language( language, rule, target_manager ) # opti: no need to call semgrep-core if no target files if not targets: continue target_file.write("\n".join(map(lambda p: str(p), targets))) target_file.flush() yaml = YAML() yaml.dump({"rules": [rule._raw]}, rule_file) rule_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-fast", "-json", "-config", rule_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", semgrep_core_ast_cache_dir, "-timeout", str(self._timeout), "-max_memory", str(self._max_memory), ] r = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out_bytes, err_bytes, returncode = r.stdout, r.stderr, r.returncode output_json = self._parse_core_output( out_bytes, err_bytes, returncode ) if returncode != 0: if "error" in output_json: self._raise_semgrep_error_from_json(output_json, [], rule) else: raise SemgrepError( f"unexpected json output while invoking semgrep-core with rule '{rule.id}':\n{PLEASE_FILE_ISSUE_TEXT}" ) # end with tempfile.NamedTemporaryFile(...) ... findings = [ RuleMatch.from_pattern_match( rule.id, PatternMatch(pattern_match), message=rule.message, metadata=rule.metadata, severity=rule.severity, fix=rule.fix, fix_regex=rule.fix_regex, ) for pattern_match in output_json["matches"] ] # TODO: we should do that in Semgrep_generic.ml instead findings = dedup_output(findings) outputs[rule].extend(findings) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) # end for rule, language ... return outputs, {}, errors, set(Path(p) for p in target_manager.targets), {}
def run_spacegrep(rule_id: str, patterns: List[Pattern], targets: List[Path], timeout: int) -> dict: matches: List[dict] = [] errors: List[dict] = [] for pattern in patterns: if not isinstance(pattern._pattern, str): raise NotImplementedError( f"Support for {type(pattern._pattern)} has not been implemented yet." ) pattern_str = pattern._pattern # TODO: Handle pattern Dict for target in targets: cmd = [ SPACEGREP_PATH, "--output-format", "semgrep", "-d", str(target), pattern_str, "--timeout", str(timeout), ] try: p = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # exit code 3 indicates a timeout. See 'spacegrep --help'. if p.returncode == 3: err = CoreException( check_id="Timeout", path=target, start=Position(0, 0), end=Position(0, 0), extra={ "message": "spacegrep timeout", "line": "", }, language="generic", rule_id=rule_id, ).to_dict() errors.append(err) else: p.check_returncode() raw_output = p.stdout output_json = _parse_spacegrep_output(raw_output) output_json["matches"] = _patch_id( pattern, output_json.get("matches", [])) matches.extend(output_json["matches"]) errors.extend(output_json["errors"]) except subprocess.CalledProcessError as e: raw_error = p.stderr spacegrep_error_text = raw_error.decode("utf-8", errors="replace") raise SemgrepError( f"Error running spacegrep on file {target}: Process error: {e}\n\nspacegrep error: {spacegrep_error_text}" ) except json.JSONDecodeError as e: raise SemgrepError( f"Could not parse spacegrep output as JSON: JSON error: {e}" ) except KeyError as e: raise SemgrepError( f"Invalid JSON output was received from spacegrep: {e}") return { "matches": matches, "errors": errors, }
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str, max_timeout_files: List[Path], profiler: ProfileManager, match_time_matrix: Dict[Tuple[str, str], float], ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError], Set[Path]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[SemgrepError] = [] all_targets: Set[Path] = set() for language, all_patterns_for_language in self._group_patterns_by_language( rule ).items(): targets = self.get_files_for_language(language, rule, target_manager) targets = [target for target in targets if target not in max_timeout_files] all_targets = all_targets.union(targets) if not targets: continue if rule.mode == TAINT_MODE: pattern_json = rule._raw.copy() del pattern_json["mode"] pattern = Pattern( 0, rule.expression, rule.severity, language, rule._yaml.span ) output_json = profiler.track( rule.id, self._run_core_command, [pattern_json], [pattern], targets, language, rule, "-tainting_rules_file", cache_dir, report_time=self._report_time, ) else: # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX or p.expression.operator == OPERATORS.NOT_REGEX, all_patterns_for_language, ) if patterns_regex: self.handle_regex_patterns(outputs, patterns_regex, targets) # regex-only rules only support OPERATORS.REGEX. # Skip passing this rule to semgrep-core. if language in REGEX_ONLY_LANGUAGE_KEYS: continue # semgrep-core doesn't know about the following operators - # they are strictly semgrep Python features: # - OPERATORS.METAVARIABLE_REGEX # - OPERATORS.METAVARIABLE_COMPARISON patterns = [ pattern for pattern in patterns if pattern.expression.operator not in [ OPERATORS.METAVARIABLE_REGEX, OPERATORS.METAVARIABLE_COMPARISON, ] ] patterns_json = [p.to_json() for p in patterns] if language == GENERIC_LANGUAGE: output_json = profiler.track( rule.id, run_spacegrep, rule.id, patterns, targets, timeout=self._timeout, report_time=self._report_time, ) else: # Run semgrep-core output_json = profiler.track( rule.id, self._run_core_command, patterns_json, patterns, targets, language, rule, "-rules_file", cache_dir, report_time=self._report_time, ) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) outputs.extend(PatternMatch(m) for m in output_json["matches"]) if "time" in output_json: self._add_match_times(rule, match_time_matrix, output_json["time"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[ Rule, Dict[Path, List[PatternMatch]] ] = collections.defaultdict(lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): logger.debug( f"--> rule ({rule.id}) has findings on filepath: {filepath}" ) findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec ) findings.extend(findings_for_rule) findings = dedup_output(findings) logger.debug(f"...ran on {len(all_targets)} files") # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors, all_targets
def _run_rules_direct_to_semgrep_core( self, rules: List[Rule], target_manager: TargetManager, profiler: ProfileManager, ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]], List[SemgrepError], Set[Path], ProfilingData, ]: logger.debug(f"Passing whole rules directly to semgrep_core") outputs: Dict[Rule, List[RuleMatch]] = collections.defaultdict(list) errors: List[SemgrepError] = [] all_targets: Set[Path] = set() file_timeouts: Dict[Path, int] = collections.defaultdict(lambda: 0) max_timeout_files: Set[Path] = set() profiling_data: ProfilingData = ProfilingData() # cf. for bar_format: https://tqdm.github.io/docs/tqdm/ with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir: for rule in progress_bar( rules, bar_format="{l_bar}{bar}|{n_fmt}/{total_fmt}"): for language in rule.languages: debug_tqdm_write(f"Running rule {rule.id}...") with tempfile.NamedTemporaryFile( "w", suffix=".yaml" ) as rule_file, tempfile.NamedTemporaryFile( "w") as target_file, tempfile.NamedTemporaryFile( "w") as equiv_file: targets = self.get_files_for_language( language, rule, target_manager) targets = [ target for target in targets if target not in max_timeout_files ] # opti: no need to call semgrep-core if no target files if not targets: continue all_targets = all_targets.union(targets) target_file.write("\n".join( map(lambda p: str(p), targets))) target_file.flush() yaml = YAML() yaml.dump({"rules": [rule._raw]}, rule_file) rule_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language.value, "-json", "-config", rule_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", semgrep_core_ast_cache_dir, "-timeout", str(self._timeout), "-max_memory", str(self._max_memory), "-json_time", ] if self._optimizations != "none": cmd.append("-fast") stderr: Optional[int] = subprocess.PIPE if is_debug(): cmd += ["-debug"] stderr = None core_run = sub_run(cmd, stdout=subprocess.PIPE, stderr=stderr) output_json = self._extract_core_output(rule, core_run) if "time" in output_json: self._add_match_times(rule, profiling_data, output_json["time"]) # end with tempfile.NamedTemporaryFile(...) ... pattern_matches = [ PatternMatch(match) for match in output_json["matches"] ] findings = create_output(rule, pattern_matches) findings = dedup_output(findings) outputs[rule].extend(findings) parsed_errors = [ CoreException.from_json(e, language.value, rule.id).into_semgrep_error() for e in output_json["errors"] ] for err in parsed_errors: if isinstance(err, MatchTimeoutError): file_timeouts[err.path] += 1 if (self._timeout_threshold != 0 and file_timeouts[err.path] >= self._timeout_threshold): max_timeout_files.add(err.path) errors.extend(parsed_errors) # end for language ... # end for rule ... return outputs, {}, errors, all_targets, profiling_data
def _run_rules_direct_to_semgrep_core( self, rules: List[Rule], target_manager: TargetManager, profiler: ProfileManager, ) -> Tuple[ Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]], List[SemgrepError], Set[Path], ProfilingData, ]: from itertools import chain from collections import defaultdict logger.debug(f"Passing whole rules directly to semgrep_core") outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list) errors: List[SemgrepError] = [] all_targets: Set[Path] = set() profiling_data: ProfilingData = ProfilingData() # cf. for bar_format: https://tqdm.github.io/docs/tqdm/ with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir: for rule, language in tuple( chain( *( [(rule, language) for language in rule.languages] for rule in rules ) ) ): debug_tqdm_write(f"Running rule {rule._raw.get('id')}...") with tempfile.NamedTemporaryFile( "w", suffix=".yaml" ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file: targets = self.get_files_for_language( language, rule, target_manager ) # opti: no need to call semgrep-core if no target files if not targets: continue all_targets = all_targets.union(targets) target_file.write("\n".join(map(lambda p: str(p), targets))) target_file.flush() yaml = YAML() yaml.dump({"rules": [rule._raw]}, rule_file) rule_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-fast", "-json", "-config", rule_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", semgrep_core_ast_cache_dir, "-timeout", str(self._timeout), "-max_memory", str(self._max_memory), ] if self._report_time: cmd += ["-json_time"] if self._output_settings.debug: cmd += ["-debug"] core_run = sub_run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) output_json = self._extract_core_output(rule, [], core_run) if "time" in output_json: self._add_match_times(rule, profiling_data, output_json["time"]) # end with tempfile.NamedTemporaryFile(...) ... findings = [ RuleMatch.from_pattern_match( rule.id, PatternMatch(pattern_match), message=rule.message, metadata=rule.metadata, severity=rule.severity, fix=rule.fix, fix_regex=rule.fix_regex, ) for pattern_match in output_json["matches"] ] # TODO: we should do that in Semgrep_generic.ml instead findings = dedup_output(findings) outputs[rule].extend(findings) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) # end for rule, language ... return outputs, {}, errors, all_targets, profiling_data
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str, max_timeout_files: List[Path], ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[SemgrepError] = [] for language, all_patterns_for_language in self._group_patterns_by_language( rule).items(): targets = self.get_files_for_language(language, rule, target_manager) targets = [ target for target in targets if target not in max_timeout_files ] if not targets: continue if rule.mode == TAINT_MODE: pattern_json = rule._raw.copy() del pattern_json["mode"] pattern = Pattern(0, rule.expression, rule.severity, language, rule._yaml.span) output_json = self._run_core_command( [pattern_json], [pattern], targets, language, rule, "-tainting_rules_file", cache_dir, ) else: # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: self.handle_regex_patterns(outputs, patterns_regex, targets) # semgrep-core doesn't know about OPERATORS.METAVARIABLE_REGEX - # this is strictly a semgrep Python feature. Metavariable regex # filtering is performed purely in Python code then compared # against semgrep-core's results for other patterns. patterns = [ pattern for pattern in patterns if pattern.expression.operator != OPERATORS.METAVARIABLE_REGEX ] patterns_json = [p.to_json() for p in patterns] output_json = self._run_core_command( patterns_json, patterns, targets, language, rule, "-rules_file", cache_dir, ) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): logger.debug( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors
def run_spacegrep( rule_id: str, patterns: List[Pattern], targets: List[Path], timeout: int, ) -> dict: matches: List[dict] = [] errors: List[dict] = [] targets_time: Dict[str, Tuple[float, float, float]] = {} for pattern in patterns: if not isinstance(pattern._pattern, str): raise NotImplementedError( f"Support for {type(pattern._pattern)} has not been implemented yet." ) pattern_str = pattern._pattern # TODO: Handle pattern Dict for target in targets: cmd = [ SPACEGREP_PATH, "--output-format", "semgrep", "-d", str(target), pattern_str, "--timeout", str(timeout), "--time", ] try: p = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # exit code 3 indicates a timeout. See 'spacegrep --help'. if p.returncode == 3: err = CoreException( check_id="Timeout", path=target, start=Position(0, 0), end=Position(0, 0), extra={ "message": "spacegrep timeout", "line": "", }, language="generic", rule_id=rule_id, ).to_dict() errors.append(err) else: p.check_returncode() raw_output = p.stdout output_json = _parse_spacegrep_output(raw_output) output_json["matches"] = _patch_id( pattern, output_json.get("matches", []) ) matches.extend(output_json["matches"]) errors.extend(output_json["errors"]) # aggregate the match times obtained for the different patterns of the rule path_s = str(target) targets_time[path_s] = tuple( # type: ignore [ i + j for i, j in zip( targets_time.get(path_s, (0.0, 0.0, 0.0)), _extract_times(output_json), ) ] ) except subprocess.CalledProcessError as e: raw_error = p.stderr spacegrep_error_text = raw_error.decode("utf-8", errors="replace") raise SemgrepError( f"Error running spacegrep on file {target}: Process error: {e}\n\nspacegrep error: {spacegrep_error_text}" ) except json.JSONDecodeError as e: raise SemgrepError( f"Could not parse spacegrep output as JSON: JSON error: {e}" ) except KeyError as e: raise SemgrepError( f"Invalid JSON output was received from spacegrep: {e}" ) target_list = [] for path in targets: times = targets_time.get(str(path), (0.0, 0.0, 0.0)) target_list.append( { "path": str(path), "parse_time": times[0], "match_time": times[1], "run_time": times[2], } ) time = {"targets": target_list} return { "matches": matches, "errors": errors, "time": time, }
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[CoreException] = [] equivalences = rule.equivalences for language, all_patterns_for_language in self._group_patterns_by_language( [rule]).items(): try: targets = target_manager.get_files(language, rule.includes, rule.excludes) except _UnknownLanguageError as ex: raise UnknownLanguageError( short_msg="invalid language", long_msg=f"unsupported language {language}", spans=[ rule.languages_span.with_context(before=1, after=1) ], ) from ex if targets == []: continue # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: patterns_json = [ pattern.to_json() for pattern in patterns_regex ] try: patterns_re = [(pattern["id"], re.compile(pattern["pattern"])) for pattern in patterns_json] except re.error as err: raise SemgrepError( f"invalid regular expression specified: {err}") re_fn = functools.partial(get_re_matches, patterns_re) with multiprocessing.Pool(self._jobs) as pool: matches = pool.map(re_fn, targets) outputs.extend(single_match for file_matches in matches for single_match in file_matches) patterns_json = [p.to_json() for p in patterns] with tempfile.NamedTemporaryFile( "w") as pattern_file, tempfile.NamedTemporaryFile( "w") as target_file, tempfile.NamedTemporaryFile( "w") as equiv_file: yaml = YAML() yaml.dump({"rules": patterns_json}, pattern_file) pattern_file.flush() target_file.write("\n".join(str(t) for t in targets)) target_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-rules_file", pattern_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", cache_dir, ] if equivalences: self._write_equivalences_file(equiv_file, equivalences) cmd += ["-equivalences", equiv_file.name] core_run = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) debug_print(core_run.stderr.decode("utf-8", "replace")) if core_run.returncode != 0: # see if semgrep output a JSON error that we can decode semgrep_output = core_run.stdout.decode("utf-8", "replace") try: output_json = json.loads(semgrep_output) except ValueError: raise SemgrepError( f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) if "error" in output_json: self._raise_semgrep_error_from_json( output_json, patterns) else: raise SemgrepError( f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) output_json = json.loads( (core_run.stdout.decode("utf-8", "replace"))) errors.extend( CoreException.from_json(e, language) for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): debug_print( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors