def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str, max_timeout_files: List[Path], ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[SemgrepError] = [] for language, all_patterns_for_language in self._group_patterns_by_language( rule).items(): targets = self.get_files_for_language(language, rule, target_manager) targets = [ target for target in targets if target not in max_timeout_files ] if not targets: continue if rule.mode == TAINT_MODE: pattern_json = rule._raw.copy() del pattern_json["mode"] pattern = Pattern(0, rule.expression, rule.severity, language, rule._yaml.span) output_json = self._run_core_command( [pattern_json], [pattern], targets, language, rule, "-tainting_rules_file", cache_dir, ) else: # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: self.handle_regex_patterns(outputs, patterns_regex, targets) # semgrep-core doesn't know about OPERATORS.METAVARIABLE_REGEX - # this is strictly a semgrep Python feature. Metavariable regex # filtering is performed purely in Python code then compared # against semgrep-core's results for other patterns. patterns = [ pattern for pattern in patterns if pattern.expression.operator != OPERATORS.METAVARIABLE_REGEX ] patterns_json = [p.to_json() for p in patterns] output_json = self._run_core_command( patterns_json, patterns, targets, language, rule, "-rules_file", cache_dir, ) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): logger.debug( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str, max_timeout_files: List[Path], profiler: ProfileManager, match_time_matrix: Dict[Tuple[str, str], float], ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError], Set[Path]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[SemgrepError] = [] all_targets: Set[Path] = set() for language, all_patterns_for_language in self._group_patterns_by_language( rule ).items(): targets = self.get_files_for_language(language, rule, target_manager) targets = [target for target in targets if target not in max_timeout_files] all_targets = all_targets.union(targets) if not targets: continue if rule.mode == TAINT_MODE: pattern_json = rule._raw.copy() del pattern_json["mode"] pattern = Pattern( 0, rule.expression, rule.severity, language, rule._yaml.span ) output_json = profiler.track( rule.id, self._run_core_command, [pattern_json], [pattern], targets, language, rule, "-tainting_rules_file", cache_dir, report_time=self._report_time, ) else: # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX or p.expression.operator == OPERATORS.NOT_REGEX, all_patterns_for_language, ) if patterns_regex: self.handle_regex_patterns(outputs, patterns_regex, targets) # regex-only rules only support OPERATORS.REGEX. # Skip passing this rule to semgrep-core. if language in REGEX_ONLY_LANGUAGE_KEYS: continue # semgrep-core doesn't know about the following operators - # they are strictly semgrep Python features: # - OPERATORS.METAVARIABLE_REGEX # - OPERATORS.METAVARIABLE_COMPARISON patterns = [ pattern for pattern in patterns if pattern.expression.operator not in [ OPERATORS.METAVARIABLE_REGEX, OPERATORS.METAVARIABLE_COMPARISON, ] ] patterns_json = [p.to_json() for p in patterns] if language == GENERIC_LANGUAGE: output_json = profiler.track( rule.id, run_spacegrep, rule.id, patterns, targets, timeout=self._timeout, report_time=self._report_time, ) else: # Run semgrep-core output_json = profiler.track( rule.id, self._run_core_command, patterns_json, patterns, targets, language, rule, "-rules_file", cache_dir, report_time=self._report_time, ) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) outputs.extend(PatternMatch(m) for m in output_json["matches"]) if "time" in output_json: self._add_match_times(rule, match_time_matrix, output_json["time"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[ Rule, Dict[Path, List[PatternMatch]] ] = collections.defaultdict(lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): logger.debug( f"--> rule ({rule.id}) has findings on filepath: {filepath}" ) findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec ) findings.extend(findings_for_rule) findings = dedup_output(findings) logger.debug(f"...ran on {len(all_targets)} files") # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors, all_targets
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[CoreException] = [] equivalences = rule.equivalences for language, all_patterns_for_language in self._group_patterns_by_language( [rule]).items(): try: targets = target_manager.get_files(language, rule.includes, rule.excludes) except _UnknownLanguageError as ex: raise UnknownLanguageError( short_msg="invalid language", long_msg=f"unsupported language {language}", spans=[ rule.languages_span.with_context(before=1, after=1) ], ) from ex if targets == []: continue # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: patterns_json = [ pattern.to_json() for pattern in patterns_regex ] try: patterns_re = [(pattern["id"], re.compile(pattern["pattern"])) for pattern in patterns_json] except re.error as err: raise SemgrepError( f"invalid regular expression specified: {err}") re_fn = functools.partial(get_re_matches, patterns_re) with multiprocessing.Pool(self._jobs) as pool: matches = pool.map(re_fn, targets) outputs.extend(single_match for file_matches in matches for single_match in file_matches) patterns_json = [p.to_json() for p in patterns] with tempfile.NamedTemporaryFile( "w") as pattern_file, tempfile.NamedTemporaryFile( "w") as target_file, tempfile.NamedTemporaryFile( "w") as equiv_file: yaml = YAML() yaml.dump({"rules": patterns_json}, pattern_file) pattern_file.flush() target_file.write("\n".join(str(t) for t in targets)) target_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-rules_file", pattern_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", cache_dir, ] if equivalences: self._write_equivalences_file(equiv_file, equivalences) cmd += ["-equivalences", equiv_file.name] core_run = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) debug_print(core_run.stderr.decode("utf-8", "replace")) if core_run.returncode != 0: # see if semgrep output a JSON error that we can decode semgrep_output = core_run.stdout.decode("utf-8", "replace") try: output_json = json.loads(semgrep_output) except ValueError: raise SemgrepError( f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) if "error" in output_json: self._raise_semgrep_error_from_json( output_json, patterns) else: raise SemgrepError( f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) output_json = json.loads( (core_run.stdout.decode("utf-8", "replace"))) errors.extend( CoreException.from_json(e, language) for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): debug_print( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors