def evaluate(rule: Rule, pattern_matches: List[PatternMatch], allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]: """ Takes a Rule and list of pattern matches from a single file and handles the boolean expression evaluation of the Rule's patterns Returns a list of RuleMatches. """ output = [] pattern_ids_to_pattern_matches = group_by_pattern_id(pattern_matches) steps_for_debugging = [{ "filter": "initial", "pattern_id": None, "ranges": { k: list(set(vv.range for vv in v)) for k, v in pattern_ids_to_pattern_matches.items() }, }] logger.debug(str(pattern_ids_to_pattern_matches)) if rule.mode == TAINT_MODE: valid_ranges_to_output = { pattern_match.range for pattern_match in pattern_matches } else: valid_ranges_to_output = evaluate_expression( rule.expression, pattern_ids_to_pattern_matches, flags={RCE_RULE_FLAG: allow_exec}, steps_for_debugging=steps_for_debugging, ) # only output matches which are inside these offsets! logger.debug(f"compiled result {valid_ranges_to_output}") logger.debug("-" * 80) for pattern_match in pattern_matches: if pattern_match.range in valid_ranges_to_output: message = interpolate_message_metavariables(rule, pattern_match) fix = interpolate_fix_metavariables(rule, pattern_match) rule_match = RuleMatch.from_pattern_match( rule.id, pattern_match, message=message, metadata=rule.metadata, severity=rule.severity, fix=fix, fix_regex=rule.fix_regex, ) output.append(rule_match) return output, steps_for_debugging
def create_output( rule: Rule, pattern_matches: List[PatternMatch], valid_ranges_to_output: Optional[Set[Range]] = None, ) -> List[RuleMatch]: output = [] if valid_ranges_to_output is None: valid_ranges_to_output = { pattern_match.range for pattern_match in pattern_matches } propagated_metavariable_lookup = { _range: { metavariable: pm.get_metavariable_value(metavariable) for pm in pattern_matches for metavariable in _range.propagated_metavariables if compare_propagated_metavariable(_range, pm, metavariable) } for _range in valid_ranges_to_output } for pattern_match in pattern_matches: if pattern_match.range in valid_ranges_to_output: propagated_metavariables = propagated_metavariable_lookup[ pattern_match.range] message = interpolate_string_with_metavariables( rule.message, pattern_match, propagated_metavariables) fix = (interpolate_string_with_metavariables( rule.fix, pattern_match, propagated_metavariables) if rule.fix else None) rule_match = RuleMatch.from_pattern_match( rule.id, pattern_match, message=message, metadata=rule.metadata, severity=rule.severity, fix=fix, fix_regex=rule.fix_regex, ) output.append(rule_match) return sorted(output, key=lambda rule_match: rule_match._pattern_match.range.start)
def evaluate(rule: Rule, pattern_matches: List[PatternMatch], allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]: """ Takes a Rule and list of pattern matches from a single file and handles the boolean expression evaluation of the Rule's patterns Returns a list of RuleMatches. """ output = [] pattern_ids_to_pattern_matches: Dict[PatternId, List[PatternMatch]] = OrderedDict() for pm in stabilize_evaluation_ordering(pattern_matches, key=lambda pm: pm.id): pattern_ids_to_pattern_matches.setdefault(pm.id, []).append(pm) initial_ranges: DebugRanges = { pattern_id: set(pm.range for pm in pattern_matches) for pattern_id, pattern_matches in pattern_ids_to_pattern_matches.items() } steps_for_debugging = [DebuggingStep("initial", None, initial_ranges, {})] if rule.mode == TAINT_MODE: valid_ranges_to_output = { pattern_match.range for pattern_match in pattern_matches } else: valid_ranges_to_output = evaluate_expression( rule.expression, pattern_ids_to_pattern_matches, allow_exec=allow_exec, steps_for_debugging=steps_for_debugging, ) # only output matches which are inside these offsets! logger.debug(f"compiled result {valid_ranges_to_output}") logger.debug(BREAK_LINE) propagated_metavariable_lookup = { _range: { metavariable: pm.get_metavariable_value(metavariable) for pm in pattern_matches for metavariable in _range.propagated_metavariables if compare_propagated_metavariable(_range, pm, metavariable) } for _range in valid_ranges_to_output } for pattern_match in pattern_matches: if pattern_match.range in valid_ranges_to_output: propagated_metavariables = propagated_metavariable_lookup[ pattern_match.range] message = interpolate_message_metavariables( rule, pattern_match, propagated_metavariables) fix = interpolate_fix_metavariables(rule, pattern_match, propagated_metavariables) rule_match = RuleMatch.from_pattern_match( rule.id, pattern_match, message=message, metadata=rule.metadata, severity=rule.severity, fix=fix, fix_regex=rule.fix_regex, ) output.append(rule_match) return output, [attr.asdict(step) for step in steps_for_debugging]
def _run_rules_direct_to_semgrep_core( self, rules: List[Rule], target_manager: TargetManager, profiler: ProfileManager, ) -> Tuple[ Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]], List[SemgrepError], Set[Path], Dict[Any, Any], ]: from itertools import chain from collections import defaultdict outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list) errors: List[SemgrepError] = [] # cf. for bar_format: https://tqdm.github.io/docs/tqdm/ with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir: for rule, language in tuple( chain( *( [(rule, language) for language in rule.languages] for rule in rules ) ) ): debug_tqdm_write(f"Running rule {rule._raw.get('id')}...") with tempfile.NamedTemporaryFile( "w", suffix=".yaml" ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file: targets = self.get_files_for_language( language, rule, target_manager ) # opti: no need to call semgrep-core if no target files if not targets: continue target_file.write("\n".join(map(lambda p: str(p), targets))) target_file.flush() yaml = YAML() yaml.dump({"rules": [rule._raw]}, rule_file) rule_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-fast", "-json", "-config", rule_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", semgrep_core_ast_cache_dir, "-timeout", str(self._timeout), "-max_memory", str(self._max_memory), ] r = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out_bytes, err_bytes, returncode = r.stdout, r.stderr, r.returncode output_json = self._parse_core_output( out_bytes, err_bytes, returncode ) if returncode != 0: if "error" in output_json: self._raise_semgrep_error_from_json(output_json, [], rule) else: raise SemgrepError( f"unexpected json output while invoking semgrep-core with rule '{rule.id}':\n{PLEASE_FILE_ISSUE_TEXT}" ) # end with tempfile.NamedTemporaryFile(...) ... findings = [ RuleMatch.from_pattern_match( rule.id, PatternMatch(pattern_match), message=rule.message, metadata=rule.metadata, severity=rule.severity, fix=rule.fix, fix_regex=rule.fix_regex, ) for pattern_match in output_json["matches"] ] # TODO: we should do that in Semgrep_generic.ml instead findings = dedup_output(findings) outputs[rule].extend(findings) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) # end for rule, language ... return outputs, {}, errors, set(Path(p) for p in target_manager.targets), {}
def evaluate(rule: Rule, pattern_matches: List[PatternMatch], allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]: """ Takes a Rule and list of pattern matches from a single file and handles the boolean expression evaluation of the Rule's patterns Returns a list of RuleMatches. """ output = [] pattern_ids_to_pattern_matches = group_by_pattern_id(pattern_matches) steps_for_debugging = [{ "filter": "initial", "pattern_id": None, "ranges": { k: list(set(vv.range for vv in v)) for k, v in pattern_ids_to_pattern_matches.items() }, }] logger.debug(str(pattern_ids_to_pattern_matches)) if rule.mode == TAINT_MODE: valid_ranges_to_output = { pattern_match.range for pattern_match in pattern_matches } else: valid_ranges_to_output = evaluate_expression( rule.expression, pattern_ids_to_pattern_matches, flags={RCE_RULE_FLAG: allow_exec}, steps_for_debugging=steps_for_debugging, ) # only output matches which are inside these offsets! logger.debug(f"compiled result {valid_ranges_to_output}") logger.debug("-" * 80) # Addresses https://github.com/returntocorp/semgrep/issues/1699, # where metavariables from pattern-inside are not bound to messages. # This should handle cases with pattern + pattern-inside. This doesn't handle # pattern-not-inside because it is difficult to determine metavariables for # exclusion ranges. For example: imagine a pattern-not-inside for 'def $CLASS(): ...' # and a file has multiple classes inside. How do we infer which metavariable was # intended for interpolation? As such, this will fix the immediate issue and should # handle the most common case. # Another corner case is: what should we do with nested metavars? Imagine 'def $FUNC(): ...' # and code with nested functions. Did we want the top-level function? The lowest-level? What # about other nesting cases? ¯\_(ツ)_/¯ Right now it will prefer the largest PatternMatch range. all_pattern_match_metavariables: Dict[ str, List[PatternMatch]] = defaultdict(list) for pattern_match in pattern_matches: for metavar_text in pattern_match.metavars.keys(): all_pattern_match_metavariables[metavar_text].append(pattern_match) for pattern_match in pattern_matches: if pattern_match.range in valid_ranges_to_output: message = interpolate_message_metavariables( rule, pattern_match, all_pattern_match_metavariables) fix = interpolate_fix_metavariables(rule, pattern_match) rule_match = RuleMatch.from_pattern_match( rule.id, pattern_match, message=message, metadata=rule.metadata, severity=rule.severity, fix=fix, fix_regex=rule.fix_regex, ) output.append(rule_match) return output, steps_for_debugging
def _run_rules_direct_to_semgrep_core( self, rules: List[Rule], target_manager: TargetManager, profiler: ProfileManager, ) -> Tuple[ Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]], List[SemgrepError], Set[Path], ProfilingData, ]: from itertools import chain from collections import defaultdict logger.debug(f"Passing whole rules directly to semgrep_core") outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list) errors: List[SemgrepError] = [] all_targets: Set[Path] = set() profiling_data: ProfilingData = ProfilingData() # cf. for bar_format: https://tqdm.github.io/docs/tqdm/ with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir: for rule, language in tuple( chain( *( [(rule, language) for language in rule.languages] for rule in rules ) ) ): debug_tqdm_write(f"Running rule {rule._raw.get('id')}...") with tempfile.NamedTemporaryFile( "w", suffix=".yaml" ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file: targets = self.get_files_for_language( language, rule, target_manager ) # opti: no need to call semgrep-core if no target files if not targets: continue all_targets = all_targets.union(targets) target_file.write("\n".join(map(lambda p: str(p), targets))) target_file.flush() yaml = YAML() yaml.dump({"rules": [rule._raw]}, rule_file) rule_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-fast", "-json", "-config", rule_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", semgrep_core_ast_cache_dir, "-timeout", str(self._timeout), "-max_memory", str(self._max_memory), ] if self._report_time: cmd += ["-json_time"] if self._output_settings.debug: cmd += ["-debug"] core_run = sub_run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) output_json = self._extract_core_output(rule, [], core_run) if "time" in output_json: self._add_match_times(rule, profiling_data, output_json["time"]) # end with tempfile.NamedTemporaryFile(...) ... findings = [ RuleMatch.from_pattern_match( rule.id, PatternMatch(pattern_match), message=rule.message, metadata=rule.metadata, severity=rule.severity, fix=rule.fix, fix_regex=rule.fix_regex, ) for pattern_match in output_json["matches"] ] # TODO: we should do that in Semgrep_generic.ml instead findings = dedup_output(findings) outputs[rule].extend(findings) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) # end for rule, language ... return outputs, {}, errors, all_targets, profiling_data