def evaluate( rule: Rule, pattern_matches: List[PatternMatch], allow_exec: bool ) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]: """ Takes a Rule and list of pattern matches from a single file and handles the boolean expression evaluation of the Rule's patterns Returns a list of RuleMatches. """ output = [] pattern_ids_to_pattern_matches = group_by_pattern_id(pattern_matches) steps_for_debugging = [ { "filter": "initial", "pattern_id": None, "ranges": { k: list(set(vv.range for vv in v)) for k, v in pattern_ids_to_pattern_matches.items() }, } ] logger.debug(str(pattern_ids_to_pattern_matches)) if rule.mode == TAINT_MODE: valid_ranges_to_output = { pattern_match.range for pattern_match in pattern_matches } else: valid_ranges_to_output = evaluate_expression( rule.expression, pattern_ids_to_pattern_matches, flags={RCE_RULE_FLAG: allow_exec}, steps_for_debugging=steps_for_debugging, ) # only output matches which are inside these offsets! logger.debug(f"compiled result {valid_ranges_to_output}") logger.debug("-" * 80) for pattern_match in pattern_matches: if pattern_match.range in valid_ranges_to_output: message = interpolate_message_metavariables(rule, pattern_match) fix = interpolate_fix_metavariables(rule, pattern_match) rule_match = RuleMatch( rule.id, pattern_match, message=message, metadata=rule.metadata, severity=rule.severity, fix=fix, fix_regex=rule.fix_regex, ) output.append(rule_match) return output, steps_for_debugging
def create_output( rule: Rule, pattern_matches: List[PatternMatch], valid_ranges_to_output: Optional[Set[Range]] = None, ) -> List[RuleMatch]: output = [] if valid_ranges_to_output is None: valid_ranges_to_output = { pattern_match.range for pattern_match in pattern_matches } propagated_metavariable_lookup = { _range: { metavariable: pm.get_metavariable_value(metavariable) for pm in pattern_matches for metavariable in _range.propagated_metavariables if compare_propagated_metavariable(_range, pm, metavariable) } for _range in valid_ranges_to_output } for pattern_match in pattern_matches: if pattern_match.range in valid_ranges_to_output: propagated_metavariables = propagated_metavariable_lookup[ pattern_match.range] message = interpolate_string_with_metavariables( rule.message, pattern_match, propagated_metavariables) fix = (interpolate_string_with_metavariables( rule.fix, pattern_match, propagated_metavariables) if rule.fix else None) rule_match = RuleMatch.from_pattern_match( rule.id, pattern_match, message=message, metadata=rule.metadata, severity=rule.severity, fix=fix, fix_regex=rule.fix_regex, ) output.append(rule_match) return sorted(output, key=lambda rule_match: rule_match._pattern_match.range.start)
def convert_to_rule_match(match: CoreMatch, rule: Rule) -> RuleMatch: metavariables = read_metavariables(match) message = interpolate(rule.message, metavariables) fix = interpolate(rule.fix, metavariables) if rule.fix else None rule_match = RuleMatch( rule.id, message=message, metadata=rule.metadata, severity=rule.severity, fix=fix, fix_regex=rule.fix_regex, path=match.path, start=match.start, end=match.end, extra=match.extra, lines_cache={}, ) return rule_match
def evaluate(rule: Rule, pattern_matches: List[PatternMatch], allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]: """ Takes a Rule and list of pattern matches from a single file and handles the boolean expression evaluation of the Rule's patterns Returns a list of RuleMatches. """ output = [] pattern_ids_to_pattern_matches: Dict[PatternId, List[PatternMatch]] = OrderedDict() for pm in stabilize_evaluation_ordering(pattern_matches, key=lambda pm: pm.id): pattern_ids_to_pattern_matches.setdefault(pm.id, []).append(pm) initial_ranges: DebugRanges = { pattern_id: set(pm.range for pm in pattern_matches) for pattern_id, pattern_matches in pattern_ids_to_pattern_matches.items() } steps_for_debugging = [DebuggingStep("initial", None, initial_ranges, {})] if rule.mode == TAINT_MODE: valid_ranges_to_output = { pattern_match.range for pattern_match in pattern_matches } else: valid_ranges_to_output = evaluate_expression( rule.expression, pattern_ids_to_pattern_matches, allow_exec=allow_exec, steps_for_debugging=steps_for_debugging, ) # only output matches which are inside these offsets! logger.debug(f"compiled result {valid_ranges_to_output}") logger.debug(BREAK_LINE) propagated_metavariable_lookup = { _range: { metavariable: pm.get_metavariable_value(metavariable) for pm in pattern_matches for metavariable in _range.propagated_metavariables if compare_propagated_metavariable(_range, pm, metavariable) } for _range in valid_ranges_to_output } for pattern_match in pattern_matches: if pattern_match.range in valid_ranges_to_output: propagated_metavariables = propagated_metavariable_lookup[ pattern_match.range] message = interpolate_message_metavariables( rule, pattern_match, propagated_metavariables) fix = interpolate_fix_metavariables(rule, pattern_match, propagated_metavariables) rule_match = RuleMatch.from_pattern_match( rule.id, pattern_match, message=message, metadata=rule.metadata, severity=rule.severity, fix=fix, fix_regex=rule.fix_regex, ) output.append(rule_match) return output, [attr.asdict(step) for step in steps_for_debugging]
def _run_rules_direct_to_semgrep_core( self, rules: List[Rule], target_manager: TargetManager, profiler: ProfileManager, ) -> Tuple[ Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]], List[SemgrepError], Set[Path], Dict[Any, Any], ]: from itertools import chain from collections import defaultdict outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list) errors: List[SemgrepError] = [] # cf. for bar_format: https://tqdm.github.io/docs/tqdm/ with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir: for rule, language in tuple( chain( *( [(rule, language) for language in rule.languages] for rule in rules ) ) ): debug_tqdm_write(f"Running rule {rule._raw.get('id')}...") with tempfile.NamedTemporaryFile( "w", suffix=".yaml" ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file: targets = self.get_files_for_language( language, rule, target_manager ) # opti: no need to call semgrep-core if no target files if not targets: continue target_file.write("\n".join(map(lambda p: str(p), targets))) target_file.flush() yaml = YAML() yaml.dump({"rules": [rule._raw]}, rule_file) rule_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-fast", "-json", "-config", rule_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", semgrep_core_ast_cache_dir, "-timeout", str(self._timeout), "-max_memory", str(self._max_memory), ] r = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out_bytes, err_bytes, returncode = r.stdout, r.stderr, r.returncode output_json = self._parse_core_output( out_bytes, err_bytes, returncode ) if returncode != 0: if "error" in output_json: self._raise_semgrep_error_from_json(output_json, [], rule) else: raise SemgrepError( f"unexpected json output while invoking semgrep-core with rule '{rule.id}':\n{PLEASE_FILE_ISSUE_TEXT}" ) # end with tempfile.NamedTemporaryFile(...) ... findings = [ RuleMatch.from_pattern_match( rule.id, PatternMatch(pattern_match), message=rule.message, metadata=rule.metadata, severity=rule.severity, fix=rule.fix, fix_regex=rule.fix_regex, ) for pattern_match in output_json["matches"] ] # TODO: we should do that in Semgrep_generic.ml instead findings = dedup_output(findings) outputs[rule].extend(findings) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) # end for rule, language ... return outputs, {}, errors, set(Path(p) for p in target_manager.targets), {}
def evaluate(rule: Rule, pattern_matches: List[PatternMatch], allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]: """ Takes a Rule and list of pattern matches from a single file and handles the boolean expression evaluation of the Rule's patterns Returns a list of RuleMatches. """ output = [] pattern_ids_to_pattern_matches = group_by_pattern_id(pattern_matches) steps_for_debugging = [{ "filter": "initial", "pattern_id": None, "ranges": { k: list(set(vv.range for vv in v)) for k, v in pattern_ids_to_pattern_matches.items() }, }] logger.debug(str(pattern_ids_to_pattern_matches)) if rule.mode == TAINT_MODE: valid_ranges_to_output = { pattern_match.range for pattern_match in pattern_matches } else: valid_ranges_to_output = evaluate_expression( rule.expression, pattern_ids_to_pattern_matches, flags={RCE_RULE_FLAG: allow_exec}, steps_for_debugging=steps_for_debugging, ) # only output matches which are inside these offsets! logger.debug(f"compiled result {valid_ranges_to_output}") logger.debug("-" * 80) # Addresses https://github.com/returntocorp/semgrep/issues/1699, # where metavariables from pattern-inside are not bound to messages. # This should handle cases with pattern + pattern-inside. This doesn't handle # pattern-not-inside because it is difficult to determine metavariables for # exclusion ranges. For example: imagine a pattern-not-inside for 'def $CLASS(): ...' # and a file has multiple classes inside. How do we infer which metavariable was # intended for interpolation? As such, this will fix the immediate issue and should # handle the most common case. # Another corner case is: what should we do with nested metavars? Imagine 'def $FUNC(): ...' # and code with nested functions. Did we want the top-level function? The lowest-level? What # about other nesting cases? ¯\_(ツ)_/¯ Right now it will prefer the largest PatternMatch range. all_pattern_match_metavariables: Dict[ str, List[PatternMatch]] = defaultdict(list) for pattern_match in pattern_matches: for metavar_text in pattern_match.metavars.keys(): all_pattern_match_metavariables[metavar_text].append(pattern_match) for pattern_match in pattern_matches: if pattern_match.range in valid_ranges_to_output: message = interpolate_message_metavariables( rule, pattern_match, all_pattern_match_metavariables) fix = interpolate_fix_metavariables(rule, pattern_match) rule_match = RuleMatch.from_pattern_match( rule.id, pattern_match, message=message, metadata=rule.metadata, severity=rule.severity, fix=fix, fix_regex=rule.fix_regex, ) output.append(rule_match) return output, steps_for_debugging
def run_join_rule( join_rule: Dict[str, Any], targets: List[Path], ) -> Tuple[List[RuleMatch], List[SemgrepError]]: """ Run a 'join' mode rule. Join rules are comprised of multiple Semgrep rules and a set of conditions which must be satisfied in order to return a result. These conditions are typically some comparison of metavariable contents from different rules. 'join_rule' is a join rule definition in dictionary form. The required keys are {'id', 'mode', 'severity', 'message', 'join'}. 'join' is dictionary with the required keys {'refs', 'on'}. 'refs' is dictionary with the required key {'rule'}. 'rule' is identical to a Semgrep config string -- the same thing used on the command line. e.g., `semgrep -f p/javascript.lang.security.rule` or `semgrep -f path/to/rule.yaml`. 'refs' has optional keys {'renames', 'as'}. 'renames' is a list of objects with properties {'from', 'to'}. 'renames' are used to rename metavariables of the associated 'rule'. 'as' lets you alias the collection of rule results for use in the conditions, similar to a SQL alias. By default, collection names will be the rule ID. 'on' is a list of strings of the form <collection>.<property> <operator> <collection>.<property>. These are the conditions which must be satisifed for this rule to report results. All conditions must be satisfied. See semgrep/tests/e2e/rules/join_rules/user-input-with-unescaped-extension.yaml for an example. """ join_contents = join_rule.get("join", {}) semgrep_config_strings = [ ref.get("rule") for ref in join_contents.get("refs", []) ] config_map = create_config_map(semgrep_config_strings) join_rule_refs: List[Ref] = [ Ref( id=config_map[ref.get("rule")].id, renames={ rename.get("from"): rename.get("to") for rename in ref.get("renames", []) }, alias=ref.get("as"), ) for ref in join_contents.get("refs", []) ] refs_lookup = {ref.id: ref for ref in join_rule_refs} alias_lookup = {ref.alias: ref.id for ref in join_rule_refs} try: conditions = [ Condition.parse(condition_string) for condition_string in join_contents.get("on", []) ] except InvalidConditionError as e: return [], [e] # Run Semgrep with tempfile.NamedTemporaryFile() as rule_path: yaml.dump({"rules": [rule.raw for rule in config_map.values()]}, rule_path) rule_path.flush() output = semgrep.semgrep_main.invoke_semgrep( config=Path(rule_path.name), targets=targets, no_rewrite_rule_ids=True, optimizations="all", ) assert isinstance(output, dict) # placate mypy results = output.get("results", []) errors = output.get("errors", []) parsed_errors = [] for error_dict in errors: try: """ This is a hack to reconstitute errors after they've been JSONified as output. Subclasses of SemgrepError define the 'level' and 'code' as class properties, which means they aren't accepted as arguments when instantiated. 'type' is also added when errors are JSONified, and is just a string of the error class name. It's not used as an argument. All of these properties will be properly populated because it's using the class properties of the SemgrepError inferred by 'type'. """ del error_dict["code"] del error_dict["level"] errortype = error_dict.get("type") del error_dict["type"] parsed_errors.append( ERROR_MAP[error_dict.get(errortype)].from_dict(error_dict)) except KeyError: logger.warning( f"Could not reconstitute Semgrep error: {error_dict}.\nSkipping processing of error" ) continue # Small optimization: if there are no results for rules that # are used in a condition, there's no sense in continuing. collection_set_unaliased = { alias_lookup[collection] for collection in create_collection_set_from_conditions(conditions) } rule_ids = set(result.get("check_id") for result in results) if collection_set_unaliased - rule_ids: logger.debug( f"No results for {collection_set_unaliased - rule_ids} in join rule '{join_rule.get('id')}'." ) return [], parsed_errors # Rename metavariables with user-defined renames. rename_metavars_in_place(results, refs_lookup) # Create a model map. This allows dynamically creating DB tables based # on Semgrep's results. There is one table for each rule ID. model_map = create_model_map(results) db.connect() db.create_tables(model_map.values()) # Populate the model tables with real data from the Semgrep results. load_results_into_db(results, model_map) # Apply the conditions and only keep combinations # of findings that satisfy the conditions. matches = [] matched_on_conditions = match_on_conditions( model_map, alias_lookup, [ Condition.parse(condition_string) for condition_string in join_contents.get("on", []) ], ) if matched_on_conditions: # This is ugly, but makes mypy happy for match in matched_on_conditions: matches.append( json.loads(match.raw.decode("utf-8", errors="replace"))) rule_matches = [ RuleMatch( id=join_rule.get("id", match.get("check_id", "[empty]")), pattern_match=PatternMatch({}), message=join_rule.get( "message", match.get("extra", {}).get("message", "[empty]")), metadata=join_rule.get("metadata", match.get("extra", {}).get("metadata", {})), severity=join_rule.get("severity", match.get("severity", "INFO")), path=Path(match.get("path", "[empty]")), start=match.get("start", {}), end=match.get("end", {}), extra=match.get("extra", {}), fix=None, fix_regex=None, lines_cache={}, ) for match in matches ] db.close() return rule_matches, parsed_errors
def _run_rules_direct_to_semgrep_core( self, rules: List[Rule], target_manager: TargetManager, profiler: ProfileManager, ) -> Tuple[ Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]], List[SemgrepError], Set[Path], ProfilingData, ]: from itertools import chain from collections import defaultdict logger.debug(f"Passing whole rules directly to semgrep_core") outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list) errors: List[SemgrepError] = [] all_targets: Set[Path] = set() profiling_data: ProfilingData = ProfilingData() # cf. for bar_format: https://tqdm.github.io/docs/tqdm/ with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir: for rule, language in tuple( chain( *( [(rule, language) for language in rule.languages] for rule in rules ) ) ): debug_tqdm_write(f"Running rule {rule._raw.get('id')}...") with tempfile.NamedTemporaryFile( "w", suffix=".yaml" ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file: targets = self.get_files_for_language( language, rule, target_manager ) # opti: no need to call semgrep-core if no target files if not targets: continue all_targets = all_targets.union(targets) target_file.write("\n".join(map(lambda p: str(p), targets))) target_file.flush() yaml = YAML() yaml.dump({"rules": [rule._raw]}, rule_file) rule_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-fast", "-json", "-config", rule_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", semgrep_core_ast_cache_dir, "-timeout", str(self._timeout), "-max_memory", str(self._max_memory), ] if self._report_time: cmd += ["-json_time"] if self._output_settings.debug: cmd += ["-debug"] core_run = sub_run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) output_json = self._extract_core_output(rule, [], core_run) if "time" in output_json: self._add_match_times(rule, profiling_data, output_json["time"]) # end with tempfile.NamedTemporaryFile(...) ... findings = [ RuleMatch.from_pattern_match( rule.id, PatternMatch(pattern_match), message=rule.message, metadata=rule.metadata, severity=rule.severity, fix=rule.fix, fix_regex=rule.fix_regex, ) for pattern_match in output_json["matches"] ] # TODO: we should do that in Semgrep_generic.ml instead findings = dedup_output(findings) outputs[rule].extend(findings) errors.extend( CoreException.from_json(e, language, rule.id).into_semgrep_error() for e in output_json["errors"] ) # end for rule, language ... return outputs, {}, errors, all_targets, profiling_data