def test_explicit_path(tmp_path, monkeypatch): foo = tmp_path / "foo" foo.mkdir() (foo / "a.go").touch() (foo / "b.go").touch() foo_noext = foo / "noext" foo_noext.touch() foo_a = foo / "a.py" foo_a.touch() foo_b = foo / "b.py" foo_b.touch() monkeypatch.chdir(tmp_path) # Should include explicitly passed python file foo_a = foo_a.relative_to(tmp_path) output_settings = OutputSettings( output_format=OutputFormat.TEXT, output_destination=None, error_on_findings=False, strict=False, ) defaulthandler = OutputHandler(output_settings) python_language = Language("python") assert foo_a in TargetManager( [], [], ["foo/a.py"], False, defaulthandler ).get_files(python_language, [], []) # Should include explicitly passed python file even if is in excludes assert foo_a not in TargetManager( [], ["foo/a.py"], ["."], False, defaulthandler ).get_files(python_language, [], []) assert foo_a in TargetManager( [], ["foo/a.py"], [".", "foo/a.py"], False, defaulthandler ).get_files(python_language, [], []) # Should ignore expliclty passed .go file when requesting python assert ( TargetManager([], [], ["foo/a.go"], False, defaulthandler).get_files( python_language, [], [] ) == [] ) # Should include explicitly passed file with unknown extension assert cmp_path_sets( set( TargetManager([], [], ["foo/noext"], False, defaulthandler).get_files( python_language, [], [] ) ), {foo_noext}, )
def test_filter_by_size(): with NamedTemporaryFile() as fp: fp.write(b"0123456789") fp.flush() path = Path(fp.name) targets = frozenset({path}) # no max size assert len(TargetManager.filter_by_size(targets, 0)) == 1 # file is under max size assert len(TargetManager.filter_by_size(targets, 20)) == 1 # file is over max size assert len(TargetManager.filter_by_size(targets, 5)) == 0
def test_ignore_git_dir(tmp_path, monkeypatch): """ Ignores all files in .git directory when scanning generic """ foo = tmp_path / ".git" foo.mkdir() (foo / "bar").touch() monkeypatch.chdir(tmp_path) language = Language("generic") output_settings = OutputSettings( output_format=OutputFormat.TEXT, output_destination=None, error_on_findings=False, verbose_errors=False, strict=False, json_stats=False, output_time=False, output_per_finding_max_lines_limit=None, output_per_line_max_chars_limit=None, ) defaulthandler = OutputHandler(output_settings) assert [] == TargetManager([], [], 0, [foo], True, defaulthandler, False).get_files( language, [], [] )
def test_skip_symlink(tmp_path, monkeypatch): foo = tmp_path / "foo" foo.mkdir() (foo / "a.py").touch() (foo / "link.py").symlink_to(foo / "a.py") monkeypatch.chdir(tmp_path) python_language = Language("python") assert cmp_path_sets( TargetManager.expand_targets([foo], python_language, False), {foo / "a.py"}, ) assert cmp_path_sets( TargetManager.expand_targets([foo / "link.py"], python_language, False), set())
def get_files_for_language( language: Language, rule: Rule, target_manager: TargetManager ) -> List[Path]: try: targets = target_manager.get_files(language, rule.includes, rule.excludes) except _UnknownLanguageError as ex: raise UnknownLanguageError( short_msg=f"invalid language: {language}", long_msg=f"unsupported language: {language}. supported languages are: {', '.join(all_supported_languages())}", spans=[rule.languages_span.with_context(before=1, after=1)], ) from ex return targets
def test_explicit_path(tmp_path, monkeypatch): foo = tmp_path / "foo" foo.mkdir() (foo / "a.go").touch() (foo / "b.go").touch() foo_noext = foo / "noext" foo_noext.touch() foo_a = foo / "a.py" foo_a.touch() foo_b = foo / "b.py" foo_b.touch() monkeypatch.chdir(tmp_path) # Should include explicitly passed python file foo_a = foo_a.relative_to(tmp_path) assert foo_a in TargetManager([], [], ["foo/a.py"], False).get_files("python", [], []) # Should include explicitly passed python file even if is in excludes assert foo_a not in TargetManager([], ["foo/a.py"], ["."], False).get_files("python", [], []) assert foo_a in TargetManager([], ["foo/a.py"], [".", "foo/a.py"], False).get_files("python", [], []) # Should ignore expliclty passed .go file when requesting python assert TargetManager([], [], ["foo/a.go"], False).get_files("python", [], []) == [] # Should include explicitly passed file with unknown extension assert cmp_path_sets( set( TargetManager([], [], ["foo/noext"], False).get_files("python", [], [])), {foo_noext}, )
def test_delete_git(tmp_path, monkeypatch): """ Check that deleted files are not included in expanded targets """ foo = tmp_path / "foo.py" bar = tmp_path / "bar.py" foo.touch() bar.touch() monkeypatch.chdir(tmp_path) subprocess.run(["git", "init"]) subprocess.run(["git", "add", foo]) subprocess.run(["git", "commit", "-m", "first commit"]) foo.unlink() subprocess.run(["git", "status"]) assert cmp_path_sets( TargetManager.expand_targets([Path(".")], "python", True), {bar})
def test_filter_exclude(): all_file_names = [ "foo.py", "foo.go", "foo.java", "foo/bar.py", "foo/bar.go", "bar/foo/baz/bar.go", "foo/bar.java", "bar/baz", "baz.py", "baz.go", "baz.java", "bar/foo/foo.py", "foo", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", "/foo/bar/baz/a.py", ] all_files = set({Path(elem) for elem in all_file_names}) # Filter out .py files assert len(TargetManager.filter_excludes(all_files, ["*.py"])) == 9 # Filter out files in a foo directory ancestor assert len(TargetManager.filter_excludes(all_files, ["foo"])) == 7 # Filter out files with an ancestor named bar/baz assert len(TargetManager.filter_excludes(all_files, ["bar/baz"])) == 12 # Filter out go files assert len(TargetManager.filter_excludes(all_files, ["*.go"])) == 14 # Filter out go and java files assert len(TargetManager.filter_excludes(all_files, ["*.go", "*.java"])) == 11 # Filter out go files with a direct ancestor named foo assert len(TargetManager.filter_excludes(all_files, ["foo/*.go"])) == 17
def test_expand_targets_not_git(tmp_path, monkeypatch): """ Check that directory expansion works with relative paths, absolute paths, paths with .. """ foo = tmp_path / "foo" foo.mkdir() (foo / "a.go").touch() (foo / "b.go").touch() (foo / "py").touch() foo_a = foo / "a.py" foo_a.touch() foo_b = foo / "b.py" foo_b.touch() bar = tmp_path / "bar" bar.mkdir() bar_a = bar / "a.py" bar_a.touch() bar_b = bar / "b.py" bar_b.touch() foo_bar = foo / "bar" foo_bar.mkdir() foo_bar_a = foo_bar / "a.py" foo_bar_a.touch() foo_bar_b = foo_bar / "b.py" foo_bar_b.touch() in_foo_bar = {foo_bar_a, foo_bar_b} in_foo = {foo_a, foo_b}.union(in_foo_bar) in_bar = {bar_a, bar_b} in_all = in_foo.union(in_bar) python_language = Language("python") monkeypatch.chdir(tmp_path) assert cmp_path_sets( TargetManager.expand_targets([Path(".")], python_language, False), in_all) assert cmp_path_sets( TargetManager.expand_targets([Path("bar")], python_language, False), in_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("foo")], python_language, False), in_foo) assert cmp_path_sets( TargetManager.expand_targets([Path("foo").resolve()], python_language, False), in_foo, ) assert cmp_path_sets( TargetManager.expand_targets([Path("foo/bar")], python_language, False), in_foo_bar, ) assert cmp_path_sets( TargetManager.expand_targets([Path("foo/bar").resolve()], python_language, False), in_foo_bar, ) monkeypatch.chdir(foo) assert cmp_path_sets( TargetManager.expand_targets([Path(".")], python_language, False), in_foo) assert cmp_path_sets( TargetManager.expand_targets([Path("./foo")], python_language, False), set()) assert cmp_path_sets( TargetManager.expand_targets([Path("bar")], python_language, False), in_foo_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("bar")], python_language, False), in_foo_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("..")], python_language, False), in_all) assert cmp_path_sets( TargetManager.expand_targets([Path("../bar")], python_language, False), in_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("../foo/bar")], python_language, False), in_foo_bar, )
def test_expand_targets_git(tmp_path, monkeypatch): """ Test TargetManager with visible_to_git_only flag on in a git repository with nested .gitignores """ foo = tmp_path / "foo" foo.mkdir() foo_a_go = foo / "a.go" foo_a_go.touch() (foo / "b.go").touch() (foo / "py").touch() foo_a = foo / "a.py" foo_a.touch() foo_b = foo / "b.py" foo_b.touch() bar = tmp_path / "bar" bar.mkdir() bar_a = bar / "a.py" bar_a.touch() bar_b = bar / "b.py" bar_b.touch() foo_bar = foo / "bar" foo_bar.mkdir() foo_bar_a = foo_bar / "a.py" foo_bar_a.touch() foo_bar_b = foo_bar / "b.py" foo_bar_b.touch() monkeypatch.chdir(tmp_path) subprocess.run(["git", "init"]) subprocess.run(["git", "add", foo_a]) subprocess.run(["git", "add", foo_bar_a]) subprocess.run(["git", "add", foo_bar_b]) subprocess.run(["git", "add", foo_a_go]) subprocess.run(["git", "commit", "-m", "first"]) # Check that all files are visible without a .gitignore in_foo_bar = {foo_bar_a, foo_bar_b} in_foo = {foo_a, foo_b}.union(in_foo_bar) in_bar = {bar_a, bar_b} in_all = in_foo.union(in_bar) python_language = Language("python") monkeypatch.chdir(tmp_path) assert cmp_path_sets( TargetManager.expand_targets([Path(".")], python_language, True), in_all) assert cmp_path_sets( TargetManager.expand_targets([Path("bar")], python_language, True), in_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("foo")], python_language, True), in_foo) assert cmp_path_sets( TargetManager.expand_targets([Path("foo").resolve()], python_language, True), in_foo, ) assert cmp_path_sets( TargetManager.expand_targets([Path("foo/bar")], python_language, True), in_foo_bar, ) assert cmp_path_sets( TargetManager.expand_targets([Path("foo/bar").resolve()], python_language, True), in_foo_bar, ) monkeypatch.chdir(foo) assert cmp_path_sets( TargetManager.expand_targets([Path(".")], python_language, True), in_foo) assert cmp_path_sets( TargetManager.expand_targets([Path("./foo")], python_language, True), set()) assert cmp_path_sets( TargetManager.expand_targets([Path("bar")], python_language, True), in_foo_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("bar")], python_language, True), in_foo_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("..")], python_language, True), in_all) assert cmp_path_sets( TargetManager.expand_targets([Path("../bar")], python_language, True), in_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("../foo/bar")], python_language, True), in_foo_bar, ) # Add bar/, foo/bar/a.py, foo/b.py to gitignores monkeypatch.chdir(tmp_path) (tmp_path / ".gitignore").write_text("bar/\nfoo/bar/a.py") (tmp_path / "foo" / ".gitignore").write_text("b.py") # Reflect what should now be visible given gitignores in_foo_bar = { foo_bar_a, foo_bar_b, } # foo/bar/a.py is gitignored but is already tracked in_foo = {foo_a}.union( in_foo_bar) # foo/b.py is gitignored with a nested gitignore in_bar = set() # bar/ is gitignored in_all = in_foo.union(in_bar) monkeypatch.chdir(tmp_path) assert cmp_path_sets( TargetManager.expand_targets([Path(".")], python_language, True), in_all) assert cmp_path_sets( TargetManager.expand_targets([Path("bar")], python_language, True), in_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("foo")], python_language, True), in_foo) assert cmp_path_sets( TargetManager.expand_targets([Path("foo").resolve()], python_language, True), in_foo, ) assert cmp_path_sets( TargetManager.expand_targets([Path("foo/bar")], python_language, True), in_foo_bar, ) assert cmp_path_sets( TargetManager.expand_targets([Path("foo/bar").resolve()], python_language, True), in_foo_bar, ) monkeypatch.chdir(foo) assert cmp_path_sets( TargetManager.expand_targets([Path(".")], python_language, True), in_foo) assert cmp_path_sets( TargetManager.expand_targets([Path("./foo")], python_language, True), set()) assert cmp_path_sets( TargetManager.expand_targets([Path("bar")], python_language, True), in_foo_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("bar")], python_language, True), in_foo_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("..")], python_language, True), in_all) assert cmp_path_sets( TargetManager.expand_targets([Path("../bar")], python_language, True), in_bar) assert cmp_path_sets( TargetManager.expand_targets([Path("../foo/bar")], python_language, True), in_foo_bar, )
def main( output_handler: OutputHandler, target: List[str], pattern: str, lang: str, config: str, no_rewrite_rule_ids: bool = False, jobs: int = 1, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, strict: bool = False, autofix: bool = False, dryrun: bool = False, disable_nosem: bool = False, dangerously_allow_arbitrary_code_execution_from_rules: bool = False, no_git_ignore: bool = False, timeout: int = 0, ) -> None: if include is None: include = [] if exclude is None: exclude = [] valid_configs, config_errors = get_config(pattern, lang, config) output_handler.handle_semgrep_errors(config_errors) if config_errors and strict: raise SemgrepError( f"run with --strict and there were {len(config_errors)} errors loading configs", code=MISSING_CONFIG_EXIT_CODE, ) if not no_rewrite_rule_ids: # re-write the configs to have the hierarchical rule ids valid_configs = rename_rule_ids(valid_configs) # extract just the rules from valid configs all_rules = flatten_configs(valid_configs) if not pattern: plural = "s" if len(valid_configs) > 1 else "" config_id_if_single = (list(valid_configs.keys())[0] if len(valid_configs) == 1 else "") invalid_msg = (f"({len(config_errors)} config files were invalid)" if len(config_errors) else "") logger.debug( f"running {len(all_rules)} rules from {len(valid_configs)} config{plural} {config_id_if_single} {invalid_msg}" ) notify_user_of_work(all_rules, include, exclude) if len(valid_configs) == 0: raise SemgrepError( f"no valid configuration file found ({len(config_errors)} configs were invalid)", code=MISSING_CONFIG_EXIT_CODE, ) respect_git_ignore = not no_git_ignore target_manager = TargetManager( includes=include, excludes=exclude, targets=target, respect_git_ignore=respect_git_ignore, output_handler=output_handler, ) # actually invoke semgrep rule_matches_by_rule, debug_steps_by_rule, semgrep_errors = CoreRunner( allow_exec=dangerously_allow_arbitrary_code_execution_from_rules, jobs=jobs, timeout=timeout, ).invoke_semgrep(target_manager, all_rules) output_handler.handle_semgrep_errors(semgrep_errors) if not disable_nosem: rule_matches_by_rule = { rule: [ rule_match for rule_match in rule_matches if not rule_match_nosem(rule_match, strict) ] for rule, rule_matches in rule_matches_by_rule.items() } output_handler.handle_semgrep_core_output(rule_matches_by_rule, debug_steps_by_rule) if autofix: apply_fixes(rule_matches_by_rule, dryrun)
def test_filter_exclude(): all_file_names = [ "/foo/bar/baz/a.py", "bar/baz", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", "bar/foo/baz/bar.go", "bar/foo/foo.py", "baz.go", "baz.java", "baz.py", "baz/foo", "foo", "foo.go", "foo.java", "foo.py", "foo/bar.go", "foo/bar.java", "foo/bar.py", ] all_files = frozenset({Path(elem) for elem in all_file_names}) # Filter out .py files assert TargetManager.filter_excludes(all_files, ["*.py"]) == { Path(p) for p in [ "bar/baz", "bar/foo/baz/bar.go", "baz.go", "baz.java", "baz/foo", "foo", "foo.go", "foo.java", "foo/bar.go", "foo/bar.java", ] } # Filter out go files assert TargetManager.filter_excludes(all_files, ["*.go"]) == { Path(p) for p in [ "/foo/bar/baz/a.py", "bar/baz", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", "bar/foo/foo.py", "baz.java", "baz.py", "baz/foo", "foo", "foo.java", "foo.py", "foo/bar.java", "foo/bar.py", ] } # Filter out go and java files assert TargetManager.filter_excludes(all_files, ["*.go", "*.java"]) == { Path(p) for p in [ "/foo/bar/baz/a.py", "bar/baz", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", "bar/foo/foo.py", "baz.py", "baz/foo", "foo", "foo.py", "foo/bar.py", ] } # Filter out files named foo or in a foo directory ancestor assert TargetManager.filter_excludes(all_files, ["foo"]) == { Path(p) for p in [ "bar/baz", "baz.go", "baz.java", "baz.py", "foo.go", "foo.java", "foo.py", ] } # Filter out files with an ancestor named bar/baz assert TargetManager.filter_excludes(all_files, ["bar/baz"]) == { Path(p) for p in [ "bar/foo/baz/bar.go", "bar/foo/foo.py", "baz.go", "baz.java", "baz.py", "baz/foo", "foo", "foo.go", "foo.java", "foo.py", "foo/bar.go", "foo/bar.java", "foo/bar.py", ] } # Filter out go files with a direct ancestor named foo assert TargetManager.filter_excludes(all_files, ["foo/*.go"]) == { Path(p) for p in [ "/foo/bar/baz/a.py", "bar/baz", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", "bar/foo/baz/bar.go", "bar/foo/foo.py", "baz.go", "baz.java", "baz.py", "baz/foo", "foo", "foo.go", "foo.java", "foo.py", "foo/bar.java", "foo/bar.py", ] } # Filter out go files with a ancestor named foo assert TargetManager.filter_excludes(all_files, ["foo/**/*.go"]) == { Path(p) for p in [ "/foo/bar/baz/a.py", "bar/baz", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", "bar/foo/foo.py", "baz.go", "baz.java", "baz.py", "baz/foo", "foo", "foo.go", "foo.java", "foo.py", "foo/bar.java", "foo/bar.py", ] } # Filter out py files with three-characters name assert TargetManager.filter_excludes(all_files, ["???.py"]) == { Path(p) for p in [ "/foo/bar/baz/a.py", "bar/baz", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", "bar/foo/baz/bar.go", "baz.go", "baz.java", "baz/foo", "foo", "foo.go", "foo.java", "foo/bar.go", "foo/bar.java", ] }
def test_filter_include(): all_file_names = [ "/foo/bar/baz/a.py", "bar/baz", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", "bar/foo/baz/bar.go", "bar/foo/foo.py", "baz.go", "baz.java", "baz.py", "baz/foo", "foo", "foo.go", "foo.java", "foo.py", "foo/bar.go", "foo/bar.java", "foo/bar.py", ] all_files = frozenset({Path(elem) for elem in all_file_names}) # All .py files assert TargetManager.filter_includes(all_files, ["*.py"]) == { Path(p) for p in [ "/foo/bar/baz/a.py", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", "bar/foo/foo.py", "baz.py", "foo.py", "foo/bar.py", ] } # All go files assert TargetManager.filter_includes(all_files, ["*.go"]) == { Path(p) for p in [ "bar/foo/baz/bar.go", "baz.go", "foo.go", "foo/bar.go", ] } # All go and java files assert TargetManager.filter_includes(all_files, ["*.go", "*.java"]) == { Path(p) for p in [ "bar/foo/baz/bar.go", "baz.go", "baz.java", "foo.go", "foo.java", "foo/bar.go", "foo/bar.java", ] } # All files named foo or in a foo directory ancestor assert TargetManager.filter_includes(all_files, ["foo"]) == { Path(p) for p in [ "/foo/bar/baz/a.py", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", "bar/foo/baz/bar.go", "bar/foo/foo.py", "baz/foo", "foo", "foo/bar.go", "foo/bar.java", "foo/bar.py", ] } # All files with an ancestor named bar/baz assert TargetManager.filter_includes(all_files, ["bar/baz"]) == { Path(p) for p in [ "/foo/bar/baz/a.py", "bar/baz", "bar/baz/foo/a.py", "bar/baz/foo/b.py", "bar/baz/foo/c.py", "bar/baz/qux/foo/a.py", ] } # All go files with a direct ancestor named foo assert TargetManager.filter_includes( all_files, ["foo/*.go"]) == {Path(p) for p in [ "foo/bar.go", ]} # All go files with a ancestor named foo assert TargetManager.filter_includes(all_files, ["foo/**/*.go"]) == { Path(p) for p in [ "bar/foo/baz/bar.go", "foo/bar.go", ] } # All py files with three-characters name assert TargetManager.filter_includes(all_files, ["???.py"]) == { Path(p) for p in [ "bar/foo/foo.py", "baz.py", "foo.py", "foo/bar.py", ] } # Test some different variantions of the pattern yield the same result. assert TargetManager.filter_includes( all_files, ["baz/qux"]) == TargetManager.filter_includes(all_files, ["/baz/qux"]) assert TargetManager.filter_includes( all_files, ["baz/qux"]) == TargetManager.filter_includes(all_files, ["baz/qux/"]) assert TargetManager.filter_includes( all_files, ["baz/qux"]) == TargetManager.filter_includes(all_files, ["/baz/qux/"]) assert TargetManager.filter_includes( all_files, ["baz/qux"]) == TargetManager.filter_includes(all_files, ["**/baz/qux"]) assert TargetManager.filter_includes( all_files, ["baz/qux"]) == TargetManager.filter_includes(all_files, ["baz/qux/**"]) assert TargetManager.filter_includes( all_files, ["baz/qux"]) == TargetManager.filter_includes(all_files, ["**/baz/qux/**"])
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[CoreException] = [] equivalences = rule.equivalences for language, all_patterns_for_language in self._group_patterns_by_language( [rule]).items(): try: targets = target_manager.get_files(language, rule.includes, rule.excludes) except _UnknownLanguageError as ex: raise UnknownLanguageError( short_msg="invalid language", long_msg=f"unsupported language {language}", spans=[ rule.languages_span.with_context(before=1, after=1) ], ) from ex if targets == []: continue # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: patterns_json = [ pattern.to_json() for pattern in patterns_regex ] try: patterns_re = [(pattern["id"], re.compile(pattern["pattern"])) for pattern in patterns_json] except re.error as err: raise SemgrepError( f"invalid regular expression specified: {err}") re_fn = functools.partial(get_re_matches, patterns_re) with multiprocessing.Pool(self._jobs) as pool: matches = pool.map(re_fn, targets) outputs.extend(single_match for file_matches in matches for single_match in file_matches) patterns_json = [p.to_json() for p in patterns] with tempfile.NamedTemporaryFile( "w") as pattern_file, tempfile.NamedTemporaryFile( "w") as target_file, tempfile.NamedTemporaryFile( "w") as equiv_file: yaml = YAML() yaml.dump({"rules": patterns_json}, pattern_file) pattern_file.flush() target_file.write("\n".join(str(t) for t in targets)) target_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-rules_file", pattern_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", cache_dir, ] if equivalences: self._write_equivalences_file(equiv_file, equivalences) cmd += ["-equivalences", equiv_file.name] core_run = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) debug_print(core_run.stderr.decode("utf-8", "replace")) if core_run.returncode != 0: # see if semgrep output a JSON error that we can decode semgrep_output = core_run.stdout.decode("utf-8", "replace") try: output_json = json.loads(semgrep_output) except ValueError: raise SemgrepError( f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) if "error" in output_json: self._raise_semgrep_error_from_json( output_json, patterns) else: raise SemgrepError( f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) output_json = json.loads( (core_run.stdout.decode("utf-8", "replace"))) errors.extend( CoreException.from_json(e, language) for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): debug_print( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors
def test_explicit_path(tmp_path, monkeypatch): foo = tmp_path / "foo" foo.mkdir() (foo / "a.go").touch() (foo / "b.go").touch() foo_noext = foo / "noext" foo_noext.touch() foo_a = foo / "a.py" foo_a.touch() foo_b = foo / "b.py" foo_b.touch() monkeypatch.chdir(tmp_path) # Should include explicitly passed python file foo_a = foo_a.relative_to(tmp_path) output_settings = OutputSettings( output_format=OutputFormat.TEXT, output_destination=None, error_on_findings=False, verbose_errors=False, strict=False, json_stats=False, output_time=False, output_per_finding_max_lines_limit=None, output_per_line_max_chars_limit=None, ) defaulthandler = OutputHandler(output_settings) python_language = Language("python") assert foo_a in TargetManager([], [], 0, ["foo/a.py"], False, defaulthandler, False).get_files(python_language, [], []) assert foo_a in TargetManager([], [], 0, ["foo/a.py"], False, defaulthandler, True).get_files(python_language, [], []) # Should include explicitly passed python file even if is in excludes assert foo_a not in TargetManager([], ["foo/a.py"], 0, ["."], False, defaulthandler, False).get_files(python_language, [], []) assert foo_a in TargetManager([], ["foo/a.py"], 0, [".", "foo/a.py"], False, defaulthandler, False).get_files(python_language, [], []) # Should ignore expliclty passed .go file when requesting python assert (TargetManager([], [], 0, ["foo/a.go"], False, defaulthandler, False).get_files(python_language, [], []) == frozenset()) # Should include explicitly passed file with unknown extension if skip_unknown_extensions=False assert cmp_path_sets( TargetManager([], [], 0, ["foo/noext"], False, defaulthandler, False).get_files(python_language, [], []), {foo_noext}, ) # Should not include explicitly passed file with unknown extension if skip_unknown_extensions=True assert cmp_path_sets( TargetManager([], [], 0, ["foo/noext"], False, defaulthandler, True).get_files(python_language, [], []), set(), ) # Should include explicitly passed file with correct extension even if skip_unknown_extensions=True assert cmp_path_sets( TargetManager([], [], 0, ["foo/noext", "foo/a.py"], False, defaulthandler, True).get_files(python_language, [], []), {foo_a}, )
def main( output_handler: OutputHandler, target: List[str], pattern: str, lang: str, configs: List[str], no_rewrite_rule_ids: bool = False, jobs: int = 1, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, strict: bool = False, autofix: bool = False, dryrun: bool = False, disable_nosem: bool = False, dangerously_allow_arbitrary_code_execution_from_rules: bool = False, no_git_ignore: bool = False, timeout: int = DEFAULT_TIMEOUT, max_memory: int = 0, max_target_bytes: int = 0, timeout_threshold: int = 0, skip_unknown_extensions: bool = False, severity: Optional[List[str]] = None, optimizations: str = "none", ) -> None: if include is None: include = [] if exclude is None: exclude = [] configs_obj, errors = get_config(pattern, lang, configs) all_rules = configs_obj.get_rules(no_rewrite_rule_ids) if severity is None or severity == []: filtered_rules = all_rules else: filtered_rules = [ rule for rule in all_rules if rule.severity in severity ] output_handler.handle_semgrep_errors(errors) if errors and strict: raise SemgrepError( f"run with --strict and there were {len(errors)} errors loading configs", code=MISSING_CONFIG_EXIT_CODE, ) if not pattern: plural = "s" if len(configs_obj.valid) > 1 else "" config_id_if_single = (list(configs_obj.valid.keys())[0] if len(configs_obj.valid) == 1 else "") invalid_msg = (f"({len(errors)} config files were invalid)" if len(errors) else "") logger.verbose( f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}" ) if len(configs_obj.valid) == 0: if len(errors) > 0: raise SemgrepError( f"no valid configuration file found ({len(errors)} configs were invalid)", code=MISSING_CONFIG_EXIT_CODE, ) else: raise SemgrepError( """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>. If you're looking for a config to start with, there are thousands at: https://semgrep.dev The two most popular are: --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI --config=p/security-audit # find security audit points; noisy, not recommended for CI """, code=MISSING_CONFIG_EXIT_CODE, ) notify_user_of_work(filtered_rules, include, exclude) respect_git_ignore = not no_git_ignore target_manager = TargetManager( includes=include, excludes=exclude, max_target_bytes=max_target_bytes, targets=target, respect_git_ignore=respect_git_ignore, output_handler=output_handler, skip_unknown_extensions=skip_unknown_extensions, ) profiler = ProfileManager() # # Turn off optimizations if using features not supported yet if optimizations == "all": # taint mode rules not yet supported if any(rule.mode == TAINT_MODE for rule in filtered_rules): logger.info("Running without optimizations since taint rule found") optimizations = "none" # step by step evaluation output not yet supported elif output_handler.settings.debug: logger.info( "Running without optimizations since step-by-step evaluation output desired" ) optimizations = "none" elif any(rule.has_pattern_where_python() for rule in filtered_rules): logger.info( "Running without optimizations since running pattern-where-python rules" ) optimizations = "none" start_time = time.time() # actually invoke semgrep ( rule_matches_by_rule, debug_steps_by_rule, semgrep_errors, all_targets, profiling_data, ) = CoreRunner( output_settings=output_handler.settings, allow_exec=dangerously_allow_arbitrary_code_execution_from_rules, jobs=jobs, timeout=timeout, max_memory=max_memory, timeout_threshold=timeout_threshold, optimizations=optimizations, ).invoke_semgrep(target_manager, profiler, filtered_rules) profiler.save("total_time", start_time) output_handler.handle_semgrep_errors(semgrep_errors) nosem_errors = [] for rule, rule_matches in rule_matches_by_rule.items(): evolved_rule_matches = [] for rule_match in rule_matches: ignored, returned_errors = rule_match_nosem(rule_match, strict) evolved_rule_matches.append( attr.evolve(rule_match, is_ignored=ignored)) nosem_errors.extend(returned_errors) rule_matches_by_rule[rule] = evolved_rule_matches output_handler.handle_semgrep_errors(nosem_errors) num_findings_nosem = 0 if not disable_nosem: filtered_rule_matches_by_rule = {} for rule, rule_matches in rule_matches_by_rule.items(): filtered_rule_matches = [] for rule_match in rule_matches: if rule_match._is_ignored: num_findings_nosem += 1 else: filtered_rule_matches.append(rule_match) filtered_rule_matches_by_rule[rule] = filtered_rule_matches rule_matches_by_rule = filtered_rule_matches_by_rule num_findings = sum(len(v) for v in rule_matches_by_rule.values()) stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings" if metric_manager.is_enabled: project_url = None try: project_url = sub_check_output( ["git", "ls-remote", "--get-url"], encoding="utf-8", stderr=subprocess.DEVNULL, ) except Exception as e: logger.debug( f"Failed to get project url from 'git ls-remote': {e}") try: # add \n to match urls from git ls-remote (backwards compatability) project_url = manually_search_file(".git/config", ".com", "\n") except Exception as e: logger.debug( f"Failed to get project url from .git/config: {e}") metric_manager.set_project_hash(project_url) metric_manager.set_configs_hash(configs) metric_manager.set_rules_hash(filtered_rules) metric_manager.set_num_rules(len(filtered_rules)) metric_manager.set_num_targets(len(all_targets)) metric_manager.set_num_findings(num_findings) metric_manager.set_num_ignored(num_findings_nosem) metric_manager.set_run_time(profiler.calls["total_time"][0]) total_bytes_scanned = sum(t.stat().st_size for t in all_targets) metric_manager.set_total_bytes_scanned(total_bytes_scanned) metric_manager.set_errors( list(type(e).__name__ for e in semgrep_errors)) metric_manager.set_run_timings(profiling_data, all_targets, filtered_rules) output_handler.handle_semgrep_core_output( rule_matches_by_rule, debug_steps_by_rule, stats_line, all_targets, profiler, filtered_rules, profiling_data, ) if autofix: apply_fixes(rule_matches_by_rule, dryrun)
def main( output_handler: OutputHandler, target: List[str], pattern: str, lang: str, configs: List[str], no_rewrite_rule_ids: bool = False, jobs: int = 1, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, strict: bool = False, autofix: bool = False, dryrun: bool = False, disable_nosem: bool = False, dangerously_allow_arbitrary_code_execution_from_rules: bool = False, no_git_ignore: bool = False, timeout: int = DEFAULT_TIMEOUT, max_memory: int = 0, timeout_threshold: int = 0, skip_unknown_extensions: bool = False, testing: bool = False, severity: Optional[List[str]] = None, ) -> None: if include is None: include = [] if exclude is None: exclude = [] configs_obj, errors = get_config(pattern, lang, configs) all_rules = configs_obj.get_rules(no_rewrite_rule_ids) if severity is None or severity == []: filtered_rules = all_rules else: filtered_rules = [ rule for rule in all_rules if rule.severity in severity ] output_handler.handle_semgrep_errors(errors) if errors and strict: raise SemgrepError( f"run with --strict and there were {len(errors)} errors loading configs", code=MISSING_CONFIG_EXIT_CODE, ) if not pattern: plural = "s" if len(configs_obj.valid) > 1 else "" config_id_if_single = (list(configs_obj.valid.keys())[0] if len(configs_obj.valid) == 1 else "") invalid_msg = (f"({len(errors)} config files were invalid)" if len(errors) else "") logger.debug( f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}" ) if len(configs_obj.valid) == 0: raise SemgrepError( f"no valid configuration file found ({len(errors)} configs were invalid)", code=MISSING_CONFIG_EXIT_CODE, ) notify_user_of_work(filtered_rules, include, exclude) respect_git_ignore = not no_git_ignore target_manager = TargetManager( includes=include, excludes=exclude, targets=target, respect_git_ignore=respect_git_ignore, output_handler=output_handler, skip_unknown_extensions=skip_unknown_extensions, ) # actually invoke semgrep rule_matches_by_rule, debug_steps_by_rule, semgrep_errors, num_targets = CoreRunner( allow_exec=dangerously_allow_arbitrary_code_execution_from_rules, jobs=jobs, timeout=timeout, max_memory=max_memory, timeout_threshold=timeout_threshold, testing=testing, ).invoke_semgrep(target_manager, filtered_rules) output_handler.handle_semgrep_errors(semgrep_errors) rule_matches_by_rule = { rule: [ attr.evolve(rule_match, is_ignored=rule_match_nosem(rule_match, strict)) for rule_match in rule_matches ] for rule, rule_matches in rule_matches_by_rule.items() } if not disable_nosem: rule_matches_by_rule = { rule: [ rule_match for rule_match in rule_matches if not rule_match._is_ignored ] for rule, rule_matches in rule_matches_by_rule.items() } num_findings = sum(len(v) for v in rule_matches_by_rule.values()) stats_line = f"ran {len(filtered_rules)} rules on {num_targets} files: {num_findings} findings" output_handler.handle_semgrep_core_output(rule_matches_by_rule, debug_steps_by_rule, stats_line) if autofix: apply_fixes(rule_matches_by_rule, dryrun)
def main( output_handler: OutputHandler, target: List[str], pattern: str, lang: str, configs: List[str], no_rewrite_rule_ids: bool = False, jobs: int = 1, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, strict: bool = False, autofix: bool = False, dryrun: bool = False, disable_nosem: bool = False, dangerously_allow_arbitrary_code_execution_from_rules: bool = False, no_git_ignore: bool = False, timeout: int = DEFAULT_TIMEOUT, max_memory: int = 0, timeout_threshold: int = 0, skip_unknown_extensions: bool = False, severity: Optional[List[str]] = None, ) -> None: if include is None: include = [] if exclude is None: exclude = [] configs_obj, errors = get_config(pattern, lang, configs) all_rules = configs_obj.get_rules(no_rewrite_rule_ids) if severity is None or severity == []: filtered_rules = all_rules else: filtered_rules = [ rule for rule in all_rules if rule.severity in severity ] output_handler.handle_semgrep_errors(errors) if errors and strict: raise SemgrepError( f"run with --strict and there were {len(errors)} errors loading configs", code=MISSING_CONFIG_EXIT_CODE, ) if not pattern: plural = "s" if len(configs_obj.valid) > 1 else "" config_id_if_single = (list(configs_obj.valid.keys())[0] if len(configs_obj.valid) == 1 else "") invalid_msg = (f"({len(errors)} config files were invalid)" if len(errors) else "") logger.debug( f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}" ) if len(configs_obj.valid) == 0: if len(errors) > 0: raise SemgrepError( f"no valid configuration file found ({len(errors)} configs were invalid)", code=MISSING_CONFIG_EXIT_CODE, ) else: raise SemgrepError( """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>. If you're looking for a config to start with, there are thousands at: https://semgrep.dev The two most popular are: --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI --config=p/security-audit # find security audit points; noisy, not recommended for CI """, code=MISSING_CONFIG_EXIT_CODE, ) notify_user_of_work(filtered_rules, include, exclude) respect_git_ignore = not no_git_ignore target_manager = TargetManager( includes=include, excludes=exclude, targets=target, respect_git_ignore=respect_git_ignore, output_handler=output_handler, skip_unknown_extensions=skip_unknown_extensions, ) # actually invoke semgrep ( rule_matches_by_rule, debug_steps_by_rule, semgrep_errors, all_targets, profiler, ) = CoreRunner( allow_exec=dangerously_allow_arbitrary_code_execution_from_rules, jobs=jobs, timeout=timeout, max_memory=max_memory, timeout_threshold=timeout_threshold, ).invoke_semgrep(target_manager, filtered_rules) output_handler.handle_semgrep_errors(semgrep_errors) rule_matches_by_rule = { rule: [ attr.evolve(rule_match, is_ignored=rule_match_nosem(rule_match, strict)) for rule_match in rule_matches ] for rule, rule_matches in rule_matches_by_rule.items() } if not disable_nosem: rule_matches_by_rule = { rule: [ rule_match for rule_match in rule_matches if not rule_match._is_ignored ] for rule, rule_matches in rule_matches_by_rule.items() } num_findings = sum(len(v) for v in rule_matches_by_rule.values()) stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings" output_handler.handle_semgrep_core_output(rule_matches_by_rule, debug_steps_by_rule, stats_line, all_targets, profiler) if autofix: apply_fixes(rule_matches_by_rule, dryrun)
def main( *, output_handler: OutputHandler, target: Sequence[str], pattern: Optional[str], lang: Optional[str], configs: Sequence[str], no_rewrite_rule_ids: bool = False, jobs: int = 1, include: Optional[Sequence[str]] = None, exclude: Optional[Sequence[str]] = None, strict: bool = False, autofix: bool = False, replacement: Optional[str] = None, dryrun: bool = False, disable_nosem: bool = False, no_git_ignore: bool = False, timeout: int = DEFAULT_TIMEOUT, max_memory: int = 0, max_target_bytes: int = 0, timeout_threshold: int = 0, skip_unknown_extensions: bool = False, severity: Optional[Sequence[str]] = None, optimizations: str = "none", ) -> None: if include is None: include = [] if exclude is None: exclude = [] configs_obj, errors = get_config(pattern, lang, configs, replacement) all_rules = configs_obj.get_rules(no_rewrite_rule_ids) if not severity: filtered_rules = all_rules else: filtered_rules = [ rule for rule in all_rules if rule.severity.value in severity ] output_handler.handle_semgrep_errors(errors) if errors and strict: raise SemgrepError( f"run with --strict and there were {len(errors)} errors loading configs", code=MISSING_CONFIG_EXIT_CODE, ) if not pattern: plural = "s" if len(configs_obj.valid) > 1 else "" config_id_if_single = (list(configs_obj.valid.keys())[0] if len(configs_obj.valid) == 1 else "") invalid_msg = (f"({len(errors)} config files were invalid)" if len(errors) else "") logger.verbose( f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}" .strip()) if len(configs_obj.valid) == 0: if len(errors) > 0: raise SemgrepError( f"no valid configuration file found ({len(errors)} configs were invalid)", code=MISSING_CONFIG_EXIT_CODE, ) else: raise SemgrepError( """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>. If you're looking for a config to start with, there are thousands at: https://semgrep.dev The two most popular are: --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI --config=p/security-audit # find security audit points; noisy, not recommended for CI """, code=MISSING_CONFIG_EXIT_CODE, ) notify_user_of_work(filtered_rules, include, exclude) respect_git_ignore = not no_git_ignore target_manager = TargetManager( includes=include, excludes=exclude, max_target_bytes=max_target_bytes, targets=target, respect_git_ignore=respect_git_ignore, output_handler=output_handler, skip_unknown_extensions=skip_unknown_extensions, ) profiler = ProfileManager() join_rules, rest_of_the_rules = partition( lambda rule: rule.mode == JOIN_MODE, filtered_rules, ) filtered_rules = rest_of_the_rules start_time = time.time() # actually invoke semgrep ( rule_matches_by_rule, debug_steps_by_rule, semgrep_errors, all_targets, profiling_data, ) = CoreRunner( jobs=jobs, timeout=timeout, max_memory=max_memory, timeout_threshold=timeout_threshold, optimizations=optimizations, ).invoke_semgrep(target_manager, profiler, filtered_rules) if join_rules: import semgrep.join_rule as join_rule for rule in join_rules: join_rule_matches, join_rule_errors = join_rule.run_join_rule( rule.raw, [Path(t) for t in target_manager.targets]) join_rule_matches_by_rule = { Rule.from_json(rule.raw): join_rule_matches } rule_matches_by_rule.update(join_rule_matches_by_rule) output_handler.handle_semgrep_errors(join_rule_errors) profiler.save("total_time", start_time) filtered_matches = process_ignores(rule_matches_by_rule, output_handler, strict=strict, disable_nosem=disable_nosem) output_handler.handle_semgrep_errors(semgrep_errors) output_handler.handle_semgrep_errors(filtered_matches.errors) num_findings = sum(len(v) for v in filtered_matches.matches.values()) stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings" if metric_manager.is_enabled: project_url = None try: project_url = sub_check_output( ["git", "ls-remote", "--get-url"], encoding="utf-8", stderr=subprocess.DEVNULL, ) except Exception as e: logger.debug( f"Failed to get project url from 'git ls-remote': {e}") try: # add \n to match urls from git ls-remote (backwards compatability) project_url = manually_search_file(".git/config", ".com", "\n") except Exception as e: logger.debug( f"Failed to get project url from .git/config: {e}") metric_manager.set_project_hash(project_url) metric_manager.set_configs_hash(configs) metric_manager.set_rules_hash(filtered_rules) metric_manager.set_num_rules(len(filtered_rules)) metric_manager.set_num_targets(len(all_targets)) metric_manager.set_num_findings(num_findings) metric_manager.set_num_ignored(filtered_matches.num_matches) metric_manager.set_run_time(profiler.calls["total_time"][0]) total_bytes_scanned = sum(t.stat().st_size for t in all_targets) metric_manager.set_total_bytes_scanned(total_bytes_scanned) metric_manager.set_errors( list(type(e).__name__ for e in semgrep_errors)) metric_manager.set_run_timings(profiling_data, list(all_targets), filtered_rules) output_handler.handle_semgrep_core_output( filtered_matches.matches, debug_steps_by_rule, stats_line, all_targets, profiler, filtered_rules, profiling_data, ) if autofix: apply_fixes(filtered_matches.matches, dryrun)