def dump_parsed_ast( to_json: bool, language: str, pattern: Optional[str], targets_str: List[str] ) -> None: targets = semgrep.config_resolver.resolve_targets(targets_str) with tempfile.NamedTemporaryFile("w") as fout: args = [] if pattern: fout.write(pattern) fout.flush() args = ["-lang", language, "-dump_pattern", fout.name] else: if len(targets) != 1: raise SemgrepError("--dump-ast requires exactly one target file") target = targets[0] args = ["-lang", language, "-dump_ast", str(target)] if to_json: args = ["-json"] + args cmd = [SEMGREP_PATH] + args try: output = sub_check_output(cmd) except subprocess.CalledProcessError as ex: raise SemgrepError( f"error invoking semgrep with:\n\t{' '.join(cmd)}\n\t{ex}\n{PLEASE_FILE_ISSUE_TEXT}" ) print(output.decode(errors="replace"))
def synthesize(language: str, code_to_synthesize: str, targets_str: Sequence[str]) -> None: targets = semgrep.config_resolver.resolve_targets(targets_str) if len(targets) != 1: raise SemgrepError( "--synthesize-patterns requires exactly one target file") target = targets[0] args = ["-synthesize_patterns", code_to_synthesize, str(target)] cmd = [SemgrepCore.path()] + args try: output = sub_check_output(cmd) except subprocess.CalledProcessError as ex: raise SemgrepError( f"error invoking semgrep with:\n\t{' '.join(cmd)}\n\t{ex}\n{PLEASE_FILE_ISSUE_TEXT}" ) print(output.decode(errors="replace"))
def metavariable_comparison( metavariable: str, comparison: str, content: Union[int, float, str] ) -> bool: core_request = { "metavars": {metavariable: content}, "language": "python", # Hardcode for now "code": comparison, } with tempfile.NamedTemporaryFile("w") as temp_file: json.dump(core_request, temp_file) temp_file.flush() cmd = [SEMGREP_PATH, "-eval", temp_file.name] try: output: bytes = sub_check_output(cmd) except subprocess.CalledProcessError as ex: raise SemgrepError( f"error invoking semgrep with:\n\t{' '.join(cmd)}\n\t{ex}\n{PLEASE_FILE_ISSUE_TEXT}" ) return output.strip() == b"true"
def main( output_handler: OutputHandler, target: List[str], pattern: str, lang: str, configs: List[str], no_rewrite_rule_ids: bool = False, jobs: int = 1, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, strict: bool = False, autofix: bool = False, dryrun: bool = False, disable_nosem: bool = False, dangerously_allow_arbitrary_code_execution_from_rules: bool = False, no_git_ignore: bool = False, timeout: int = DEFAULT_TIMEOUT, max_memory: int = 0, max_target_bytes: int = 0, timeout_threshold: int = 0, skip_unknown_extensions: bool = False, severity: Optional[List[str]] = None, optimizations: str = "none", ) -> None: if include is None: include = [] if exclude is None: exclude = [] configs_obj, errors = get_config(pattern, lang, configs) all_rules = configs_obj.get_rules(no_rewrite_rule_ids) if severity is None or severity == []: filtered_rules = all_rules else: filtered_rules = [ rule for rule in all_rules if rule.severity in severity ] output_handler.handle_semgrep_errors(errors) if errors and strict: raise SemgrepError( f"run with --strict and there were {len(errors)} errors loading configs", code=MISSING_CONFIG_EXIT_CODE, ) if not pattern: plural = "s" if len(configs_obj.valid) > 1 else "" config_id_if_single = (list(configs_obj.valid.keys())[0] if len(configs_obj.valid) == 1 else "") invalid_msg = (f"({len(errors)} config files were invalid)" if len(errors) else "") logger.verbose( f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}" ) if len(configs_obj.valid) == 0: if len(errors) > 0: raise SemgrepError( f"no valid configuration file found ({len(errors)} configs were invalid)", code=MISSING_CONFIG_EXIT_CODE, ) else: raise SemgrepError( """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>. If you're looking for a config to start with, there are thousands at: https://semgrep.dev The two most popular are: --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI --config=p/security-audit # find security audit points; noisy, not recommended for CI """, code=MISSING_CONFIG_EXIT_CODE, ) notify_user_of_work(filtered_rules, include, exclude) respect_git_ignore = not no_git_ignore target_manager = TargetManager( includes=include, excludes=exclude, max_target_bytes=max_target_bytes, targets=target, respect_git_ignore=respect_git_ignore, output_handler=output_handler, skip_unknown_extensions=skip_unknown_extensions, ) profiler = ProfileManager() # # Turn off optimizations if using features not supported yet if optimizations == "all": # taint mode rules not yet supported if any(rule.mode == TAINT_MODE for rule in filtered_rules): logger.info("Running without optimizations since taint rule found") optimizations = "none" # step by step evaluation output not yet supported elif output_handler.settings.debug: logger.info( "Running without optimizations since step-by-step evaluation output desired" ) optimizations = "none" elif any(rule.has_pattern_where_python() for rule in filtered_rules): logger.info( "Running without optimizations since running pattern-where-python rules" ) optimizations = "none" start_time = time.time() # actually invoke semgrep ( rule_matches_by_rule, debug_steps_by_rule, semgrep_errors, all_targets, profiling_data, ) = CoreRunner( output_settings=output_handler.settings, allow_exec=dangerously_allow_arbitrary_code_execution_from_rules, jobs=jobs, timeout=timeout, max_memory=max_memory, timeout_threshold=timeout_threshold, optimizations=optimizations, ).invoke_semgrep(target_manager, profiler, filtered_rules) profiler.save("total_time", start_time) output_handler.handle_semgrep_errors(semgrep_errors) nosem_errors = [] for rule, rule_matches in rule_matches_by_rule.items(): evolved_rule_matches = [] for rule_match in rule_matches: ignored, returned_errors = rule_match_nosem(rule_match, strict) evolved_rule_matches.append( attr.evolve(rule_match, is_ignored=ignored)) nosem_errors.extend(returned_errors) rule_matches_by_rule[rule] = evolved_rule_matches output_handler.handle_semgrep_errors(nosem_errors) num_findings_nosem = 0 if not disable_nosem: filtered_rule_matches_by_rule = {} for rule, rule_matches in rule_matches_by_rule.items(): filtered_rule_matches = [] for rule_match in rule_matches: if rule_match._is_ignored: num_findings_nosem += 1 else: filtered_rule_matches.append(rule_match) filtered_rule_matches_by_rule[rule] = filtered_rule_matches rule_matches_by_rule = filtered_rule_matches_by_rule num_findings = sum(len(v) for v in rule_matches_by_rule.values()) stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings" if metric_manager.is_enabled: project_url = None try: project_url = sub_check_output( ["git", "ls-remote", "--get-url"], encoding="utf-8", stderr=subprocess.DEVNULL, ) except Exception as e: logger.debug( f"Failed to get project url from 'git ls-remote': {e}") try: # add \n to match urls from git ls-remote (backwards compatability) project_url = manually_search_file(".git/config", ".com", "\n") except Exception as e: logger.debug( f"Failed to get project url from .git/config: {e}") metric_manager.set_project_hash(project_url) metric_manager.set_configs_hash(configs) metric_manager.set_rules_hash(filtered_rules) metric_manager.set_num_rules(len(filtered_rules)) metric_manager.set_num_targets(len(all_targets)) metric_manager.set_num_findings(num_findings) metric_manager.set_num_ignored(num_findings_nosem) metric_manager.set_run_time(profiler.calls["total_time"][0]) total_bytes_scanned = sum(t.stat().st_size for t in all_targets) metric_manager.set_total_bytes_scanned(total_bytes_scanned) metric_manager.set_errors( list(type(e).__name__ for e in semgrep_errors)) metric_manager.set_run_timings(profiling_data, all_targets, filtered_rules) output_handler.handle_semgrep_core_output( rule_matches_by_rule, debug_steps_by_rule, stats_line, all_targets, profiler, filtered_rules, profiling_data, ) if autofix: apply_fixes(rule_matches_by_rule, dryrun)
def _expand_dir(curr_dir: Path, language: Language, respect_git_ignore: bool) -> FrozenSet[Path]: """ Recursively go through a directory and return list of all files with default file extension of language """ def _parse_output(output: str, curr_dir: Path) -> FrozenSet[Path]: """ Convert a newline delimited list of files to a set of path objects prepends curr_dir to all paths in said list If list is empty then returns an empty set """ files: FrozenSet[Path] = frozenset() if output: files = frozenset( p for p in (Path(curr_dir) / elem for elem in output.strip().split("\n")) if TargetManager._is_valid(p)) return files def _find_files_with_extension( curr_dir: Path, extension: FileExtension) -> FrozenSet[Path]: """ Return set of all files in curr_dir with given extension """ return frozenset(p for p in curr_dir.rglob(f"*{extension}") if TargetManager._is_valid(p) and p.is_file()) extensions = lang_to_exts(language) expanded: FrozenSet[Path] = frozenset() for ext in extensions: if respect_git_ignore: try: # Tracked files tracked_output = sub_check_output( ["git", "ls-files", f"*{ext}"], cwd=curr_dir.resolve(), encoding="utf-8", stderr=subprocess.DEVNULL, ) # Untracked but not ignored files untracked_output = sub_check_output( [ "git", "ls-files", "--other", "--exclude-standard", f"*{ext}", ], cwd=curr_dir.resolve(), encoding="utf-8", stderr=subprocess.DEVNULL, ) deleted_output = sub_check_output( ["git", "ls-files", "--deleted", f"*{ext}"], cwd=curr_dir.resolve(), encoding="utf-8", stderr=subprocess.DEVNULL, ) except (subprocess.CalledProcessError, FileNotFoundError): # Not a git directory or git not installed. Fallback to using rglob ext_files = _find_files_with_extension(curr_dir, ext) expanded = expanded.union(ext_files) else: tracked = _parse_output(tracked_output, curr_dir) untracked_unignored = _parse_output( untracked_output, curr_dir) deleted = _parse_output(deleted_output, curr_dir) expanded = expanded.union(tracked) expanded = expanded.union(untracked_unignored) expanded = expanded.difference(deleted) else: ext_files = _find_files_with_extension(curr_dir, ext) expanded = expanded.union(ext_files) return expanded
def main( *, output_handler: OutputHandler, target: Sequence[str], pattern: Optional[str], lang: Optional[str], configs: Sequence[str], no_rewrite_rule_ids: bool = False, jobs: int = 1, include: Optional[Sequence[str]] = None, exclude: Optional[Sequence[str]] = None, strict: bool = False, autofix: bool = False, replacement: Optional[str] = None, dryrun: bool = False, disable_nosem: bool = False, no_git_ignore: bool = False, timeout: int = DEFAULT_TIMEOUT, max_memory: int = 0, max_target_bytes: int = 0, timeout_threshold: int = 0, skip_unknown_extensions: bool = False, severity: Optional[Sequence[str]] = None, optimizations: str = "none", ) -> None: if include is None: include = [] if exclude is None: exclude = [] configs_obj, errors = get_config(pattern, lang, configs, replacement) all_rules = configs_obj.get_rules(no_rewrite_rule_ids) if not severity: filtered_rules = all_rules else: filtered_rules = [ rule for rule in all_rules if rule.severity.value in severity ] output_handler.handle_semgrep_errors(errors) if errors and strict: raise SemgrepError( f"run with --strict and there were {len(errors)} errors loading configs", code=MISSING_CONFIG_EXIT_CODE, ) if not pattern: plural = "s" if len(configs_obj.valid) > 1 else "" config_id_if_single = (list(configs_obj.valid.keys())[0] if len(configs_obj.valid) == 1 else "") invalid_msg = (f"({len(errors)} config files were invalid)" if len(errors) else "") logger.verbose( f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}" .strip()) if len(configs_obj.valid) == 0: if len(errors) > 0: raise SemgrepError( f"no valid configuration file found ({len(errors)} configs were invalid)", code=MISSING_CONFIG_EXIT_CODE, ) else: raise SemgrepError( """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>. If you're looking for a config to start with, there are thousands at: https://semgrep.dev The two most popular are: --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI --config=p/security-audit # find security audit points; noisy, not recommended for CI """, code=MISSING_CONFIG_EXIT_CODE, ) notify_user_of_work(filtered_rules, include, exclude) respect_git_ignore = not no_git_ignore target_manager = TargetManager( includes=include, excludes=exclude, max_target_bytes=max_target_bytes, targets=target, respect_git_ignore=respect_git_ignore, output_handler=output_handler, skip_unknown_extensions=skip_unknown_extensions, ) profiler = ProfileManager() join_rules, rest_of_the_rules = partition( lambda rule: rule.mode == JOIN_MODE, filtered_rules, ) filtered_rules = rest_of_the_rules start_time = time.time() # actually invoke semgrep ( rule_matches_by_rule, debug_steps_by_rule, semgrep_errors, all_targets, profiling_data, ) = CoreRunner( jobs=jobs, timeout=timeout, max_memory=max_memory, timeout_threshold=timeout_threshold, optimizations=optimizations, ).invoke_semgrep(target_manager, profiler, filtered_rules) if join_rules: import semgrep.join_rule as join_rule for rule in join_rules: join_rule_matches, join_rule_errors = join_rule.run_join_rule( rule.raw, [Path(t) for t in target_manager.targets]) join_rule_matches_by_rule = { Rule.from_json(rule.raw): join_rule_matches } rule_matches_by_rule.update(join_rule_matches_by_rule) output_handler.handle_semgrep_errors(join_rule_errors) profiler.save("total_time", start_time) filtered_matches = process_ignores(rule_matches_by_rule, output_handler, strict=strict, disable_nosem=disable_nosem) output_handler.handle_semgrep_errors(semgrep_errors) output_handler.handle_semgrep_errors(filtered_matches.errors) num_findings = sum(len(v) for v in filtered_matches.matches.values()) stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings" if metric_manager.is_enabled: project_url = None try: project_url = sub_check_output( ["git", "ls-remote", "--get-url"], encoding="utf-8", stderr=subprocess.DEVNULL, ) except Exception as e: logger.debug( f"Failed to get project url from 'git ls-remote': {e}") try: # add \n to match urls from git ls-remote (backwards compatability) project_url = manually_search_file(".git/config", ".com", "\n") except Exception as e: logger.debug( f"Failed to get project url from .git/config: {e}") metric_manager.set_project_hash(project_url) metric_manager.set_configs_hash(configs) metric_manager.set_rules_hash(filtered_rules) metric_manager.set_num_rules(len(filtered_rules)) metric_manager.set_num_targets(len(all_targets)) metric_manager.set_num_findings(num_findings) metric_manager.set_num_ignored(filtered_matches.num_matches) metric_manager.set_run_time(profiler.calls["total_time"][0]) total_bytes_scanned = sum(t.stat().st_size for t in all_targets) metric_manager.set_total_bytes_scanned(total_bytes_scanned) metric_manager.set_errors( list(type(e).__name__ for e in semgrep_errors)) metric_manager.set_run_timings(profiling_data, list(all_targets), filtered_rules) output_handler.handle_semgrep_core_output( filtered_matches.matches, debug_steps_by_rule, stats_line, all_targets, profiler, filtered_rules, profiling_data, ) if autofix: apply_fixes(filtered_matches.matches, dryrun)