예제 #1
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[
        Dict[Rule, List[RuleMatch]],
        Dict[Rule, List[Any]],
        List[SemgrepError],
        Set[Path],
        Dict[Any, Any],
    ]:
        from itertools import chain
        from collections import defaultdict

        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
        errors: List[SemgrepError] = []
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule, language in tuple(
                chain(
                    *(
                        [(rule, language) for language in rule.languages]
                        for rule in rules
                    )
                )
            ):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                with tempfile.NamedTemporaryFile(
                    "w", suffix=".yaml"
                ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
                    targets = self.get_files_for_language(
                        language, rule, target_manager
                    )
                    # opti: no need to call semgrep-core if no target files
                    if not targets:
                        continue
                    target_file.write("\n".join(map(lambda p: str(p), targets)))
                    target_file.flush()
                    yaml = YAML()
                    yaml.dump({"rules": [rule._raw]}, rule_file)
                    rule_file.flush()

                    cmd = [SEMGREP_PATH] + [
                        "-lang",
                        language,
                        "-fast",
                        "-json",
                        "-config",
                        rule_file.name,
                        "-j",
                        str(self._jobs),
                        "-target_file",
                        target_file.name,
                        "-use_parsing_cache",
                        semgrep_core_ast_cache_dir,
                        "-timeout",
                        str(self._timeout),
                        "-max_memory",
                        str(self._max_memory),
                    ]

                    r = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    out_bytes, err_bytes, returncode = r.stdout, r.stderr, r.returncode
                    output_json = self._parse_core_output(
                        out_bytes, err_bytes, returncode
                    )

                    if returncode != 0:
                        if "error" in output_json:
                            self._raise_semgrep_error_from_json(output_json, [], rule)
                        else:
                            raise SemgrepError(
                                f"unexpected json output while invoking semgrep-core with rule '{rule.id}':\n{PLEASE_FILE_ISSUE_TEXT}"
                            )

                # end with tempfile.NamedTemporaryFile(...) ...
                findings = [
                    RuleMatch.from_pattern_match(
                        rule.id,
                        PatternMatch(pattern_match),
                        message=rule.message,
                        metadata=rule.metadata,
                        severity=rule.severity,
                        fix=rule.fix,
                        fix_regex=rule.fix_regex,
                    )
                    for pattern_match in output_json["matches"]
                ]
                # TODO: we should do that in Semgrep_generic.ml instead
                findings = dedup_output(findings)
                outputs[rule].extend(findings)
                errors.extend(
                    CoreException.from_json(e, language, rule.id).into_semgrep_error()
                    for e in output_json["errors"]
                )
        # end for rule, language ...

        return outputs, {}, errors, set(Path(p) for p in target_manager.targets), {}
예제 #2
0
def run_spacegrep(rule_id: str, patterns: List[Pattern], targets: List[Path],
                  timeout: int) -> dict:
    matches: List[dict] = []
    errors: List[dict] = []
    for pattern in patterns:
        if not isinstance(pattern._pattern, str):
            raise NotImplementedError(
                f"Support for {type(pattern._pattern)} has not been implemented yet."
            )
        pattern_str = pattern._pattern  # TODO: Handle pattern Dict
        for target in targets:
            cmd = [
                SPACEGREP_PATH,
                "--output-format",
                "semgrep",
                "-d",
                str(target),
                pattern_str,
                "--timeout",
                str(timeout),
            ]
            try:
                p = sub_run(cmd,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
                # exit code 3 indicates a timeout. See 'spacegrep --help'.
                if p.returncode == 3:
                    err = CoreException(
                        check_id="Timeout",
                        path=target,
                        start=Position(0, 0),
                        end=Position(0, 0),
                        extra={
                            "message": "spacegrep timeout",
                            "line": "",
                        },
                        language="generic",
                        rule_id=rule_id,
                    ).to_dict()
                    errors.append(err)
                else:
                    p.check_returncode()
                    raw_output = p.stdout

                    output_json = _parse_spacegrep_output(raw_output)
                    output_json["matches"] = _patch_id(
                        pattern, output_json.get("matches", []))

                    matches.extend(output_json["matches"])
                    errors.extend(output_json["errors"])
            except subprocess.CalledProcessError as e:
                raw_error = p.stderr
                spacegrep_error_text = raw_error.decode("utf-8",
                                                        errors="replace")
                raise SemgrepError(
                    f"Error running spacegrep on file {target}: Process error: {e}\n\nspacegrep error: {spacegrep_error_text}"
                )
            except json.JSONDecodeError as e:
                raise SemgrepError(
                    f"Could not parse spacegrep output as JSON: JSON error: {e}"
                )
            except KeyError as e:
                raise SemgrepError(
                    f"Invalid JSON output was received from spacegrep: {e}")

    return {
        "matches": matches,
        "errors": errors,
    }
예제 #3
0
    def _run_rule(
        self,
        rule: Rule,
        target_manager: TargetManager,
        cache_dir: str,
        max_timeout_files: List[Path],
        profiler: ProfileManager,
        match_time_matrix: Dict[Tuple[str, str], float],
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError], Set[Path]]:
        """
        Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()

        for language, all_patterns_for_language in self._group_patterns_by_language(
            rule
        ).items():

            targets = self.get_files_for_language(language, rule, target_manager)
            targets = [target for target in targets if target not in max_timeout_files]
            all_targets = all_targets.union(targets)
            if not targets:
                continue

            if rule.mode == TAINT_MODE:
                pattern_json = rule._raw.copy()
                del pattern_json["mode"]
                pattern = Pattern(
                    0, rule.expression, rule.severity, language, rule._yaml.span
                )

                output_json = profiler.track(
                    rule.id,
                    self._run_core_command,
                    [pattern_json],
                    [pattern],
                    targets,
                    language,
                    rule,
                    "-tainting_rules_file",
                    cache_dir,
                    report_time=self._report_time,
                )
            else:
                # semgrep-core doesn't know about OPERATORS.REGEX - this is
                # strictly a semgrep Python feature. Regex filtering is
                # performed purely in Python code then compared against
                # semgrep-core's results for other patterns.
                patterns_regex, patterns = partition(
                    lambda p: p.expression.operator == OPERATORS.REGEX
                    or p.expression.operator == OPERATORS.NOT_REGEX,
                    all_patterns_for_language,
                )
                if patterns_regex:
                    self.handle_regex_patterns(outputs, patterns_regex, targets)

                # regex-only rules only support OPERATORS.REGEX.
                # Skip passing this rule to semgrep-core.
                if language in REGEX_ONLY_LANGUAGE_KEYS:
                    continue

                # semgrep-core doesn't know about the following operators -
                # they are strictly semgrep Python features:
                #   - OPERATORS.METAVARIABLE_REGEX
                #   - OPERATORS.METAVARIABLE_COMPARISON
                patterns = [
                    pattern
                    for pattern in patterns
                    if pattern.expression.operator
                    not in [
                        OPERATORS.METAVARIABLE_REGEX,
                        OPERATORS.METAVARIABLE_COMPARISON,
                    ]
                ]

                patterns_json = [p.to_json() for p in patterns]

                if language == GENERIC_LANGUAGE:
                    output_json = profiler.track(
                        rule.id,
                        run_spacegrep,
                        rule.id,
                        patterns,
                        targets,
                        timeout=self._timeout,
                        report_time=self._report_time,
                    )
                else:  # Run semgrep-core
                    output_json = profiler.track(
                        rule.id,
                        self._run_core_command,
                        patterns_json,
                        patterns,
                        targets,
                        language,
                        rule,
                        "-rules_file",
                        cache_dir,
                        report_time=self._report_time,
                    )

            errors.extend(
                CoreException.from_json(e, language, rule.id).into_semgrep_error()
                for e in output_json["errors"]
            )
            outputs.extend(PatternMatch(m) for m in output_json["matches"])
            if "time" in output_json:
                self._add_match_times(rule, match_time_matrix, output_json["time"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[
            Rule, Dict[Path, List[PatternMatch]]
        ] = collections.defaultdict(lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                logger.debug(
                    f"--> rule ({rule.id}) has findings on filepath: {filepath}"
                )

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec
                )
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)
        logger.debug(f"...ran on {len(all_targets)} files")

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors, all_targets
예제 #4
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Any]],
               List[SemgrepError], Set[Path], ProfilingData, ]:
        logger.debug(f"Passing whole rules directly to semgrep_core")

        outputs: Dict[Rule, List[RuleMatch]] = collections.defaultdict(list)
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()
        file_timeouts: Dict[Path, int] = collections.defaultdict(lambda: 0)
        max_timeout_files: Set[Path] = set()

        profiling_data: ProfilingData = ProfilingData()
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule in progress_bar(
                    rules, bar_format="{l_bar}{bar}|{n_fmt}/{total_fmt}"):
                for language in rule.languages:
                    debug_tqdm_write(f"Running rule {rule.id}...")
                    with tempfile.NamedTemporaryFile(
                            "w", suffix=".yaml"
                    ) as rule_file, tempfile.NamedTemporaryFile(
                            "w") as target_file, tempfile.NamedTemporaryFile(
                                "w") as equiv_file:
                        targets = self.get_files_for_language(
                            language, rule, target_manager)

                        targets = [
                            target for target in targets
                            if target not in max_timeout_files
                        ]

                        # opti: no need to call semgrep-core if no target files
                        if not targets:
                            continue
                        all_targets = all_targets.union(targets)

                        target_file.write("\n".join(
                            map(lambda p: str(p), targets)))
                        target_file.flush()
                        yaml = YAML()
                        yaml.dump({"rules": [rule._raw]}, rule_file)
                        rule_file.flush()

                        cmd = [SEMGREP_PATH] + [
                            "-lang",
                            language.value,
                            "-json",
                            "-config",
                            rule_file.name,
                            "-j",
                            str(self._jobs),
                            "-target_file",
                            target_file.name,
                            "-use_parsing_cache",
                            semgrep_core_ast_cache_dir,
                            "-timeout",
                            str(self._timeout),
                            "-max_memory",
                            str(self._max_memory),
                            "-json_time",
                        ]

                        if self._optimizations != "none":
                            cmd.append("-fast")

                        stderr: Optional[int] = subprocess.PIPE
                        if is_debug():
                            cmd += ["-debug"]
                            stderr = None

                        core_run = sub_run(cmd,
                                           stdout=subprocess.PIPE,
                                           stderr=stderr)
                        output_json = self._extract_core_output(rule, core_run)

                        if "time" in output_json:
                            self._add_match_times(rule, profiling_data,
                                                  output_json["time"])

                    # end with tempfile.NamedTemporaryFile(...) ...
                    pattern_matches = [
                        PatternMatch(match) for match in output_json["matches"]
                    ]
                    findings = create_output(rule, pattern_matches)

                    findings = dedup_output(findings)
                    outputs[rule].extend(findings)
                    parsed_errors = [
                        CoreException.from_json(e, language.value,
                                                rule.id).into_semgrep_error()
                        for e in output_json["errors"]
                    ]
                    for err in parsed_errors:
                        if isinstance(err, MatchTimeoutError):
                            file_timeouts[err.path] += 1
                            if (self._timeout_threshold != 0
                                    and file_timeouts[err.path] >=
                                    self._timeout_threshold):
                                max_timeout_files.add(err.path)
                    errors.extend(parsed_errors)
            # end for language ...
        # end for rule ...

        return outputs, {}, errors, all_targets, profiling_data
예제 #5
0
    def _run_rules_direct_to_semgrep_core(
        self,
        rules: List[Rule],
        target_manager: TargetManager,
        profiler: ProfileManager,
    ) -> Tuple[
        Dict[Rule, List[RuleMatch]],
        Dict[Rule, List[Any]],
        List[SemgrepError],
        Set[Path],
        ProfilingData,
    ]:
        from itertools import chain
        from collections import defaultdict

        logger.debug(f"Passing whole rules directly to semgrep_core")

        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()
        profiling_data: ProfilingData = ProfilingData()
        # cf. for bar_format: https://tqdm.github.io/docs/tqdm/
        with tempfile.TemporaryDirectory() as semgrep_core_ast_cache_dir:
            for rule, language in tuple(
                chain(
                    *(
                        [(rule, language) for language in rule.languages]
                        for rule in rules
                    )
                )
            ):
                debug_tqdm_write(f"Running rule {rule._raw.get('id')}...")
                with tempfile.NamedTemporaryFile(
                    "w", suffix=".yaml"
                ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
                    targets = self.get_files_for_language(
                        language, rule, target_manager
                    )
                    # opti: no need to call semgrep-core if no target files
                    if not targets:
                        continue
                    all_targets = all_targets.union(targets)

                    target_file.write("\n".join(map(lambda p: str(p), targets)))
                    target_file.flush()
                    yaml = YAML()
                    yaml.dump({"rules": [rule._raw]}, rule_file)
                    rule_file.flush()

                    cmd = [SEMGREP_PATH] + [
                        "-lang",
                        language,
                        "-fast",
                        "-json",
                        "-config",
                        rule_file.name,
                        "-j",
                        str(self._jobs),
                        "-target_file",
                        target_file.name,
                        "-use_parsing_cache",
                        semgrep_core_ast_cache_dir,
                        "-timeout",
                        str(self._timeout),
                        "-max_memory",
                        str(self._max_memory),
                    ]

                    if self._report_time:
                        cmd += ["-json_time"]

                    if self._output_settings.debug:
                        cmd += ["-debug"]

                    core_run = sub_run(
                        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
                    )
                    output_json = self._extract_core_output(rule, [], core_run)

                    if "time" in output_json:
                        self._add_match_times(rule, profiling_data, output_json["time"])

                # end with tempfile.NamedTemporaryFile(...) ...
                findings = [
                    RuleMatch.from_pattern_match(
                        rule.id,
                        PatternMatch(pattern_match),
                        message=rule.message,
                        metadata=rule.metadata,
                        severity=rule.severity,
                        fix=rule.fix,
                        fix_regex=rule.fix_regex,
                    )
                    for pattern_match in output_json["matches"]
                ]
                # TODO: we should do that in Semgrep_generic.ml instead
                findings = dedup_output(findings)
                outputs[rule].extend(findings)
                errors.extend(
                    CoreException.from_json(e, language, rule.id).into_semgrep_error()
                    for e in output_json["errors"]
                )
        # end for rule, language ...

        return outputs, {}, errors, all_targets, profiling_data
예제 #6
0
    def _run_rule(
        self,
        rule: Rule,
        target_manager: TargetManager,
        cache_dir: str,
        max_timeout_files: List[Path],
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[SemgrepError] = []

        for language, all_patterns_for_language in self._group_patterns_by_language(
                rule).items():

            targets = self.get_files_for_language(language, rule,
                                                  target_manager)
            targets = [
                target for target in targets if target not in max_timeout_files
            ]
            if not targets:
                continue

            if rule.mode == TAINT_MODE:
                pattern_json = rule._raw.copy()
                del pattern_json["mode"]
                pattern = Pattern(0, rule.expression, rule.severity, language,
                                  rule._yaml.span)

                output_json = self._run_core_command(
                    [pattern_json],
                    [pattern],
                    targets,
                    language,
                    rule,
                    "-tainting_rules_file",
                    cache_dir,
                )
            else:
                # semgrep-core doesn't know about OPERATORS.REGEX - this is
                # strictly a semgrep Python feature. Regex filtering is
                # performed purely in Python code then compared against
                # semgrep-core's results for other patterns.
                patterns_regex, patterns = partition(
                    lambda p: p.expression.operator == OPERATORS.REGEX,
                    all_patterns_for_language,
                )
                if patterns_regex:
                    self.handle_regex_patterns(outputs, patterns_regex,
                                               targets)

                # semgrep-core doesn't know about OPERATORS.METAVARIABLE_REGEX -
                # this is strictly a semgrep Python feature. Metavariable regex
                # filtering is performed purely in Python code then compared
                # against semgrep-core's results for other patterns.
                patterns = [
                    pattern for pattern in patterns if
                    pattern.expression.operator != OPERATORS.METAVARIABLE_REGEX
                ]

                patterns_json = [p.to_json() for p in patterns]

                output_json = self._run_core_command(
                    patterns_json,
                    patterns,
                    targets,
                    language,
                    rule,
                    "-rules_file",
                    cache_dir,
                )

            errors.extend(
                CoreException.from_json(e, language,
                                        rule.id).into_semgrep_error()
                for e in output_json["errors"])
            outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                logger.debug(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors
예제 #7
0
def run_spacegrep(
    rule_id: str,
    patterns: List[Pattern],
    targets: List[Path],
    timeout: int,
) -> dict:
    matches: List[dict] = []
    errors: List[dict] = []
    targets_time: Dict[str, Tuple[float, float, float]] = {}
    for pattern in patterns:
        if not isinstance(pattern._pattern, str):
            raise NotImplementedError(
                f"Support for {type(pattern._pattern)} has not been implemented yet."
            )
        pattern_str = pattern._pattern  # TODO: Handle pattern Dict
        for target in targets:
            cmd = [
                SPACEGREP_PATH,
                "--output-format",
                "semgrep",
                "-d",
                str(target),
                pattern_str,
                "--timeout",
                str(timeout),
                "--time",
            ]

            try:
                p = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                # exit code 3 indicates a timeout. See 'spacegrep --help'.
                if p.returncode == 3:
                    err = CoreException(
                        check_id="Timeout",
                        path=target,
                        start=Position(0, 0),
                        end=Position(0, 0),
                        extra={
                            "message": "spacegrep timeout",
                            "line": "",
                        },
                        language="generic",
                        rule_id=rule_id,
                    ).to_dict()
                    errors.append(err)
                else:
                    p.check_returncode()
                    raw_output = p.stdout

                    output_json = _parse_spacegrep_output(raw_output)
                    output_json["matches"] = _patch_id(
                        pattern, output_json.get("matches", [])
                    )

                    matches.extend(output_json["matches"])
                    errors.extend(output_json["errors"])
                    # aggregate the match times obtained for the different patterns of the rule
                    path_s = str(target)

                    targets_time[path_s] = tuple(  # type: ignore
                        [
                            i + j
                            for i, j in zip(
                                targets_time.get(path_s, (0.0, 0.0, 0.0)),
                                _extract_times(output_json),
                            )
                        ]
                    )

            except subprocess.CalledProcessError as e:
                raw_error = p.stderr
                spacegrep_error_text = raw_error.decode("utf-8", errors="replace")
                raise SemgrepError(
                    f"Error running spacegrep on file {target}: Process error: {e}\n\nspacegrep error: {spacegrep_error_text}"
                )
            except json.JSONDecodeError as e:
                raise SemgrepError(
                    f"Could not parse spacegrep output as JSON: JSON error: {e}"
                )
            except KeyError as e:
                raise SemgrepError(
                    f"Invalid JSON output was received from spacegrep: {e}"
                )

    target_list = []
    for path in targets:
        times = targets_time.get(str(path), (0.0, 0.0, 0.0))
        target_list.append(
            {
                "path": str(path),
                "parse_time": times[0],
                "match_time": times[1],
                "run_time": times[2],
            }
        )
    time = {"targets": target_list}
    return {
        "matches": matches,
        "errors": errors,
        "time": time,
    }
예제 #8
0
    def _run_rule(
        self, rule: Rule, target_manager: TargetManager, cache_dir: str
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[CoreException] = []
        equivalences = rule.equivalences

        for language, all_patterns_for_language in self._group_patterns_by_language(
            [rule]).items():
            try:
                targets = target_manager.get_files(language, rule.includes,
                                                   rule.excludes)
            except _UnknownLanguageError as ex:
                raise UnknownLanguageError(
                    short_msg="invalid language",
                    long_msg=f"unsupported language {language}",
                    spans=[
                        rule.languages_span.with_context(before=1, after=1)
                    ],
                ) from ex

            if targets == []:
                continue

            # semgrep-core doesn't know about OPERATORS.REGEX - this is
            # strictly a semgrep Python feature. Regex filtering is
            # performed purely in Python code then compared against
            # semgrep-core's results for other patterns.
            patterns_regex, patterns = partition(
                lambda p: p.expression.operator == OPERATORS.REGEX,
                all_patterns_for_language,
            )
            if patterns_regex:
                patterns_json = [
                    pattern.to_json() for pattern in patterns_regex
                ]

                try:
                    patterns_re = [(pattern["id"],
                                    re.compile(pattern["pattern"]))
                                   for pattern in patterns_json]
                except re.error as err:
                    raise SemgrepError(
                        f"invalid regular expression specified: {err}")

                re_fn = functools.partial(get_re_matches, patterns_re)
                with multiprocessing.Pool(self._jobs) as pool:
                    matches = pool.map(re_fn, targets)

                outputs.extend(single_match for file_matches in matches
                               for single_match in file_matches)

            patterns_json = [p.to_json() for p in patterns]
            with tempfile.NamedTemporaryFile(
                    "w") as pattern_file, tempfile.NamedTemporaryFile(
                        "w") as target_file, tempfile.NamedTemporaryFile(
                            "w") as equiv_file:
                yaml = YAML()
                yaml.dump({"rules": patterns_json}, pattern_file)
                pattern_file.flush()
                target_file.write("\n".join(str(t) for t in targets))
                target_file.flush()

                cmd = [SEMGREP_PATH] + [
                    "-lang",
                    language,
                    "-rules_file",
                    pattern_file.name,
                    "-j",
                    str(self._jobs),
                    "-target_file",
                    target_file.name,
                    "-use_parsing_cache",
                    cache_dir,
                ]

                if equivalences:
                    self._write_equivalences_file(equiv_file, equivalences)
                    cmd += ["-equivalences", equiv_file.name]

                core_run = sub_run(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

                debug_print(core_run.stderr.decode("utf-8", "replace"))

                if core_run.returncode != 0:
                    # see if semgrep output a JSON error that we can decode
                    semgrep_output = core_run.stdout.decode("utf-8", "replace")
                    try:
                        output_json = json.loads(semgrep_output)
                    except ValueError:
                        raise SemgrepError(
                            f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                    if "error" in output_json:
                        self._raise_semgrep_error_from_json(
                            output_json, patterns)
                    else:
                        raise SemgrepError(
                            f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                output_json = json.loads(
                    (core_run.stdout.decode("utf-8", "replace")))
                errors.extend(
                    CoreException.from_json(e, language)
                    for e in output_json["errors"])
                outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                debug_print(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors