Exemplo n.º 1
0
    def _run_rule(
        self,
        rule: Rule,
        target_manager: TargetManager,
        cache_dir: str,
        max_timeout_files: List[Path],
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[SemgrepError] = []

        for language, all_patterns_for_language in self._group_patterns_by_language(
                rule).items():

            targets = self.get_files_for_language(language, rule,
                                                  target_manager)
            targets = [
                target for target in targets if target not in max_timeout_files
            ]
            if not targets:
                continue

            if rule.mode == TAINT_MODE:
                pattern_json = rule._raw.copy()
                del pattern_json["mode"]
                pattern = Pattern(0, rule.expression, rule.severity, language,
                                  rule._yaml.span)

                output_json = self._run_core_command(
                    [pattern_json],
                    [pattern],
                    targets,
                    language,
                    rule,
                    "-tainting_rules_file",
                    cache_dir,
                )
            else:
                # semgrep-core doesn't know about OPERATORS.REGEX - this is
                # strictly a semgrep Python feature. Regex filtering is
                # performed purely in Python code then compared against
                # semgrep-core's results for other patterns.
                patterns_regex, patterns = partition(
                    lambda p: p.expression.operator == OPERATORS.REGEX,
                    all_patterns_for_language,
                )
                if patterns_regex:
                    self.handle_regex_patterns(outputs, patterns_regex,
                                               targets)

                # semgrep-core doesn't know about OPERATORS.METAVARIABLE_REGEX -
                # this is strictly a semgrep Python feature. Metavariable regex
                # filtering is performed purely in Python code then compared
                # against semgrep-core's results for other patterns.
                patterns = [
                    pattern for pattern in patterns if
                    pattern.expression.operator != OPERATORS.METAVARIABLE_REGEX
                ]

                patterns_json = [p.to_json() for p in patterns]

                output_json = self._run_core_command(
                    patterns_json,
                    patterns,
                    targets,
                    language,
                    rule,
                    "-rules_file",
                    cache_dir,
                )

            errors.extend(
                CoreException.from_json(e, language,
                                        rule.id).into_semgrep_error()
                for e in output_json["errors"])
            outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                logger.debug(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors
Exemplo n.º 2
0
    def _run_rule(
        self,
        rule: Rule,
        target_manager: TargetManager,
        cache_dir: str,
        max_timeout_files: List[Path],
        profiler: ProfileManager,
        match_time_matrix: Dict[Tuple[str, str], float],
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[SemgrepError], Set[Path]]:
        """
        Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[SemgrepError] = []
        all_targets: Set[Path] = set()

        for language, all_patterns_for_language in self._group_patterns_by_language(
            rule
        ).items():

            targets = self.get_files_for_language(language, rule, target_manager)
            targets = [target for target in targets if target not in max_timeout_files]
            all_targets = all_targets.union(targets)
            if not targets:
                continue

            if rule.mode == TAINT_MODE:
                pattern_json = rule._raw.copy()
                del pattern_json["mode"]
                pattern = Pattern(
                    0, rule.expression, rule.severity, language, rule._yaml.span
                )

                output_json = profiler.track(
                    rule.id,
                    self._run_core_command,
                    [pattern_json],
                    [pattern],
                    targets,
                    language,
                    rule,
                    "-tainting_rules_file",
                    cache_dir,
                    report_time=self._report_time,
                )
            else:
                # semgrep-core doesn't know about OPERATORS.REGEX - this is
                # strictly a semgrep Python feature. Regex filtering is
                # performed purely in Python code then compared against
                # semgrep-core's results for other patterns.
                patterns_regex, patterns = partition(
                    lambda p: p.expression.operator == OPERATORS.REGEX
                    or p.expression.operator == OPERATORS.NOT_REGEX,
                    all_patterns_for_language,
                )
                if patterns_regex:
                    self.handle_regex_patterns(outputs, patterns_regex, targets)

                # regex-only rules only support OPERATORS.REGEX.
                # Skip passing this rule to semgrep-core.
                if language in REGEX_ONLY_LANGUAGE_KEYS:
                    continue

                # semgrep-core doesn't know about the following operators -
                # they are strictly semgrep Python features:
                #   - OPERATORS.METAVARIABLE_REGEX
                #   - OPERATORS.METAVARIABLE_COMPARISON
                patterns = [
                    pattern
                    for pattern in patterns
                    if pattern.expression.operator
                    not in [
                        OPERATORS.METAVARIABLE_REGEX,
                        OPERATORS.METAVARIABLE_COMPARISON,
                    ]
                ]

                patterns_json = [p.to_json() for p in patterns]

                if language == GENERIC_LANGUAGE:
                    output_json = profiler.track(
                        rule.id,
                        run_spacegrep,
                        rule.id,
                        patterns,
                        targets,
                        timeout=self._timeout,
                        report_time=self._report_time,
                    )
                else:  # Run semgrep-core
                    output_json = profiler.track(
                        rule.id,
                        self._run_core_command,
                        patterns_json,
                        patterns,
                        targets,
                        language,
                        rule,
                        "-rules_file",
                        cache_dir,
                        report_time=self._report_time,
                    )

            errors.extend(
                CoreException.from_json(e, language, rule.id).into_semgrep_error()
                for e in output_json["errors"]
            )
            outputs.extend(PatternMatch(m) for m in output_json["matches"])
            if "time" in output_json:
                self._add_match_times(rule, match_time_matrix, output_json["time"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[
            Rule, Dict[Path, List[PatternMatch]]
        ] = collections.defaultdict(lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                logger.debug(
                    f"--> rule ({rule.id}) has findings on filepath: {filepath}"
                )

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec
                )
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)
        logger.debug(f"...ran on {len(all_targets)} files")

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors, all_targets
Exemplo n.º 3
0
    def _run_rule(
        self, rule: Rule, target_manager: TargetManager, cache_dir: str
    ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]:
        """
            Run all rules on targets and return list of all places that match patterns, ... todo errors
        """
        outputs: List[PatternMatch] = []  # multiple invocations per language
        errors: List[CoreException] = []
        equivalences = rule.equivalences

        for language, all_patterns_for_language in self._group_patterns_by_language(
            [rule]).items():
            try:
                targets = target_manager.get_files(language, rule.includes,
                                                   rule.excludes)
            except _UnknownLanguageError as ex:
                raise UnknownLanguageError(
                    short_msg="invalid language",
                    long_msg=f"unsupported language {language}",
                    spans=[
                        rule.languages_span.with_context(before=1, after=1)
                    ],
                ) from ex

            if targets == []:
                continue

            # semgrep-core doesn't know about OPERATORS.REGEX - this is
            # strictly a semgrep Python feature. Regex filtering is
            # performed purely in Python code then compared against
            # semgrep-core's results for other patterns.
            patterns_regex, patterns = partition(
                lambda p: p.expression.operator == OPERATORS.REGEX,
                all_patterns_for_language,
            )
            if patterns_regex:
                patterns_json = [
                    pattern.to_json() for pattern in patterns_regex
                ]

                try:
                    patterns_re = [(pattern["id"],
                                    re.compile(pattern["pattern"]))
                                   for pattern in patterns_json]
                except re.error as err:
                    raise SemgrepError(
                        f"invalid regular expression specified: {err}")

                re_fn = functools.partial(get_re_matches, patterns_re)
                with multiprocessing.Pool(self._jobs) as pool:
                    matches = pool.map(re_fn, targets)

                outputs.extend(single_match for file_matches in matches
                               for single_match in file_matches)

            patterns_json = [p.to_json() for p in patterns]
            with tempfile.NamedTemporaryFile(
                    "w") as pattern_file, tempfile.NamedTemporaryFile(
                        "w") as target_file, tempfile.NamedTemporaryFile(
                            "w") as equiv_file:
                yaml = YAML()
                yaml.dump({"rules": patterns_json}, pattern_file)
                pattern_file.flush()
                target_file.write("\n".join(str(t) for t in targets))
                target_file.flush()

                cmd = [SEMGREP_PATH] + [
                    "-lang",
                    language,
                    "-rules_file",
                    pattern_file.name,
                    "-j",
                    str(self._jobs),
                    "-target_file",
                    target_file.name,
                    "-use_parsing_cache",
                    cache_dir,
                ]

                if equivalences:
                    self._write_equivalences_file(equiv_file, equivalences)
                    cmd += ["-equivalences", equiv_file.name]

                core_run = sub_run(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

                debug_print(core_run.stderr.decode("utf-8", "replace"))

                if core_run.returncode != 0:
                    # see if semgrep output a JSON error that we can decode
                    semgrep_output = core_run.stdout.decode("utf-8", "replace")
                    try:
                        output_json = json.loads(semgrep_output)
                    except ValueError:
                        raise SemgrepError(
                            f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                    if "error" in output_json:
                        self._raise_semgrep_error_from_json(
                            output_json, patterns)
                    else:
                        raise SemgrepError(
                            f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}"
                        )

                output_json = json.loads(
                    (core_run.stdout.decode("utf-8", "replace")))
                errors.extend(
                    CoreException.from_json(e, language)
                    for e in output_json["errors"])
                outputs.extend(PatternMatch(m) for m in output_json["matches"])

        # group output; we want to see all of the same rule ids on the same file path
        by_rule_index: Dict[Rule, Dict[
            Path, List[PatternMatch]]] = collections.defaultdict(
                lambda: collections.defaultdict(list))

        for pattern_match in outputs:
            by_rule_index[rule][pattern_match.path].append(pattern_match)

        findings = []
        debugging_steps: List[Any] = []
        for rule, paths in by_rule_index.items():
            for filepath, pattern_matches in paths.items():
                debug_print(
                    f"----- rule ({rule.id}) ----- filepath: {filepath}")

                findings_for_rule, debugging_steps = evaluate(
                    rule, pattern_matches, self._allow_exec)
                findings.extend(findings_for_rule)

        findings = dedup_output(findings)

        # debugging steps are only tracked for a single file, just overwrite
        return findings, debugging_steps, errors