Exemplo n.º 1
0
def test_raise_semgrep_error_from_json_unknown_error():
    test_rule_id = "test_rule_id"
    rule_yaml_text = io.StringIO(f"""
    rules:
    - id: {test_rule_id}
      pattern: $X == $X
      severity: INFO
      languages: [python]
      message: blah
    """)
    rule_dict = yaml.load(rule_yaml_text).get("rules")[0]
    rule: Rule = Rule.from_json(rule_dict)

    core_runner = CoreRunner(
        allow_exec=False,
        output_settings=OutputSettings(OutputFormat.TEXT),
        jobs=1,
        timeout=0,
        max_memory=0,
        timeout_threshold=0,
        report_time=False,
    )

    patterns: List[Pattern] = list(core_runner._flatten_rule_patterns([rule]))

    output_json: Dict[str, Any] = {
        "error": "unknown exception",
        "message": "End_of_file",
    }
    with pytest.raises(SemgrepError) as excinfo:
        core_runner._raise_semgrep_error_from_json(output_json, patterns, rule)
        assert test_rule_id in str(excinfo.value)
Exemplo n.º 2
0
def test_raise_semgrep_error_from_json_unknown_error():
    test_rule_id = "test_rule_id"
    rule_yaml_text = io.StringIO(f"""
    rules:
    - id: {test_rule_id}
      pattern: $X == $X
      severity: INFO
      languages: [python]
      message: blah
    """)
    rule_dict = yaml.load(rule_yaml_text).get("rules")[0]
    rule: Rule = Rule.from_json(rule_dict)

    core_runner = CoreRunner(
        jobs=1,
        timeout=0,
        max_memory=0,
        timeout_threshold=0,
        optimizations="all",
    )

    output_json: Dict[str, Any] = {
        "error": "unknown exception",
        "message": "End_of_file",
    }
    with pytest.raises(SemgrepError) as excinfo:
        core_runner._raise_semgrep_error_from_json(output_json, rule)
        assert test_rule_id in str(excinfo.value)
Exemplo n.º 3
0
def validate_single_rule(
    config_id: str,
    rule_yaml: YamlTree[YamlMap],
) -> Optional[Rule]:
    """
        Validate that a rule dictionary contains all necessary keys
        and can be correctly parsed.

        Returns Rule object if valid otherwise raises InvalidRuleSchemaError
    """
    rule: YamlMap = rule_yaml.value

    rule_keys = set({k.value for k in rule.keys()})
    if not rule_keys.issuperset(YAML_MUST_HAVE_KEYS):
        missing_keys = YAML_MUST_HAVE_KEYS - rule_keys

        extra_keys: Set[str] = rule_keys - YAML_ALL_VALID_RULE_KEYS
        extra_key_spans = sorted([rule.key_tree(k) for k in extra_keys])
        help_msg = None
        if extra_keys:
            help_msg = f"Unexpected keys {extra_keys} found. Is one of these a typo of {missing_keys}?"
        raise InvalidRuleSchemaError(
            short_msg="missing keys",
            long_msg=f"{config_id} is missing required keys {missing_keys}",
            spans=[rule_yaml.span.truncate(lines=5)] +
            [e.span for e in extra_key_spans],
            help=help_msg,
        )

    # Raises InvalidRuleSchemaError if fails to parse
    return Rule.from_yamltree(rule_yaml)
Exemplo n.º 4
0
def validate_single_rule(config_id: str,
                         rule_yaml: YamlTree[YamlMap]) -> Optional[Rule]:
    """
    Validate that a rule dictionary contains all necessary keys
    and can be correctly parsed.
    """
    rule: YamlMap = rule_yaml.value

    # Defaults to search mode if mode is not specified
    return Rule.from_yamltree(rule_yaml)
Exemplo n.º 5
0
def test_build_exprs() -> None:
    base_rule: Dict[str, Any] = {
        "id": "test-id",
        "message": "test message",
        "languages": ["python"],
        "severity": "ERROR",
    }
    rules: List[Dict[str, Any]] = [
        {
            **base_rule,
            **{
                "pattern": "test(...)"
            }
        },
        {
            **base_rule,
            **{
                "patterns": [{
                    "pattern": "test(...)"
                }]
            }
        },
        {
            **base_rule,
            **{
                "pattern-either": [{
                    "pattern": "test(...)"
                }]
            }
        },
    ]

    results = [Rule.from_json(rule).expression for rule in rules]
    base_expected = [
        BooleanRuleExpression(OPERATORS.AND, PatternId(".0"), None,
                              "test(...)")
    ]
    expected = [
        BooleanRuleExpression(OPERATORS.AND, PatternId("test-id"), None,
                              "test(...)"),
        BooleanRuleExpression(OPERATORS.AND_ALL, None, base_expected, None),
        BooleanRuleExpression(OPERATORS.AND_EITHER, None, base_expected, None),
    ]

    assert results == expected
Exemplo n.º 6
0
def validate_single_rule(config_id: str,
                         rule_yaml: YamlTree[YamlMap]) -> Optional[Rule]:
    """
        Validate that a rule dictionary contains all necessary keys
        and can be correctly parsed.

        Returns Rule object if valid otherwise raises InvalidRuleSchemaError
    """
    rule: YamlMap = rule_yaml.value

    rule_keys = set({k.value for k in rule.keys()})
    extra_keys = rule_keys - YAML_ALL_VALID_RULE_KEYS
    extra_key_spans = sorted([rule.key_tree(k) for k in extra_keys])
    missing_keys = YAML_MUST_HAVE_KEYS - rule_keys

    if missing_keys and extra_keys:
        help_msg = f"Unexpected keys {extra_keys} found. Is one of these a typo of {missing_keys}?"
        raise InvalidRuleSchemaError(
            short_msg="incorrect keys",
            long_msg=f"{config_id} is missing required keys {missing_keys}",
            spans=[rule_yaml.span.truncate(lines=5)] +
            [e.span for e in extra_key_spans],
            help=help_msg,
        )
    elif missing_keys:
        help_msg = f"Add {missing_keys} to your config file."
        raise InvalidRuleSchemaError(
            short_msg="missing keys",
            long_msg=f"{config_id} is missing required keys {missing_keys}",
            spans=[rule_yaml.span.truncate(lines=5)] +
            [e.span for e in extra_key_spans],
            help=help_msg,
        )
    elif extra_keys:
        help_msg = f"Unexpected keys {extra_keys} found. Were you looking for any of these unused, valid keys?\n {sorted(YAML_ALL_VALID_RULE_KEYS - rule_keys)}"
        raise InvalidRuleSchemaError(
            short_msg="invalid keys",
            long_msg=
            f"{config_id} has extra, un-interpretable keys: {extra_keys}",
            spans=[e.span for e in extra_key_spans],
            help=help_msg,
        )
    # Defaults to search mode if mode is not specified
    return Rule.from_yamltree(rule_yaml)
Exemplo n.º 7
0
def main(
    output_handler: OutputHandler,
    target: List[str],
    pattern: str,
    lang: str,
    configs: List[str],
    no_rewrite_rule_ids: bool = False,
    jobs: int = 1,
    include: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
    strict: bool = False,
    autofix: bool = False,
    dryrun: bool = False,
    disable_nosem: bool = False,
    dangerously_allow_arbitrary_code_execution_from_rules: bool = False,
    no_git_ignore: bool = False,
    timeout: int = DEFAULT_TIMEOUT,
    max_memory: int = 0,
    max_target_bytes: int = 0,
    timeout_threshold: int = 0,
    skip_unknown_extensions: bool = False,
    severity: Optional[List[str]] = None,
    optimizations: str = "none",
) -> None:
    if include is None:
        include = []

    if exclude is None:
        exclude = []

    configs_obj, errors = get_config(pattern, lang, configs)
    all_rules = configs_obj.get_rules(no_rewrite_rule_ids)

    if severity is None or severity == []:
        filtered_rules = all_rules
    else:
        filtered_rules = [
            rule for rule in all_rules if rule.severity in severity
        ]

    output_handler.handle_semgrep_errors(errors)

    is_sarif = output_handler.settings.output_format == OutputFormat.SARIF

    if errors and strict:
        raise SemgrepError(
            f"run with --strict and there were {len(errors)} errors loading configs",
            code=MISSING_CONFIG_EXIT_CODE,
        )

    if not pattern:
        plural = "s" if len(configs_obj.valid) > 1 else ""
        config_id_if_single = (list(configs_obj.valid.keys())[0]
                               if len(configs_obj.valid) == 1 else "")
        invalid_msg = (f"({len(errors)} config files were invalid)"
                       if len(errors) else "")
        logger.verbose(
            f"running {len(filtered_rules)} rules from {len(configs_obj.valid)} config{plural} {config_id_if_single} {invalid_msg}"
        )

        if len(configs_obj.valid) == 0:
            if len(errors) > 0:
                raise SemgrepError(
                    f"no valid configuration file found ({len(errors)} configs were invalid)",
                    code=MISSING_CONFIG_EXIT_CODE,
                )
            else:
                raise SemgrepError(
                    """You need to specify a config with --config=<semgrep.dev config name|localfile|localdirectory|url>.
If you're looking for a config to start with, there are thousands at: https://semgrep.dev
The two most popular are:
    --config=p/ci # find logic bugs, and high-confidence security vulnerabilities; recommended for CI
    --config=p/security-audit # find security audit points; noisy, not recommended for CI
""",
                    code=MISSING_CONFIG_EXIT_CODE,
                )

        notify_user_of_work(filtered_rules, include, exclude)

    respect_git_ignore = not no_git_ignore
    target_manager = TargetManager(
        includes=include,
        excludes=exclude,
        max_target_bytes=max_target_bytes,
        targets=target,
        respect_git_ignore=respect_git_ignore,
        output_handler=output_handler,
        skip_unknown_extensions=skip_unknown_extensions,
    )

    profiler = ProfileManager()

    join_rules, rest_of_the_rules = partition(
        lambda rule: rule.mode == JOIN_MODE,
        filtered_rules,
    )
    filtered_rules = rest_of_the_rules

    start_time = time.time()
    # actually invoke semgrep
    (
        rule_matches_by_rule,
        debug_steps_by_rule,
        semgrep_errors,
        all_targets,
        profiling_data,
    ) = CoreRunner(
        jobs=jobs,
        timeout=timeout,
        max_memory=max_memory,
        timeout_threshold=timeout_threshold,
        optimizations=optimizations,
    ).invoke_semgrep(target_manager, profiler, filtered_rules)

    if join_rules:
        import semgrep.join_rule as join_rule

        for rule in join_rules:
            join_rule_matches, join_rule_errors = join_rule.run_join_rule(
                rule.raw, [Path(t) for t in target_manager.targets])
            join_rule_matches_by_rule = {
                Rule.from_json(rule.raw): join_rule_matches
            }
            rule_matches_by_rule.update(join_rule_matches_by_rule)
            output_handler.handle_semgrep_errors(join_rule_errors)

    profiler.save("total_time", start_time)

    output_handler.handle_semgrep_errors(semgrep_errors)

    nosem_errors = []
    for rule, rule_matches in rule_matches_by_rule.items():
        evolved_rule_matches = []
        for rule_match in rule_matches:
            ignored, returned_errors = rule_match_nosem(rule_match, strict)
            evolved_rule_matches.append(
                attr.evolve(rule_match, is_ignored=ignored))
            nosem_errors.extend(returned_errors)
        rule_matches_by_rule[rule] = evolved_rule_matches

    output_handler.handle_semgrep_errors(nosem_errors)

    num_findings_nosem = 0
    if not disable_nosem:
        filtered_rule_matches_by_rule = {}
        for rule, rule_matches in rule_matches_by_rule.items():
            filtered_rule_matches = []
            for rule_match in rule_matches:
                if rule_match._is_ignored:
                    num_findings_nosem += 1
                else:
                    filtered_rule_matches.append(rule_match)
            filtered_rule_matches_by_rule[rule] = filtered_rule_matches
        # SARIF output includes ignored findings, but labels them as suppressed.
        # https://docs.oasis-open.org/sarif/sarif/v2.1.0/csprd01/sarif-v2.1.0-csprd01.html#_Toc10541099
        if not is_sarif:
            rule_matches_by_rule = filtered_rule_matches_by_rule

    num_findings = sum(len(v) for v in rule_matches_by_rule.values())
    stats_line = f"ran {len(filtered_rules)} rules on {len(all_targets)} files: {num_findings} findings"

    if metric_manager.is_enabled:
        project_url = None
        try:
            project_url = sub_check_output(
                ["git", "ls-remote", "--get-url"],
                encoding="utf-8",
                stderr=subprocess.DEVNULL,
            )
        except Exception as e:
            logger.debug(
                f"Failed to get project url from 'git ls-remote': {e}")
            try:
                # add \n to match urls from git ls-remote (backwards compatability)
                project_url = manually_search_file(".git/config", ".com", "\n")
            except Exception as e:
                logger.debug(
                    f"Failed to get project url from .git/config: {e}")

        metric_manager.set_project_hash(project_url)
        metric_manager.set_configs_hash(configs)
        metric_manager.set_rules_hash(filtered_rules)
        metric_manager.set_num_rules(len(filtered_rules))
        metric_manager.set_num_targets(len(all_targets))
        metric_manager.set_num_findings(num_findings)
        metric_manager.set_num_ignored(num_findings_nosem)
        metric_manager.set_run_time(profiler.calls["total_time"][0])
        total_bytes_scanned = sum(t.stat().st_size for t in all_targets)
        metric_manager.set_total_bytes_scanned(total_bytes_scanned)
        metric_manager.set_errors(
            list(type(e).__name__ for e in semgrep_errors))
        metric_manager.set_run_timings(profiling_data, list(all_targets),
                                       filtered_rules)

    output_handler.handle_semgrep_core_output(
        rule_matches_by_rule,
        debug_steps_by_rule,
        stats_line,
        all_targets,
        profiler,
        filtered_rules,
        profiling_data,
    )

    if autofix:
        apply_fixes(rule_matches_by_rule, dryrun)