def test_adv_record_products():
    record = AdvisoryRecord(vulnerability_id="CVE-XXXX-YYYY",
                            description=ADVISORY_TEXT)
    record.analyze()

    # print(record)
    assert "Chrysler" in record.affected_products
def test_adv_record_versions():

    record = AdvisoryRecord(vulnerability_id="CVE-2014-0050",
                            description=ADVISORY_TEXT)
    record.analyze()

    assert "15.26.1" in record.versions
    assert "15.26" not in record.versions
def test_adv_record_code_tokens():
    record = AdvisoryRecord(vulnerability_id="CVE-XXXX-YYYY",
                            description=ADVISORY_TEXT_2)
    record.analyze()

    assert record.code_tokens == (
        "IO",
        "2.7,",
        "FileNameUtils.normalize",
        '"//../foo",',
        '"\\..\\foo",',
        '"limited"',
    )
def prospector(  # noqa: C901
    vulnerability_id: str,
    repository_url: str,
    publication_date: str = "",
    vuln_descr: str = "",
    tag_interval: str = "",
    version_interval: str = "",
    modified_files: "list[str]" = [],
    code_tokens: "list[str]" = [],
    time_limit_before: int = TIME_LIMIT_BEFORE,
    time_limit_after: int = TIME_LIMIT_AFTER,
    use_nvd: bool = False,
    nvd_rest_endpoint: str = "",
    backend_address: str = "",
    git_cache: str = GIT_CACHE,
    limit_candidates: int = MAX_CANDIDATES,
    active_rules: "list[str]" = ["ALL"],
    model_name: str = "",
) -> "list[Commit]":

    _logger.info("begin main commit and CVE processing")

    # -------------------------------------------------------------------------
    # advisory record extraction
    # -------------------------------------------------------------------------
    advisory_record = AdvisoryRecord(
        vulnerability_id=vulnerability_id,
        repository_url=repository_url,
        description=vuln_descr,
        from_nvd=use_nvd,
        nvd_rest_endpoint=nvd_rest_endpoint,
    )

    _logger.pretty_log(advisory_record)

    advisory_record.analyze(use_nvd=use_nvd)
    _logger.info(f"{advisory_record.code_tokens=}")

    if publication_date != "":
        advisory_record.published_timestamp = int(
            datetime.strptime(publication_date, r"%Y-%m-%dT%H:%M%z").timestamp()
        )

    if len(code_tokens) > 0:
        advisory_record.code_tokens += tuple(code_tokens)
        # drop duplicates
        advisory_record.code_tokens = list(set(advisory_record.code_tokens))

    # FIXME this should be handled better (or '' should not end up in the modified_files in
    # the first place)
    if modified_files != [""]:
        advisory_record.paths += modified_files

    _logger.info(f"{advisory_record.code_tokens=}")
    # print(advisory_record.paths)

    # -------------------------------------------------------------------------
    # retrieval of commit candidates
    # -------------------------------------------------------------------------
    with ExecutionTimer(
        core_statistics.sub_collection(name="retrieval of commit candidates")
    ):
        _logger.info(
            "Downloading repository {} in {}..".format(repository_url, git_cache)
        )
        repository = Git(repository_url, git_cache)
        repository.clone()
        tags = repository.get_tags()

        _logger.debug(f"Found tags: {tags}")

        _logger.info("Done retrieving %s" % repository_url)

        prev_tag = None
        following_tag = None
        if tag_interval != "":
            prev_tag, following_tag = tag_interval.split(":")
        elif version_interval != "":
            vuln_version, fixed_version = version_interval.split(":")
            prev_tag = get_tag_for_version(tags, vuln_version)[0]
            following_tag = get_tag_for_version(tags, fixed_version)[0]

        since = None
        until = None
        if advisory_record.published_timestamp:
            since = advisory_record.published_timestamp - time_limit_before
            until = advisory_record.published_timestamp + time_limit_after

        candidates = repository.get_commits(
            since=since,
            until=until,
            ancestors_of=following_tag,
            exclude_ancestors_of=prev_tag,
            filter_files="*.java",
        )

        _logger.info("Found %d candidates" % len(candidates))
    # if some code_tokens were found in the advisory text, require
    # that candidate commits touch some file whose path contains those tokens
    # NOTE: this works quite well for Java, not sure how general this criterion is

    # -------------------------------------------------------------------------
    # commit filtering
    #
    # Here we apply additional criteria to discard commits from the initial
    # set extracted from the repository
    # # -------------------------------------------------------------------------
    # if advisory_record.code_tokens != []:
    #     _logger.info(
    #         "Detected tokens in advisory text, searching for files whose path contains those tokens"
    #     )
    #     _logger.info(advisory_record.code_tokens)

    # if modified_files == [""]:
    #     modified_files = advisory_record.code_tokens
    # else:
    #     modified_files.extend(advisory_record.code_tokens)

    # candidates = filter_by_changed_files(candidates, modified_files, repository)

    with ExecutionTimer(core_statistics.sub_collection(name="commit filtering")):
        candidates = filter_commits(candidates)

        _logger.debug(f"Collected {len(candidates)} candidates")

        if len(candidates) > limit_candidates:
            _logger.error(
                "Number of candidates exceeds %d, aborting." % limit_candidates
            )
            _logger.error(
                "Possible cause: the backend might be unreachable or otherwise unable to provide details about the advisory."
            )
            sys.exit(-1)

    # -------------------------------------------------------------------------
    # commit preprocessing
    # -------------------------------------------------------------------------

    with ExecutionTimer(
        core_statistics.sub_collection(name="commit preprocessing")
    ) as timer:
        raw_commit_data = dict()
        missing = []
        try:
            # Exploit the preprocessed commits already stored in the backend
            #      and only process those that are missing. Note: the endpoint
            #      does not exist (yet)
            r = requests.get(
                backend_address
                + "/commits/"
                + repository_url
                + "?commit_id="
                + ",".join(candidates)
            )
            _logger.info("The backend returned status '%d'" % r.status_code)
            if r.status_code != 200:
                _logger.error("This is weird...Continuing anyway.")
                missing = candidates
            else:
                raw_commit_data = r.json()
                _logger.info(
                    "Found {} preprocessed commits".format(len(raw_commit_data))
                )
        except requests.exceptions.ConnectionError:
            _logger.error(
                "Could not reach backend, is it running? The result of commit pre-processing will not be saved.",
                exc_info=log.config.level < logging.WARNING,
            )
            missing = candidates

        preprocessed_commits: "list[Commit]" = []
        for idx, commit in enumerate(raw_commit_data):
            if (
                commit
            ):  # None results are not in the DB, collect them to missing list, they need local preprocessing
                preprocessed_commits.append(Commit.parse_obj(commit))
            else:
                missing.append(candidates[idx])

        _logger.info("Preprocessing commits...")
        first_missing = len(preprocessed_commits)
        pbar = tqdm(missing)
        with Counter(
            timer.collection.sub_collection(name="commit preprocessing")
        ) as counter:
            counter.initialize("preprocessed commits", unit="commit")
            for commit_id in pbar:
                counter.increment("preprocessed commits")
                preprocessed_commits.append(
                    preprocess_commit(repository.get_commit(commit_id))
                )

        _logger.pretty_log(advisory_record)
        _logger.debug(f"preprocessed {len(preprocessed_commits)} commits")

        payload = [c.__dict__ for c in preprocessed_commits[first_missing:]]

    # -------------------------------------------------------------------------
    # save preprocessed commits to backend
    # -------------------------------------------------------------------------
    with ExecutionTimer(
        core_statistics.sub_collection(name="save preprocessed commits to backend")
    ):
        _logger.info("Sending preprocessing commits to backend...")
        try:
            r = requests.post(backend_address + "/commits/", json=payload)
            _logger.info(
                "Saving to backend completed (status code: %d)" % r.status_code
            )
        except requests.exceptions.ConnectionError:
            _logger.error(
                "Could not reach backend, is it running?"
                "The result of commit pre-processing will not be saved."
                "Continuing anyway.....",
                exc_info=log.config.level < logging.WARNING,
            )

    # TODO compute actual rank
    # This can be done by a POST request that creates a "search" job
    # whose inputs are the AdvisoryRecord, and the repository URL
    # The API returns immediately indicating a job id. From this
    # id, a URL can be constructed to poll the results asynchronously.
    # ranked_results = [repository.get_commit(c) for c in preprocessed_commits]

    # -------------------------------------------------------------------------
    # analyze candidates by applying rules and ML predictor
    # -------------------------------------------------------------------------

    with ExecutionTimer(
        core_statistics.sub_collection(name="analyze candidates")
    ) as timer:
        _logger.info("Extracting features from commits...")
        # annotated_candidates = []
        # with Counter(timer.collection.sub_collection("commit analysing")) as counter:
        #     counter.initialize("analyzed commits", unit="commit")
        #     # TODO remove "proactive" invocation of feature extraction
        #     for commit in tqdm(preprocessed_commits):
        #         counter.increment("analyzed commits")
        #         annotated_candidates.append(extract_features(commit, advisory_record))

        annotated_candidates = apply_rules(
            preprocessed_commits, advisory_record, active_rules=active_rules
        )
        annotated_candidates = rank(annotated_candidates, model_name=model_name)

    return annotated_candidates, advisory_record