def test_extract_referred_to_by_pages_linked_from_advisories(repository, requests_mock): requests_mock.get( "https://for.testing.purposes/containing_commit_id_in_text_2", text="some text r97993e3d78e1f5389b7b172ba9f308440830ce5 blah", ) advisory_record = AdvisoryRecord( vulnerability_id="CVE-2020-26258", references=["https://for.testing.purposes/containing_commit_id_in_text_2"], ) commit = Commit( commit_id="r97993e3d78e1f5389b7b172ba9f308440830ce5", repository="test_repository", ) assert extract_referred_to_by_pages_linked_from_advisories( commit, advisory_record ) == { "https://for.testing.purposes/containing_commit_id_in_text_2", } commit = Commit( commit_id="f4d2eabd921cbd8808b9d923ee63d44538b4154f", repository="test_repository", ) assert ( extract_referred_to_by_pages_linked_from_advisories(commit, advisory_record) == set() )
def candidates(): return [ Commit( repository="repo1", commit_id="1", message= "Blah blah blah fixes CVE-2020-26258 and a few other issues", ghissue_refs=["example"], changed_files={"foo/bar/otherthing.xml", "pom.xml"}, cve_refs=["CVE-2020-26258"], ), Commit(repository="repo2", commit_id="2", cve_refs=["CVE-2020-26258"]), Commit( repository="repo3", commit_id="3", message="Another commit that fixes CVE-2020-26258", ghissue_refs=["example"], ), Commit( repository="repo4", commit_id="4", message="Endless loop causes DoS vulnerability", changed_files={"foo/bar/otherthing.xml", "pom.xml"}, ), Commit( repository="repo5", commit_id="5", message="Insecure deserialization", changed_files={ "core/src/main/java/org/apache/cxf/workqueue/AutomaticWorkQueueImpl.java" }, ), ]
def test_simple_write(setupdb): db = setupdb db.connect(DB_CONNECT_STRING) commit_obj = Commit( commit_id="1234", repository="https://blabla.com/zxyufd/fdafa", timestamp=123456789, hunks=[(3, 5)], hunk_count=1, message="Some random garbage", diff=["fasdfasfa", "asf90hfasdfads", "fasd0fasdfas"], changed_files=["fadsfasd/fsdafasd/fdsafafdsa.ifd"], message_reference_content=[], jira_refs=[], ghissue_refs=[], cve_refs=["fasdfads", "fsfasf"], tags=["tag1"], ) db.save(commit_obj) commit_obj = Commit( commit_id="42423b2423", repository="https://fasfasdfasfasd.com/rewrwe/rwer", timestamp=121422430, hunks=[(3, 5)], hunk_count=1, message="Some random garbage", diff=["fasdfasfa", "asf90hfasdfads", "fasd0fasdfas"], changed_files=["fadsfasd/fsdafasd/fdsafafdsa.ifd"], message_reference_content=[], jira_refs=[], ghissue_refs=["hggdhd"], cve_refs=["fasdfads", "fsfasf"], tags=["tag1"], ) db.save(commit_obj)
def candidates(): return [ Commit(repository="repo1", commit_id="1", ghissue_refs=["example"]), Commit(repository="repo2", commit_id="2"), Commit(repository="repo3", commit_id="3", ghissue_refs=["example"]), Commit(repository="repo4", commit_id="4"), Commit(repository="repo5", commit_id="5"), ]
def lookup(self, repository: str, commit_id: str = None): # Returns the results of the query as list of Commit objects if not self.connection: raise Exception("Invalid connection") data = [] try: cur = self.connection.cursor(cursor_factory=DictCursor) if commit_id: for cid in commit_id.split(","): cur.execute( "SELECT * FROM commits WHERE repository = %s AND commit_id =%s", ( repository, cid, ), ) result = cur.fetchall() if len(result): # Workaround for unmarshaling hunks lis = [] for r in result[0]["hunks"]: a, b = r.strip("()").split(",") lis.append((int(a), int(b))) result[0]["hunks"] = lis parsed_commit = Commit.parse_obj(result[0]) data.append(parsed_commit) else: data.append(None) else: cur.execute( "SELECT * FROM commits WHERE repository = %s", (repository, ), ) result = cur.fetchall() if len(result): for res in result: # Workaround for unmarshaling hunks lis = [] for r in res[3]: a, b = r.strip("()").split(",") lis.append((int(a), int(b))) res[3] = lis parsed_commit = Commit.parse_obj(res) data.append(parsed_commit) cur.close() except Exception: _logger.error("Could not lookup commit vector in database", exc_info=True) raise Exception("Could not lookup commit vector in database") return data
def preprocess_commit(git_commit: GitCommit) -> DatamodelCommit: # TODO need to recheck these docstring, it may contains some outdated info """ This function is responsible of translating a (git)Commit into a preprocessed-Commit, that can be saved to the DB and later used by the ranking/ML module. The structure of this module is straightforward and should remain simple and clear for the sake of maintainability and extensibility. Add as many "extractor" functions as needed below, and call them from this function to assign their result to the appropriate attribute of the preprocessed Commit. Remember to propagate the changes to the DB schema and to the save() and lookup() functions of the database module. NOTE: don't be confused by the fact that we have two classes both named Commit: the one from the git module represents a commit as extracted directly from Git, with only minimal post-processing. The datamodel.Commit class instead maps one-to-one onto the rows of the backend database, and its instances are the input to the ranking module (together with an Advisory Record with which they must be matched) """ commit_id = git_commit.get_id() repository_url = git_commit._repository._url commit = DatamodelCommit(commit_id=commit_id, repository=repository_url) # This is where all the attributes of the preprocessed commit # are computed and assigned. # # Note: all attributes that do not depend on a particular query # (that is, that do not depend on a particular Advisory Record) # should be computed here so that they can be stored in the db. # Space-efficiency is important. commit.diff = git_commit.get_diff() commit.hunks = git_commit.get_hunks() commit.message = git_commit.get_msg() commit.timestamp = int(git_commit.get_timestamp()) commit.changed_files = git_commit.get_changed_files() commit.tags = git_commit.get_tags() commit.jira_refs = list(set(extract_jira_references(commit.message))) commit.ghissue_refs = extract_ghissue_references(commit.message) commit.cve_refs = extract_cve_references(commit.message) return commit
def test_post_preprocessed_commits(): commit_1 = Commit( repository="https://github.com/apache/dubbo", commit_id="yyy" ).__dict__ commit_2 = Commit( repository="https://github.com/apache/dubbo", commit_id="zzz" ).__dict__ commit_3 = Commit( repository="https://github.com/apache/struts", commit_id="bbb" ).__dict__ commits = [commit_1, commit_2, commit_3] response = client.post("/commits/", json=commits) assert response.status_code == 200 assert response.json() == {"status": "ok"}
def test_same_path_only(paths): commit = Commit( commit_id="test_commit", repository="test_repository", changed_files=paths ) advisory_record = AdvisoryRecord( vulnerability_id="test_advisory_record", paths=paths[:2] ) assert extract_changed_relevant_paths(commit, advisory_record) == set(paths[:2])
def test_time_between_commit_and_advisory_record(): commit = Commit( commit_id="test_commit", repository="test_repository", timestamp=142 ) advisory_record = AdvisoryRecord( vulnerability_id="test_advisory_record", published_timestamp=100 ) assert ( extract_time_between_commit_and_advisory_record(commit, advisory_record) == 42 )
def test_no_match(paths): commit = Commit( commit_id="test_commit", repository="test_repository", changed_files=paths[:1], ) advisory_record = AdvisoryRecord( vulnerability_id="test_advisory_record", paths=paths[2:] ) assert extract_changed_relevant_paths(commit, advisory_record) == set()
def test_extract_referred_to_by_pages_linked_from_advisories_wrong_url(repository): advisory_record = AdvisoryRecord( vulnerability_id="CVE-2020-26258", references=["https://non-existing-url.com"], ) commit = Commit( commit_id="r97993e3d78e1f5389b7b172ba9f308440830ce5", repository="test_repository", ) assert not extract_referred_to_by_pages_linked_from_advisories( commit, advisory_record )
def test_extract_referred_to_by_nvd(repository): advisory_record = AdvisoryRecord( vulnerability_id="CVE-2020-26258", references=[ "https://lists.apache.org/thread.html/r97993e3d78e1f5389b7b172ba9f308440830ce5f051ee62714a0aa34@%3Ccommits.struts.apache.org%3E", "https://other.com", ], ) commit = Commit( commit_id="r97993e3d78e1f5389b7b172ba9f308440830ce5", repository="test_repository", ) assert extract_referred_to_by_nvd(commit, advisory_record) == { "https://lists.apache.org/thread.html/r97993e3d78e1f5389b7b172ba9f308440830ce5f051ee62714a0aa34@%3Ccommits.struts.apache.org%3E", } commit = Commit( commit_id="f4d2eabd921cbd8808b9d923ee63d44538b4154f", repository="test_repository", ) assert extract_referred_to_by_nvd(commit, advisory_record) == set()
def test_extract_references_vuln_id(): commit = Commit( commit_id="test_commit", repository="test_repository", cve_refs=[ "test_advisory_record", "another_advisory_record", "yet_another_advisory_record", ], ) advisory_record = AdvisoryRecord(vulnerability_id="test_advisory_record") result = extract_references_vuln_id(commit, advisory_record) assert result is True
def test_sub_path_matching(paths, sub_paths): commit = Commit( commit_id="test_commit", repository="test_repository", changed_files=paths ) advisory_record = AdvisoryRecord( vulnerability_id="test_advisory_record", paths=sub_paths ) matched_paths = { "fire-nation/zuko/lightning.png", "water-bending/katara/necklace.gif", } assert extract_changed_relevant_paths(commit, advisory_record) == matched_paths
def test_report_generation(): candidates = [] for _ in range(100): annotated_candidates = Commit( commit_id=random_commit_hash(), repository=random_url(4), message=" ".join(random_list_of_strs(100)), timestamp=randint(0, 100000), hunks=random_list_of_hunks(1000, 42), diff=random_list_of_strs(200), changed_files=random_list_of_path(4, 42), message_reference_content=random_list_of_strs(42), jira_refs=random_list_of_jira_refs(42), ghissue_refs=random_list_of_github_issue_ids(100000, 42), cve_refs=random_list_of_cve(42), tags=random_list_of_strs(42), annotations=random_dict_of_strs(16, 10), ) candidates.append(annotated_candidates) advisory = AdvisoryRecord( vulnerability_id=random_list_of_cve(max_count=1, min_count=1)[0], repository_url=random_url(4), published_timestamp=randint(0, 100000), last_modified_timestamp=randint(0, 100000), references=random_list_of_strs(42), references_content=random_list_of_strs(42), advisory_references=random_list_of_cve(42), affected_products=random_list_of_strs(42), description=" ".join(random_list_of_strs(42)), preprocessed_vulnerability_description=" ".join( random_list_of_strs(42)), relevant_tags=random_list_of_strs(42), versions=random_list_of_version(42, 4, 42), from_nvd=random_bool(), paths=random_list_of_path(4, 42), code_tokens=tuple(random_list_of_strs(42)), ) filename = "test_report.html" if os.path.isfile(filename): os.remove(filename) generated_report = report_as_html(candidates, advisory, filename, statistics=sample_statistics()) assert os.path.isfile(generated_report)
def test_extract_other_CVE_in_message(): commit = Commit( commit_id="test_commit", repository="test_repository", cve_refs=["CVE-2021-29425", "CVE-2021-21251"], ) advisory_record = AdvisoryRecord(vulnerability_id="CVE-2020-31284") assert extract_other_CVE_in_message(commit, advisory_record) == { "CVE-2021-29425", "CVE-2021-21251", } advisory_record = AdvisoryRecord(vulnerability_id="CVE-2021-29425") result = extract_other_CVE_in_message(commit, advisory_record) assert result == { "CVE-2021-21251", }
def test_upsert(setupdb): db = setupdb db.connect(DB_CONNECT_STRING) commit_obj = Commit( commit_id="42423b2423", repository="https://fasfasdfasfasd.com/rewrwe/rwer", timestamp=1214212430, hunks=[(3, 3)], hunk_count=3, message="Some random garbage upserted", diff=["fasdfasfa", "asf90hfasdfads", "fasd0fasdfas"], changed_files=["fadsfasd/fsdafasd/fdsafafdsa.ifd"], message_reference_content=[], jira_refs=[], ghissue_refs=["hggdhd"], cve_refs=["fasdfads", "fsfasf"], tags=["tag1"], ) db.save(commit_obj) result = db.lookup(commit_obj.repository, commit_obj.commit_id) assert result is not None db.reset() # remove garbage added by tests from DB
def test_simple(): commit = Commit(commit_id="abcd", repository="https://github.com/abc/xyz") assert commit.repository == "https://github.com/abc/xyz" commit = Commit(commit_id="abcd", repository="https://github.com/abc/xyz")
def test_extract_path_similarities(): code_tokens = [ "TophBeifong_Zuko_IknikBlackstoneVarrick_AsamiSato", "Bolin+Bumi+Ozai+Katara", "Jinora.Appa.Unalaq.Zaheer", "Naga.LinBeifong", "Sokka.Kya", "Bumi=Momo=Naga=Iroh", "Sokka_Unalaq", "Sokka.Iroh.Pabu", "LinBeifong=Zuko", "TenzinBolinSokka", "Korra-AsamiSato-Pabu-Iroh", "Mako.Naga", "Jinora=Bumi", "BolinAppaKuvira", "TophBeifongIroh", "Amon+Zuko+Unalaq", ] paths = [ "Unalaq/Aang/Suyin Beifong", "Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer", "Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko", "Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi", "Momo", "Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq", ] commit = Commit(changed_files=paths) advisory = AdvisoryRecord( vulnerability_id=random_list_of_cve(max_count=1, min_count=1)[0], code_tokens=code_tokens, ) similarities: pandas.DataFrame = extract_path_similarities(commit, advisory) expected = ( ",changed file,code token,jaccard,sorensen-dice,otsuka-ochiai,levenshtein,damerau-levenshtein,length diff,inverted normalized levenshtein,inverted normalized damerau-levenshtein\n" "0,Unalaq/Aang/Suyin Beifong,TophBeifong_Zuko_IknikBlackstoneVarrick_AsamiSato,0.09090909090909091,0.16666666666666666,0.17677669529663687,8,8,4,0.19999999999999996,0.19999999999999996\n" "1,Unalaq/Aang/Suyin Beifong,Bolin+Bumi+Ozai+Katara,0.0,0.0,0.0,4,4,0,0.6,0.6\n" "2,Unalaq/Aang/Suyin Beifong,Jinora.Appa.Unalaq.Zaheer,0.14285714285714285,0.25,0.25,4,4,0,0.6,0.6\n" "3,Unalaq/Aang/Suyin Beifong,Naga.LinBeifong,0.16666666666666666,0.2857142857142857,0.2886751345948129,3,3,1,0.7,0.7\n" "4,Unalaq/Aang/Suyin Beifong,Sokka.Kya,0.0,0.0,0.0,4,4,2,0.6,0.6\n" "5,Unalaq/Aang/Suyin Beifong,Bumi=Momo=Naga=Iroh,0.0,0.0,0.0,4,4,0,0.6,0.6\n" "6,Unalaq/Aang/Suyin Beifong,Sokka_Unalaq,0.2,0.3333333333333333,0.35355339059327373,4,4,2,0.6,0.6\n" "7,Unalaq/Aang/Suyin Beifong,Sokka.Iroh.Pabu,0.0,0.0,0.0,4,4,1,0.6,0.6\n" "8,Unalaq/Aang/Suyin Beifong,LinBeifong=Zuko,0.16666666666666666,0.2857142857142857,0.2886751345948129,4,4,1,0.6,0.6\n" "9,Unalaq/Aang/Suyin Beifong,TenzinBolinSokka,0.0,0.0,0.0,4,4,1,0.6,0.6\n" "10,Unalaq/Aang/Suyin Beifong,Korra-AsamiSato-Pabu-Iroh,0.0,0.0,0.0,5,5,1,0.5,0.5\n" "11,Unalaq/Aang/Suyin Beifong,Mako.Naga,0.0,0.0,0.0,4,4,2,0.6,0.6\n" "12,Unalaq/Aang/Suyin Beifong,Jinora=Bumi,0.0,0.0,0.0,4,4,2,0.6,0.6\n" "13,Unalaq/Aang/Suyin Beifong,BolinAppaKuvira,0.0,0.0,0.0,4,4,1,0.6,0.6\n" "14,Unalaq/Aang/Suyin Beifong,TophBeifongIroh,0.16666666666666666,0.2857142857142857,0.2886751345948129,4,4,1,0.6,0.6\n" "15,Unalaq/Aang/Suyin Beifong,Amon+Zuko+Unalaq,0.16666666666666666,0.2857142857142857,0.2886751345948129,4,4,1,0.6,0.6\n" "16,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,TophBeifong_Zuko_IknikBlackstoneVarrick_AsamiSato,0.25,0.4,0.4008918628686366,8,8,0,0.19999999999999996,0.19999999999999996\n" "17,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Bolin+Bumi+Ozai+Katara,0.1,0.18181818181818182,0.1889822365046136,8,8,4,0.19999999999999996,0.19999999999999996\n" "18,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Jinora.Appa.Unalaq.Zaheer,0.1,0.18181818181818182,0.1889822365046136,7,7,4,0.30000000000000004,0.30000000000000004\n" "19,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Naga.LinBeifong,0.1111111111111111,0.2,0.2182178902359924,7,7,5,0.30000000000000004,0.30000000000000004\n" "20,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Sokka.Kya,0.0,0.0,0.0,8,8,6,0.19999999999999996,0.19999999999999996\n" "21,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Bumi=Momo=Naga=Iroh,0.1,0.18181818181818182,0.1889822365046136,8,8,4,0.19999999999999996,0.19999999999999996\n" "22,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Sokka_Unalaq,0.0,0.0,0.0,8,8,6,0.19999999999999996,0.19999999999999996\n" "23,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Sokka.Iroh.Pabu,0.0,0.0,0.0,8,8,5,0.19999999999999996,0.19999999999999996\n" "24,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,LinBeifong=Zuko,0.1111111111111111,0.2,0.2182178902359924,7,7,5,0.30000000000000004,0.30000000000000004\n" "25,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,TenzinBolinSokka,0.1111111111111111,0.2,0.2182178902359924,7,7,5,0.30000000000000004,0.30000000000000004\n" "26,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Korra-AsamiSato-Pabu-Iroh,0.2,0.3333333333333333,0.3380617018914066,6,6,3,0.4,0.4\n" "27,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Mako.Naga,0.0,0.0,0.0,8,8,6,0.19999999999999996,0.19999999999999996\n" "28,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Jinora=Bumi,0.125,0.2222222222222222,0.2672612419124244,7,7,6,0.30000000000000004,0.30000000000000004\n" "29,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,BolinAppaKuvira,0.0,0.0,0.0,8,8,5,0.19999999999999996,0.19999999999999996\n" "30,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,TophBeifongIroh,0.1111111111111111,0.2,0.2182178902359924,7,7,5,0.30000000000000004,0.30000000000000004\n" "31,Tenzin/Asami Sato/Suyin Beifong/Tenzin/Bumi/Zaheer,Amon+Zuko+Unalaq,0.0,0.0,0.0,8,8,5,0.19999999999999996,0.19999999999999996\n" "32,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,TophBeifong_Zuko_IknikBlackstoneVarrick_AsamiSato,0.23076923076923078,0.375,0.375,8,8,0,0.19999999999999996,0.19999999999999996\n" "33,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Bolin+Bumi+Ozai+Katara,0.09090909090909091,0.16666666666666666,0.17677669529663687,7,7,4,0.30000000000000004,0.30000000000000004\n" "34,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Jinora.Appa.Unalaq.Zaheer,0.0,0.0,0.0,8,8,4,0.19999999999999996,0.19999999999999996\n" "35,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Naga.LinBeifong,0.1,0.18181818181818182,0.20412414523193154,8,8,5,0.19999999999999996,0.19999999999999996\n" "36,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Sokka.Kya,0.0,0.0,0.0,8,8,6,0.19999999999999996,0.19999999999999996\n" "37,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Bumi=Momo=Naga=Iroh,0.09090909090909091,0.16666666666666666,0.17677669529663687,7,7,4,0.30000000000000004,0.30000000000000004\n" "38,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Sokka_Unalaq,0.0,0.0,0.0,8,8,6,0.19999999999999996,0.19999999999999996\n" "39,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Sokka.Iroh.Pabu,0.0,0.0,0.0,8,8,5,0.19999999999999996,0.19999999999999996\n" "40,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,LinBeifong=Zuko,0.1,0.18181818181818182,0.20412414523193154,7,7,5,0.30000000000000004,0.30000000000000004\n" "41,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,TenzinBolinSokka,0.1,0.18181818181818182,0.20412414523193154,7,7,5,0.30000000000000004,0.30000000000000004\n" "42,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Korra-AsamiSato-Pabu-Iroh,0.18181818181818182,0.3076923076923077,0.31622776601683794,7,7,3,0.30000000000000004,0.30000000000000004\n" "43,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Mako.Naga,0.1111111111111111,0.2,0.25,7,7,6,0.30000000000000004,0.30000000000000004\n" "44,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Jinora=Bumi,0.0,0.0,0.0,8,8,6,0.19999999999999996,0.19999999999999996\n" "45,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,BolinAppaKuvira,0.0,0.0,0.0,8,8,5,0.19999999999999996,0.19999999999999996\n" "46,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,TophBeifongIroh,0.0,0.0,0.0,8,8,5,0.19999999999999996,0.19999999999999996\n" "47,Asami Sato/Tenzin/Tonraq/Katara/Tarrlok/Naga/Zuko,Amon+Zuko+Unalaq,0.1,0.18181818181818182,0.20412414523193154,8,8,5,0.19999999999999996,0.19999999999999996\n" "48,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,TophBeifong_Zuko_IknikBlackstoneVarrick_AsamiSato,0.3333333333333333,0.5,0.5,9,9,1,0.09999999999999998,0.09999999999999998\n" "49,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Bolin+Bumi+Ozai+Katara,0.2,0.3333333333333333,0.35355339059327373,8,8,5,0.19999999999999996,0.19999999999999996\n" "50,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Jinora.Appa.Unalaq.Zaheer,0.0,0.0,0.0,9,9,5,0.09999999999999998,0.09999999999999998\n" "51,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Naga.LinBeifong,0.1,0.18181818181818182,0.20412414523193154,8,8,6,0.19999999999999996,0.19999999999999996\n" "52,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Sokka.Kya,0.0,0.0,0.0,9,9,7,0.09999999999999998,0.09999999999999998\n" "53,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Bumi=Momo=Naga=Iroh,0.09090909090909091,0.16666666666666666,0.17677669529663687,8,8,5,0.19999999999999996,0.19999999999999996\n" "54,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Sokka_Unalaq,0.0,0.0,0.0,9,9,7,0.09999999999999998,0.09999999999999998\n" "55,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Sokka.Iroh.Pabu,0.0,0.0,0.0,9,9,6,0.09999999999999998,0.09999999999999998\n" "56,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,LinBeifong=Zuko,0.1,0.18181818181818182,0.20412414523193154,8,8,6,0.19999999999999996,0.19999999999999996\n" "57,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,TenzinBolinSokka,0.1,0.18181818181818182,0.20412414523193154,8,8,6,0.19999999999999996,0.19999999999999996\n" "58,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Korra-AsamiSato-Pabu-Iroh,0.18181818181818182,0.3076923076923077,0.31622776601683794,7,7,4,0.30000000000000004,0.30000000000000004\n" "59,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Mako.Naga,0.0,0.0,0.0,9,9,7,0.09999999999999998,0.09999999999999998\n" "60,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Jinora=Bumi,0.1111111111111111,0.2,0.25,8,8,7,0.19999999999999996,0.19999999999999996\n" "61,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,BolinAppaKuvira,0.2222222222222222,0.36363636363636365,0.4082482904638631,8,8,6,0.19999999999999996,0.19999999999999996\n" "62,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,TophBeifongIroh,0.2222222222222222,0.36363636363636365,0.4082482904638631,7,7,6,0.30000000000000004,0.30000000000000004\n" "63,Amon/Asami Sato/Bumi/Kuvira/Toph Beifong/Bolin/Bumi,Amon+Zuko+Unalaq,0.1,0.18181818181818182,0.20412414523193154,8,8,6,0.19999999999999996,0.19999999999999996\n" "64,Momo,TophBeifong_Zuko_IknikBlackstoneVarrick_AsamiSato,0.0,0.0,0.0,8,8,7,0.19999999999999996,0.19999999999999996\n" "65,Momo,Bolin+Bumi+Ozai+Katara,0.0,0.0,0.0,4,4,3,0.6,0.6\n" "66,Momo,Jinora.Appa.Unalaq.Zaheer,0.0,0.0,0.0,4,4,3,0.6,0.6\n" "67,Momo,Naga.LinBeifong,0.0,0.0,0.0,3,3,2,0.7,0.7\n" "68,Momo,Sokka.Kya,0.0,0.0,0.0,2,2,1,0.8,0.8\n" "69,Momo,Bumi=Momo=Naga=Iroh,0.25,0.4,0.5,3,3,3,0.7,0.7\n" "70,Momo,Sokka_Unalaq,0.0,0.0,0.0,2,2,1,0.8,0.8\n" "71,Momo,Sokka.Iroh.Pabu,0.0,0.0,0.0,3,3,2,0.7,0.7\n" "72,Momo,LinBeifong=Zuko,0.0,0.0,0.0,3,3,2,0.7,0.7\n" "73,Momo,TenzinBolinSokka,0.0,0.0,0.0,3,3,2,0.7,0.7\n" "74,Momo,Korra-AsamiSato-Pabu-Iroh,0.0,0.0,0.0,5,5,4,0.5,0.5\n" "75,Momo,Mako.Naga,0.0,0.0,0.0,2,2,1,0.8,0.8\n" "76,Momo,Jinora=Bumi,0.0,0.0,0.0,2,2,1,0.8,0.8\n" "77,Momo,BolinAppaKuvira,0.0,0.0,0.0,3,3,2,0.7,0.7\n" "78,Momo,TophBeifongIroh,0.0,0.0,0.0,3,3,2,0.7,0.7\n" "79,Momo,Amon+Zuko+Unalaq,0.0,0.0,0.0,3,3,2,0.7,0.7\n" "80,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,TophBeifong_Zuko_IknikBlackstoneVarrick_AsamiSato,0.13333333333333333,0.23529411764705882,0.23570226039551587,9,9,2,0.09999999999999998,0.09999999999999998\n" "81,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Bolin+Bumi+Ozai+Katara,0.08333333333333333,0.15384615384615385,0.16666666666666666,9,9,6,0.09999999999999998,0.09999999999999998\n" "82,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Jinora.Appa.Unalaq.Zaheer,0.08333333333333333,0.15384615384615385,0.16666666666666666,10,10,6,0.0,0.0\n" "83,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Naga.LinBeifong,0.2,0.3333333333333333,0.3849001794597505,8,8,7,0.19999999999999996,0.19999999999999996\n" "84,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Sokka.Kya,0.1,0.18181818181818182,0.23570226039551587,9,9,8,0.09999999999999998,0.09999999999999998\n" "85,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Bumi=Momo=Naga=Iroh,0.0,0.0,0.0,10,10,6,0.0,0.0\n" "86,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Sokka_Unalaq,0.2222222222222222,0.36363636363636365,0.47140452079103173,8,8,8,0.19999999999999996,0.19999999999999996\n" "87,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Sokka.Iroh.Pabu,0.09090909090909091,0.16666666666666666,0.19245008972987526,9,9,7,0.09999999999999998,0.09999999999999998\n" "88,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,LinBeifong=Zuko,0.2,0.3333333333333333,0.3849001794597505,8,8,7,0.19999999999999996,0.19999999999999996\n" "89,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,TenzinBolinSokka,0.2,0.3333333333333333,0.3849001794597505,8,8,7,0.19999999999999996,0.19999999999999996\n" "90,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Korra-AsamiSato-Pabu-Iroh,0.07692307692307693,0.14285714285714285,0.14907119849998599,10,10,5,0.0,0.0\n" "91,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Mako.Naga,0.1,0.18181818181818182,0.23570226039551587,9,9,8,0.09999999999999998,0.09999999999999998\n" "92,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Jinora=Bumi,0.0,0.0,0.0,10,10,8,0.0,0.0\n" "93,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,BolinAppaKuvira,0.2,0.3333333333333333,0.3849001794597505,9,9,7,0.09999999999999998,0.09999999999999998\n" "94,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,TophBeifongIroh,0.2,0.3333333333333333,0.3849001794597505,8,8,7,0.19999999999999996,0.19999999999999996\n" "95,Kuvira/Bolin/Lin Beifong/Sokka/Mako/Korra/Toph Beifong/Unalaq,Amon+Zuko+Unalaq,0.09090909090909091,0.16666666666666666,0.19245008972987526,9,9,7,0.09999999999999998,0.09999999999999998\n" ) assert similarities.to_csv() == expected
def prospector( # noqa: C901 vulnerability_id: str, repository_url: str, publication_date: str = "", vuln_descr: str = "", tag_interval: str = "", version_interval: str = "", modified_files: "list[str]" = [], code_tokens: "list[str]" = [], time_limit_before: int = TIME_LIMIT_BEFORE, time_limit_after: int = TIME_LIMIT_AFTER, use_nvd: bool = False, nvd_rest_endpoint: str = "", backend_address: str = "", git_cache: str = GIT_CACHE, limit_candidates: int = MAX_CANDIDATES, active_rules: "list[str]" = ["ALL"], model_name: str = "", ) -> "list[Commit]": _logger.info("begin main commit and CVE processing") # ------------------------------------------------------------------------- # advisory record extraction # ------------------------------------------------------------------------- advisory_record = AdvisoryRecord( vulnerability_id=vulnerability_id, repository_url=repository_url, description=vuln_descr, from_nvd=use_nvd, nvd_rest_endpoint=nvd_rest_endpoint, ) _logger.pretty_log(advisory_record) advisory_record.analyze(use_nvd=use_nvd) _logger.info(f"{advisory_record.code_tokens=}") if publication_date != "": advisory_record.published_timestamp = int( datetime.strptime(publication_date, r"%Y-%m-%dT%H:%M%z").timestamp() ) if len(code_tokens) > 0: advisory_record.code_tokens += tuple(code_tokens) # drop duplicates advisory_record.code_tokens = list(set(advisory_record.code_tokens)) # FIXME this should be handled better (or '' should not end up in the modified_files in # the first place) if modified_files != [""]: advisory_record.paths += modified_files _logger.info(f"{advisory_record.code_tokens=}") # print(advisory_record.paths) # ------------------------------------------------------------------------- # retrieval of commit candidates # ------------------------------------------------------------------------- with ExecutionTimer( core_statistics.sub_collection(name="retrieval of commit candidates") ): _logger.info( "Downloading repository {} in {}..".format(repository_url, git_cache) ) repository = Git(repository_url, git_cache) repository.clone() tags = repository.get_tags() _logger.debug(f"Found tags: {tags}") _logger.info("Done retrieving %s" % repository_url) prev_tag = None following_tag = None if tag_interval != "": prev_tag, following_tag = tag_interval.split(":") elif version_interval != "": vuln_version, fixed_version = version_interval.split(":") prev_tag = get_tag_for_version(tags, vuln_version)[0] following_tag = get_tag_for_version(tags, fixed_version)[0] since = None until = None if advisory_record.published_timestamp: since = advisory_record.published_timestamp - time_limit_before until = advisory_record.published_timestamp + time_limit_after candidates = repository.get_commits( since=since, until=until, ancestors_of=following_tag, exclude_ancestors_of=prev_tag, filter_files="*.java", ) _logger.info("Found %d candidates" % len(candidates)) # if some code_tokens were found in the advisory text, require # that candidate commits touch some file whose path contains those tokens # NOTE: this works quite well for Java, not sure how general this criterion is # ------------------------------------------------------------------------- # commit filtering # # Here we apply additional criteria to discard commits from the initial # set extracted from the repository # # ------------------------------------------------------------------------- # if advisory_record.code_tokens != []: # _logger.info( # "Detected tokens in advisory text, searching for files whose path contains those tokens" # ) # _logger.info(advisory_record.code_tokens) # if modified_files == [""]: # modified_files = advisory_record.code_tokens # else: # modified_files.extend(advisory_record.code_tokens) # candidates = filter_by_changed_files(candidates, modified_files, repository) with ExecutionTimer(core_statistics.sub_collection(name="commit filtering")): candidates = filter_commits(candidates) _logger.debug(f"Collected {len(candidates)} candidates") if len(candidates) > limit_candidates: _logger.error( "Number of candidates exceeds %d, aborting." % limit_candidates ) _logger.error( "Possible cause: the backend might be unreachable or otherwise unable to provide details about the advisory." ) sys.exit(-1) # ------------------------------------------------------------------------- # commit preprocessing # ------------------------------------------------------------------------- with ExecutionTimer( core_statistics.sub_collection(name="commit preprocessing") ) as timer: raw_commit_data = dict() missing = [] try: # Exploit the preprocessed commits already stored in the backend # and only process those that are missing. Note: the endpoint # does not exist (yet) r = requests.get( backend_address + "/commits/" + repository_url + "?commit_id=" + ",".join(candidates) ) _logger.info("The backend returned status '%d'" % r.status_code) if r.status_code != 200: _logger.error("This is weird...Continuing anyway.") missing = candidates else: raw_commit_data = r.json() _logger.info( "Found {} preprocessed commits".format(len(raw_commit_data)) ) except requests.exceptions.ConnectionError: _logger.error( "Could not reach backend, is it running? The result of commit pre-processing will not be saved.", exc_info=log.config.level < logging.WARNING, ) missing = candidates preprocessed_commits: "list[Commit]" = [] for idx, commit in enumerate(raw_commit_data): if ( commit ): # None results are not in the DB, collect them to missing list, they need local preprocessing preprocessed_commits.append(Commit.parse_obj(commit)) else: missing.append(candidates[idx]) _logger.info("Preprocessing commits...") first_missing = len(preprocessed_commits) pbar = tqdm(missing) with Counter( timer.collection.sub_collection(name="commit preprocessing") ) as counter: counter.initialize("preprocessed commits", unit="commit") for commit_id in pbar: counter.increment("preprocessed commits") preprocessed_commits.append( preprocess_commit(repository.get_commit(commit_id)) ) _logger.pretty_log(advisory_record) _logger.debug(f"preprocessed {len(preprocessed_commits)} commits") payload = [c.__dict__ for c in preprocessed_commits[first_missing:]] # ------------------------------------------------------------------------- # save preprocessed commits to backend # ------------------------------------------------------------------------- with ExecutionTimer( core_statistics.sub_collection(name="save preprocessed commits to backend") ): _logger.info("Sending preprocessing commits to backend...") try: r = requests.post(backend_address + "/commits/", json=payload) _logger.info( "Saving to backend completed (status code: %d)" % r.status_code ) except requests.exceptions.ConnectionError: _logger.error( "Could not reach backend, is it running?" "The result of commit pre-processing will not be saved." "Continuing anyway.....", exc_info=log.config.level < logging.WARNING, ) # TODO compute actual rank # This can be done by a POST request that creates a "search" job # whose inputs are the AdvisoryRecord, and the repository URL # The API returns immediately indicating a job id. From this # id, a URL can be constructed to poll the results asynchronously. # ranked_results = [repository.get_commit(c) for c in preprocessed_commits] # ------------------------------------------------------------------------- # analyze candidates by applying rules and ML predictor # ------------------------------------------------------------------------- with ExecutionTimer( core_statistics.sub_collection(name="analyze candidates") ) as timer: _logger.info("Extracting features from commits...") # annotated_candidates = [] # with Counter(timer.collection.sub_collection("commit analysing")) as counter: # counter.initialize("analyzed commits", unit="commit") # # TODO remove "proactive" invocation of feature extraction # for commit in tqdm(preprocessed_commits): # counter.increment("analyzed commits") # annotated_candidates.append(extract_features(commit, advisory_record)) annotated_candidates = apply_rules( preprocessed_commits, advisory_record, active_rules=active_rules ) annotated_candidates = rank(annotated_candidates, model_name=model_name) return annotated_candidates, advisory_record