def test_add_commits_to_database(example_vulnerability, example_vulnerability_git_repo): connection, cursor = database.connect_with_database(':memory:') commit_ids_to_add = database.get_commit_ids_between_timestamp( since=1457823600, until=1529532000, git_repo=None, repository_url=example_vulnerability['repo_url']) database.add_commits_to_database( connection, commit_ids_to_add[:10], git_repo=example_vulnerability_git_repo, repository_url=example_vulnerability['repo_url'], with_message_references=False) cursor.execute('SELECT COUNT(id) FROM commits') assert cursor.fetchone()['COUNT(id)'] == 10 # verify entries are correct cursor.execute('SELECT * FROM commits') row = cursor.fetchone() assert row[ 'repository_url'] == 'https://github.com/jenkinsci/promoted-builds-plugin' assert row['id'] == 'e4c9304553f2868f67556644f5831eba60cf2c34' assert row['timestamp'] == '1528139978' assert row[ 'message'] == "['[maven-release-plugin] prepare for next development iteration']" assert row['changed_files'] == "['pom.xml']" assert row[ 'diff'] == "['diff --git a/pom.xml b/pom.xml', 'index 3afe9c3..51b568a 100644', '--- a/pom.xml', '+++ b/pom.xml', '@@ -10,3 +10,3 @@', ' <artifactId>promoted-builds</artifactId>', '- <version>3.2</version>', '+ <version>3.3-SNAPSHOT</version>', ' <packaging>hpi</packaging>', '@@ -41,3 +41,3 @@', ' <url>https://github.com/jenkinsci/${project.artifactId}-plugin</url>', '- <tag>promoted-builds-3.2</tag>', '+ <tag>HEAD</tag>', ' </scm>']" assert row['hunks'] == "[(6, 8), (11, 13)]" assert row['commit_message_reference_content'] == None assert row[ 'preprocessed_message'] == "maven release plugin prepare development iteration" # assert row['preprocessed_diff'] == "artifactid artifact id promote build artifactid artifact id version version version snapshot version packaging hpi packaging url https github com jenkinsci project artifactid artifact id plugin url tag promote build tag tag head tag scm" assert row['preprocessed_changed_files'] == "pom.xml pom xml" assert row['preprocessed_commit_message_reference_content'] == None # test adding without reference content: to speed up the time database.add_commits_to_database( connection, commit_ids_to_add[10:20], git_repo=example_vulnerability_git_repo, repository_url=example_vulnerability['repo_url'], with_message_references=False) cursor.execute('SELECT COUNT(id) FROM commits') assert cursor.fetchone()['COUNT(id)'] == 20 connection.close()
def load_vulnerabilities(): prospector_connection, prospector_cursor = database.connect_with_database( commits_db_path) vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database( vulnerabilities_db_path) print("Reading vulnerabilities") vulnerabilities_df = pd.read_sql( "SELECT * FROM vulnerabilities", vulnerabilities_connection).set_index("vulnerability_id") db_references_df = pd.read_sql( "SELECT vulnerability_id, url, preprocessed_content FROM vulnerability_references", vulnerabilities_connection) advisory_references_df = pd.read_sql( "SELECT vulnerability_id, url FROM advisory_references", vulnerabilities_connection) fixes_df = pd.read_sql("SELECT * FROM fix_commits", vulnerabilities_connection) # prospector_connection, prospector_cursor = database.connect_with_database(commits_db_path) tags_df = pd.read_sql("SELECT * FROM tags", prospector_connection) # Create repository_url_df repository_url_df = pd.DataFrame() for i, repo_url in enumerate(list(vulnerabilities_df.repo_url.unique())): repository_url_df.at[i, 'repo_url'] = repo_url repository_url_df.at[i, 'project_name'] = rank.simpler_filter_text( re.sub('^https?://|[^\w]', ' ', repo_url)).lower() repository_url_df['project_name'] = repository_url_df.apply( lambda x: ' '.join([ token for token in x['project_name'].split(' ') if token not in ['github', 'com', 'git', 'org'] ]), axis=1) return vulnerabilities_df, db_references_df, advisory_references_df, tags_df, repository_url_df, fixes_df
], 'fix_commits': ['2bb79861dbaf7e8a9646fcd70359523fdb464d9c'], 'project_name': 'github JPCERTCC LogonTracer', 'nvd_reference_content': 'git GitHub hub cve-2018 logon tracer vulnerability jpcert base score cookie cvss av ac sign code use learn product release', 'preprocessed_description': 'LogonTracer logon tracer early allow remote attacker conduct xml external entity xxe attack unspecified vector' } return example_vulnerability # databases are created in the notebook database_creation.ipynb vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database( 'test-vulnerabilities.db') prospector_connection, prospector_cursor = database.connect_with_database( 'test-commits.db') @pytest.mark.database def test_database_coverage(example_vulnerability): database.add_vulnerabiliy_to_database( vulnerabilities_connection, example_vulnerability['vulnerability_id'], example_vulnerability['repo_url'], example_vulnerability['description'], str(example_vulnerability['nvd_published_timestamp'])) assert vulnerabilities_cursor.execute( "SELECT COUNT(vulnerability_id) FROM vulnerabilities WHERE vulnerability_id = :vulnerability_id;", { 'vulnerability_id': example_vulnerability['vulnerability_id'] }).fetchone()['COUNT(vulnerability_id)'] == 1
def connect_with_commits_database(path): return database.connect_with_database(path)
def main(vulnerability_id, verbose, description=None, published_timestamp=None, repo_url=None, project_name=None, references=None, k=10, vulnerability_specific_scaling=False): model = load(model_path) universal_columns_scaler = load(min_max_scaler_path) # databases are created in the notebook database_creation.ipynb # the vulnerabilities database vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database( 'data/prospector-vulnerabilities.db', verbose=verbose) # the commits database prospector_connection, prospector_cursor = database.connect_with_database( 'data/prospector-commits.db', verbose=verbose) # if the vulnerability is already in the database if database.if_new_vulnerability(vulnerabilities_cursor, vulnerability_id) == False: vulnerability = vulnerabilities_cursor.execute( "SELECT * FROM vulnerabilities WHERE vulnerability_id = :vulnerability_id", { 'vulnerability_id': vulnerability_id }).fetchone() # keep the manually provided value if it has been provided, otherwise select the one in the DB repo_url = repo_url if repo_url != None else vulnerability['repo_url'] published_timestamp = published_timestamp if published_timestamp != None else vulnerability[ 'published_date'] if description == None: description = vulnerability['description'] preprocessed_description = vulnerability[ 'preprocessed_description'] else: preprocessed_description = rank.simpler_filter_text(description) if references != None: database.add_vulnerability_references_to_database( vulnerabilities_connection, vulnerability_id, references, driver=None, verbose=verbose) else: references = references if references != None else [ nvd_reference['url'] for nvd_reference in vulnerabilities_cursor.execute( "SELECT url FROM vulnerability_references WHERE vulnerability_id = :vulnerability_id", {'vulnerability_id': vulnerability_id}) ] else: if verbose: print("Vulnerability {} is a new vulnerability".format( vulnerability_id)) # gather information for the new vulnerability if needed if description == None or published_timestamp == None or references == None: try: nvd_description, nvd_published_timestamp, nvd_references = database.extract_nvd_content( vulnerability_id) except: #if the vulnerability is not in the NVD nvd_description, nvd_published_timestamp, nvd_references = None, None, None if description == None: if nvd_description == None: # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id)) print( "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually." .format(vulnerability_id)) description = input() if description == "SKIP!": print('skipping this one') return else: description = nvd_description if published_timestamp == None: if nvd_published_timestamp == None: # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually.".format(vulnerability_id)) print( "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually." .format(vulnerability_id)) published_timestamp = input() else: published_timestamp = nvd_published_timestamp if references == None: if nvd_references == None: # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id)) print( "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability references manually (comma seperated)." .format(vulnerability_id)) references = input() references = references.split(',') else: references = nvd_references # determine the repo_url if repo_url == None: if verbose: print('Suggesting a repository URL') repo_url = rank.map_description_to_repository_url( vulnerabilities_connection, vulnerability_id, description) print( 'Does the vulnerability affect the following repository: {} [Y/n]' .format(repo_url)) choice = input() if choice.lower() in [ '', 'y', 'yes' ]: #@TODO: can be a while, where it is either yes or no, not enter print('Confirmed') else: print('Provide the (GitHub) URL of the affected repository:') repo_url = input() repo_url = re.sub('\.git$|/$', '', repo_url) print('repo_url:', repo_url) # add to the database preprocessed_description = rank.simpler_filter_text(description) with vulnerabilities_connection: vulnerabilities_cursor.execute( "INSERT INTO vulnerabilities VALUES (:vulnerability_id, :repo_url, :description, :published_timestamp, :preprocessed_description)", { 'vulnerability_id': vulnerability_id, 'repo_url': repo_url, 'description': description, 'published_timestamp': str(published_timestamp), 'preprocessed_description': preprocessed_description }) # add the references to the database database.add_vulnerability_references_to_database( vulnerabilities_connection, vulnerability_id, references, driver=None, verbose=verbose) # determine the project_name if project_name == None: if verbose: print('Suggesting a project name') project_name = rank.extract_project_name_from_repository_url(repo_url) print('Does the vulnerability affect the following project: {} [Y/n]'. format(project_name)) choice = input() if choice.lower() in [ '', 'y', 'yes' ]: #@TODO: can be a while, where it is either yes or no, not enter print('Confirmed') else: print('Provide the name of the affected project:') project_name = input() references_for_query = '' if len(references) == 1: references_for_query = "('" + references[0] + "')" else: references_for_query = tuple(references) references_content = tuple( pd.read_sql( "SELECT vulnerability_id, url, preprocessed_content FROM vulnerability_references WHERE url IN {} and vulnerability_id = '{}'" .format(references_for_query, vulnerability_id), vulnerabilities_connection).preprocessed_content) references_content = rank.extract_n_most_occurring_words( rank.remove_forbidden_words_from_string( string=' '.join(references_content), forbidden_words=rank.reference_stopwords + project_name.split(' ')), n=20) # @TODO: now adding all advisory references --> change to only using the provided references advisory_references = [ advisory_reference['url'] for advisory_reference in vulnerabilities_cursor.execute( "SELECT url FROM advisory_references WHERE vulnerability_id = :vulnerability_id", {'vulnerability_id': vulnerability_id}) ] # creating advisory record advisory_record = rank.Advisory_record( vulnerability_id, published_timestamp, repo_url, references, references_content, advisory_references, description, prospector_connection, preprocessed_vulnerability_description=preprocessed_description, relevant_tags=None, verbose=verbose, since=None, until=None) if verbose: print("\nThe following advisory record has been created:") print(" - Vulnerability ID: {}".format(advisory_record.id)) print(" - Vulnerability description: {}".format( advisory_record.description)) print(" - Vulnerability published timestamp: {}".format( advisory_record.published_timestamp)) print(" - Affected project: {}".format(advisory_record.project_name)) print(" - Affected repository: {}".format(advisory_record.repo_url)) print(" - References content extracted: {}".format( advisory_record.references_content)) if verbose: print("\nGathering candidate commits:") advisory_record.gather_candidate_commits() if verbose: print("\nComputing ranking vectors:") advisory_record.compute_ranking_vectors(vulnerability_specific_scaling) if vulnerability_specific_scaling == False: if verbose: print( "\nscaling some columns using the pretrained scaler, and some vulnerability specific" ) advisory_record.ranking_vectors[ vulnerability_specific_columns] = MinMaxScaler().fit_transform( advisory_record.ranking_vectors[vulnerability_specific_columns] ) advisory_record.ranking_vectors[ universal_columns] = universal_columns_scaler.transform( advisory_record.ranking_vectors[universal_columns]) advisory_record.ranking_vectors.drop(columns=columns_to_drop, inplace=True) if verbose: print("\nRanking the candidate commits:") advisory_record.ranked_candidate_commits = rank.rank_candidates( model, advisory_record.ranking_vectors) if verbose: print('\nResults:') advisory_record.ranking_vectors.set_index('commit_id', inplace=True) output = advisory_record_to_output(advisory_record, model, prospector_cursor, k=k) print(output) # # succeeded vulnerabilities_connection.close() prospector_connection.close() return advisory_record
def test_database_creation(example_vulnerability, example_vulnerability_git_repo): connection, cursor = database.connect_with_database(':memory:') assert type(connection) == sqlite3.Connection assert type(cursor) == sqlite3.Cursor connection.close()