def a(db_conn, vuln_list): query_base = \ "select sum(vulns) / count(distinct repo_id) from \ (select r.repo_id, count(distinct rc.username), count(distinct v.vuln_id) vulns \ from gh_repo r inner join \ gh_repo_contributors rc on r.repo_id = rc.repo_id \ inner join gh_file f on r.repo_id = f.repo_id \ left join gh_vuln v on f.file_id = v.file_id \ where (v.vuln_desc in (" + '\'' + '\', \''.join(vuln_list) + '\'' + ") or v.vuln_desc is null)\ and r.repo_size >= %s and r.repo_size < %s \ group by r.repo_id " t1 = 5 t2 = 10 avg_vulns_under = [] ra_under = [] avg_vulns_mid = [] ra_mid = [] avg_vulns_over = [] ra_over = [] for i in range(0,1000,100): rows = sd.select_many_query(db_conn, query_base +\ " having count(distinct rc.username) <= %s) a", [i, i+100, t1]) if rows[0]: ra_under.append(i); print "appending ", i avg_vulns_under.append(rows[0][0]) rows = sd.select_many_query(db_conn, query_base +\ " having count(distinct rc.username) > %s \ and count(distinct rc.username) <= %s) a", [i, i+100, t1, t2]) if rows[0]: ra_mid.append(i); print "appending ", i avg_vulns_mid.append(rows[0][0]) rows = sd.select_many_query(db_conn, query_base +\ " having count(distinct rc.username) > %s) a", [i, i+100, t2]) if rows[0]: ra_over.append(i); print "appending ", i avg_vulns_over.append(rows[0][0]) l_under, = plt.plot(ra_under, avg_vulns_under, label="under " + str(t1) + ", ".join(vuln_list)) l_mid, = plt.plot(ra_mid, avg_vulns_mid, label="between " + str(t1) + " and " + str(t2) + ", ".join(vuln_list)) l_over, = plt.plot(ra_over, avg_vulns_over, label="over " + str(t2) + ", ".join(vuln_list)) plt.legend(handles=[l_under, l_mid, l_over])#, ["over", "under"]) return [ra_under, avg_vulns_under, ra_mid, avg_vulns_mid, ra_over, avg_vulns_over , [l_under, l_mid, l_over]]
def get_analysis(output_filename, vuln_list, alt_list): # get database connection (local for now) outfile = open(output_filename, "wb"); id_types = ["r.repo_id", "f.file_id", "v.vuln_id"] id_names = ["number of repos", "number of files", "total occurrences"] fieldnames = ["vulnerability"] + id_names writer = csv.DictWriter(outfile, fieldnames=fieldnames); writer.writeheader(); total_list = vuln_list + alt_list db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); for vuln in total_list:# vuln_list + alt_list: tmp_result = {"vulnerability": vuln}; for i in range(0, len(id_types)): #get number of distinct {repo, file, vuln} containing a particular vulnerability #don't escape the id type rows = sd.select_many_query(db_conn, \ " select count(distinct " + id_types[i] + ") \ from \ gh_repo r \ inner join gh_file f on f.repo_id = r.repo_id \ inner join gh_vuln v on f.file_id = v.file_id \ where vuln_desc = %s ", \ [vuln]); for row in rows: tmp_result[id_names[i]] = row[0]; writer.writerow(tmp_result) #Next, we'll want to look into co-occurrence of different types of vulnerabilities for i in range(0, len(total_list)): for j in range(i+1, len(total_list)): rows = sd.select_many_query(db_conn, \ " select count(distinct r.repo_id) \ from gh_repo r \ inner join gh_file f1 on r.repo_id = f1.repo_id \ inner join gh_vuln v1 on f1.file_id = v1.file_id \ inner join gh_file f2 on r.repo_id = f2.repo_id \ inner join gh_vuln v2 on f2.file_id = v2.file_id \ and v1.vuln_desc = %s \ and v2.vuln_desc = %s ", [total_list[i], total_list[j]]); for row in rows: print total_list[i], ", ", total_list[j], ", ", row[0]
def get_repo_cuser_histogram1(): #contributors per repo db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); rows = sd.select_many_query(db_conn, \ "select repo_id, count(distinct username) from gh_repo_contributors group by repo_id order by count(distinct username);") contributors = [] for row in rows: contributors.append(row[1]) n, bins, patches = plt.hist(contributors, 50) l = plt.plot(bins) plt.show() plt.clf()
def get_graphs(vuln_list, alt_list): db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); #b(db_conn, vuln_list) #for vuln in vuln_list: #b(db_conn, [vuln]) c1(db_conn, ["strcpy"], ["strncpy"]) c1(db_conn, ["strcpy"], ["strlcpy"]) #also correlation between repo size and vuln types/frequency #trends over time? not sure how to measure. maybe time last pushed #also correlation between number of contributors and vulnerability content #okay, other analysis we'd like to do: #ummmm....shit this is not going well # we could look at the number of contributors vs. vuln count/vuln types - that would be a nice thing to graph #rows = sd.select_many_query(db_conn, \ rows = sd.select_many_query(db_conn, \ "select r.repo_id, count(distinct rc.username), count(distinct v.vuln_id) \ from gh_repo r inner join \ gh_repo_contributors rc on r.repo_id = rc.repo_id \ inner join gh_file f on r.repo_id = f.repo_id \ inner join gh_vuln v on f.file_id = v.file_id \ where v.vuln_desc in (%s) \ group by r.repo_id \ ", ['\'' + '\', \''.join(vuln_list) + '\'']) #rows = sd.select_many_query(db_conn, \ print \ "select r.repo_id, count(distinct rc.username), count(distinct v.vuln_desc) \ from gh_repo r inner join \ gh_repo_contributors rc on r.repo_id = rc.repo_id \ inner join gh_file f on r.repo_id = f.repo_id \ inner join gh_vuln v on f.file_id = v.file_id \ where v.vuln_desc in (%s) \ group by r.repo_id" % '\'' + '\', \''.join(vuln_list) + '\'' #group by r.repo_id", '\'' + '\', \''.join(vuln_list) + '\'') # could look at proportion of repos a user contributes to that have vulns vs no-vulns # create a view that has gh_vuln with the vulnerabilities, and another one that has the safe versions print \ "select rcv.username, count(distinct rcv.repo_id), count(distinct rcnv.repo_id) \ from gh_repo_contributors rcv \ inner join \ gh_repo_contributors rcnv on rcv.username = rcnv.username \ where rcnv.repo_id not in (select f.repo_id from gh_file f inner join gh_vuln v on f.file_id = v.vuln_id where v.vuln_desc in (%s)) \ and rcv.repo_id in (select f.repo_id from gh_file f inner join gh_vuln v on f.file_id = v.vuln_id where v.vuln_desc in (%s)) \ " % '\'' + '\', \''.join(vuln_list) + '\'', '\'' + '\', \''.join(vuln_list) + '\''
def execute_query_over_range(db_conn, query, args): avg = [] stddev = [] x_axis = [] for i in range(0,1000,100): rows = sd.select_many_query(db_conn, query, [i, i+100] + args) #query_base + " having count(distinct rc.username) <= %s) a", args)#[i, i+100, t1]) vulncount = [] for row in rows: vulncount.append(row[1]) avg.append(np.average(vulncount)) stddev.append(np.std(vulncount)) x_axis.append(i) #n, bins, patches = plt.hist(vulncount, 50) #l = plt.plot(bins) #plt.show() #plt.clf() return [avg, stddev, x_axis]
def get_repo_cuser_histogram2(): db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); # get number of repos with each number of different contributors rows = sd.select_many_query(db_conn, \ "select count(distinct repo_id), num_contribs from ( select repo_id, count(distinct username) num_contribs from gh_repo_contributors group by repo_id order by count(distinct username) ) a group by num_contribs") repos = [] contributing_users = [] for i in range(0, len(rows)): repos.append(rows[i][0]) contributing_users.append(rows[i][1]) print rows[i][0], rows[i][1] print "3-10", np.sum(repos[2:10]) print "11-20", np.sum(repos[10:20]) print "21-30", np.sum(repos[20:31]) print ">1", np.sum(repos[1:-1]) # plt.plot(repos_contributed_to, users) # plt.show() return[repos, contributing_users]
def get_user_crepo_histogram(): # get number of users who have each number of different repos they contribute to db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); rows = sd.select_many_query(db_conn, \ "select count(distinct username), repo_contribs from ( select username, count(distinct repo_id) repo_contribs from gh_repo_contributors group by username order by count(distinct repo_id) ) a group by repo_contribs"); users = [] repos_contributed_to = [] for i in range(0, len(rows)): users.append(rows[i][0]) repos_contributed_to.append(rows[i][1]) print rows[i][0], rows[i][1] # plt.plot(repos_contributed_to, users) print "3-10", np.sum(users[2:10]) print "11-20", np.sum(users[10:20]) print "21-30", np.sum(users[20:-1]) print ">1", np.sum(users[1:-1]) # plt.show() return[users, repos_contributed_to]
def get_analysis_threshold(vuln_list, alt_list, threshconds): db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); # for vuln in vuln_list + alt_list: # tmp_result = {"vulnerability": vuln}; # for thresh in threshconds: # rows = sd.select_many_query(db_conn, \ #" select count(distinct " + "repo_id" + ") \ # from \ # (select r.repo_id \ # from gh_repo r \ # inner join gh_file f on f.repo_id = r.repo_id \ # inner join gh_vuln v on f.file_id = v.file_id \ # inner join gh_repo_contributors rc on r.repo_id = rc.repo_id \ # where vuln_desc = %s \ # group by r.repo_id \ # having count(distinct rc.username) " + thresh + \ # ") a", \ # [vuln]); # for row in rows: # print vuln, row[0], thresh #Next, we'll want to look into co-occurrence of different types of vulnerabilities total_list = vuln_list + alt_list; for i in range(0, len(total_list)): for j in range(i+1, len(total_list)): for thresh in threshconds: rows = sd.select_many_query(db_conn, \ " select count(distinct repo_id) \ from \ (select distinct rc.repo_id from \ gh_repo r \ inner join gh_file f1 on r.repo_id = f1.repo_id \ inner join gh_vuln v1 on f1.file_id = v1.file_id \ inner join gh_file f2 on r.repo_id = f2.repo_id \ inner join gh_vuln v2 on f2.file_id = v2.file_id \ inner join gh_repo_contributors rc on r.repo_id = rc.repo_id \ and v1.vuln_desc = %s \ and v2.vuln_desc = %s \ group by rc.repo_id \ having count(distinct rc.username) " + thresh +\ ") a", \ [total_list[i], total_list[j]]); for row in rows: print total_list[i], ", ", total_list[j], ", ", row[0], ", ", thresh
import save_data as sd import requests import json import util import sys in_order = False; token = os.environ.get('P_TOKEN', None) #db_conn = sd.get_connection(credentials_file='mysqlcreds-remote.csv') db_conn = sd.get_connection(credentials_file='mysqlcreds-analysis.csv') rows = None; if (in_order): rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id >= (select coalesce(max(repo_id), 0) from gh_repo_contributors) order by repo_id") else: #get the stragglers rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id") header = {'Authorization': 'token ' + token} for row in rows: repo_id = row[0]; if (repo_id % 10 == 0): print "repo_id ", repo_id query_url = row[1]; owner_name = row[2]; try: r = requests.get(query_url, headers=header) item = json.loads(r.text or r.content) for thing in item: contributions = thing['contributions'] username = thing['login']
import save_data as sd import requests import json import util import sys in_order = False token = os.environ.get('P_TOKEN', None) #db_conn = sd.get_connection(credentials_file='mysqlcreds-remote.csv') db_conn = sd.get_connection(credentials_file='mysqlcreds-analysis.csv') rows = None if (in_order): rows = sd.select_many_query( db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id >= (select coalesce(max(repo_id), 0) from gh_repo_contributors) order by repo_id" ) else: #get the stragglers rows = sd.select_many_query( db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id" ) header = {'Authorization': 'token ' + token} for row in rows: repo_id = row[0] if (repo_id % 10 == 0): print "repo_id ", repo_id query_url = row[1] owner_name = row[2]