def import_to_database (json_dict, credentials_file): # number_of_files = 0 db_conn = sd.get_connection(credentials_file); # start = json_dict["start"] # end = json_dict["end"] # json_dict = json_dict["result"] for username, repo in json_dict.iteritems(): sd.save_user_data(db_conn, username); #print username for reponame, repo_dict in repo.iteritems(): date_created = repo_dict["created_at"] repo_size = repo_dict["size"] last_pushed = repo_dict["pushed_at"] contributors_url = repo_dict["contributors_url"] description = repo_dict["description"] repo_id = sd.save_repo_data(db_conn, reponame, date_created, username, repo_size, last_pushed, repo_dict["url"], repo_dict["forks_url"], contributors_url, description, repo_dict["stargazers"], repo_dict["forks"]); file_list = repo_dict["files"]; for file_entry in file_list: #print file_info for filename, datapoints in file_entry.iteritems(): file_id = sd.save_file_data(db_conn, filename, repo_id, "") for thing in datapoints: line = thing["line"] code_sample = thing["code_sample"] sd.save_vulnerability_data(db_conn, file_id, line, code_sample);
def get_repo_cuser_histogram1(): #contributors per repo db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); rows = sd.select_many_query(db_conn, \ "select repo_id, count(distinct username) from gh_repo_contributors group by repo_id order by count(distinct username);") contributors = [] for row in rows: contributors.append(row[1]) n, bins, patches = plt.hist(contributors, 50) l = plt.plot(bins) plt.show() plt.clf()
def get_graphs(vuln_list, alt_list): db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); #b(db_conn, vuln_list) #for vuln in vuln_list: #b(db_conn, [vuln]) c1(db_conn, ["strcpy"], ["strncpy"]) c1(db_conn, ["strcpy"], ["strlcpy"]) #also correlation between repo size and vuln types/frequency #trends over time? not sure how to measure. maybe time last pushed #also correlation between number of contributors and vulnerability content #okay, other analysis we'd like to do: #ummmm....shit this is not going well # we could look at the number of contributors vs. vuln count/vuln types - that would be a nice thing to graph #rows = sd.select_many_query(db_conn, \ rows = sd.select_many_query(db_conn, \ "select r.repo_id, count(distinct rc.username), count(distinct v.vuln_id) \ from gh_repo r inner join \ gh_repo_contributors rc on r.repo_id = rc.repo_id \ inner join gh_file f on r.repo_id = f.repo_id \ inner join gh_vuln v on f.file_id = v.file_id \ where v.vuln_desc in (%s) \ group by r.repo_id \ ", ['\'' + '\', \''.join(vuln_list) + '\'']) #rows = sd.select_many_query(db_conn, \ print \ "select r.repo_id, count(distinct rc.username), count(distinct v.vuln_desc) \ from gh_repo r inner join \ gh_repo_contributors rc on r.repo_id = rc.repo_id \ inner join gh_file f on r.repo_id = f.repo_id \ inner join gh_vuln v on f.file_id = v.file_id \ where v.vuln_desc in (%s) \ group by r.repo_id" % '\'' + '\', \''.join(vuln_list) + '\'' #group by r.repo_id", '\'' + '\', \''.join(vuln_list) + '\'') # could look at proportion of repos a user contributes to that have vulns vs no-vulns # create a view that has gh_vuln with the vulnerabilities, and another one that has the safe versions print \ "select rcv.username, count(distinct rcv.repo_id), count(distinct rcnv.repo_id) \ from gh_repo_contributors rcv \ inner join \ gh_repo_contributors rcnv on rcv.username = rcnv.username \ where rcnv.repo_id not in (select f.repo_id from gh_file f inner join gh_vuln v on f.file_id = v.vuln_id where v.vuln_desc in (%s)) \ and rcv.repo_id in (select f.repo_id from gh_file f inner join gh_vuln v on f.file_id = v.vuln_id where v.vuln_desc in (%s)) \ " % '\'' + '\', \''.join(vuln_list) + '\'', '\'' + '\', \''.join(vuln_list) + '\''
def get_repo_cuser_histogram2(): db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); # get number of repos with each number of different contributors rows = sd.select_many_query(db_conn, \ "select count(distinct repo_id), num_contribs from ( select repo_id, count(distinct username) num_contribs from gh_repo_contributors group by repo_id order by count(distinct username) ) a group by num_contribs") repos = [] contributing_users = [] for i in range(0, len(rows)): repos.append(rows[i][0]) contributing_users.append(rows[i][1]) print rows[i][0], rows[i][1] print "3-10", np.sum(repos[2:10]) print "11-20", np.sum(repos[10:20]) print "21-30", np.sum(repos[20:31]) print ">1", np.sum(repos[1:-1]) # plt.plot(repos_contributed_to, users) # plt.show() return[repos, contributing_users]
def get_user_crepo_histogram(): # get number of users who have each number of different repos they contribute to db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); rows = sd.select_many_query(db_conn, \ "select count(distinct username), repo_contribs from ( select username, count(distinct repo_id) repo_contribs from gh_repo_contributors group by username order by count(distinct repo_id) ) a group by repo_contribs"); users = [] repos_contributed_to = [] for i in range(0, len(rows)): users.append(rows[i][0]) repos_contributed_to.append(rows[i][1]) print rows[i][0], rows[i][1] # plt.plot(repos_contributed_to, users) print "3-10", np.sum(users[2:10]) print "11-20", np.sum(users[10:20]) print "21-30", np.sum(users[20:-1]) print ">1", np.sum(users[1:-1]) # plt.show() return[users, repos_contributed_to]
def get_analysis_threshold(vuln_list, alt_list, threshconds): db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); # for vuln in vuln_list + alt_list: # tmp_result = {"vulnerability": vuln}; # for thresh in threshconds: # rows = sd.select_many_query(db_conn, \ #" select count(distinct " + "repo_id" + ") \ # from \ # (select r.repo_id \ # from gh_repo r \ # inner join gh_file f on f.repo_id = r.repo_id \ # inner join gh_vuln v on f.file_id = v.file_id \ # inner join gh_repo_contributors rc on r.repo_id = rc.repo_id \ # where vuln_desc = %s \ # group by r.repo_id \ # having count(distinct rc.username) " + thresh + \ # ") a", \ # [vuln]); # for row in rows: # print vuln, row[0], thresh #Next, we'll want to look into co-occurrence of different types of vulnerabilities total_list = vuln_list + alt_list; for i in range(0, len(total_list)): for j in range(i+1, len(total_list)): for thresh in threshconds: rows = sd.select_many_query(db_conn, \ " select count(distinct repo_id) \ from \ (select distinct rc.repo_id from \ gh_repo r \ inner join gh_file f1 on r.repo_id = f1.repo_id \ inner join gh_vuln v1 on f1.file_id = v1.file_id \ inner join gh_file f2 on r.repo_id = f2.repo_id \ inner join gh_vuln v2 on f2.file_id = v2.file_id \ inner join gh_repo_contributors rc on r.repo_id = rc.repo_id \ and v1.vuln_desc = %s \ and v2.vuln_desc = %s \ group by rc.repo_id \ having count(distinct rc.username) " + thresh +\ ") a", \ [total_list[i], total_list[j]]); for row in rows: print total_list[i], ", ", total_list[j], ", ", row[0], ", ", thresh
def get_analysis(output_filename, vuln_list, alt_list): # get database connection (local for now) outfile = open(output_filename, "wb"); id_types = ["r.repo_id", "f.file_id", "v.vuln_id"] id_names = ["number of repos", "number of files", "total occurrences"] fieldnames = ["vulnerability"] + id_names writer = csv.DictWriter(outfile, fieldnames=fieldnames); writer.writeheader(); total_list = vuln_list + alt_list db_conn = sd.get_connection("mysqlcreds-local-analysis.csv"); for vuln in total_list:# vuln_list + alt_list: tmp_result = {"vulnerability": vuln}; for i in range(0, len(id_types)): #get number of distinct {repo, file, vuln} containing a particular vulnerability #don't escape the id type rows = sd.select_many_query(db_conn, \ " select count(distinct " + id_types[i] + ") \ from \ gh_repo r \ inner join gh_file f on f.repo_id = r.repo_id \ inner join gh_vuln v on f.file_id = v.file_id \ where vuln_desc = %s ", \ [vuln]); for row in rows: tmp_result[id_names[i]] = row[0]; writer.writerow(tmp_result) #Next, we'll want to look into co-occurrence of different types of vulnerabilities for i in range(0, len(total_list)): for j in range(i+1, len(total_list)): rows = sd.select_many_query(db_conn, \ " select count(distinct r.repo_id) \ from gh_repo r \ inner join gh_file f1 on r.repo_id = f1.repo_id \ inner join gh_vuln v1 on f1.file_id = v1.file_id \ inner join gh_file f2 on r.repo_id = f2.repo_id \ inner join gh_vuln v2 on f2.file_id = v2.file_id \ and v1.vuln_desc = %s \ and v2.vuln_desc = %s ", [total_list[i], total_list[j]]); for row in rows: print total_list[i], ", ", total_list[j], ", ", row[0]
import save_data as sd random_9726 = [359, 1522 , 417 , 1339 , 6808 , 3094 , 3025 , 4695 , 59 , 6650 , 5056 , 9073 , 6499 , 6125 , 3434 , 6462 , 9724 , 468 , 2358 , 2448 , 5427 , 2362 , 5860 , 2794 , 4084 , 5276 , 441 , 2123 , 6032 , 2257 , 8756 , 5335 , 5468 , 6741 , 6832 , 9476 , 7305 , 8189 , 1396 , 5035 , 9271 , 8343 , 502 , 5434 , 2283 , 193 , 1843 , 7542 , 4092 , 2165 , 5719 , 7142 , 3347 , 3261 , 6750 , 7813 , 648 , 6529 , 2951 , 7358 , 3550 , 9059 , 38 , 9681 , 8159 , 7781 , 8466 , 3381 , 6787 , 2861 , 4675 , 5371 , 1703 , 2037 , 7414 , 8244 , 4965 , 8756 , 4256 , 6754 , 8444 , 5588 , 7644 , 8792 , 4994 , 3905 , 5000 , 3857 , 7993 , 5832 , 7250 , 595 , 3971 , 8176 , 667 , 9076 , 5708 , 6313 , 5552 , 8291 , 5403 , 6252 , 6837 , 7937 , 5238 , 943 , 2647 , 5996 , 2073 , 6752 , 7306 , 5188 , 7395 , 2757 , 4859 , 46 , 2958 , 1899 , 172 , 6092 , 5576 , 7534 , 8569 , 1952 , 8729 , 3766 , 7580 , 6666 , 865 , 877 , 2080 , 1636 , 6501 , 6962 , 5666 , 2206 , 3230 , 4908 , 3589 , 2891 , 3826 , 4613 , 8369 , 5826 , 660 , 6396 , 1232 , 2190 , 3327 , 2092 , 7842 , 9223 , 401 , 6442 , 8430 , 1395 , 7630 , 5149 , 2377 , 2542 , 5542 , 7299 , 8386 , 5637 , 8920 , 8544 , 3539 , 4293 , 862 , 4095 , 8600 , 3880 , 2564 , 3651 , 8991 , 989 , 4494 , 2335 , 4806 , 1229 , 6423 , 1655 , 4391 , 1548 , 7227 , 5677 , 2572 , 7706 , 5624 , 3302 , 4426 , 4555 , 2372 , 3588 , 6377 , 3898 , 9130 , 8617 , 8342 , 1225 , 7815 , 5441 , 283 , 4030 , 7496 , 1148 , 6154 , 9024 , 6156 , 6714 , 3974 , 2250 , 2711 , 2303 , 4768 , 7683 , 8500 , 8766 , 3106 , 5065 , 7450 , 9115 , 618 , 5590 , 8 , 7395 , 7428 , 652 , 5559 , 2379 , 5142 , 2599 , 6081 , 2561 , 7093 , 6044 , 8122 , 6179 , 3274 , 1069 , 7195 , 9056 , 2477 , 2305 , 6371 , 6870 , 8374 , 6007 , 8065 , 5604 , 8323 , 5392 , 4965 , 7111 , 8558 , 4291 , 7945 , 7565 , 7334 , 5855 , 1836 , 1012 , 9143 , 5610 , 1359 , 5229 , 1160 , 9302 , 7737 , 8752 , 2479 , 4893 , 407 , 6172 , 5732 , 8332 , 4361 , 9058 , 2117 , 6375 , 9046 , 9141 , 1258 , 4558 , 4420 , 6664 , 6926 , 3636 , 3071 , 2938 , 197 , 131 , 8684 , 6523 , 2900 , 3137 , 6287 , 1498 , 2642 , 5874 , 3499 , 9509 , 8037 , 4703 , 497 , 1018 , 505 , 7005 , 6902 , 5486 , 4359 , 8851 , 1978 , 8497 , 692 , 1035 , 8420 , 8636 , 3232 , 9052 , 4108 , 9410 , 544 , 9466 , 9053 , 1238 , 8327 , 9277 , 4238 , 8445 , 5966 , 2153 , 6909 , 5880 , 3288 , 4216 , 4964 , 2580 , 5521 , 9572 , 1460 , 3149 , 5199 , 3562 , 8010 , 2989 , 2445 , 3549 , 5071 , 7558 , 6256 , 3608 , 270 , 5482 , 3456 , 2099 , 786 , 2606 , 1733 , 265 , 1895 , 7791 , 7346 , 7061 , 5558 , 9555 , 8725 , 3046 , 3338 , 203 , 214 , 4688 , 4375 , 9638 , 2446 , 4562 , 2025 , 1121 , 5823 , 2490 , 8202 , 6657 , 2596 , 2846 , 1931 , 5486 , 4234 , 7788 , 3678 , 5243 , 2352 , 4913 , 9390 , 6013 , 1290 , 8361 , 5358 , 8955 , 7248 , 2416 , 8241 , 7943 , 8939 , 1518 , 5481 , 6056 , 1888 , 3988 , 2214 , 8479 , 2776 , 7251 , 3457 , 310 , 1550 , 8755 , 8541 , 3951 , 2006 , 4785 , 6801 , 8388 , 923 , 9134 , 3351 , 7454 , 5940 , 1021 , 5584 , 7098 , 3160 , 7289 , 9217 , 5954 , 3456 , 5712 , 6573 , 8991 , 5902 , 5250 , 7571 , 4825 , 1968 , 5579 , 4294 , 1506 , 643 , 3026 , 2697 , 9292 , 6412 , 4465 , 4119 , 3852 , 1156 , 530 , 5249 , 7651 , 8895 , 8188 , 5496 , 8031 , 7114 , 9330 , 2095 , 410 , 2335 , 7028 , 2439 , 5128 , 6628 , 2509 , 8094 , 7840 , 3264 , 7118 , 5925 , 3224 , 3062 , 4850 , 6031 , 8493 , 6581 , 6813 , 8165 , 3310 , 6446 , 3421 , 8540 , 4606 , 2 , 9506 , 6197 , 7631 , 2025 , 2209 , 3298 , 4446 , 3840 , 3415] #500 print len(random_9726) db_conn = sd.get_connection("mysqlcreds-sample-c.csv") cursor = db_conn.cursor() for rand_id in random_9726: id_query = "select n.repo_id from (select distinct r.repo_id from gh_vuln v, gh_repo r, gh_file f where r.repo_id = f.repo_id and f.file_id = v.file_id and v.vuln_desc = 'strcpy' order by r.repo_id limit " +str(rand_id) + ") n order by n.repo_id desc limit 1" cursor.execute(id_query); repo_id = int(cursor.fetchall()[0][0]) #print str(rand_id) + ": " + str(repo_id) data_query = "select owner_name, repo_name, filename, code_sample, line_number, date_created, r.repo_id from gh_vuln v, gh_repo r, gh_file f where r.repo_id = f.repo_id and f.file_id = v.file_id and r.repo_id =" + str(repo_id) + " limit 100" cursor.execute(data_query) data = cursor.fetchall() owner_name = data[0][0] repo_name = data[0][1] filename = data[0][2] code_sample = data[0][3] line_number = data[0][4] date_created = data[0][5] URL1 = "https://github.com/" + str(owner_name) + "/" + str(repo_name) #print URL1 URL2 = URL1 + "/blob/master/" + str(filename.split("/",1)[1]) #print URL2
import save_data as sd import requests import json import util import sys in_order = False; token = os.environ.get('P_TOKEN', None) #db_conn = sd.get_connection(credentials_file='mysqlcreds-remote.csv') db_conn = sd.get_connection(credentials_file='mysqlcreds-analysis.csv') rows = None; if (in_order): rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id >= (select coalesce(max(repo_id), 0) from gh_repo_contributors) order by repo_id") else: #get the stragglers rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id") header = {'Authorization': 'token ' + token} for row in rows: repo_id = row[0]; if (repo_id % 10 == 0): print "repo_id ", repo_id query_url = row[1]; owner_name = row[2]; try: r = requests.get(query_url, headers=header) item = json.loads(r.text or r.content) for thing in item: contributions = thing['contributions'] username = thing['login']
import save_data as sd import requests import json import util import sys in_order = False token = os.environ.get('P_TOKEN', None) #db_conn = sd.get_connection(credentials_file='mysqlcreds-remote.csv') db_conn = sd.get_connection(credentials_file='mysqlcreds-analysis.csv') rows = None if (in_order): rows = sd.select_many_query( db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id >= (select coalesce(max(repo_id), 0) from gh_repo_contributors) order by repo_id" ) else: #get the stragglers rows = sd.select_many_query( db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id" ) header = {'Authorization': 'token ' + token} for row in rows: repo_id = row[0] if (repo_id % 10 == 0): print "repo_id ", repo_id query_url = row[1] owner_name = row[2]
4688, 4375, 9638, 2446, 4562, 2025, 1121, 5823, 2490, 8202, 6657, 2596, 2846, 1931, 5486, 4234, 7788, 3678, 5243, 2352, 4913, 9390, 6013, 1290, 8361, 5358, 8955, 7248, 2416, 8241, 7943, 8939, 1518, 5481, 6056, 1888, 3988, 2214, 8479, 2776, 7251, 3457, 310, 1550, 8755, 8541, 3951, 2006, 4785, 6801, 8388, 923, 9134, 3351, 7454, 5940, 1021, 5584, 7098, 3160, 7289, 9217, 5954, 3456, 5712, 6573, 8991, 5902, 5250, 7571, 4825, 1968, 5579, 4294, 1506, 643, 3026, 2697, 9292, 6412, 4465, 4119, 3852, 1156, 530, 5249, 7651, 8895, 8188, 5496, 8031, 7114, 9330, 2095, 410, 2335, 7028, 2439, 5128, 6628, 2509, 8094, 7840, 3264, 7118, 5925, 3224, 3062, 4850, 6031, 8493, 6581, 6813, 8165, 3310, 6446, 3421, 8540, 4606, 2, 9506, 6197, 7631, 2025, 2209, 3298, 4446, 3840, 3415 ] #500 print len(random_9726) db_conn = sd.get_connection("mysqlcreds-sample-c.csv") cursor = db_conn.cursor() for rand_id in random_9726: id_query = "select n.repo_id from (select distinct r.repo_id from gh_vuln v, gh_repo r, gh_file f where r.repo_id = f.repo_id and f.file_id = v.file_id and v.vuln_desc = 'strcpy' order by r.repo_id limit " + str( rand_id) + ") n order by n.repo_id desc limit 1" cursor.execute(id_query) repo_id = int(cursor.fetchall()[0][0]) #print str(rand_id) + ": " + str(repo_id) data_query = "select owner_name, repo_name, filename, code_sample, line_number, date_created, r.repo_id from gh_vuln v, gh_repo r, gh_file f where r.repo_id = f.repo_id and f.file_id = v.file_id and r.repo_id =" + str( repo_id) + " limit 100" cursor.execute(data_query) data = cursor.fetchall()