Пример #1
0
def import_to_database (json_dict, credentials_file):
#	number_of_files = 0	
	db_conn = sd.get_connection(credentials_file);
#	start = json_dict["start"]
#	end = json_dict["end"]
#	json_dict = json_dict["result"]
	for username, repo in json_dict.iteritems():
		sd.save_user_data(db_conn, username);
		#print username
		for reponame, repo_dict in repo.iteritems():
			date_created = repo_dict["created_at"]
			repo_size = repo_dict["size"]
			last_pushed = repo_dict["pushed_at"]
			contributors_url = repo_dict["contributors_url"]
			description = repo_dict["description"]
			repo_id = sd.save_repo_data(db_conn, reponame, date_created, username, repo_size, last_pushed, repo_dict["url"], repo_dict["forks_url"], contributors_url, description, repo_dict["stargazers"], repo_dict["forks"]);

			file_list = repo_dict["files"];
			for file_entry in file_list:
				#print file_info
				for filename, datapoints in file_entry.iteritems():
					file_id = sd.save_file_data(db_conn, filename, repo_id, "") 
				 	for thing in datapoints:
						line = thing["line"]
						code_sample = thing["code_sample"]
						sd.save_vulnerability_data(db_conn, file_id, line, code_sample);
Пример #2
0
def get_repo_cuser_histogram1():
#contributors per repo
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
  rows = sd.select_many_query(db_conn, \
  "select repo_id, count(distinct username) from gh_repo_contributors group by repo_id order by count(distinct username);")
  contributors = []
  for row in rows:
    contributors.append(row[1])

  n, bins, patches = plt.hist(contributors, 50)
  l = plt.plot(bins)
  plt.show()
  plt.clf()
Пример #3
0
def get_graphs(vuln_list, alt_list):
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
  #b(db_conn, vuln_list)
  #for vuln in vuln_list:
    #b(db_conn, [vuln])
  c1(db_conn, ["strcpy"], ["strncpy"])
  c1(db_conn, ["strcpy"], ["strlcpy"])
  

  #also correlation between repo size and vuln types/frequency
  #trends over time? not sure how to measure. maybe time last pushed
  #also correlation between number of contributors and vulnerability content

#okay, other analysis we'd like to do:
#ummmm....shit this is not going well
# we could look at the number of contributors vs. vuln count/vuln types - that would be a nice thing to graph
  #rows = sd.select_many_query(db_conn, \
  rows = sd.select_many_query(db_conn, \
"select r.repo_id, count(distinct rc.username), count(distinct v.vuln_id) \
from gh_repo r inner join \
gh_repo_contributors rc on r.repo_id = rc.repo_id \
inner join gh_file f on r.repo_id = f.repo_id \
inner join gh_vuln v on f.file_id = v.file_id \
where v.vuln_desc in (%s) \
group by r.repo_id \
", ['\'' + '\', \''.join(vuln_list) + '\''])



  #rows = sd.select_many_query(db_conn, \
  print \
"select r.repo_id, count(distinct rc.username), count(distinct v.vuln_desc) \
from gh_repo r inner join \
gh_repo_contributors rc on r.repo_id = rc.repo_id \
inner join gh_file f on r.repo_id = f.repo_id \
inner join gh_vuln v on f.file_id = v.file_id \
where v.vuln_desc in (%s) \
group by r.repo_id" % '\'' + '\', \''.join(vuln_list) + '\''
#group by r.repo_id", '\'' + '\', \''.join(vuln_list) + '\'')

# could look at proportion of repos a user contributes to that have vulns vs no-vulns

# create a view that has gh_vuln with the vulnerabilities, and another one that has the safe versions
  print \
"select rcv.username, count(distinct rcv.repo_id), count(distinct rcnv.repo_id) \
from gh_repo_contributors rcv \
inner join \
gh_repo_contributors rcnv on rcv.username = rcnv.username \
where rcnv.repo_id not in (select f.repo_id from gh_file f inner join gh_vuln v on f.file_id = v.vuln_id where v.vuln_desc in (%s)) \
and rcv.repo_id in (select f.repo_id from gh_file f inner join gh_vuln v on f.file_id = v.vuln_id where v.vuln_desc in (%s)) \
" % '\'' + '\', \''.join(vuln_list) + '\'', '\'' + '\', \''.join(vuln_list) + '\'' 
Пример #4
0
def get_repo_cuser_histogram2():
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
# get number of repos with each number of different contributors
  rows = sd.select_many_query(db_conn, \
"select count(distinct repo_id), num_contribs from ( select repo_id, count(distinct username) num_contribs from gh_repo_contributors group by repo_id order by count(distinct username) ) a group by num_contribs")
  repos = []
  contributing_users = []
  for i in range(0, len(rows)):
    repos.append(rows[i][0])
    contributing_users.append(rows[i][1])
    print rows[i][0], rows[i][1]
  print "3-10", np.sum(repos[2:10])
  print "11-20", np.sum(repos[10:20])
  print "21-30", np.sum(repos[20:31])
  print ">1", np.sum(repos[1:-1])
#  plt.plot(repos_contributed_to, users)
#  plt.show()
  return[repos, contributing_users]
Пример #5
0
def get_user_crepo_histogram():
# get number of users who have each number of different repos they contribute to
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
  rows = sd.select_many_query(db_conn, \
"select count(distinct username), repo_contribs from ( select username, count(distinct repo_id) repo_contribs from gh_repo_contributors group by username order by count(distinct repo_id) ) a group by repo_contribs");
  users = []
  repos_contributed_to = []
  for i in range(0, len(rows)):
    users.append(rows[i][0])
    repos_contributed_to.append(rows[i][1])
    print rows[i][0], rows[i][1]
#  plt.plot(repos_contributed_to, users)
  print "3-10", np.sum(users[2:10])
  print "11-20", np.sum(users[10:20])
  print "21-30", np.sum(users[20:-1])
  print ">1", np.sum(users[1:-1])
#  plt.show()
  return[users, repos_contributed_to]
Пример #6
0
def get_analysis_threshold(vuln_list, alt_list, threshconds):
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
#  for vuln in vuln_list + alt_list:
#    tmp_result = {"vulnerability": vuln};
#    for thresh in threshconds:
#      rows = sd.select_many_query(db_conn, \
#" select count(distinct " + "repo_id" + ")  \
# from \
#   (select r.repo_id \
#   from gh_repo r  \
#   inner join gh_file f on f.repo_id = r.repo_id  \
#   inner join gh_vuln v on f.file_id = v.file_id  \
#   inner join gh_repo_contributors rc on r.repo_id = rc.repo_id \
#   where vuln_desc = %s \
#   group by r.repo_id \
#   having count(distinct rc.username) " + thresh + \
#   ") a", \
#   [vuln]);
#      for row in rows:
#        print vuln, row[0], thresh

  #Next, we'll want to look into co-occurrence of different types of vulnerabilities
  total_list = vuln_list + alt_list;
  for i in range(0, len(total_list)):
    for j in range(i+1, len(total_list)):
      for thresh in threshconds:
        rows = sd.select_many_query(db_conn, \
  "  select count(distinct repo_id) \
    from \
    (select distinct rc.repo_id from \
    gh_repo r \
    inner join gh_file f1 on r.repo_id = f1.repo_id \
    inner join gh_vuln v1 on f1.file_id = v1.file_id \
    inner join gh_file f2 on r.repo_id = f2.repo_id \
    inner join gh_vuln v2 on f2.file_id = v2.file_id \
    inner join gh_repo_contributors rc on r.repo_id = rc.repo_id \
    and v1.vuln_desc = %s \
    and v2.vuln_desc = %s \
    group by rc.repo_id \
    having count(distinct rc.username) " + thresh +\
    ") a", \
    [total_list[i], total_list[j]]);
        for row in rows:
          print total_list[i], ", ", total_list[j], ", ", row[0], ", ", thresh
Пример #7
0
def get_analysis(output_filename, vuln_list, alt_list):
  # get database connection (local for now)
  outfile = open(output_filename, "wb");
  id_types = ["r.repo_id", "f.file_id", "v.vuln_id"]
  id_names = ["number of repos", "number of files", "total occurrences"]
  fieldnames = ["vulnerability"] + id_names
  writer = csv.DictWriter(outfile, fieldnames=fieldnames);
  writer.writeheader();

  total_list = vuln_list + alt_list
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
  for vuln in total_list:# vuln_list + alt_list:
    tmp_result = {"vulnerability": vuln};
    for i in range(0, len(id_types)):
#get number of distinct {repo, file, vuln} containing a particular vulnerability
#don't escape the id type
      rows = sd.select_many_query(db_conn, \
" select count(distinct " + id_types[i] + ")  \
 from \
   gh_repo r  \
   inner join gh_file f on f.repo_id = r.repo_id  \
   inner join gh_vuln v on f.file_id = v.file_id  \
   where vuln_desc = %s ", \
   [vuln]);
      for row in rows:
        tmp_result[id_names[i]] = row[0];
    writer.writerow(tmp_result)

  #Next, we'll want to look into co-occurrence of different types of vulnerabilities
  for i in range(0, len(total_list)):
    for j in range(i+1, len(total_list)):
      rows = sd.select_many_query(db_conn, \
"  select count(distinct r.repo_id) \
  from gh_repo r \
  inner join gh_file f1 on r.repo_id = f1.repo_id \
  inner join gh_vuln v1 on f1.file_id = v1.file_id \
  inner join gh_file f2 on r.repo_id = f2.repo_id \
  inner join gh_vuln v2 on f2.file_id = v2.file_id \
  and v1.vuln_desc = %s \
  and v2.vuln_desc = %s ", [total_list[i], total_list[j]]);
      for row in rows:
        print total_list[i], ", ", total_list[j], ", ", row[0]
Пример #8
0
import save_data as sd

random_9726 = [359, 1522 , 417 , 1339 , 6808 , 3094 , 3025 , 4695 , 59 , 6650 , 5056 , 9073 , 6499 , 6125 , 3434 , 6462 , 9724 , 468 , 2358 , 2448 , 5427 , 2362 , 5860 , 2794 , 4084 , 5276 , 441 , 2123 , 6032 , 2257 , 8756 , 5335 , 5468 , 6741 , 6832 , 9476 , 7305 , 8189 , 1396 , 5035 , 9271 , 8343 , 502 , 5434 , 2283 , 193 , 1843 , 7542 , 4092 , 2165 , 5719 , 7142 , 3347 , 3261 , 6750 , 7813 , 648 , 6529 , 2951 , 7358 , 3550 , 9059 , 38 , 9681 , 8159 , 7781 , 8466 , 3381 , 6787 , 2861 , 4675 , 5371 , 1703 , 2037 , 7414 , 8244 , 4965 , 8756 , 4256 , 6754 , 8444 , 5588 , 7644 , 8792 , 4994 , 3905 , 5000 , 3857 , 7993 , 5832 , 7250 , 595 , 3971 , 8176 , 667 , 9076 , 5708 , 6313 , 5552 , 8291 , 5403 , 6252 , 6837 , 7937 , 5238 , 943 , 2647 , 5996 , 2073 , 6752 , 7306 , 5188 , 7395 , 2757 , 4859 , 46 , 2958 , 1899 , 172 , 6092 , 5576 , 7534 , 8569 , 1952 , 8729 , 3766 , 7580 , 6666 , 865 , 877 , 2080 , 1636 , 6501 , 6962 , 5666 , 2206 , 3230 , 4908 , 3589 , 2891 , 3826 , 4613 , 8369 , 5826 , 660 , 6396 , 1232 , 2190 , 3327 , 2092 , 7842 , 9223 , 401 , 6442 , 8430 , 1395 , 7630 , 5149 , 2377 , 2542 , 5542 , 7299 , 8386 , 5637 , 8920 , 8544 , 3539 , 4293 , 862 , 4095 , 8600 , 3880 , 2564 , 3651 , 8991 , 989 , 4494 , 2335 , 4806 , 1229 , 6423 , 1655 , 4391 , 1548 , 7227 , 5677 , 2572 , 7706 , 5624 , 3302 , 4426 , 4555 , 2372 , 3588 , 6377 , 3898 , 9130 , 8617 , 8342 , 1225 , 7815 , 5441 , 283 , 4030 , 7496 , 1148 , 6154 , 9024 , 6156 , 6714 , 3974 , 2250 , 2711 , 2303 , 4768 , 7683 , 8500 , 8766 , 3106 , 5065 , 7450 , 9115 , 618 , 5590 , 8 , 7395 , 7428 , 652 , 5559 , 2379 , 5142 , 2599 , 6081 , 2561 , 7093 , 6044 , 8122 , 6179 , 3274 , 1069 , 7195 , 9056 , 2477 , 2305 , 6371 , 6870 , 8374 , 6007 , 8065 , 5604 , 8323 , 5392 , 4965 , 7111 , 8558 , 4291 , 7945 , 7565 , 7334 , 5855 , 1836 , 1012 , 9143 , 5610 , 1359 , 5229 , 1160 , 9302 , 7737 , 8752 , 2479 , 4893 , 407 , 6172 , 5732 , 8332 , 4361 , 9058 , 2117 , 6375 , 9046 , 9141 , 1258 , 4558 , 4420 , 6664 , 6926 , 3636 , 3071 , 2938 , 197 , 131 , 8684 , 6523 , 2900 , 3137 , 6287 , 1498 , 2642 , 5874 , 3499 , 9509 , 8037 , 4703 , 497 , 1018 , 505 , 7005 , 6902 , 5486 , 4359 , 8851 , 1978 , 8497 , 692 , 1035 , 8420 , 8636 , 3232 , 9052 , 4108 , 9410 , 544 , 9466 , 9053 , 1238 , 8327 , 9277 , 4238 , 8445 , 5966 , 2153 , 6909 , 5880 , 3288 , 4216 , 4964 , 2580 , 5521 , 9572 , 1460 , 3149 , 5199 , 3562 , 8010 , 2989 , 2445 , 3549 , 5071 , 7558 , 6256 , 3608 , 270 , 5482 , 3456 , 2099 , 786 , 2606 , 1733 , 265 , 1895 , 7791 , 7346 , 7061 , 5558 , 9555 , 8725 , 3046 , 3338 , 203 , 214 , 4688 , 4375 , 9638 , 2446 , 4562 , 2025 , 1121 , 5823 , 2490 , 8202 , 6657 , 2596 , 2846 , 1931 , 5486 , 4234 , 7788 , 3678 , 5243 , 2352 , 4913 , 9390 , 6013 , 1290 , 8361 , 5358 , 8955 , 7248 , 2416 , 8241 , 7943 , 8939 , 1518 , 5481 , 6056 , 1888 , 3988 , 2214 , 8479 , 2776 , 7251 , 3457 , 310 , 1550 , 8755 , 8541 , 3951 , 2006 , 4785 , 6801 , 8388 , 923 , 9134 , 3351 , 7454 , 5940 , 1021 , 5584 , 7098 , 3160 , 7289 , 9217 , 5954 , 3456 , 5712 , 6573 , 8991 , 5902 , 5250 , 7571 , 4825 , 1968 , 5579 , 4294 , 1506 , 643 , 3026 , 2697 , 9292 , 6412 , 4465 , 4119 , 3852 , 1156 , 530 , 5249 , 7651 , 8895 , 8188 , 5496 , 8031 , 7114 , 9330 , 2095 , 410 , 2335 , 7028 , 2439 , 5128 , 6628 , 2509 , 8094 , 7840 , 3264 , 7118 , 5925 , 3224 , 3062 , 4850 , 6031 , 8493 , 6581 , 6813 , 8165 , 3310 , 6446 , 3421 , 8540 , 4606 , 2 , 9506 , 6197 , 7631 , 2025 , 2209 , 3298 , 4446 , 3840 , 3415]
#500
print len(random_9726)

db_conn = sd.get_connection("mysqlcreds-sample-c.csv")
cursor = db_conn.cursor()

for rand_id in random_9726:
	
	id_query = 	"select n.repo_id from (select distinct r.repo_id from gh_vuln v, gh_repo r, gh_file f where r.repo_id = f.repo_id and f.file_id = v.file_id and v.vuln_desc = 'strcpy' order by r.repo_id limit " +str(rand_id) + ") n order by  n.repo_id desc limit 1"
	cursor.execute(id_query);
	repo_id = int(cursor.fetchall()[0][0])
	#print str(rand_id) + ": " + str(repo_id)
	
	data_query = "select owner_name, repo_name, filename, code_sample, line_number, date_created, r.repo_id from gh_vuln v, gh_repo r, gh_file f where r.repo_id = f.repo_id and f.file_id = v.file_id and r.repo_id =" + str(repo_id) + " limit 100"
	cursor.execute(data_query)
	data = cursor.fetchall()
	
	owner_name = data[0][0]
	repo_name = data[0][1]
	filename = data[0][2]
	code_sample = data[0][3]
	line_number = data[0][4]
	date_created = data[0][5]
	
	URL1 = "https://github.com/" + str(owner_name) + "/" + str(repo_name)
	#print URL1
	URL2 = URL1 + "/blob/master/" + str(filename.split("/",1)[1])
	#print URL2
import save_data as sd
import requests
import json
import util
import sys

in_order = False;
token = os.environ.get('P_TOKEN', None)

#db_conn = sd.get_connection(credentials_file='mysqlcreds-remote.csv')
db_conn = sd.get_connection(credentials_file='mysqlcreds-analysis.csv')
rows = None;
if (in_order):
	rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id >= (select coalesce(max(repo_id), 0) from gh_repo_contributors) order by repo_id")
else: #get the stragglers
	rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id")

header = {'Authorization': 'token ' + token}

for row in rows:
	repo_id = row[0];
	if (repo_id % 10 == 0):
		print "repo_id ", repo_id
	query_url = row[1];
	owner_name = row[2];
	try:
		r = requests.get(query_url, headers=header)
		item = json.loads(r.text or r.content)
		for thing in item:
			contributions = thing['contributions']
			username = thing['login']
Пример #10
0
import save_data as sd
import requests
import json
import util
import sys

in_order = False
token = os.environ.get('P_TOKEN', None)

#db_conn = sd.get_connection(credentials_file='mysqlcreds-remote.csv')
db_conn = sd.get_connection(credentials_file='mysqlcreds-analysis.csv')
rows = None
if (in_order):
    rows = sd.select_many_query(
        db_conn,
        "select repo_id, contributors_url, owner_name from gh_repo where repo_id >= (select coalesce(max(repo_id), 0) from gh_repo_contributors) order by repo_id"
    )
else:  #get the stragglers
    rows = sd.select_many_query(
        db_conn,
        "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id"
    )

header = {'Authorization': 'token ' + token}

for row in rows:
    repo_id = row[0]
    if (repo_id % 10 == 0):
        print "repo_id ", repo_id
    query_url = row[1]
    owner_name = row[2]
Пример #11
0
    4688, 4375, 9638, 2446, 4562, 2025, 1121, 5823, 2490, 8202, 6657, 2596,
    2846, 1931, 5486, 4234, 7788, 3678, 5243, 2352, 4913, 9390, 6013, 1290,
    8361, 5358, 8955, 7248, 2416, 8241, 7943, 8939, 1518, 5481, 6056, 1888,
    3988, 2214, 8479, 2776, 7251, 3457, 310, 1550, 8755, 8541, 3951, 2006,
    4785, 6801, 8388, 923, 9134, 3351, 7454, 5940, 1021, 5584, 7098, 3160,
    7289, 9217, 5954, 3456, 5712, 6573, 8991, 5902, 5250, 7571, 4825, 1968,
    5579, 4294, 1506, 643, 3026, 2697, 9292, 6412, 4465, 4119, 3852, 1156, 530,
    5249, 7651, 8895, 8188, 5496, 8031, 7114, 9330, 2095, 410, 2335, 7028,
    2439, 5128, 6628, 2509, 8094, 7840, 3264, 7118, 5925, 3224, 3062, 4850,
    6031, 8493, 6581, 6813, 8165, 3310, 6446, 3421, 8540, 4606, 2, 9506, 6197,
    7631, 2025, 2209, 3298, 4446, 3840, 3415
]
#500
print len(random_9726)

db_conn = sd.get_connection("mysqlcreds-sample-c.csv")
cursor = db_conn.cursor()

for rand_id in random_9726:

    id_query = "select n.repo_id from (select distinct r.repo_id from gh_vuln v, gh_repo r, gh_file f where r.repo_id = f.repo_id and f.file_id = v.file_id and v.vuln_desc = 'strcpy' order by r.repo_id limit " + str(
        rand_id) + ") n order by  n.repo_id desc limit 1"
    cursor.execute(id_query)
    repo_id = int(cursor.fetchall()[0][0])
    #print str(rand_id) + ": " + str(repo_id)

    data_query = "select owner_name, repo_name, filename, code_sample, line_number, date_created, r.repo_id from gh_vuln v, gh_repo r, gh_file f where r.repo_id = f.repo_id and f.file_id = v.file_id and r.repo_id =" + str(
        repo_id) + " limit 100"
    cursor.execute(data_query)
    data = cursor.fetchall()