def collect_urls_by_page_num(token, page_num, starting_date=None, ending_date=None, counter_flag=False): if not starting_date: print 'Not enough args' if not ending_date: query_url = ("https://api.github.com/search/repositories?" "q=language:" + language + "+size:<=" + size_limit + "+created:<=" + str(starting_date) + "&per_page=" + str(per_page) + "&page=" + str(page_num)) else: #TODO #Offset purpose starting_date += timedelta(days=1) query_url = ("https://api.github.com/search/repositories?" "q=language:" + language + "+size:<=" + size_limit + "+created:" + str(starting_date) + ".." + str(ending_date) + "&per_page=" + str(per_page) + "&page=" + str(page_num)) header = {'Authorization': 'token ' + str(token)} #url_list = [] meta_list = [] if counter_flag: print query_url if not token: r = requests.get(query_url) else: r = requests.get(query_url, headers=header) #print r.headers if (r.ok): #TODO:time to sleep #print 'Request for Page Num: ' + str(page_num) + ' returns OK' #print 'Remaining request: ' + r.headers['x-ratelimit-remaining'] #print ' date: ' + r.headers['date'] + ' reset: ' + datetime.utcfromtimestamp(int(r.headers['x-ratelimit-reset'])).isoformat() #print str(r.headers['x-ratelimit-reset']) + ' ' + str(time()) rate_remaining = int(r.headers['x-ratelimit-remaining']) reset_time = int(r.headers['x-ratelimit-reset']) if rate_remaining == 0: util.nap(reset_time) repoItem = json.loads(r.text or r.content) total_count = repoItem['total_count'] repoList = repoItem['items'] if counter_flag: return { 'total_count': total_count, 'rate_limit': r.headers['x-ratelimit-remaining'] } #print len(repoItem) #Print it to see everything you mioght need from repo query, ask nina for pretty print #print repoItem #print len(repoItem['items']) for item in repoList: url = item['url'] + '/' + compress_format created_at = item['created_at'] pushed_at = item['pushed_at'] size = item['size'] contributors_url = item['contributors_url'] description = item['description'] meta_list.append({ 'url': url, 'created_at': created_at, 'pushed_at': pushed_at, 'size': size, 'contributors_url': contributors_url, 'description': description, 'forks_url': item['forks_url'], 'stargazers': item['stargazers_count'], 'forks': item['forks_count'], 'actual_url': item['html_url'] }) else: print 'Request for Page Num: ' + str(page_num) + ' ERROR' print r.headers return {'total_count': total_count, 'meta_list': meta_list}
def collect_urls_by_page_num(token, page_num, starting_date=None, ending_date=None, counter_flag=False): if not starting_date: print "Not enough args" if not ending_date: query_url = "https://api.github.com/search/repositories?" "q=language:" + language + "+size:<=" + size_limit + "+created:<=" + str( starting_date ) + "&per_page=" + str( per_page ) + "&page=" + str( page_num ) else: # TODO # Offset purpose starting_date += timedelta(days=1) query_url = "https://api.github.com/search/repositories?" "q=language:" + language + "+size:<=" + size_limit + "+created:" + str( starting_date ) + ".." + str( ending_date ) + "&per_page=" + str( per_page ) + "&page=" + str( page_num ) header = {"Authorization": "token " + str(token)} # url_list = [] meta_list = [] if counter_flag: print query_url if not token: r = requests.get(query_url) else: r = requests.get(query_url, headers=header) # print r.headers if r.ok: # TODO:time to sleep # print 'Request for Page Num: ' + str(page_num) + ' returns OK' # print 'Remaining request: ' + r.headers['x-ratelimit-remaining'] # print ' date: ' + r.headers['date'] + ' reset: ' + datetime.utcfromtimestamp(int(r.headers['x-ratelimit-reset'])).isoformat() # print str(r.headers['x-ratelimit-reset']) + ' ' + str(time()) rate_remaining = int(r.headers["x-ratelimit-remaining"]) reset_time = int(r.headers["x-ratelimit-reset"]) if rate_remaining == 0: util.nap(reset_time) repoItem = json.loads(r.text or r.content) total_count = repoItem["total_count"] repoList = repoItem["items"] if counter_flag: return {"total_count": total_count, "rate_limit": r.headers["x-ratelimit-remaining"]} # print len(repoItem) # Print it to see everything you mioght need from repo query, ask nina for pretty print # print repoItem # print len(repoItem['items']) for item in repoList: url = item["url"] + "/" + compress_format created_at = item["created_at"] pushed_at = item["pushed_at"] size = item["size"] contributors_url = item["contributors_url"] description = item["description"] meta_list.append( { "url": url, "created_at": created_at, "pushed_at": pushed_at, "size": size, "contributors_url": contributors_url, "description": description, "forks_url": item["forks_url"], "stargazers": item["stargazers_count"], "forks": item["forks_count"], "actual_url": item["html_url"], } ) else: print "Request for Page Num: " + str(page_num) + " ERROR" print r.headers return {"total_count": total_count, "meta_list": meta_list}
else: #get the stragglers rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id") header = {'Authorization': 'token ' + token} for row in rows: repo_id = row[0]; if (repo_id % 10 == 0): print "repo_id ", repo_id query_url = row[1]; owner_name = row[2]; try: r = requests.get(query_url, headers=header) item = json.loads(r.text or r.content) for thing in item: contributions = thing['contributions'] username = thing['login'] sd.save_user_data(db_conn, username); sd.save_repo_contributor_data(db_conn, username, repo_id, contributions); headers = r.headers; ratelimit_remaining = int(headers['x-ratelimit-remaining']) reset_time = int(headers['x-ratelimit-reset']) if (ratelimit_remaining % 10 == 0): print "ratelimit_remaining ", ratelimit_remaining if ratelimit_remaining == 0: print "napping for ", reset_time util.nap(reset_time) except:# ValueError, requests.exceptions.ConnectionError: print "error: ", sys.exc_info()[0] print "skipping repo: ", repo_id
) header = {'Authorization': 'token ' + token} for row in rows: repo_id = row[0] if (repo_id % 10 == 0): print "repo_id ", repo_id query_url = row[1] owner_name = row[2] try: r = requests.get(query_url, headers=header) item = json.loads(r.text or r.content) for thing in item: contributions = thing['contributions'] username = thing['login'] sd.save_user_data(db_conn, username) sd.save_repo_contributor_data(db_conn, username, repo_id, contributions) headers = r.headers ratelimit_remaining = int(headers['x-ratelimit-remaining']) reset_time = int(headers['x-ratelimit-reset']) if (ratelimit_remaining % 10 == 0): print "ratelimit_remaining ", ratelimit_remaining if ratelimit_remaining == 0: print "napping for ", reset_time util.nap(reset_time) except: # ValueError, requests.exceptions.ConnectionError: print "error: ", sys.exc_info()[0] print "skipping repo: ", repo_id
def download_url(url, file_num='', token=None): file_name = '../data/' + url.split('/')[-2] + '_' + str(file_num) + '.tar.gz' print 'Downloading from ' + url + ' save as ' + file_name.split('/')[-1] ''' cmd = 'curl -L ' + url #print cmd args = cmd.split() f = open(file_name,'w') subp = Popen(args, stdout=f, stderr=PIPE) curlstdout, curlstderr = subp.communicate() f.close() #print curlstderr ''' headers = {} def header_function(header_line): # HTTP standard specifies that headers are encoded in iso-8859-1. # On Python 2, decoding step can be skipped. # On Python 3, decoding step is required. header_line = header_line.decode('iso-8859-1') # Header lines include the first status line (HTTP/1.x ...). # We are going to ignore all lines that don't have a colon in them. # This will botch headers that are split on multiple lines... if ':' not in header_line: return # Break the header line into header name and value. name, value = header_line.split(':', 1) # Remove whitespace that may be present. # Header lines include the trailing newline, and there may be whitespace # around the colon. name = name.strip() value = value.strip() # Header names are case insensitive. # Lowercase name here. name = name.lower() # Now we can actually record the header name and value. headers[name] = value #header = StringIO.StringIO() f = open(file_name,'w') c = pycurl.Curl() c.setopt(pycurl.URL, str(url)) c.setopt(pycurl.WRITEDATA, f) c.setopt(c.FOLLOWLOCATION, True) c.setopt(c.HEADERFUNCTION, header_function) #Authorizaiton token if token: c.setopt(pycurl.HTTPHEADER, ['Authorization: token ' + str(token)]) c.setopt(pycurl.SSL_VERIFYPEER, True) c.setopt(pycurl.SSL_VERIFYHOST, 2) c.perform() f.close() status = int(c.getinfo(c.RESPONSE_CODE)) if status != 200: print ('Status: %d' % status) os.remove(file_name) #print header.getvalue() #print headers c.close() ratelimit_remaining = int(headers['x-ratelimit-remaining']) reset_time = int(headers['x-ratelimit-reset']) #print ratelimit_remaining #print reset_time if ratelimit_remaining == 0: util.nap(reset_time)
def download_url(url, file_num='', token=None): file_name = '../data/' + url.split('/')[-2] + '_' + str( file_num) + '.tar.gz' print 'Downloading from ' + url + ' save as ' + file_name.split('/')[-1] ''' cmd = 'curl -L ' + url #print cmd args = cmd.split() f = open(file_name,'w') subp = Popen(args, stdout=f, stderr=PIPE) curlstdout, curlstderr = subp.communicate() f.close() #print curlstderr ''' headers = {} def header_function(header_line): # HTTP standard specifies that headers are encoded in iso-8859-1. # On Python 2, decoding step can be skipped. # On Python 3, decoding step is required. header_line = header_line.decode('iso-8859-1') # Header lines include the first status line (HTTP/1.x ...). # We are going to ignore all lines that don't have a colon in them. # This will botch headers that are split on multiple lines... if ':' not in header_line: return # Break the header line into header name and value. name, value = header_line.split(':', 1) # Remove whitespace that may be present. # Header lines include the trailing newline, and there may be whitespace # around the colon. name = name.strip() value = value.strip() # Header names are case insensitive. # Lowercase name here. name = name.lower() # Now we can actually record the header name and value. headers[name] = value #header = StringIO.StringIO() f = open(file_name, 'w') c = pycurl.Curl() c.setopt(pycurl.URL, str(url)) c.setopt(pycurl.WRITEDATA, f) c.setopt(c.FOLLOWLOCATION, True) c.setopt(c.HEADERFUNCTION, header_function) #Authorizaiton token if token: c.setopt(pycurl.HTTPHEADER, ['Authorization: token ' + str(token)]) c.setopt(pycurl.SSL_VERIFYPEER, True) c.setopt(pycurl.SSL_VERIFYHOST, 2) c.perform() f.close() status = int(c.getinfo(c.RESPONSE_CODE)) if status != 200: print('Status: %d' % status) os.remove(file_name) #print header.getvalue() #print headers c.close() ratelimit_remaining = int(headers['x-ratelimit-remaining']) reset_time = int(headers['x-ratelimit-reset']) #print ratelimit_remaining #print reset_time if ratelimit_remaining == 0: util.nap(reset_time)