示例#1
0
def collect_urls_by_page_num(token,
                             page_num,
                             starting_date=None,
                             ending_date=None,
                             counter_flag=False):
    if not starting_date:
        print 'Not enough args'
    if not ending_date:
        query_url = ("https://api.github.com/search/repositories?"
                     "q=language:" + language + "+size:<=" + size_limit +
                     "+created:<=" + str(starting_date) + "&per_page=" +
                     str(per_page) + "&page=" + str(page_num))
    else:
        #TODO
        #Offset purpose
        starting_date += timedelta(days=1)
        query_url = ("https://api.github.com/search/repositories?"
                     "q=language:" + language + "+size:<=" + size_limit +
                     "+created:" + str(starting_date) + ".." +
                     str(ending_date) + "&per_page=" + str(per_page) +
                     "&page=" + str(page_num))
    header = {'Authorization': 'token ' + str(token)}

    #url_list = []
    meta_list = []
    if counter_flag:
        print query_url
    if not token:
        r = requests.get(query_url)
    else:
        r = requests.get(query_url, headers=header)
    #print r.headers
    if (r.ok):
        #TODO:time to sleep
        #print 'Request for Page Num: ' + str(page_num) + ' returns OK'
        #print 'Remaining request: ' + r.headers['x-ratelimit-remaining']
        #print ' date: ' + r.headers['date'] + ' reset: ' + datetime.utcfromtimestamp(int(r.headers['x-ratelimit-reset'])).isoformat()
        #print str(r.headers['x-ratelimit-reset']) + ' ' + str(time())
        rate_remaining = int(r.headers['x-ratelimit-remaining'])
        reset_time = int(r.headers['x-ratelimit-reset'])
        if rate_remaining == 0:
            util.nap(reset_time)
        repoItem = json.loads(r.text or r.content)
        total_count = repoItem['total_count']
        repoList = repoItem['items']
        if counter_flag:
            return {
                'total_count': total_count,
                'rate_limit': r.headers['x-ratelimit-remaining']
            }
        #print len(repoItem)
        #Print it to see everything you mioght need from repo query, ask nina for pretty print
        #print repoItem
        #print len(repoItem['items'])
        for item in repoList:
            url = item['url'] + '/' + compress_format
            created_at = item['created_at']
            pushed_at = item['pushed_at']
            size = item['size']
            contributors_url = item['contributors_url']
            description = item['description']
            meta_list.append({
                'url': url,
                'created_at': created_at,
                'pushed_at': pushed_at,
                'size': size,
                'contributors_url': contributors_url,
                'description': description,
                'forks_url': item['forks_url'],
                'stargazers': item['stargazers_count'],
                'forks': item['forks_count'],
                'actual_url': item['html_url']
            })
    else:
        print 'Request for Page Num: ' + str(page_num) + ' ERROR'
        print r.headers

    return {'total_count': total_count, 'meta_list': meta_list}
示例#2
0
def collect_urls_by_page_num(token, page_num, starting_date=None, ending_date=None, counter_flag=False):
    if not starting_date:
        print "Not enough args"
    if not ending_date:
        query_url = "https://api.github.com/search/repositories?" "q=language:" + language + "+size:<=" + size_limit + "+created:<=" + str(
            starting_date
        ) + "&per_page=" + str(
            per_page
        ) + "&page=" + str(
            page_num
        )
    else:
        # TODO
        # Offset purpose
        starting_date += timedelta(days=1)
        query_url = "https://api.github.com/search/repositories?" "q=language:" + language + "+size:<=" + size_limit + "+created:" + str(
            starting_date
        ) + ".." + str(
            ending_date
        ) + "&per_page=" + str(
            per_page
        ) + "&page=" + str(
            page_num
        )
    header = {"Authorization": "token " + str(token)}

    # url_list = []
    meta_list = []
    if counter_flag:
        print query_url
    if not token:
        r = requests.get(query_url)
    else:
        r = requests.get(query_url, headers=header)
        # print r.headers
    if r.ok:
        # TODO:time to sleep
        # print 'Request for Page Num: ' + str(page_num) + ' returns OK'
        # print 'Remaining request: ' + r.headers['x-ratelimit-remaining']
        # print ' date: ' + r.headers['date'] + ' reset: ' + datetime.utcfromtimestamp(int(r.headers['x-ratelimit-reset'])).isoformat()
        # print str(r.headers['x-ratelimit-reset']) + ' ' + str(time())
        rate_remaining = int(r.headers["x-ratelimit-remaining"])
        reset_time = int(r.headers["x-ratelimit-reset"])
        if rate_remaining == 0:
            util.nap(reset_time)
        repoItem = json.loads(r.text or r.content)
        total_count = repoItem["total_count"]
        repoList = repoItem["items"]
        if counter_flag:
            return {"total_count": total_count, "rate_limit": r.headers["x-ratelimit-remaining"]}
            # print len(repoItem)
            # Print it to see everything you mioght need from repo query, ask nina for pretty print
            # print repoItem
            # print len(repoItem['items'])
        for item in repoList:
            url = item["url"] + "/" + compress_format
            created_at = item["created_at"]
            pushed_at = item["pushed_at"]
            size = item["size"]
            contributors_url = item["contributors_url"]
            description = item["description"]
            meta_list.append(
                {
                    "url": url,
                    "created_at": created_at,
                    "pushed_at": pushed_at,
                    "size": size,
                    "contributors_url": contributors_url,
                    "description": description,
                    "forks_url": item["forks_url"],
                    "stargazers": item["stargazers_count"],
                    "forks": item["forks_count"],
                    "actual_url": item["html_url"],
                }
            )
    else:
        print "Request for Page Num: " + str(page_num) + " ERROR"
        print r.headers

    return {"total_count": total_count, "meta_list": meta_list}
else: #get the stragglers
	rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id")

header = {'Authorization': 'token ' + token}

for row in rows:
	repo_id = row[0];
	if (repo_id % 10 == 0):
		print "repo_id ", repo_id
	query_url = row[1];
	owner_name = row[2];
	try:
		r = requests.get(query_url, headers=header)
		item = json.loads(r.text or r.content)
		for thing in item:
			contributions = thing['contributions']
			username = thing['login']
			sd.save_user_data(db_conn, username);
			sd.save_repo_contributor_data(db_conn, username, repo_id, contributions);
		headers = r.headers;
		ratelimit_remaining = int(headers['x-ratelimit-remaining'])
		reset_time = int(headers['x-ratelimit-reset'])
		if (ratelimit_remaining % 10 == 0):
			print "ratelimit_remaining ", ratelimit_remaining
		if ratelimit_remaining == 0: 
			print "napping for ", reset_time
			util.nap(reset_time)
	except:# ValueError, requests.exceptions.ConnectionError:
		print "error: ", sys.exc_info()[0]
		print "skipping repo: ", repo_id
示例#4
0
    )

header = {'Authorization': 'token ' + token}

for row in rows:
    repo_id = row[0]
    if (repo_id % 10 == 0):
        print "repo_id ", repo_id
    query_url = row[1]
    owner_name = row[2]
    try:
        r = requests.get(query_url, headers=header)
        item = json.loads(r.text or r.content)
        for thing in item:
            contributions = thing['contributions']
            username = thing['login']
            sd.save_user_data(db_conn, username)
            sd.save_repo_contributor_data(db_conn, username, repo_id,
                                          contributions)
        headers = r.headers
        ratelimit_remaining = int(headers['x-ratelimit-remaining'])
        reset_time = int(headers['x-ratelimit-reset'])
        if (ratelimit_remaining % 10 == 0):
            print "ratelimit_remaining ", ratelimit_remaining
        if ratelimit_remaining == 0:
            print "napping for ", reset_time
            util.nap(reset_time)
    except:  # ValueError, requests.exceptions.ConnectionError:
        print "error: ", sys.exc_info()[0]
        print "skipping repo: ", repo_id
示例#5
0
def download_url(url, file_num='', token=None): 
	file_name = '../data/' + url.split('/')[-2] + '_' + str(file_num)  + '.tar.gz'
	print 'Downloading from ' + url + ' save as ' + file_name.split('/')[-1]
	'''	
	cmd = 'curl -L ' + url 
	#print cmd
	args = cmd.split()
	f = open(file_name,'w')
	subp = Popen(args, stdout=f, stderr=PIPE)

	curlstdout, curlstderr = subp.communicate()
	f.close()
	#print curlstderr
	
	'''
	headers = {}
	def header_function(header_line):
	    # HTTP standard specifies that headers are encoded in iso-8859-1.
	    # On Python 2, decoding step can be skipped.
	    # On Python 3, decoding step is required.
	    header_line = header_line.decode('iso-8859-1')

	    # Header lines include the first status line (HTTP/1.x ...).
	    # We are going to ignore all lines that don't have a colon in them.
	    # This will botch headers that are split on multiple lines...
	    if ':' not in header_line:
		return

	    # Break the header line into header name and value.
	    name, value = header_line.split(':', 1)

	    # Remove whitespace that may be present.
	    # Header lines include the trailing newline, and there may be whitespace
	    # around the colon.
	    name = name.strip()
	    value = value.strip()

	    # Header names are case insensitive.
	    # Lowercase name here.
	    name = name.lower()

	    # Now we can actually record the header name and value.
	    headers[name] = value

	#header = StringIO.StringIO()
	f = open(file_name,'w')
	c = pycurl.Curl()
	c.setopt(pycurl.URL, str(url))
	c.setopt(pycurl.WRITEDATA, f)
	c.setopt(c.FOLLOWLOCATION, True)
	c.setopt(c.HEADERFUNCTION, header_function)
	#Authorizaiton token
	if token:
		c.setopt(pycurl.HTTPHEADER, ['Authorization: token ' + str(token)])
	c.setopt(pycurl.SSL_VERIFYPEER, True)
	c.setopt(pycurl.SSL_VERIFYHOST, 2)
	c.perform()
	f.close()
	status = int(c.getinfo(c.RESPONSE_CODE))	
	if status != 200:
		print ('Status: %d' % status)
		os.remove(file_name)
	#print header.getvalue()
	#print headers
	c.close()
	ratelimit_remaining = int(headers['x-ratelimit-remaining'])
	reset_time = int(headers['x-ratelimit-reset'])
	#print ratelimit_remaining
	#print reset_time
	if ratelimit_remaining == 0: 
		util.nap(reset_time)
示例#6
0
def download_url(url, file_num='', token=None):
    file_name = '../data/' + url.split('/')[-2] + '_' + str(
        file_num) + '.tar.gz'
    print 'Downloading from ' + url + ' save as ' + file_name.split('/')[-1]
    '''	
	cmd = 'curl -L ' + url 
	#print cmd
	args = cmd.split()
	f = open(file_name,'w')
	subp = Popen(args, stdout=f, stderr=PIPE)

	curlstdout, curlstderr = subp.communicate()
	f.close()
	#print curlstderr
	
	'''
    headers = {}

    def header_function(header_line):
        # HTTP standard specifies that headers are encoded in iso-8859-1.
        # On Python 2, decoding step can be skipped.
        # On Python 3, decoding step is required.
        header_line = header_line.decode('iso-8859-1')

        # Header lines include the first status line (HTTP/1.x ...).
        # We are going to ignore all lines that don't have a colon in them.
        # This will botch headers that are split on multiple lines...
        if ':' not in header_line:
            return

        # Break the header line into header name and value.
        name, value = header_line.split(':', 1)

        # Remove whitespace that may be present.
        # Header lines include the trailing newline, and there may be whitespace
        # around the colon.
        name = name.strip()
        value = value.strip()

        # Header names are case insensitive.
        # Lowercase name here.
        name = name.lower()

        # Now we can actually record the header name and value.
        headers[name] = value

    #header = StringIO.StringIO()
    f = open(file_name, 'w')
    c = pycurl.Curl()
    c.setopt(pycurl.URL, str(url))
    c.setopt(pycurl.WRITEDATA, f)
    c.setopt(c.FOLLOWLOCATION, True)
    c.setopt(c.HEADERFUNCTION, header_function)
    #Authorizaiton token
    if token:
        c.setopt(pycurl.HTTPHEADER, ['Authorization: token ' + str(token)])
    c.setopt(pycurl.SSL_VERIFYPEER, True)
    c.setopt(pycurl.SSL_VERIFYHOST, 2)
    c.perform()
    f.close()
    status = int(c.getinfo(c.RESPONSE_CODE))
    if status != 200:
        print('Status: %d' % status)
        os.remove(file_name)
    #print header.getvalue()
    #print headers
    c.close()
    ratelimit_remaining = int(headers['x-ratelimit-remaining'])
    reset_time = int(headers['x-ratelimit-reset'])
    #print ratelimit_remaining
    #print reset_time
    if ratelimit_remaining == 0:
        util.nap(reset_time)