def update_from_response(self, url, response, content): r = super(GitHubRemoteObject, self).update_from_response(url, response, content) self._rate_limit = (response.get('x-ratelimit-remaining'), response.get('x-ratelimit-limit')) # GitHub sends paging information as a response header like this: # <https://api.github.com/repos/cappuccino/cappuccino/issues?page=2&state=open>; rel="next", <https://api.github.com/repos/cappuccino/cappuccino/issues?page=11&state=open>; rel="last" # # <https://api.github.com/repos/cappuccino/cappuccino/issues?page=2&state=closed>; rel="next", <https://api.github.com/repos/cappuccino/cappuccino/issues?page=51&state=closed>; rel="last" links = response.get('link') self._next_page_url = None self._last_page_url = None if links: links = [parse_link_value(link) for link in links.split(',')] for link in links: for url, attrs in link.items(): if attrs['rel'] == 'next': self._next_page_url = url elif attrs['rel'] == 'last': self._last_page_url = url return r
def fetch_and_process(url): logging.debug("Following url %s" % url) response = raw_request(url, oauth_token=app.config['GITHUB_OAUTH_KEY']) links = parse_link_value(response.headers.get('Link', '')) prs = json.loads(response.content) for pr in prs: taskqueue.add(url="/tasks/update-issue/%i" % pr['number']) for (link_url, info) in links.items(): if info.get('rel') == 'next': fetch_and_process(link_url)
def fetch(url, dry_run=False): request = urllib2.Request(url) request.get_method = lambda: 'HEAD' conn = urllib2.urlopen(request) # Parse out Last-Modified date in a reliable way def get_seconds(headers, modified='Last-Modified'): date_tuple = headers.getdate_tz(modified) epoch_seconds = rfc822.mktime_tz(date_tuple) return epoch_seconds def progress_ticker(blocks, blocksize, total): BACKSPACE = '\x08' percentage = "{0:6.2f}%".format( min(100.0, blocks * blocksize * 100.0 / total)) sys.stdout.write(BACKSPACE * len(percentage) + percentage) # single-level mkdir -p functionality def mkdir_parents(directory): try: os.mkdir(directory) except OSError as e: if os.path.isdir(directory) and e.errno == errno.EEXIST: pass else: raise seconds = get_seconds(conn.headers, 'X-Archive-Orig-last-modified') #print conn.headers['X-Archive-Orig-last-modified'] directory = time.strftime('%Y%m%d-%H%M%S', time.gmtime(seconds)) if not dry_run: mkdir_parents(directory) path = urlparse.urlparse(url).path filename = os.path.basename(path) base, ext = os.path.splitext(filename) destination = "%s/%s%s" % (directory, base, ext) if not dry_run: local_filename, headers = urllib.urlretrieve(url, destination, progress_ticker) os.utime(local_filename, (seconds, ) * 2) os.utime(directory, (seconds, ) * 2) try: # Mark Nottingham # https://gist.githubusercontent.com/mnot/210535/raw/1755bb24a4f8796d55c280f1c50d0910f5522fb2/link_header.py import link_header links = link_header.parse_link_value(conn.headers['Link']) for k, v in links.items(): if v.has_key('rel') and 'prev memento' in v['rel']: sys.stdout.write('\n%10s %s\r' % ('', k)) fetch(url=k, dry_run=dry_run) except: raise
def fetch(url, dry_run=False): request = urllib2.Request(url) request.get_method = lambda: 'HEAD' conn = urllib2.urlopen(request) # Parse out Last-Modified date in a reliable way def get_seconds(headers, modified='Last-Modified'): date_tuple = headers.getdate_tz(modified) epoch_seconds = rfc822.mktime_tz(date_tuple) return epoch_seconds def progress_ticker(blocks, blocksize, total): BACKSPACE = '\x08' percentage = "{0:6.2f}%".format(min(100.0,blocks*blocksize*100.0/total)) sys.stdout.write(BACKSPACE * len(percentage) + percentage) # single-level mkdir -p functionality def mkdir_parents(directory): try: os.mkdir(directory) except OSError as e: if os.path.isdir(directory) and e.errno == errno.EEXIST: pass else: raise seconds = get_seconds(conn.headers, 'X-Archive-Orig-last-modified') #print conn.headers['X-Archive-Orig-last-modified'] directory = time.strftime('%Y%m%d-%H%M%S', time.gmtime(seconds)) if not dry_run: mkdir_parents(directory) path = urlparse.urlparse(url).path filename = os.path.basename(path) base, ext = os.path.splitext(filename) destination = "%s/%s%s" % (directory, base, ext) if not dry_run: local_filename, headers = urllib.urlretrieve(url, destination, progress_ticker) os.utime(local_filename, (seconds,) * 2) os.utime(directory, (seconds,) * 2) try: # Mark Nottingham # https://gist.githubusercontent.com/mnot/210535/raw/1755bb24a4f8796d55c280f1c50d0910f5522fb2/link_header.py import link_header links = link_header.parse_link_value(conn.headers['Link']) for k,v in links.items(): if v.has_key('rel') and 'prev memento' in v['rel']: sys.stdout.write('\n%10s %s\r' % ('', k)) fetch(url=k, dry_run=dry_run) except: raise
def github_api_all(url): """ Fetch all results, not simply the first page of results, for the given URL. """ # Get our initial response response = requests.get(url, auth=AUTH) response_json = response.json() # Parse the links header parsed_links = link_header.parse_link_value(response.headers['link']) links = dict([(parsed_links[l]['rel'], l) for l in parsed_links]) while 'next' in links: # While we have a 'next' link, fetch it and add its response to the json # object. page_response = requests.get(links['next'], auth=AUTH) response_json += page_response.json() parsed_links = link_header.parse_link_value( page_response.headers['link']) links = dict([(parsed_links[l]['rel'], l) for l in parsed_links]) return response_json
def _visit_page_link_headers(self,url,visited,ids): url = self.url(url) # make URL absolute r = self.get(url) assert_equal(r.status_code,200) data = r.json() id_list = map(lambda r: r['resourceId'],data) ids += id_list self.log.info('Request %s returned %s' % (url,str(id_list))) visited.append(url) links = link_header.parse_link_value(r.headers['link']) self.log.info('Parsed link headers : %s' % str(links)) for link in links.keys(): assert_true(links[link].has_key('rel')) assert_true(links[link]['rel'] in ['current','next','prev','last']) if not link in visited: self._visit_page_link_headers(link,visited,ids)
def _visit_page_link_headers(self, url, visited, ids): url = self.url(url) # make URL absolute r = self.get(url) assert_equal(r.status_code, 200) data = r.json() id_list = map(lambda r: r['resourceId'], data) ids += id_list self.log.info('Request %s returned %s' % (url, str(id_list))) visited.append(url) links = link_header.parse_link_value(r.headers['link']) self.log.info('Parsed link headers : %s' % str(links)) for link in links.keys(): assert_true(links[link].has_key('rel')) assert_true( links[link]['rel'] in ['current', 'next', 'prev', 'last']) if not link in visited: self._visit_page_link_headers(link, visited, ids)
# Retrieve pages of repos till rate limit is reached r = api_request(next_repos_url) if requests_left < 10 or r.status_code != 200: wait_for_rate_limit_reset() logger.info("Request status: %s" % r.headers['status']) while(r.ok or r.status_code == 401): repos_json = json.loads(r.text or r.content) logger.info("Requests left: %s" % requests_left) # Get link for next page of repos links = link_header.parse_link_value(r.headers['link']) for link_url in links: if(links[link_url]['rel'] == 'next'): next_repos_url = link_url # Process this page of repos for repo in repos_json: if requests_left < 10 or r.status_code != 200: wait_for_rate_limit_reset() # Log the effort to store this repo's information: logger.info("Storing repository #%s: %s (Fork? %s)" % \ (repo['id'], repo['full_name'], repo['fork']))
# Retrieve pages of repos till rate limit is reached r = api_request(next_repos_url) if requests_left < 10 or r.status_code != 200: wait_for_rate_limit_reset() logger.info("Request status: %s" % r.headers['status']) while (r.ok or r.status_code == 401): repos_json = json.loads(r.text or r.content) logger.info("Requests left: %s" % requests_left) # Get link for next page of repos links = link_header.parse_link_value(r.headers['link']) for link_url in links: if (links[link_url]['rel'] == 'next'): next_repos_url = link_url # Process this page of repos for repo in repos_json: if requests_left < 10 or r.status_code != 200: wait_for_rate_limit_reset() # Log the effort to store this repo's information: logger.info("Storing repository #%s: %s (Fork? %s)" % \ (repo['id'], repo['full_name'], repo['fork']))