예제 #1
0
    def update_from_response(self, url, response, content):
        r = super(GitHubRemoteObject,
                  self).update_from_response(url, response, content)

        self._rate_limit = (response.get('x-ratelimit-remaining'),
                            response.get('x-ratelimit-limit'))

        # GitHub sends paging information as a response header like this:
        # <https://api.github.com/repos/cappuccino/cappuccino/issues?page=2&state=open>; rel="next", <https://api.github.com/repos/cappuccino/cappuccino/issues?page=11&state=open>; rel="last"
        #
        # <https://api.github.com/repos/cappuccino/cappuccino/issues?page=2&state=closed>; rel="next", <https://api.github.com/repos/cappuccino/cappuccino/issues?page=51&state=closed>; rel="last"

        links = response.get('link')

        self._next_page_url = None
        self._last_page_url = None
        if links:
            links = [parse_link_value(link) for link in links.split(',')]
            for link in links:
                for url, attrs in link.items():
                    if attrs['rel'] == 'next':
                        self._next_page_url = url
                    elif attrs['rel'] == 'last':
                        self._last_page_url = url

        return r
예제 #2
0
 def fetch_and_process(url):
     logging.debug("Following url %s" % url)
     response = raw_request(url, oauth_token=app.config['GITHUB_OAUTH_KEY'])
     links = parse_link_value(response.headers.get('Link', ''))
     prs = json.loads(response.content)
     for pr in prs:
         taskqueue.add(url="/tasks/update-issue/%i" % pr['number'])
     for (link_url, info) in links.items():
         if info.get('rel') == 'next':
             fetch_and_process(link_url)
예제 #3
0
def fetch(url, dry_run=False):

    request = urllib2.Request(url)
    request.get_method = lambda: 'HEAD'
    conn = urllib2.urlopen(request)

    # Parse out Last-Modified date in a reliable way
    def get_seconds(headers, modified='Last-Modified'):
        date_tuple = headers.getdate_tz(modified)
        epoch_seconds = rfc822.mktime_tz(date_tuple)
        return epoch_seconds

    def progress_ticker(blocks, blocksize, total):
        BACKSPACE = '\x08'
        percentage = "{0:6.2f}%".format(
            min(100.0, blocks * blocksize * 100.0 / total))
        sys.stdout.write(BACKSPACE * len(percentage) + percentage)

    # single-level mkdir -p functionality
    def mkdir_parents(directory):
        try:
            os.mkdir(directory)
        except OSError as e:
            if os.path.isdir(directory) and e.errno == errno.EEXIST: pass
            else: raise

    seconds = get_seconds(conn.headers, 'X-Archive-Orig-last-modified')
    #print conn.headers['X-Archive-Orig-last-modified']
    directory = time.strftime('%Y%m%d-%H%M%S', time.gmtime(seconds))
    if not dry_run:
        mkdir_parents(directory)

    path = urlparse.urlparse(url).path
    filename = os.path.basename(path)
    base, ext = os.path.splitext(filename)
    destination = "%s/%s%s" % (directory, base, ext)

    if not dry_run:
        local_filename, headers = urllib.urlretrieve(url, destination,
                                                     progress_ticker)
        os.utime(local_filename, (seconds, ) * 2)
        os.utime(directory, (seconds, ) * 2)

    try:
        # Mark Nottingham
        # https://gist.githubusercontent.com/mnot/210535/raw/1755bb24a4f8796d55c280f1c50d0910f5522fb2/link_header.py
        import link_header

        links = link_header.parse_link_value(conn.headers['Link'])
        for k, v in links.items():
            if v.has_key('rel') and 'prev memento' in v['rel']:
                sys.stdout.write('\n%10s %s\r' % ('', k))
                fetch(url=k, dry_run=dry_run)
    except:
        raise
def fetch(url, dry_run=False):

    request = urllib2.Request(url)
    request.get_method = lambda: 'HEAD'
    conn = urllib2.urlopen(request)

    # Parse out Last-Modified date in a reliable way
    def get_seconds(headers, modified='Last-Modified'):
        date_tuple = headers.getdate_tz(modified)
        epoch_seconds = rfc822.mktime_tz(date_tuple)
        return epoch_seconds

    def progress_ticker(blocks, blocksize, total):
        BACKSPACE = '\x08'
        percentage = "{0:6.2f}%".format(min(100.0,blocks*blocksize*100.0/total))
        sys.stdout.write(BACKSPACE * len(percentage) + percentage)

    # single-level mkdir -p functionality
    def mkdir_parents(directory):
        try: os.mkdir(directory)
        except OSError as e:
            if os.path.isdir(directory) and e.errno == errno.EEXIST: pass
            else: raise

    seconds = get_seconds(conn.headers, 'X-Archive-Orig-last-modified')
    #print conn.headers['X-Archive-Orig-last-modified']
    directory = time.strftime('%Y%m%d-%H%M%S', time.gmtime(seconds))
    if not dry_run:
        mkdir_parents(directory)

    path = urlparse.urlparse(url).path
    filename = os.path.basename(path)
    base, ext = os.path.splitext(filename)
    destination = "%s/%s%s" % (directory, base, ext)

    if not dry_run:
        local_filename, headers = urllib.urlretrieve(url, destination, progress_ticker)
        os.utime(local_filename, (seconds,) * 2)
        os.utime(directory, (seconds,) * 2)

    try:
        # Mark Nottingham
        # https://gist.githubusercontent.com/mnot/210535/raw/1755bb24a4f8796d55c280f1c50d0910f5522fb2/link_header.py
        import link_header

        links = link_header.parse_link_value(conn.headers['Link'])
        for k,v in links.items():
            if v.has_key('rel') and 'prev memento' in v['rel']:
                sys.stdout.write('\n%10s %s\r' % ('', k))
                fetch(url=k, dry_run=dry_run)
    except:
        raise
예제 #5
0
def github_api_all(url):
    """
    Fetch all results, not simply the first page of results, for the given URL.
    """

    # Get our initial response
    response = requests.get(url, auth=AUTH)
    response_json = response.json()

    # Parse the links header
    parsed_links = link_header.parse_link_value(response.headers['link'])
    links = dict([(parsed_links[l]['rel'], l) for l in parsed_links])
    while 'next' in links:
        # While we have a 'next' link, fetch it and add its response to the json
        # object.
        page_response = requests.get(links['next'], auth=AUTH)
        response_json += page_response.json()

        parsed_links = link_header.parse_link_value(
            page_response.headers['link'])
        links = dict([(parsed_links[l]['rel'], l) for l in parsed_links])

    return response_json
예제 #6
0
 def _visit_page_link_headers(self,url,visited,ids):
     url = self.url(url) # make URL absolute
     r = self.get(url)
     assert_equal(r.status_code,200)
     data = r.json()
     id_list = map(lambda r: r['resourceId'],data)
     ids += id_list
     self.log.info('Request %s returned %s' % (url,str(id_list)))
     visited.append(url)
     links = link_header.parse_link_value(r.headers['link'])
     self.log.info('Parsed link headers : %s' % str(links))
     for link in links.keys():
         assert_true(links[link].has_key('rel'))
         assert_true(links[link]['rel'] in ['current','next','prev','last'])
         if not link in visited:
             self._visit_page_link_headers(link,visited,ids)
예제 #7
0
 def _visit_page_link_headers(self, url, visited, ids):
     url = self.url(url)  # make URL absolute
     r = self.get(url)
     assert_equal(r.status_code, 200)
     data = r.json()
     id_list = map(lambda r: r['resourceId'], data)
     ids += id_list
     self.log.info('Request %s returned %s' % (url, str(id_list)))
     visited.append(url)
     links = link_header.parse_link_value(r.headers['link'])
     self.log.info('Parsed link headers : %s' % str(links))
     for link in links.keys():
         assert_true(links[link].has_key('rel'))
         assert_true(
             links[link]['rel'] in ['current', 'next', 'prev', 'last'])
         if not link in visited:
             self._visit_page_link_headers(link, visited, ids)
예제 #8
0
    def update_from_response(self, url, response, content):
        r = super(GitHubRemoteObject, self).update_from_response(url, response, content)

        self._rate_limit = (response.get('x-ratelimit-remaining'), response.get('x-ratelimit-limit'))

        # GitHub sends paging information as a response header like this:
        # <https://api.github.com/repos/cappuccino/cappuccino/issues?page=2&state=open>; rel="next", <https://api.github.com/repos/cappuccino/cappuccino/issues?page=11&state=open>; rel="last"
        #
        # <https://api.github.com/repos/cappuccino/cappuccino/issues?page=2&state=closed>; rel="next", <https://api.github.com/repos/cappuccino/cappuccino/issues?page=51&state=closed>; rel="last"

        links = response.get('link')

        self._next_page_url = None
        self._last_page_url = None
        if links:
            links = [parse_link_value(link) for link in links.split(',')]
            for link in links:
                for url, attrs in link.items():
                    if attrs['rel'] == 'next':
                        self._next_page_url = url
                    elif attrs['rel'] == 'last':
                        self._last_page_url = url

        return r
    # Retrieve pages of repos till rate limit is reached
    r = api_request(next_repos_url)

    if requests_left < 10 or r.status_code != 200:
        wait_for_rate_limit_reset()

    logger.info("Request status: %s" % r.headers['status'])

    while(r.ok or r.status_code == 401):
        repos_json = json.loads(r.text or r.content)

        logger.info("Requests left: %s" % requests_left)

        # Get link for next page of repos
        links = link_header.parse_link_value(r.headers['link'])

        for link_url in links:
            if(links[link_url]['rel'] == 'next'):
                next_repos_url = link_url

        # Process this page of repos
        for repo in repos_json:

            if requests_left < 10 or r.status_code != 200:
                wait_for_rate_limit_reset()

            # Log the effort to store this repo's information:
            logger.info("Storing repository #%s: %s (Fork? %s)" % \
                            (repo['id'], repo['full_name'], repo['fork']))
예제 #10
0
    # Retrieve pages of repos till rate limit is reached
    r = api_request(next_repos_url)

    if requests_left < 10 or r.status_code != 200:
        wait_for_rate_limit_reset()

    logger.info("Request status: %s" % r.headers['status'])

    while (r.ok or r.status_code == 401):
        repos_json = json.loads(r.text or r.content)

        logger.info("Requests left: %s" % requests_left)

        # Get link for next page of repos
        links = link_header.parse_link_value(r.headers['link'])

        for link_url in links:
            if (links[link_url]['rel'] == 'next'):
                next_repos_url = link_url

        # Process this page of repos
        for repo in repos_json:

            if requests_left < 10 or r.status_code != 200:
                wait_for_rate_limit_reset()

            # Log the effort to store this repo's information:
            logger.info("Storing repository #%s: %s (Fork? %s)" % \
                            (repo['id'], repo['full_name'], repo['fork']))