Пример #1
0
 def __init__(self, max_redirects=3, max_retries=3, retry_delay=0,
              cookiejar=None, headers=None, **kwargs):
     self.max_redirects = int(max_redirects)
     self.max_retries = int(max_retries)
     self.retry_delay = retry_delay
     self.default_headers = HTTPClient.DEFAULT_HEADERS.copy()
     if headers:
         self.default_headers.update(headers)
     self.cookiejar = cookiejar
     self.clientpool = HTTPClientPool(**kwargs)
Пример #2
0
 def __init__(self, max_redirects=3, max_retries=3, retry_delay=0, cookiejar=None, headers=None, **kwargs):
     self.max_redirects = int(max_redirects)
     self.max_retries = int(max_retries)
     self.retry_delay = retry_delay
     self.default_headers = HTTPClient.DEFAULT_HEADERS.copy()
     if headers:
         self.default_headers.update(headers)
     self.cookiejar = cookiejar
     self.clientpool = HTTPClientPool(**kwargs)
Пример #3
0
class UserAgent(object):
    response_type = CompatResponse
    request_type = CompatRequest
    valid_response_codes = set([200, 206, 301, 302, 303, 307])

    def __init__(self, max_redirects=3, max_retries=3, retry_delay=0,
                 cookiejar=None, headers=None, **kwargs):
        self.max_redirects = int(max_redirects)
        self.max_retries = int(max_retries)
        self.retry_delay = retry_delay
        self.default_headers = HTTPClient.DEFAULT_HEADERS.copy()
        if headers:
            self.default_headers.update(headers)
        self.cookiejar = cookiejar
        self.clientpool = HTTPClientPool(**kwargs)

    def _make_request(self, url, method='GET', headers=None, payload=None):
        req_headers = self.default_headers.copy()
        if headers:
            req_headers.update(headers)
        if payload:
            # Adjust headers depending on payload content
            content_type = req_headers.get('content-type', None)
            if not content_type and isinstance(payload, dict):
                req_headers['content-type'] = "application/x-www-form-urlencoded; charset=utf-8"
                payload = urlencode(payload)
                req_headers['content-length'] = len(payload)
            elif not content_type:
                req_headers['content-type'] = 'application/octet-stream'
                payload = payload if isinstance(payload, basestring) else str(payload)
                req_headers['content-length'] = len(payload)
            elif content_type.startswith("multipart/form-data"):
                # See restkit for some example implementation
                # TODO: Implement it
                raise NotImplementedError
            else:
                payload = payload if isinstance(payload, basestring) else str(payload)
                req_headers['content-length'] = len(payload)
        return CompatRequest(url, method=method, headers=req_headers, payload=payload)

    def _urlopen(self, request):
        client = self.clientpool.get_client(request.url_split)
        resp = client.request(request.method, request.url_split.request_uri,
                              body=request.payload, headers=request.headers)
        return CompatResponse(resp, request=request, sent_request=resp._sent_request)

    def _verify_status(self, status_code, url=None):
        """ Hook for subclassing 
        """
        if status_code not in self.valid_response_codes:
            raise BadStatusCode(url, code=status_code)

    def _handle_error(self, e, url=None):
        """ Hook for subclassing. Raise the error to interrupt further retrying,
            return it to continue retries and save the error, when retries
            exceed the limit.
            Temporary errors should be swallowed here for automatic retries.
        """
        if isinstance(e, (socket.timeout, gevent.Timeout)):
            return e
        elif isinstance(e, (socket.error, gevent.dns.DNSError)) and \
                e.errno in set([errno.ETIMEDOUT, errno.ENOLINK, errno.ENOENT, errno.EPIPE]):
            return e
        elif isinstance(e, ssl.SSLError) and 'read operation timed out' in str(e):
            return e
        elif isinstance(e, EmptyResponse):
            return e
        raise e, None, sys.exc_info()[2]

    def _handle_retries_exceeded(self, url, last_error=None):
        """ Hook for subclassing 
        """
        raise RetriesExceeded(url, self.max_retries, original=last_error)

    def urlopen(self, url, method='GET', response_codes=valid_response_codes,
                headers=None, payload=None, to_string=False, debug_stream=None, **kwargs):
        """ Open an URL, do retries and redirects and verify the status code 
        """
        # POST or GET parameters can be passed in **kwargs
        if kwargs:
            if not payload:
                payload = kwargs
            elif isinstance(payload, dict):
                payload.update(kwargs)

        req = self._make_request(url, method=method, headers=headers, payload=payload)
        for retry in xrange(self.max_retries):
            if retry > 0 and self.retry_delay:
                # Don't wait the first time and skip if no delay specified
                gevent.sleep(self.retry_delay)
            for _ in xrange(self.max_redirects):
                if self.cookiejar is not None:
                    # Check against None to avoid issues with empty cookiejars
                    self.cookiejar.add_cookie_header(req)

                try:
                    resp = self._urlopen(req)
                except gevent.GreenletExit:
                    raise
                except BaseException as e:
                    e.request = req
                    e = self._handle_error(e, url=req.url)
                    break # Continue with next retry

                # We received a response
                if debug_stream is not None:
                    debug_stream.write(self._conversation_str(url, resp) + '\n\n')

                try:
                    self._verify_status(resp.status_code, url=req.url)
                except Exception as e:
                    # Basic transmission successful, but not the wished result
                    # Let's collect some debug info
                    e.response = resp
                    e.request = req
                    e.http_log = self._conversation_str(url, resp)
                    resp.release()
                    e = self._handle_error(e, url=req.url)
                    break # Continue with next retry

                if self.cookiejar is not None:
                    # Check against None to avoid issues with empty cookiejars
                    self.cookiejar.extract_cookies(resp, req)

                redirection = resp.headers.get('location')
                if resp.status_code in set([301, 302, 303, 307]) and redirection:
                    resp.release()
                    req.set_url(req.url_split.redirect(redirection))
                    req.method = 'GET' if resp.status_code in set([302, 303]) else req.method
                    for item in ('content-length', 'content-type', 'content-encoding', 'cookie', 'cookie2'):
                        req.headers.discard(item)
                    req.payload = None
                    continue

                if not to_string:
                    return resp
                else:
                    # to_string added as parameter, to handle empty response
                    # bodies as error and continue retries automatically
                    try:
                        ret = resp.content
                    except Exception as e:
                        e = self._handle_error(e, url=url)
                        break
                    else:
                        if not ret:
                            e = EmptyResponse(url, "Empty response body received")
                            e = self._handle_error(e, url=url)
                            break
                        else:
                            return ret
            else:
                e = RetriesExceeded(url, "Redirection limit reached (%s)" % self.max_redirects)
                e = self._handle_error(e, url=url)
        else:
            return self._handle_retries_exceeded(url, last_error=e)

    @classmethod
    def _conversation_str(cls, url, resp):
        header_str = '\n'.join('%s: %s' % item for item in resp.headers.pretty_items())
        ret = 'REQUEST: ' + url + '\n' + resp._sent_request + '\n\n'
        ret += 'RESPONSE: ' + resp._response.version + ' ' + \
                           str(resp.status_code) + '\n' + \
                           header_str + '\n\n' + resp.content
        return ret

    def download(self, url, fpath, chunk_size=16 * 1024, resume=False, **kwargs):
        kwargs.pop('to_string', None)
        headers = kwargs.pop('headers', {})
        headers['Connection'] = 'Keep-Alive'
        if resume and os.path.isfile(fpath):
            offset = os.path.getsize(fpath)
        else:
            offset = 0

        for _ in xrange(self.max_retries):
            if offset:
                headers['Range'] = 'bytes=%d-' % offset
                resp = self.urlopen(url, headers=headers, **kwargs)
                cr = resp.headers.get('Content-Range')
                if resp.status_code != 206 or not cr or not cr.startswith('bytes') or \
                            not cr.split(None, 1)[1].startswith(str(offset)):
                    resp.release()
                    offset = 0
            if not offset:
                headers.pop('Range', None)
                resp = self.urlopen(url, headers=headers, **kwargs)

            with open(fpath, 'ab' if offset else 'wb') as f:
                if offset:
                    f.seek(offset, os.SEEK_SET)
                try:
                    data = resp.read(chunk_size)
                    with resp:
                        while data:
                            f.write(data)
                            data = resp.read(chunk_size)
                except BaseException as e:
                    self._handle_error(e, url=url)
                    if resp.headers.get('accept-ranges') == 'bytes':
                        # Only if this header is set, we can fall back to partial download
                        offset = f.tell()
                    continue
            # All done, break outer loop
            break
        else:
            self._handle_retries_exceeded(url, last_error=e)
        return resp

    def close(self):
        self.clientpool.close()
Пример #4
0
class UserAgent(object):
    response_type = CompatResponse
    request_type = CompatRequest
    valid_response_codes = set([200, 206, 301, 302, 303, 307])

    def __init__(self, max_redirects=3, max_retries=3, retry_delay=0, cookiejar=None, headers=None, **kwargs):
        self.max_redirects = int(max_redirects)
        self.max_retries = int(max_retries)
        self.retry_delay = retry_delay
        self.default_headers = HTTPClient.DEFAULT_HEADERS.copy()
        if headers:
            self.default_headers.update(headers)
        self.cookiejar = cookiejar
        self.clientpool = HTTPClientPool(**kwargs)

    def _make_request(self, url, method="GET", headers=None, payload=None):
        req_headers = self.default_headers.copy()
        if headers:
            req_headers.update(headers)
        if payload:
            # Adjust headers depending on payload content
            content_type = req_headers.get("content-type", None)
            if not content_type and isinstance(payload, dict):
                req_headers["content-type"] = "application/x-www-form-urlencoded; charset=utf-8"
                payload = urlencode(payload)
                req_headers["content-length"] = len(payload)
            elif not content_type:
                req_headers["content-type"] = "application/octet-stream"
                payload = payload if isinstance(payload, basestring) else str(payload)
                req_headers["content-length"] = len(payload)
            elif content_type.startswith("multipart/form-data"):
                # See restkit for some example implementation
                # TODO: Implement it
                raise NotImplementedError
            else:
                payload = payload if isinstance(payload, basestring) else str(payload)
                req_headers["content-length"] = len(payload)
        return CompatRequest(url, method=method, headers=req_headers, payload=payload)

    def _urlopen(self, request):
        client = self.clientpool.get_client(request.url_split)
        resp = client.request(
            request.method, request.url_split.request_uri, body=request.payload, headers=request.headers
        )
        return CompatResponse(resp, request=request, sent_request=resp._sent_request)

    def _verify_status(self, status_code, url=None):
        """ Hook for subclassing 
        """
        if status_code not in self.valid_response_codes:
            raise BadStatusCode(url, code=status_code)

    def _handle_error(self, e, url=None):
        """ Hook for subclassing. Raise the error to interrupt further retrying,
            return it to continue retries and save the error, when retries
            exceed the limit.
            Temporary errors should be swallowed here for automatic retries.
        """
        if isinstance(e, (socket.timeout, gevent.Timeout)):
            return e
        elif isinstance(e, (socket.error, gevent.dns.DNSError)) and e.errno in set(
            [errno.ETIMEDOUT, errno.ENOLINK, errno.ENOENT, errno.EPIPE]
        ):
            return e
        elif isinstance(e, ssl.SSLError) and "read operation timed out" in str(e):
            return e
        elif isinstance(e, EmptyResponse):
            return e
        raise e, None, sys.exc_info()[2]

    def _handle_retries_exceeded(self, url, last_error=None):
        """ Hook for subclassing 
        """
        raise RetriesExceeded(url, self.max_retries, original=last_error)

    def urlopen(
        self,
        url,
        method="GET",
        response_codes=valid_response_codes,
        headers=None,
        payload=None,
        to_string=False,
        debug_stream=None,
        **kwargs
    ):
        """ Open an URL, do retries and redirects and verify the status code 
        """
        # POST or GET parameters can be passed in **kwargs
        if kwargs:
            if not payload:
                payload = kwargs
            elif isinstance(payload, dict):
                payload.update(kwargs)

        req = self._make_request(url, method=method, headers=headers, payload=payload)
        for retry in xrange(self.max_retries):
            if retry > 0 and self.retry_delay:
                # Don't wait the first time and skip if no delay specified
                gevent.sleep(self.retry_delay)
            for _ in xrange(self.max_redirects):
                if self.cookiejar is not None:
                    # Check against None to avoid issues with empty cookiejars
                    self.cookiejar.add_cookie_header(req)

                try:
                    resp = self._urlopen(req)
                except gevent.GreenletExit:
                    raise
                except BaseException as e:
                    e.request = req
                    e = self._handle_error(e, url=req.url)
                    break  # Continue with next retry

                # We received a response
                if debug_stream is not None:
                    debug_stream.write(self._conversation_str(url, resp) + "\n\n")

                try:
                    self._verify_status(resp.status_code, url=req.url)
                except Exception as e:
                    # Basic transmission successful, but not the wished result
                    # Let's collect some debug info
                    e.response = resp
                    e.request = req
                    e.http_log = self._conversation_str(url, resp)
                    e = self._handle_error(e, url=req.url)
                    break  # Continue with next retry

                if self.cookiejar is not None:
                    # Check against None to avoid issues with empty cookiejars
                    self.cookiejar.extract_cookies(resp, req)

                redirection = resp.headers.get("location")
                if resp.status_code in set([301, 302, 303, 307]) and redirection:
                    resp._response.release()
                    req.set_url(req.url_split.redirect(redirection))
                    req.method = "GET" if resp.status_code in set([302, 303]) else req.method
                    for item in ("content-length", "content-type", "content-encoding", "cookie", "cookie2"):
                        req.headers.discard(item)
                    req.payload = None
                    continue

                if not to_string:
                    return resp
                else:
                    # to_string added as parameter, to handle empty response
                    # bodies as error and continue retries automatically
                    try:
                        ret = resp.content
                    except Exception as e:
                        e = self._handle_error(e, url=url)
                        break
                    else:
                        if not ret:
                            e = EmptyResponse(url, "Empty response body received")
                            e = self._handle_error(e, url=url)
                            break
                        else:
                            return ret
            else:
                e = RetriesExceeded(url, "Redirection limit reached (%s)" % self.max_redirects)
                e = self._handle_error(e, url=url)
        else:
            return self._handle_retries_exceeded(url, last_error=e)

    @classmethod
    def _conversation_str(cls, url, resp):
        header_str = "\n".join("%s: %s" % item for item in resp.headers.pretty_items())
        ret = "REQUEST: " + url + "\n" + resp._sent_request + "\n\n"
        ret += (
            "RESPONSE: "
            + resp._response.version
            + " "
            + str(resp.status_code)
            + "\n"
            + header_str
            + "\n\n"
            + resp.content
        )
        return ret

    def download(self, url, fpath, chunk_size=16 * 1024, resume=False, **kwargs):
        kwargs.pop("to_string", None)
        headers = kwargs.pop("headers", {})
        headers["Connection"] = "Keep-Alive"
        if resume and os.path.isfile(fpath):
            offset = os.path.getsize(fpath)
        else:
            offset = 0

        for _ in xrange(self.max_retries):
            if offset:
                headers["Range"] = "bytes=%d-" % offset
                resp = self.urlopen(url, headers=headers, **kwargs)
                cr = resp.headers.get("Content-Range")
                if (
                    resp.status_code != 206
                    or not cr
                    or not cr.startswith("bytes")
                    or not cr.split(None, 1)[1].startswith(str(offset))
                ):
                    resp._response.release()
                    offset = 0
            if not offset:
                headers.pop("Range", None)
                resp = self.urlopen(url, headers=headers, **kwargs)

            with open(fpath, "ab" if offset else "wb") as f:
                if offset:
                    f.seek(offset, os.SEEK_SET)
                try:
                    data = resp.read(chunk_size)
                    while data:
                        f.write(data)
                        data = resp.read(chunk_size)
                except BaseException as e:
                    self._handle_error(e, url=url)
                    if resp.headers.get("accept-ranges") == "bytes":
                        # Only if this header is set, we can fall back to partial download
                        offset = f.tell()
                    continue
            # All done, break outer loop
            break
        else:
            self._handle_retries_exceeded(url, last_error=e)
        return resp

    def close(self):
        self.clientpool.close()