Пример #1
0
    def fetch(self, request, **kwargs):
        """Executes an HTTPRequest, returning an HTTPResponse.

        If an error occurs during the fetch, we raise an HTTPError.
        """
        if not isinstance(request, HTTPRequest):
            request = HTTPRequest(url=request, **kwargs)
        buffer = cStringIO.StringIO()
        headers = httputil.HTTPHeaders()
        try:
            _curl_setup_request(self._curl, request, buffer, headers)
            self._curl.perform()
            code = self._curl.getinfo(pycurl.HTTP_CODE)
            effective_url = self._curl.getinfo(pycurl.EFFECTIVE_URL)
            buffer.seek(0)
            response = HTTPResponse(request=request,
                                    code=code,
                                    headers=headers,
                                    buffer=buffer,
                                    effective_url=effective_url)
            if code < 200 or code >= 300:
                raise HTTPError(code, response=response)
            return response
        except pycurl.error, e:
            buffer.close()
            raise CurlError(*e)
Пример #2
0
    def _process_queue(self):
        while True:
            started = 0
            while self._free_list and self._requests:
                started += 1
                curl = self._free_list.pop()
                (request, callback) = self._requests.popleft()
                curl.info = {
                    "headers": httputil.HTTPHeaders(),
                    "buffer": cStringIO.StringIO(),
                    "request": request,
                    "callback": callback,
                    "start_time": time.time(),
                }
                # Disable IPv6 to mitigate the effects of this bug
                # on curl versions <= 7.21.0
                # http://sourceforge.net/tracker/?func=detail&aid=3017819&group_id=976&atid=100976
                if pycurl.version_info()[2] <= 0x71500:  # 7.21.0
                    curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
                _curl_setup_request(curl, request, curl.info["buffer"],
                                    curl.info["headers"])
                self._multi.add_handle(curl)

            if not started:
                break
Пример #3
0
    def __init__(self, method, uri, version="HTTP/1.0", headers=None,
                 body=None, remote_ip=None, protocol=None, host=None,
                 files=None, connection=None):
        self.method = method
        self.uri = uri
        self.version = version
        self.headers = headers or httputil.HTTPHeaders()
        self.body = body or ""
        if connection and connection.xheaders:
            # Squid uses X-Forwarded-For, others use X-Real-Ip
            self.remote_ip = self.headers.get(
                "X-Real-Ip", self.headers.get("X-Forwarded-For", remote_ip))
            self.protocol = self.headers.get("X-Scheme", protocol) or "http"
        else:
            self.remote_ip = remote_ip
            self.protocol = protocol or "http"
        self.host = host or self.headers.get("Host") or "127.0.0.1"
        self.files = files or {}
        self.connection = connection
        self._start_time = time.time()
        self._finish_time = None

        scheme, netloc, path, query, fragment = urlparse.urlsplit(uri)
        self.path = path
        self.query = query
        arguments = cgi.parse_qs(query)
        self.arguments = {}
        for name, values in arguments.iteritems():
            values = [v for v in values if v]
            if values: self.arguments[name] = values
Пример #4
0
 def __init__(self, url, method="GET", headers=None, body=None,
              auth_username=None, auth_password=None,
              connect_timeout=20.0, request_timeout=20.0,
              if_modified_since=None, follow_redirects=True,
              max_redirects=5, user_agent=None, use_gzip=True,
              network_interface=None, streaming_callback=None,
              header_callback=None, prepare_curl_callback=None,
              allow_nonstandard_methods=False):
     if headers is None:
         headers = httputil.HTTPHeaders()
     if if_modified_since:
         timestamp = calendar.timegm(if_modified_since.utctimetuple())
         headers["If-Modified-Since"] = email.utils.formatdate(
             timestamp, localtime=False, usegmt=True)
     if "Pragma" not in headers:
         headers["Pragma"] = ""
     self.url = _utf8(url)
     self.method = method
     self.headers = headers
     self.body = body
     self.auth_username = _utf8(auth_username)
     self.auth_password = _utf8(auth_password)
     self.connect_timeout = connect_timeout
     self.request_timeout = request_timeout
     self.follow_redirects = follow_redirects
     self.max_redirects = max_redirects
     self.user_agent = user_agent
     self.use_gzip = use_gzip
     self.network_interface = network_interface
     self.streaming_callback = streaming_callback
     self.header_callback = header_callback
     self.prepare_curl_callback = prepare_curl_callback
     self.allow_nonstandard_methods = allow_nonstandard_methods
Пример #5
0
    def __init__(self, environ):
        """Parses the given WSGI environ to construct the request."""
        self.method = environ["REQUEST_METHOD"]
        self.path = urllib.quote(environ.get("SCRIPT_NAME", ""))
        self.path += urllib.quote(environ.get("PATH_INFO", ""))
        self.uri = self.path
        self.arguments = {}
        self.query = environ.get("QUERY_STRING", "")
        if self.query:
            self.uri += "?" + self.query
            arguments = cgi.parse_qs(self.query)
            for name, values in arguments.iteritems():
                values = [v for v in values if v]
                if values:
                    self.arguments[name] = values
        self.version = "HTTP/1.1"
        self.headers = httputil.HTTPHeaders()
        if environ.get("CONTENT_TYPE"):
            self.headers["Content-Type"] = environ["CONTENT_TYPE"]
        if environ.get("CONTENT_LENGTH"):
            self.headers["Content-Length"] = int(environ["CONTENT_LENGTH"])
        for key in environ:
            if key.startswith("HTTP_"):
                self.headers[key[5:].replace("_", "-")] = environ[key]
        if self.headers.get("Content-Length"):
            self.body = environ["wsgi.input"].read()
        else:
            self.body = ""
        self.protocol = environ["wsgi.url_scheme"]
        self.remote_ip = environ.get("REMOTE_ADDR", "")
        if environ.get("HTTP_HOST"):
            self.host = environ["HTTP_HOST"]
        else:
            self.host = environ["SERVER_NAME"]

        # Parse request body
        self.files = {}
        content_type = self.headers.get("Content-Type", "")
        if content_type.startswith("application/x-www-form-urlencoded"):
            for name, values in cgi.parse_qs(self.body).iteritems():
                self.arguments.setdefault(name, []).extend(values)
        elif content_type.startswith("multipart/form-data"):
            if 'boundary=' in content_type:
                boundary = content_type.split('boundary=', 1)[1]
                if boundary:
                    self._parse_mime_body(boundary)
            else:
                logging.warning("Invalid multipart/form-data")

        self._start_time = time.time()
        self._finish_time = None
Пример #6
0
class AsyncHTTPClient(object):
    """An non-blocking HTTP client backed with pycurl.

    Example usage:

        import ioloop

        def handle_request(response):
            if response.error:
                print "Error:", response.error
            else:
                print response.body
            ioloop.IOLoop.instance().stop()

        http_client = httpclient.AsyncHTTPClient()
        http_client.fetch("http://www.google.com/", handle_request)
        ioloop.IOLoop.instance().start()

    fetch() can take a string URL or an HTTPRequest instance, which offers
    more options, like executing POST/PUT/DELETE requests.

    The keyword argument max_clients to the AsyncHTTPClient constructor
    determines the maximum number of simultaneous fetch() operations that
    can execute in parallel on each IOLoop.
    """
    _ASYNC_CLIENTS = weakref.WeakKeyDictionary()

    def __new__(cls,
                io_loop=None,
                max_clients=10,
                max_simultaneous_connections=None):
        # There is one client per IOLoop since they share curl instances
        io_loop = io_loop or ioloop.IOLoop.instance()
        if io_loop in cls._ASYNC_CLIENTS:
            return cls._ASYNC_CLIENTS[io_loop]
        else:
            instance = super(AsyncHTTPClient, cls).__new__(cls)
            instance.io_loop = io_loop
            instance._multi = pycurl.CurlMulti()
            instance._curls = [
                _curl_create(max_simultaneous_connections)
                for i in xrange(max_clients)
            ]
            instance._free_list = instance._curls[:]
            instance._requests = collections.deque()
            instance._fds = {}
            instance._events = {}
            instance._added_perform_callback = False
            instance._timeout = None
            instance._closed = False
            cls._ASYNC_CLIENTS[io_loop] = instance
            return instance

    def close(self):
        """Destroys this http client, freeing any file descriptors used.
        Not needed in normal use, but may be helpful in unittests that
        create and destroy http clients.  No other methods may be called
        on the AsyncHTTPClient after close().
        """
        del AsyncHTTPClient._ASYNC_CLIENTS[self.io_loop]
        for curl in self._curls:
            curl.close()
        self._multi.close()
        self._closed = True

    def fetch(self, request, callback, **kwargs):
        """Executes an HTTPRequest, calling callback with an HTTPResponse.

        If an error occurs during the fetch, the HTTPResponse given to the
        callback has a non-None error attribute that contains the exception
        encountered during the request. You can call response.reraise() to
        throw the exception (if any) in the callback.
        """
        if not isinstance(request, HTTPRequest):
            request = HTTPRequest(url=request, **kwargs)
        self._requests.append((request, callback))
        self._add_perform_callback()

    def _add_perform_callback(self):
        if not self._added_perform_callback:
            self.io_loop.add_callback(self._perform)
            self._added_perform_callback = True

    def _handle_events(self, fd, events):
        self._events[fd] = events
        self._add_perform_callback()

    def _handle_timeout(self):
        self._timeout = None
        self._perform()

    def _perform(self):
        self._added_perform_callback = False

        if self._closed:
            return

        while True:
            while True:
                ret, num_handles = self._multi.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            # Update the set of active file descriptors.  It is important
            # that this happen immediately after perform() because
            # fds that have been removed from fdset are free to be reused
            # in user callbacks.
            fds = {}
            (readable, writable, exceptable) = self._multi.fdset()
            for fd in readable:
                fds[fd] = fds.get(fd, 0) | 0x1 | 0x2
            for fd in writable:
                fds[fd] = fds.get(fd, 0) | 0x4
            for fd in exceptable:
                fds[fd] = fds.get(fd, 0) | 0x8 | 0x10

            if fds and max(fds.iterkeys()) > 900:
                # Libcurl has a bug in which it behaves unpredictably with
                # file descriptors greater than 1024.  (This is because
                # even though it uses poll() instead of select(), it still
                # uses FD_SET internally) Since curl opens its own file
                # descriptors we can't catch this problem when it happens,
                # and the best we can do is detect that it's about to
                # happen.  Exiting is a lousy way to handle this error,
                # but there's not much we can do at this point.  Exiting
                # (and getting restarted by whatever monitoring process
                # is handling crashed tornado processes) will at least
                # get things working again and hopefully bring the issue
                # to someone's attention.
                # If you run into this issue, you either have a file descriptor
                # leak or need to run more tornado processes (so that none
                # of them are handling more than 1000 simultaneous connections)
                print >> sys.stderr, "ERROR: File descriptor too high for libcurl. Exiting."
                logging.error("File descriptor too high for libcurl. Exiting.")
                sys.exit(1)

            for fd in self._fds:
                if fd not in fds:
                    try:
                        self.io_loop.remove_handler(fd)
                    except (OSError, IOError), e:
                        if e[0] != errno.ENOENT:
                            raise

            for fd, events in fds.iteritems():
                old_events = self._fds.get(fd, None)
                if old_events is None:
                    self.io_loop.add_handler(fd, self._handle_events, events)
                elif old_events != events:
                    try:
                        self.io_loop.update_handler(fd, events)
                    except (OSError, IOError), e:
                        if e[0] == errno.ENOENT:
                            self.io_loop.add_handler(fd, self._handle_events,
                                                     events)
                        else:
                            raise
            self._fds = fds

            # Handle completed fetches
            completed = 0
            while True:
                num_q, ok_list, err_list = self._multi.info_read()
                for curl in ok_list:
                    self._finish(curl)
                    completed += 1
                for curl, errnum, errmsg in err_list:
                    self._finish(curl, errnum, errmsg)
                    completed += 1
                if num_q == 0:
                    break

            # Start fetching new URLs
            started = 0
            while self._free_list and self._requests:
                started += 1
                curl = self._free_list.pop()
                (request, callback) = self._requests.popleft()
                curl.info = {
                    "headers": httputil.HTTPHeaders(),
                    "buffer": cStringIO.StringIO(),
                    "request": request,
                    "callback": callback,
                    "start_time": time.time(),
                }
                _curl_setup_request(curl, request, curl.info["buffer"],
                                    curl.info["headers"])
                self._multi.add_handle(curl)

            if not started and not completed:
                break
Пример #7
0
    def _perform(self):
        self._added_perform_callback = False

        if self._closed:
            return

        while True:
            while True:
                ret, num_handles = self._multi.perform()
                if ret != pycurl.E_CALL_MULTI_PERFORM:
                    break

            # Update the set of active file descriptors.  It is important
            # that this happen immediately after perform() because
            # fds that have been removed from fdset are free to be reused
            # in user callbacks.
            fds = {}
            (readable, writable, exceptable) = self._multi.fdset()
            for fd in readable:
                fds[fd] = fds.get(fd, 0) | 0x1 | 0x2
            for fd in writable:
                fds[fd] = fds.get(fd, 0) | 0x4
            for fd in exceptable:
                fds[fd] = fds.get(fd, 0) | 0x8 | 0x10

            if fds and max(fds.iterkeys()) > 900:
                # Libcurl has a bug in which it behaves unpredictably with
                # file descriptors greater than 1024.  (This is because
                # even though it uses poll() instead of select(), it still
                # uses FD_SET internally) Since curl opens its own file
                # descriptors we can't catch this problem when it happens,
                # and the best we can do is detect that it's about to
                # happen.  Exiting is a lousy way to handle this error,
                # but there's not much we can do at this point.  Exiting
                # (and getting restarted by whatever monitoring process
                # is handling crashed tornado processes) will at least
                # get things working again and hopefully bring the issue
                # to someone's attention.
                # If you run into this issue, you either have a file descriptor
                # leak or need to run more tornado processes (so that none
                # of them are handling more than 1000 simultaneous connections)
                print >> sys.stderr, "ERROR: File descriptor too high for libcurl. Exiting."
                logging.error("File descriptor too high for libcurl. Exiting.")
                sys.exit(1)

            for fd in self._fds:
                if fd not in fds:
                    try:
                        self.io_loop.remove_handler(fd)
                    except (OSError, IOError) as e:
                        if e[0] != errno.ENOENT:
                            raise

            for fd, events in fds.iteritems():
                old_events = self._fds.get(fd, None)
                if old_events is None:
                    self.io_loop.add_handler(fd, self._handle_events, events)
                elif old_events != events:
                    try:
                        self.io_loop.update_handler(fd, events)
                    except (OSError, IOError) as e:
                        if e[0] == errno.ENOENT:
                            self.io_loop.add_handler(fd, self._handle_events,
                                                     events)
                        else:
                            raise
            self._fds = fds

            # Handle completed fetches
            completed = 0
            while True:
                num_q, ok_list, err_list = self._multi.info_read()
                for curl in ok_list:
                    self._finish(curl)
                    completed += 1
                for curl, errnum, errmsg in err_list:
                    self._finish(curl, errnum, errmsg)
                    completed += 1
                if num_q == 0:
                    break

            # Start fetching new URLs
            started = 0
            while self._free_list and self._requests:
                started += 1
                curl = self._free_list.pop()
                (request, callback) = self._requests.popleft()
                curl.info = {
                    "headers": httputil.HTTPHeaders(),
                    "buffer": cStringIO.StringIO(),
                    "request": request,
                    "callback": callback,
                    "start_time": time.time(),
                }
                _curl_setup_request(curl, request, curl.info["buffer"],
                                    curl.info["headers"])
                self._multi.add_handle(curl)

            if not started and not completed:
                break

        if self._timeout is not None:
            self.io_loop.remove_timeout(self._timeout)
            self._timeout = None

        if num_handles:
            self._timeout = self.io_loop.add_timeout(
                time.time() + 0.2, self._handle_timeout)