示例#1
0
    def _connect_to_remote_server(self):
        '''
        Connect to destination.
        Note that connection_from_host has hard-coded `scheme='http'`
        to avoid internal urllib3 logic when scheme is https. We handle ssl and
        socks inside the current method.
        self._conn_pool._get_conn() will either return an existing connection
        or a new one. If its new, it needs initialization.
        '''
        self._conn_pool = self.server.remote_connection_pool.connection_from_host(
            host=self.hostname,
            port=int(self.port),
            scheme='http',
            pool_kwargs={
                'maxsize': 12,
                'timeout': self._socket_timeout
            })

        self._remote_server_conn = self._conn_pool._get_conn()
        if is_connection_dropped(self._remote_server_conn):
            if self.onion_tor_socks_proxy_host and self.hostname.endswith(
                    '.onion'):
                self.logger.info(
                    "using tor socks proxy at %s:%s to connect to %s",
                    self.onion_tor_socks_proxy_host,
                    self.onion_tor_socks_proxy_port or 1080, self.hostname)
                self._remote_server_conn.sock = socks.socksocket()
                self._remote_server_conn.sock.set_proxy(
                    socks.SOCKS5,
                    addr=self.onion_tor_socks_proxy_host,
                    port=self.onion_tor_socks_proxy_port,
                    rdns=True)
                self._remote_server_conn.sock.settimeout(self._socket_timeout)
                self._remote_server_conn.sock.connect(
                    (self.hostname, int(self.port)))
            else:
                self._remote_server_conn.connect()

            # Wrap socket if SSL is required
            if self.is_connect:
                try:
                    context = ssl.create_default_context()
                    context.check_hostname = False
                    context.verify_mode = ssl.CERT_NONE
                    self._remote_server_conn.sock = context.wrap_socket(
                        self._remote_server_conn.sock,
                        server_hostname=self.hostname)
                except AttributeError:
                    try:
                        self._remote_server_conn.sock = ssl.wrap_socket(
                            self._remote_server_conn.sock)
                    except ssl.SSLError:
                        self.logger.warning(
                            "failed to establish ssl connection to %s; "
                            "python ssl library does not support SNI, "
                            "consider upgrading to python 2.7.9+ or 3.4+",
                            self.hostname)
                    raise
        return self._remote_server_conn.sock
示例#2
0
    def _connect_to_remote_server(self):
        '''
        Connect to destination.
        Note that connection_from_host has hard-coded `scheme='http'`
        to avoid internal urllib3 logic when scheme is https. We handle ssl and
        socks inside the current method.
        self._conn_pool._get_conn() will either return an existing connection
        or a new one. If its new, it needs initialization.
        '''
        self._conn_pool = self.server.remote_connection_pool.connection_from_host(
            host=self.hostname, port=int(self.port), scheme='http',
            pool_kwargs={'maxsize': 12, 'timeout': self._socket_timeout})

        self._remote_server_conn = self._conn_pool._get_conn()
        if is_connection_dropped(self._remote_server_conn):
            if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'):
                self.logger.info(
                        "using tor socks proxy at %s:%s to connect to %s",
                        self.onion_tor_socks_proxy_host,
                        self.onion_tor_socks_proxy_port or 1080, self.hostname)
                self._remote_server_conn.sock = socks.socksocket()
                self._remote_server_conn.sock.set_proxy(
                        socks.SOCKS5, addr=self.onion_tor_socks_proxy_host,
                        port=self.onion_tor_socks_proxy_port, rdns=True)
                self._remote_server_conn.sock.settimeout(self._socket_timeout)
                self._remote_server_conn.sock.connect((self.hostname, int(self.port)))
            else:
                self._remote_server_conn.connect()

            # Wrap socket if SSL is required
            if self.is_connect:
                try:
                    context = ssl.create_default_context()
                    context.check_hostname = False
                    context.verify_mode = ssl.CERT_NONE
                    self._remote_server_conn.sock = context.wrap_socket(
                            self._remote_server_conn.sock,
                            server_hostname=self.hostname)
                except AttributeError:
                    try:
                        self._remote_server_conn.sock = ssl.wrap_socket(
                                self._remote_server_conn.sock)
                    except ssl.SSLError:
                        self.logger.warning(
                                "failed to establish ssl connection to %s; "
                                "python ssl library does not support SNI, "
                                "consider upgrading to python 2.7.9+ or 3.4+",
                                self.hostname)
                    raise
        return self._remote_server_conn.sock
示例#3
0
    def _proxy_request(self, extra_response_headers={}):
        '''
        Sends the request to the remote server, then uses a ProxyingRecorder to
        read the response and send it to the proxy client, while recording the
        bytes in transit. Returns a tuple (request, response) where request is
        the raw request bytes, and response is a ProxyingRecorder.

        :param extra_response_headers: generated on warcprox._proxy_request.
        It may contain extra HTTP headers such as ``Warcprox-Meta`` which
        are written in the WARC record for this request.
        '''
        # Build request
        req_str = '{} {} {}\r\n'.format(
                self.command, self.path, self.request_version)

        # Swallow headers that don't make sense to forward on, i.e. most
        # hop-by-hop headers, see
        # http://tools.ietf.org/html/rfc2616#section-13.5.
        # self.headers is an email.message.Message, which is case-insensitive
        # and doesn't throw KeyError in __delitem__
        for key in (
                'Connection', 'Proxy-Connection', 'Keep-Alive',
                'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
            del self.headers[key]

        self.headers['Via'] = via_header_value(
                self.headers.get('Via'),
                self.request_version.replace('HTTP/', ''))

        # Add headers to the request
        # XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
        req_str += '\r\n'.join(
                '{}: {}'.format(k,v) for (k,v) in self.headers.items())

        req = req_str.encode('latin1') + b'\r\n\r\n'

        # Append message body if present to the request
        if 'Content-Length' in self.headers:
            req += self.rfile.read(int(self.headers['Content-Length']))

        prox_rec_res = None
        try:
            self.logger.debug('sending to remote server req=%r', req)

            # Send it down the pipe!
            self._remote_server_conn.sock.sendall(req)

            prox_rec_res = ProxyingRecordingHTTPResponse(
                    self._remote_server_conn.sock, proxy_client=self.connection,
                    digest_algorithm=self.server.digest_algorithm,
                    url=self.url, method=self.command,
                    tmp_file_max_memory_size=self._tmp_file_max_memory_size)
            prox_rec_res.begin(extra_response_headers=extra_response_headers)

            buf = prox_rec_res.read(65536)
            while buf != b'':
                buf = prox_rec_res.read(65536)
                if self._max_resource_size:
                    if prox_rec_res.recorder.len > self._max_resource_size:
                        prox_rec_res.truncated = b'length'
                        self.logger.error(
                            'Max resource size %d bytes exceeded for URL %s',
                            self._max_resource_size, self.url)
                        break

            self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
            # Let's close off the remote end. If remote connection is fine,
            # put it back in the pool to reuse it later.
            if not is_connection_dropped(self._remote_server_conn):
                self._conn_pool._put_conn(self._remote_server_conn)
        except:
            self._remote_server_conn.sock.close()
        finally:
            if prox_rec_res:
                prox_rec_res.close()

        return req, prox_rec_res
示例#4
0
    def _inner_proxy_request(self, extra_response_headers={}):
        '''
        Sends the request to the remote server, then uses a ProxyingRecorder to
        read the response and send it to the proxy client, while recording the
        bytes in transit. Returns a tuple (request, response) where request is
        the raw request bytes, and response is a ProxyingRecorder.

        :param extra_response_headers: generated on warcprox._proxy_request.
        It may contain extra HTTP headers such as ``Warcprox-Meta`` which
        are written in the WARC record for this request.
        '''
        self._swallow_hop_by_hop_headers()
        self.headers['Via'] = via_header_value(
            self.headers.get('Via'), self.request_version.replace('HTTP/', ''))
        req = self._build_request()

        # Append message body if present to the request
        if 'Content-Length' in self.headers:
            req += self.rfile.read(int(self.headers['Content-Length']))

        prox_rec_res = None
        start = time.time()
        try:
            self.logger.debug('sending to remote server req=%r', req)

            # Send it down the pipe!
            self._remote_server_conn.sock.sendall(req)

            prox_rec_res = ProxyingRecordingHTTPResponse(
                self._remote_server_conn.sock,
                proxy_client=self.connection,
                digest_algorithm=self.server.digest_algorithm,
                url=self.url,
                method=self.command,
                tmp_file_max_memory_size=self._tmp_file_max_memory_size)
            prox_rec_res.begin(extra_response_headers=extra_response_headers)

            buf = None
            while buf != b'':
                try:
                    buf = prox_rec_res.read(65536)
                except http_client.IncompleteRead as e:
                    self.logger.warning('%s from %s', e, self.url)
                    buf = e.partial

                if (self._max_resource_size and
                        prox_rec_res.recorder.len > self._max_resource_size):
                    prox_rec_res.truncated = b'length'
                    self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
                    self._remote_server_conn.sock.close()
                    self.logger.info(
                        'truncating response because max resource size %d '
                        'bytes exceeded for URL %s', self._max_resource_size,
                        self.url)
                    break
                elif (not 'content-length' in self.headers
                      and time.time() - start > 3 * 60 * 60):
                    prox_rec_res.truncated = b'time'
                    self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
                    self._remote_server_conn.sock.close()
                    self.logger.info(
                        'reached hard timeout of 3 hours fetching url '
                        'without content-length: %s', self.url)
                    break

            self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
            # Let's close off the remote end. If remote connection is fine,
            # put it back in the pool to reuse it later.
            if not is_connection_dropped(self._remote_server_conn):
                self._conn_pool._put_conn(self._remote_server_conn)
        except Exception as e:
            # A common error is to connect to the remote server successfully
            # but raise a `RemoteDisconnected` exception when trying to begin
            # downloading. Its caused by prox_rec_res.begin(...) which calls
            # http_client._read_status(). The connection fails there.
            # https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L275
            # Another case is when the connection is fine but the response
            # status is problematic, raising `BadStatusLine`.
            # https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L296
            # In both cases, the host is bad and we must add it to
            # `bad_hostnames_ports` cache.
            if isinstance(
                    e,
                (http_client.RemoteDisconnected, http_client.BadStatusLine)):
                host_port = self._hostname_port_cache_key()
                with self.server.bad_hostnames_ports_lock:
                    self.server.bad_hostnames_ports[host_port] = 502
                self.logger.info('bad_hostnames_ports cache size: %d',
                                 len(self.server.bad_hostnames_ports))

            # Close the connection only if its still open. If its already
            # closed, an `OSError` "([Errno 107] Transport endpoint is not
            # connected)" would be raised.
            if not is_connection_dropped(self._remote_server_conn):
                self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
                self._remote_server_conn.sock.close()
            raise
        finally:
            if prox_rec_res:
                prox_rec_res.close()

        return req, prox_rec_res
示例#5
0
    def _inner_proxy_request(self, extra_response_headers={}):
        '''
        Sends the request to the remote server, then uses a ProxyingRecorder to
        read the response and send it to the proxy client, while recording the
        bytes in transit. Returns a tuple (request, response) where request is
        the raw request bytes, and response is a ProxyingRecorder.

        :param extra_response_headers: generated on warcprox._proxy_request.
        It may contain extra HTTP headers such as ``Warcprox-Meta`` which
        are written in the WARC record for this request.
        '''
        # Build request
        req_str = '{} {} {}\r\n'.format(
                self.command, self.path, self.request_version)

        # Swallow headers that don't make sense to forward on, i.e. most
        # hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5.
        # self.headers is an email.message.Message, which is case-insensitive
        # and doesn't throw KeyError in __delitem__
        for key in (
                'Connection', 'Proxy-Connection', 'Keep-Alive',
                'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
            del self.headers[key]

        self.headers['Via'] = via_header_value(
                self.headers.get('Via'),
                self.request_version.replace('HTTP/', ''))

        # Add headers to the request
        # XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
        req_str += '\r\n'.join(
                '{}: {}'.format(k,v) for (k,v) in self.headers.items())

        req = req_str.encode('latin1') + b'\r\n\r\n'

        # Append message body if present to the request
        if 'Content-Length' in self.headers:
            req += self.rfile.read(int(self.headers['Content-Length']))

        prox_rec_res = None
        start = time.time()
        try:
            self.logger.debug('sending to remote server req=%r', req)

            # Send it down the pipe!
            self._remote_server_conn.sock.sendall(req)

            prox_rec_res = ProxyingRecordingHTTPResponse(
                    self._remote_server_conn.sock, proxy_client=self.connection,
                    digest_algorithm=self.server.digest_algorithm,
                    url=self.url, method=self.command,
                    tmp_file_max_memory_size=self._tmp_file_max_memory_size)
            prox_rec_res.begin(extra_response_headers=extra_response_headers)

            buf = None
            while buf != b'':
                try:
                    buf = prox_rec_res.read(65536)
                except http_client.IncompleteRead as e:
                    self.logger.warn('%s from %s', e, self.url)
                    buf = e.partial

                if (self._max_resource_size and
                        prox_rec_res.recorder.len > self._max_resource_size):
                    prox_rec_res.truncated = b'length'
                    self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
                    self._remote_server_conn.sock.close()
                    self.logger.info(
                            'truncating response because max resource size %d '
                            'bytes exceeded for URL %s',
                            self._max_resource_size, self.url)
                    break
                elif (not 'content-length' in self.headers
                        and time.time() - start > 3 * 60 * 60):
                    prox_rec_res.truncated = b'time'
                    self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
                    self._remote_server_conn.sock.close()
                    self.logger.info(
                            'reached hard timeout of 3 hours fetching url '
                            'without content-length: %s', self.url)
                    break

            self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
            # Let's close off the remote end. If remote connection is fine,
            # put it back in the pool to reuse it later.
            if not is_connection_dropped(self._remote_server_conn):
                self._conn_pool._put_conn(self._remote_server_conn)
        except Exception as e:
            # A common error is to connect to the remote server successfully
            # but raise a `RemoteDisconnected` exception when trying to begin
            # downloading. Its caused by prox_rec_res.begin(...) which calls
            # http_client._read_status(). In that case, the host is also bad
            # and we must add it to `bad_hostnames_ports` cache.
            if isinstance(e, http_client.RemoteDisconnected):
                host_port = self._hostname_port_cache_key()
                with self.server.bad_hostnames_ports_lock:
                    self.server.bad_hostnames_ports[host_port] = 502
                self.logger.info('bad_hostnames_ports cache size: %d',
                                 len(self.server.bad_hostnames_ports))

            self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
            self._remote_server_conn.sock.close()
            raise
        finally:
            if prox_rec_res:
                prox_rec_res.close()

        return req, prox_rec_res