async def use_proxy_request( self, url: str, method: str, data: dict, headers: dict, timeout: int, proxy_dict: dict, resp_encoding: str, ) -> Union[HTTPResponse, int, None]: """ 检查代理状态 :param url: :param method: :param data: :param headers: :param timeout: :param proxy_dict: :param resp_encoding: :return: """ client = CurlAsyncHTTPClient(force_instance=True) request = self.make_request(url, method, data, headers, timeout, proxy_dict) try: resp = await client.fetch(request, raise_error=False) msg = ('proxy: {}:{}, url: {} ,result: {}'.format( proxy_dict.get('host'), proxy_dict.get('port'), url, resp.code)) self.logger.debug(msg) except CurlError as e: self.logger.error(e) resp = 599 except Exception as e: self.logger.error('proxy: {}:{}, url: {}, result: {}'.format( proxy_dict.get('host'), proxy_dict.get('port'), url, str(e))) resp = None else: resp = self.get_response_body(resp, resp_encoding) finally: client.close() return resp
class HTTPClient: def __init__(self, *, max_clients=100, connect_timeout=20.0, verify_ssl=True, **kwargs): self._httpclient = AsyncHTTPClient(max_clients=max_clients, **kwargs) self._connect_timeout = connect_timeout self._verify_ssl = verify_ssl async def fetch(self, url, *, method="GET", headers=None, body=None, request_timeout=30.0): if isinstance(body, (dict, list)): if headers is None: headers = {'Content-Type': "application/json"} elif 'Content-Type' not in headers: headers['Content-Type'] = "application/json" body = tornado.escape.json_encode(body) resp = await self._httpclient.fetch( url, method=method, headers=headers, body=body, validate_cert=self._verify_ssl, request_timeout=request_timeout, connect_timeout=self._connect_timeout, raise_error=False) if resp.code < 200 or resp.code >= 300: raise HTTPError(resp.code, message=resp.reason) return HTTPResponse(resp.code, resp.body) async def close(self): self._httpclient.close()
class Downloader: def __init__(self, max_clients=100, renderer=None, renderer_cores=None): self._max_clients = max_clients self._http_client = CurlAsyncHTTPClient(max_clients=max_clients, force_instance=True) self._renderer = renderer if renderer_cores is None: renderer_cores = self._max_clients self._renderer_semaphore = Semaphore(renderer_cores) @classmethod def from_crawler(cls, crawler): config = crawler.config renderer = ChromeRenderer( options=config.get('chrome_renderer_options')) downloader = cls(**with_not_none_params( max_clients=config.getint('downloader_clients'), renderer=renderer, renderer_cores=config.getint('renderer_cores'))) crawler.event_bus.subscribe(downloader.close, events.crawler_shutdown) return downloader @property def max_clients(self): return self._max_clients async def fetch(self, request): log.debug("HTTP request: %s", request) try: if request.render: async with self._renderer_semaphore: response = await self._renderer.fetch(request) else: req = self._make_request(request) resp = await self._http_client.fetch(req) response = self._make_response(resp) except CancelledError: raise except HTTPClientError as e: if e.response is not None and e.response.code != 599: raise HttpError('{} {}'.format(e.response.code, e.message), response=self._make_response(e.response)) raise ClientError(e.message) except Exception as e: raise ClientError(e) log.debug("HTTP response: %s", response) return response def _make_request(self, request): kwargs = { 'method': request.method, 'headers': request.headers, 'body': request.body, 'connect_timeout': request.timeout, # FIXME 'request_timeout': request.timeout, # FIXME 'follow_redirects': request.allow_redirects, 'validate_cert': request.verify_ssl } if request.auth is not None: auth_username, auth_password = request.auth kwargs['auth_username'] = auth_username kwargs['auth_password'] = auth_password if request.proxy is not None: s = urlsplit(request.proxy) if s.scheme: if s.scheme in ('http', 'socks4', 'socks5'): proxy_host, proxy_port = s.hostname, s.port else: raise ValueError('Unsupported proxy scheme: {}'.format( s.scheme)) if s.scheme == 'socks5': kwargs['prepare_curl_callback'] = prepare_curl_socks5 elif s.scheme == 'socks4': kwargs['prepare_curl_callback'] = prepare_curl_socks4 else: proxy_host, proxy_port = request.proxy.split(':') kwargs['proxy_host'] = proxy_host kwargs['proxy_port'] = int(proxy_port) if request.proxy_auth is not None: proxy_username, proxy_password = request.proxy_auth kwargs['proxy_username'] = proxy_username kwargs['proxy_password'] = proxy_password return HTTPRequest(request.url, **kwargs) def _make_response(self, resp): return HttpResponse(resp.effective_url, resp.code, headers=resp.headers, body=resp.body) def close(self): self._http_client.close() self._renderer.close()
__author__ = 'TzAnAnY' from monkey import patch_socks patch_socks() from tornado.curl_httpclient import CurlAsyncHTTPClient from tornado.ioloop import IOLoop from pycurl import PROXYTYPE_SOCKS5 def handle_request(response): if response.error: print "Error:", response.error else: print response.body IOLoop.instance().stop() if __name__ == '__main__': config = { 'proxy_type': PROXYTYPE_SOCKS5, 'proxy_host': '127.0.0.1', 'proxy_port': 9050, 'validate_cert': False } client = CurlAsyncHTTPClient() # for i in range(5): client.fetch("https://www.dyndns.org/", handle_request, **config) IOLoop.instance().start() client.close()