def __init__(self, crawler): self._bans = defaultdict(int) self.crawler = crawler self._saved_delays = defaultdict(lambda: None) proxy_url = self.crawler.settings.get('PROXY_URL') self.proxy_type, self.user, self.password, self.hostport = _parse_proxy( proxy_url)
def setUp(self): conf = namedtuple('ProxyConf', 'scheme username password hostport') self.config = { 'https': conf(*_parse_proxy('https://*****:*****@host:3128')), 'no_proxy': 'localhost,127.0.0.1,dev_server:8080' }
def parse_proxy(proxy_ip: str) -> str: """解析代理,获取代理的scheme `proxy_ip`: 代理ip """ proxy_type, *_ = _parse_proxy(proxy_ip) return proxy_type
def get_proxy(auth_encoding, url: str, orig_type: str) -> Tuple[bytes, str]: proxy_type, user, password, host_port = _parse_proxy(url) proxy_url: str = urlunparse((proxy_type or orig_type, host_port, '', '', '', '')) credentials: bytes = (basic_auth_header(user, password, auth_encoding) if user else None) return credentials, proxy_url
def check_proxy(proxy_ip: str, proxy_user_name: str, proxy_pwd: str) -> bool: """检查代理是否可用,request同步,适用于单个代理 `proxy_ip`: 代理IP `proxy_user_name`: 用户名 `proxy_pwd`: 密码 """ try: url = "http://httpbin.org/ip" headers = json.loads( get_scrapy_settings("DEFAULT_REQUEST_HEADERS").replace(r"'", '"')) proxy_type, username, password, proxy_port = _parse_proxy(proxy_ip) if not username and not password: proxy = {proxy_type: proxy_ip} # 用户名和密码都已经包含在了proxy_ip中 else: proxy = { proxy_type: f"{proxy_type}://{proxy_user_name}:{proxy_pwd}@{proxy_port}" } # 重新组proxy response = requests.get(url, headers=headers, proxies=proxy).text except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout): return False else: if json.loads(response).get("origin") == re.search( r"(.*):", proxy_port).group(1): return True else: return False
def _get_proxy(self, url, orig_type=''): proxy_type, user, password, hostport = _parse_proxy(url) proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) creds = self._basic_auth_header(user, password) if user else None return creds, proxy_url
def test_set_tunnel_is_not_called_when_socks(self, mock_set_tunnel): conf = namedtuple('ProxyConf', 'scheme username password hostport') self.config = { 'https': conf(*_parse_proxy('socks5://username:password@host:3128')) } ProxyAwareHTTPSConnection(self.config, 'example.com') self.assertEqual(mock_set_tunnel.call_count, 0)
def _get_proxy(self, url, orig_type): proxy_type, user, password, hostport = _parse_proxy(url) proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) if user: print('_get_proxy', user) else: creds = None return creds, proxy_url
def test_raises_exception_when_invalid_socks_scheme(self, mock_socks): conf = namedtuple('ProxyConf', 'scheme username password hostport') self.config = { 'https': conf(*_parse_proxy('socks6://socks_user:socks_pass@socks_host:3128')), 'no_proxy': 'localhost,127.0.0.1,dev_server:8080' } conn = ProxyAwareHTTPSConnection(self.config, 'example.com', context=Mock()) with self.assertRaises(TypeError): conn.connect()
def _sanitise_proxy_config(self, proxy_config): """Parse the proxy configuration into something more usable.""" conf = namedtuple('ProxyConf', 'scheme username password hostport') for proxy_type in ('http', 'https'): # Parse the upstream proxy URL into (scheme, username, password, hostport) # for ease of access. if proxy_config.get(proxy_type) is not None: proxy_config[proxy_type] = conf( *_parse_proxy(proxy_config[proxy_type])) return proxy_config
def _sanitise_proxy_config(self, proxy_config): """Parse the proxy configuration into something more usable.""" for proxy_type in ('http', 'https'): # Parse the upstream proxy URL into (scheme, user, password, hostport) # for ease of access. if proxy_config.get(proxy_type) is not None: proxy_config[proxy_type] = _parse_proxy( proxy_config[proxy_type]) if proxy_config: proxy_config['no_proxy'] = [host.strip() for host in proxy_config.get('no_proxy', '').split(',') if host] return proxy_config
def process_response(self, request, response, spider): cur_proxy = request.meta['proxy'] if response.status >= 400: self.stats[cur_proxy] += 1 logger.info('%s failed %s' % (cur_proxy, self.stats[cur_proxy])) if self.stats[cur_proxy] >= self.max_failed: for proxy in self.proxies: *_, hostport = _parse_proxy(proxy) if cur_proxy.endswith(hostport): self.proxies.remove(proxy) logger.warning('proxy %s removed from proxies.' % cur_proxy) break return response
def process_response(self, request, response, spider): cur_proxy = request.meta['proxy'] print(cur_proxy) if response.status >= 400: self.stats[cur_proxy] += 1 if self.stats[cur_proxy] >= self.max_failed: for proxy in self.proxies: *_, hostport = _parse_proxy(proxy) if cur_proxy.endswith(hostport): self.proxies.remove(proxy) logger.info('proxy {ip} remove from proxy list.'.format(ip=cur_proxy)) break return response
def get_upstream_proxy(options): """Get the upstream proxy configuration from the options dictionary. This will be overridden with any configuration found in the environment variables HTTP_PROXY, HTTPS_PROXY, NO_PROXY The configuration will be returned as a dictionary with keys 'http', 'https' and 'no_proxy'. The value of the 'http' and 'https' keys will be a named tuple with the attributes: scheme, username, password, hostport The value of 'no_proxy' will be a list. Note that the keys will only be present in the dictionary when relevant proxy configuration exists. Args: options: The selenium wire options. Returns: A dictionary. """ proxy_options = (options or {}).pop('proxy', {}) http_proxy = os.environ.get('HTTP_PROXY') https_proxy = os.environ.get('HTTPS_PROXY') no_proxy = os.environ.get('NO_PROXY') merged = {} if http_proxy: merged['http'] = http_proxy if https_proxy: merged['https'] = https_proxy if no_proxy: merged['no_proxy'] = no_proxy merged.update(proxy_options) no_proxy = merged.get('no_proxy') if isinstance(no_proxy, str): merged['no_proxy'] = [h.strip() for h in no_proxy.split(',')] conf = namedtuple('ProxyConf', 'scheme username password hostport') for proxy_type in ('http', 'https'): # Parse the upstream proxy URL into (scheme, username, password, hostport) # for ease of access. if merged.get(proxy_type) is not None: merged[proxy_type] = conf(*_parse_proxy(merged[proxy_type])) return merged
def process_response(self, request, response, spider): # 获取当次请求所用的代理ip cur_proxy = request.meta['proxy'] logger.info(cur_proxy) if response.status >= 400: self.stats[cur_proxy] += 1 logger.info('%s proxy request has failed' % cur_proxy) if self.stats[cur_proxy] >= self.max_failed_times: for proxy in self.proxies: *_, hostport = _parse_proxy(proxy) if cur_proxy.endswith(hostport): self.proxies.remove(cur_proxy) logger.warning( '%s proxy has beyond max_failed_times,removed' % cur_proxy) break return response
def test_connect_uses_remote_dns(self, mock_socks): conf = namedtuple('ProxyConf', 'scheme username password hostport') self.config = { 'http': conf(*_parse_proxy( 'socks5h://socks_user:socks_pass@socks_host:3128')), 'no_proxy': 'localhost,127.0.0.1,dev_server:8080' } mock_socks.PROXY_TYPE_SOCKS5 = socks.PROXY_TYPE_SOCKS5 conn = ProxyAwareHTTPConnection(self.config, 'example.com') conn.connect() mock_socks.create_connection.assert_called_once_with( ('example.com', 80), socket._GLOBAL_DEFAULT_TIMEOUT, None, socks.PROXY_TYPE_SOCKS5, 'socks_host', 3128, True, 'socks_user', 'socks_pass', ((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1), ))
def extract_proxy_hostport(proxy): """ Return the hostport component from a given proxy: >>> extract_proxy_hostport('example.com') 'example.com' >>> extract_proxy_hostport('http://www.example.com') 'www.example.com' >>> extract_proxy_hostport('127.0.0.1:8000') '127.0.0.1:8000' >>> extract_proxy_hostport('127.0.0.1') '127.0.0.1' >>> extract_proxy_hostport('localhost') 'localhost' >>> extract_proxy_hostport('zot:4321') 'zot:4321' >>> extract_proxy_hostport('http://*****:*****@baz:1234') 'baz:1234' """ return _parse_proxy(proxy)[3]
def __init__(self, proxy_url=None): options = webdriver.ChromeOptions() prefs = {'profile.default_content_setting_values': {'images': 2}} options.add_experimental_option('prefs', prefs) if proxy_url: proxy_type, user, password, hostport = _parse_proxy(proxy_url) proxyauth_plugin_path = create_proxyauth_extension( proxy_host=hostport.split(':')[0], proxy_port=hostport.split(':')[1], proxy_username=user, proxy_password=password, scheme=proxy_type, plugin_path="vimm_chrome_proxyauth_plugin.zip") options.add_extension(proxyauth_plugin_path) self.display = self.get_display() self.driver = webdriver.Chrome(chrome_options=options) self.driver.set_page_load_timeout(30)
def setUp(self): self.config = { 'http': _parse_proxy('http://*****:*****@host:3128'), 'https': _parse_proxy('https://*****:*****@host:3128'), 'no_proxy': 'localhost,127.0.0.1,dev_server:8080' }
def parse_proxy(proxy): try: return parse_proxy_cache[proxy] except KeyError: parse_proxy_cache[proxy] = proxy_tuple = request._parse_proxy(proxy) return proxy_tuple
def _parse(proxy_url): proxy_type, user, password, hostport = _parse_proxy(proxy_url) return '%s://%s' % (proxy_type, hostport)
def reform_url(url): # 将url解开成不同的部分 proxy_type, *_, hostport = _parse_proxy(url) # 将代理URL重新组合,去掉用户名密码 return '%s://%s' % (proxy_type, hostport)
def reform_url(url): proxy_type, user, password, hostport = _parse_proxy(url) return '%s://%s' % (proxy_type, hostport)