def process_request(self, request, spider): # Set the location of the proxy use_vpn = False parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https'): if proxy_bypass(parsed.hostname): return use_vpn = use_vpn_only(parsed.hostname, spider.vpn_only) if 'http://' == request.url[0:7]: if use_vpn: request.meta['proxy'] = 'http://' + spider.vpn_proxy else: self.http_index = divmod(self.http_index + 1, len(spider.http_proxy))[1] http_proxy = spider.http_proxy[self.http_index] request.meta['proxy'] = http_proxy if spider.crawlera_enabled: return elif 'https://' == request.url[0:8]: if use_vpn: request.meta['proxy'] = 'https://' + spider.vpn_proxy else: self.https_index = divmod(self.https_index + 1, len(spider.https_proxy))[1] https_proxy = spider.https_proxy[self.https_index] request.meta['proxy'] = https_proxy
def process_request(self, request, spider): # ignore if proxy is already set if 'proxy' in request.meta: if request.meta['proxy'] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url # request设置认证头 if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes # ?? if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme)
def process_request(self, request, spider): # ignore if proxy is already set if 'proxy' in request.meta: if request.meta['proxy'] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return parsed = urlparse_cached(request) scheme = parsed.scheme if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if self.use_proxy_rate < 1: if random.random() < self.use_proxy_rate: self._set_proxy(request, scheme) else: self._set_proxy(request, scheme)
def _set_proxy(self, request, proxies): if not proxies: return parsed = urlparse_cached(request) scheme = parsed.scheme if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme not in proxies: return creds, proxy = proxies[scheme] request.meta['proxy'] = proxy if creds: request.headers['Proxy-Authorization'] = b'Basic ' + creds
def process_request(self, request, spider): # ignore if proxy is already seted if 'proxy' in request.meta: return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme)
def process_request(self, request, spider): # ignore if proxy is already set if 'proxy' in request.meta: return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme)
def process_request(self, request, spider): # update proxies global count count+=1 if count % 100 == 0: count = 1 self._update_proxies() if 'direct_connect' in request.meta: value = request.meta['direct_connect'] del request.meta['direct_connect'] if value: if 'proxy' in request.meta: del request.meta['proxy'] logger.debug('HTTP_PROXY-->Direct') return # change proxy if 'proxy' in request.meta: proxy_url = None creds = None if request.meta['proxy'] is None: self._set_proxy(request) return else: creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: # local ip return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return # add proxy self._set_proxy(request)
def process_request(self, request, spider): # When Retry, dont_filter=True, reset proxy if 'proxy' in request.meta: if request.meta['proxy'] is None: return creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: return parsed = urlparse_cached(request) scheme = parsed.scheme if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme in self.proxies: yield self._set_proxy(request, scheme)
def process_request(self, request, spider): # ignore if proxy is already set if 'proxy' in request.meta: if request.meta['proxy'] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme)