def _extract_proxy(self, page_num): try: full_url = self.url_template.format(page=page_num) # rp = requests.get(url=full_url, headers=self._headers, proxies=self.cur_proxy, timeout=10) req = request.Request(full_url) req.add_header = [self._headers] httpproxy_handler = urllib.request.ProxyHandler(self.cur_proxy) opener = request.build_opener(httpproxy_handler) with opener.open(req) as rp: # print('Status:', rp.status, rp.reason) rs = rp.read().decode('utf-8') if rp.status != 200: self._log(logger, 'unexpected http status code %s' % rp.status, full_url, 'restricted') self._need_retry() except Exception as e: self._log(logger, 'request error', full_url, str(e)) self._need_retry() re_ip_result = self.re_ip_pattern.findall(rs) re_port_result = self.re_port_pattern.findall(rs) if not re_ip_result or not re_port_result: self._log(logger, 'extract data error', full_url, 'find no proxy data in web page') self._need_retry() if len(re_ip_result) != len(re_port_result): self._log(logger, 'extract data error', full_url, 'the number of hosts and ports extracted from web page are different') self._need_retry() result_list = zip(re_ip_result, re_port_result) return [{'host': host, 'port': port, 'from': self.name, 'grab_time': get_current_time_str()} for host, port in result_list]
def _extract_proxy(self): try: full_url = self.url_template rp = requests.get(url=full_url, headers=self._headers, proxies=self.cur_proxy, timeout=10) if rp.status_code != 200: self._log(logger, 'unexpected http status code %s' % rp.status_code, full_url, 'restricted') self._need_retry() except Exception as e: self._log(logger, 'request error', full_url, str(e)) self._need_retry() re_ip_port_result = self.re_ip_port_pattern.findall(rp.text) if not re_ip_port_result: self._log(logger, 'extract data error', full_url, 'find no proxy data in web page') self._need_retry() return [{ 'host': host, 'port': port, 'from': self.name, 'grab_time': get_current_time_str() } for host, port in re_ip_port_result]
def _extract_proxy(self, page_num): try: full_url = self.url_template.format(page=page_num) self._headers.update({'cookie': self._cookie}) rp = requests.get(url=full_url, headers=self._headers, proxies=self.cur_proxy, timeout=10) if rp.status_code == 521: self._log(logger, 'unexpected http status code %s' % rp.status_code, full_url, 'response javascript code') self._set_cookies(rp.text) self._need_retry(switch_proxy=False) elif rp.status_code != 200: self._log(logger, 'unexpected http status code %s' % rp.status_code, full_url, 'restricted') self._need_retry() except Exception as e: self._log(logger, 'request error', full_url, str(e)) self._need_retry() re_ip_result = self.re_ip_pattern.findall(rp.text) re_port_result = self.re_port_pattern.findall(rp.text) if not re_ip_result or not re_port_result: self._log(logger, 'extract data error', full_url, 'find no proxy data in web page') self._need_retry() if len(re_ip_result) != len(re_port_result): self._log( logger, 'extract data error', full_url, 'the number of hosts and ports extracted from web page are different' ) self._need_retry() result_list = zip(re_ip_result, re_port_result) return [{ 'host': host, 'port': port, 'from': self.name, 'grab_time': get_current_time_str() } for host, port in result_list]
def _validate_proxy(self, proxy, protocol): host = proxy.get('host') port = proxy.get('port') request_proxies = {protocol: "%s:%s" % (host, port)} request_begin = time.time() try: response_json = requests.get( "%s://httpbin.org/get?show_env=1&cur=%s" % (protocol, request_begin), proxies=request_proxies, timeout=5).json() except: return request_end = time.time() if not isinstance(response_json, dict) or str(request_begin) != response_json.get( 'args', {}).get('cur', ''): return anonymity = self._check_proxy_anonymity(response_json) country = proxy.get('country') country = country or self._geoip_reader.country(host).country.iso_code export_address = self._check_export_address(response_json) return { "protocol": protocol, "host": host, "export_address": export_address, "port": port, "anonymity": anonymity, "country": country, "response_time": round(request_end - request_begin, 2), "from": proxy.get('from'), 'grab_time': proxy.get('grab_time'), 'check_time': get_current_time_str() }