def _check_one_http_proxy(self, proxy): check_anonymity_url = "http://www.xxorg.com/tools/checkproxy/" fetch_result = fetch(check_anonymity_url, proxy) response = fetch_result['response'] if response is None: if self.recheck: delete_proxy_from_db(proxy) return response.encoding = 'utf-8' html = response.text result = BeautifulSoup(html, "html5lib").find("div", id="result") anonymities = { "透明": "transparent", "普通匿名": "normal_anonymity", "高匿名": "high_anonymity" } for anonymity in anonymities.keys(): if anonymity in str(result): proxy.anonymity = anonymities[anonymity] check_address_url = "http://ip-api.com/json/" fetch_result = fetch(check_address_url, proxy) response = fetch_result['response'] if response is None: if self.recheck: delete_proxy_from_db(proxy) return try: proxy.country = response.json()['country'] proxy.round_trip_time = fetch_result['round_trip_time'] save_proxy_to_db(proxy) except JSONDecodeError: delete_proxy_from_db(proxy) return break
def _check_one_https_proxy(self, proxy): testURL = "https://book.douban.com/" fetch_result = fetch(url=testURL, proxy=proxy, proxy_type='https') response = fetch_result['response'] spiderURL = "https://cn.investing.com/" spider_fetch_result = fetch(url=spiderURL, proxy=proxy, proxy_type='https') spider_response_status = spider_fetch_result['response_status_code'] if spider_response_status == 200: print("proxy cn.investing.com", proxy, spider_response_status) else: spider_response_status = None if response is None or spider_response_status != 200: logger.info('response is None , proxy:{}'.format(proxy)) if self.recheck: update_proxy_score(proxy, res=0) return response.encoding = 'utf-8' html = response.text if "豆瓣读书,新书速递,畅销书,书评,书单" in html: proxy.round_trip_time = fetch_result['round_trip_time'] update_proxy_score(proxy) # save_proxy_to_db(proxy) else: if self.recheck: update_proxy_score(proxy, res=0) return
def parse_ip_proxy(self, url): proxy = random.choice(self.proxypool) fetch_result = fetch(url, proxy) response = fetch_result['response'] if not response: logger.info('response is None , url:{}, proxy:{}'.format( url, proxy)) return response.encoding = 'utf-8' response_status_code = response.status_code print('response is status_code:{}, url:{}, proxy:{}'.format( response_status_code, url, proxy)) html = response.text soup = BeautifulSoup(html, "html5lib") trs = soup.find('table', id="ip_list").find('tbody').find_all('tr')[1:] for tr in trs: tds = tr.find_all('td') ip_and_port = tds[1].string + ":" + tds[2].string # proxy = Proxy_IP(ip_and_port=ip_and_port, type='https') proxy = Proxy_IP(ip_and_port=ip_and_port) if tds[4].string == '高匿': proxy.anonymity = 'high_anonymity' elif tds[4].string == '透明': proxy.anonymity = 'transparent' proxy.country = 'China' httptype = tds[5].string if httptype == 'HTTPS': proxy.type = 'https' self.proxy_list.add(proxy) logger.info(self.__class__.__name__ + " " + ip_and_port + " " + proxy.anonymity)
def db_proxy(): data = [] proxies = Proxy_IP.select().where(Proxy_IP.type == 'https').order_by( Proxy_IP.timestamp) for proxy in proxies: r_times = int(proxy.right_times) a_times = int(proxy.all_times) success_rate = r_times * 1.0 / a_times ip_and_port = proxy.ip_and_port httptype = proxy.type proxyurl = httptype + "://" + ip_and_port logger.info("db proxyurl is {}".format(proxyurl)) fetch_result = fetch(url=fetch_url, proxy=proxyurl, proxy_type='https') response = fetch_result['response_status_code'] retry_num = fetch_result['retry_num'] retry_success_rate = retry_num * 1.0 / RETRY_NUM # 总成功率超过60%,最近一个时刻尝试2次(总重试次数为3)就成功的代理 if success_rate > 0.6 and response == 200 and retry_success_rate < 0.7: update_proxy_score(proxy, res=1) one_proxy_data_dic = { "proxy": proxyurl, "proxy_scheme": proxy.type } data.append(one_proxy_data_dic) logger.info("from db add proxyinfo:{} ".format(one_proxy_data_dic)) # 成功率低于30%的代理在DB中减少成功次数,成功次数低于0则删除记录 else: logger.info( "proxy success is too low, proxy info:{}, latest response_status_code:{}" .format(proxyurl, response)) # delete_proxy_from_db(proxy) update_proxy_score(proxy) return data
def parse_ip_proxy(self, url): fetch_result = fetch(url) response = fetch_result['response'] response.encoding = 'gbk' html = response.text soup = BeautifulSoup(html, "html5lib") trs = soup.find('div', id="main").find('tbody').find_all('tr')[1:] for tr in trs: tds = tr.find_all('td') ip_and_port = tds[0].string + ":" + tds[1].string self.proxy_list.add(Proxy_IP(ip_and_port=ip_and_port)) logger.info(self.__class__.__name__ + " " + ip_and_port)
def _check_one_https_proxy(self, proxy): testURL = "https://book.douban.com/" fetch_result = fetch(url=testURL, proxy=proxy, proxy_type='https') response = fetch_result['response'] if response is None: logger.info('response is None , proxy:{}'.format(proxy)) if self.recheck: delete_proxy_from_db(proxy) return response.encoding = 'utf-8' html = response.text if "豆瓣读书,新书速递,畅销书,书评,书单" in html: proxy.round_trip_time = fetch_result['round_trip_time'] save_proxy_to_db(proxy) else: if self.recheck: delete_proxy_from_db(proxy) return
def run(self): fetch_result = fetch(self.start_url) response = fetch_result['response'] if response: self.parse_ip_proxy(response)
def json_proxy(): data = [] jsonfile = open(jsonpath, encoding='utf-8') proxylist = json.load(jsonfile) jsonfile.close() if proxylist: for proxy in proxylist: proxyurl = proxy['proxy'] # 端口是3888的为私有代理 pattern = ':3888$' if not re.search(pattern, proxyurl): # if proxyurl != "http://192.168.88.176:3888": fetch_result = fetch(url=fetch_url, proxy=proxyurl, proxy_type='https') response = fetch_result['response_status_code'] # 查询代理IP是否在DB中 ip_and_port = proxyurl.split('/')[-1] httptype = proxyurl.split(':')[0] proxies = Proxy_IP.select().where( Proxy_IP.ip_and_port == ip_and_port, Proxy_IP.type == httptype).first() # print("proxies", proxies) # 构建对象 proxyinfo = Proxy_IP(ip_and_port=ip_and_port) proxyinfo.ip_and_port = ip_and_port proxyinfo.timestamp = datetime.datetime.now() if proxies: # IP在DB中 if response == 200: update_proxy_score(proxyinfo, res=1) data.append(proxy) logger.info( "from jsonfile add proxyinfo:{} ".format(proxy)) else: update_proxy_score(proxyinfo) logger.info( "proxy response is not 200, cancel from jsonfile, proxy info:{} " .format(proxy)) else: # IP不在DB中 proxyinfo.type = 'https' proxyinfo.anonymity = 'high_anonymity' proxyinfo.round_trip_time = '1' proxyinfo.country = 'China' proxyinfo.all_times = '1' proxyinfo.timestamp = datetime.datetime.now() if response == 200: proxyinfo.right_times = '1' save_proxy_to_db(proxyinfo) data.append(proxy) logger.info( "from jsonfile add proxyinfo:{} ".format(proxy)) else: proxyinfo.right_times = '1' save_proxy_to_db(proxyinfo) logger.info( "proxy response is not 200, cancel from jsonfile, proxy info:{} " .format(proxy)) return data