def db_proxy(): data = [] proxies = Proxy_IP.select().where(Proxy_IP.type == 'https').order_by( Proxy_IP.timestamp) for proxy in proxies: r_times = int(proxy.right_times) a_times = int(proxy.all_times) success_rate = r_times * 1.0 / a_times ip_and_port = proxy.ip_and_port httptype = proxy.type proxyurl = httptype + "://" + ip_and_port logger.info("db proxyurl is {}".format(proxyurl)) fetch_result = fetch(url=fetch_url, proxy=proxyurl, proxy_type='https') response = fetch_result['response_status_code'] retry_num = fetch_result['retry_num'] retry_success_rate = retry_num * 1.0 / RETRY_NUM # 总成功率超过60%,最近一个时刻尝试2次(总重试次数为3)就成功的代理 if success_rate > 0.6 and response == 200 and retry_success_rate < 0.7: update_proxy_score(proxy, res=1) one_proxy_data_dic = { "proxy": proxyurl, "proxy_scheme": proxy.type } data.append(one_proxy_data_dic) logger.info("from db add proxyinfo:{} ".format(one_proxy_data_dic)) # 成功率低于30%的代理在DB中减少成功次数,成功次数低于0则删除记录 else: logger.info( "proxy success is too low, proxy info:{}, latest response_status_code:{}" .format(proxyurl, response)) # delete_proxy_from_db(proxy) update_proxy_score(proxy) return data
def __init__(self): super(XicidailiSpider, self).__init__() urls = [ "http://www.xicidaili.com/wn/{}".format(k) for k in range(1, 100) ] for url in urls: self.url_list.put(url) self.proxypool = Proxy_IP.select().where(Proxy_IP.type == 'http')
def GET(self): get_input = web.input(_method='get') query_country = query_anonymity = query_number = query_type = None try: query_country = get_input.country except: pass try: query_anonymity = get_input.anonymity except: pass try: query_number = get_input.number except: pass try: query_type = get_input.type except: pass proxies = Proxy_IP.select().order_by(Proxy_IP.timestamp) updatetime = str(proxies[0].timestamp).split('.')[0] data = [] anonymity_level = { "transparent": 0, "anonymity": 1, "normal_anonymity": 1, "high_anonymity": 2 } for proxy in proxies: if query_country: if proxy.country != query_country: continue if query_type: if proxy.type != query_type: continue if query_anonymity: print(query_anonymity) if anonymity_level[ proxy.anonymity] < anonymity_level[query_anonymity]: continue one_proxy_data_dic = { "ip_and_port": proxy.ip_and_port, "country": proxy.country, "type": proxy.type, "anonymity": proxy.anonymity, "round_trip_time": proxy.round_trip_time } data.append(one_proxy_data_dic) if query_number: if query_number < len(data): data = data[0:query_number] return_dic = {"num": len(data), "updatetime": updatetime, "data": data} return json.dumps(return_dic)
def json_proxy(): data = [] jsonfile = open(jsonpath, encoding='utf-8') proxylist = json.load(jsonfile) jsonfile.close() if proxylist: for proxy in proxylist: proxyurl = proxy['proxy'] # 端口是3888的为私有代理 pattern = ':3888$' if not re.search(pattern, proxyurl): # if proxyurl != "http://192.168.88.176:3888": fetch_result = fetch(url=fetch_url, proxy=proxyurl, proxy_type='https') response = fetch_result['response_status_code'] # 查询代理IP是否在DB中 ip_and_port = proxyurl.split('/')[-1] httptype = proxyurl.split(':')[0] proxies = Proxy_IP.select().where( Proxy_IP.ip_and_port == ip_and_port, Proxy_IP.type == httptype).first() # print("proxies", proxies) # 构建对象 proxyinfo = Proxy_IP(ip_and_port=ip_and_port) proxyinfo.ip_and_port = ip_and_port proxyinfo.timestamp = datetime.datetime.now() if proxies: # IP在DB中 if response == 200: update_proxy_score(proxyinfo, res=1) data.append(proxy) logger.info( "from jsonfile add proxyinfo:{} ".format(proxy)) else: update_proxy_score(proxyinfo) logger.info( "proxy response is not 200, cancel from jsonfile, proxy info:{} " .format(proxy)) else: # IP不在DB中 proxyinfo.type = 'https' proxyinfo.anonymity = 'high_anonymity' proxyinfo.round_trip_time = '1' proxyinfo.country = 'China' proxyinfo.all_times = '1' proxyinfo.timestamp = datetime.datetime.now() if response == 200: proxyinfo.right_times = '1' save_proxy_to_db(proxyinfo) data.append(proxy) logger.info( "from jsonfile add proxyinfo:{} ".format(proxy)) else: proxyinfo.right_times = '1' save_proxy_to_db(proxyinfo) logger.info( "proxy response is not 200, cancel from jsonfile, proxy info:{} " .format(proxy)) return data
response.encoding = 'utf-8' html = response.text if "豆瓣读书,新书速递,畅销书,书评,书单" in html: proxy.round_trip_time = fetch_result['round_trip_time'] save_proxy_to_db(proxy) else: if self.recheck: delete_proxy_from_db(proxy) return def _check_one_proxy(self, proxy): if proxy.type == 'http': self._check_one_http_proxy(proxy) else: self._check_one_https_proxy(proxy) def run(self, ): for proxy in self.proxies: self.pool.spawn(self._check_one_proxy, proxy) self.pool.join() if __name__ == "__main__": logger.info("-------Recheck Start-------") check_proxy = Check_proxy() check_proxy.recheck = True proxies = Proxy_IP.select() check_proxy.proxies.extend(proxies) check_proxy.run() logger.info("-------Recheck Finish-------")