def verify_proxy_ip(self): ''' 验证数据库中的【全部的】代理ip是否过期,如果过期,更新status=1,标记为待删除状态 ''' p = SqliteHelper() result = p.db_select_all_for_verify() if result: for pro in result: pid = pro[0] aa_show = 'verify {0}-{1}:{2}'.format(pro[3], pro[1], pro[2]) print(aa_show) proxyman.info(aa_show) p_ip = { "{0}".format(pro[3]): "http://{0}:{1}".format(pro[1], pro[2]) } res = self.check_proxy_ip(p_ip) if not res: # 该代理ip不可用了 sign_show = 'proxy ip【{0}】can not used ,signed for delete it'.format( pro[1]) print(sign_show) proxyman.info(sign_show) # 标记为待删除状态 p.db_update_for_status(pid, 1) Shelper.makeSleep(3, False) else: res_show = '未从数据库中获取到待检测的代理ip' print(res_show) proxyman.info(res_show)
def do_get(self): # 请注意之前是do_GET(self) """ """ ip_dict = {} # 之前是dict parsed_path = urlparse.urlparse(self.path) # 定义一个获取成功链接的返回结果 try: query = urllib.unquote(parsed_path.query) # print(query) # 测试输出 logger.info("query %s" % query) # 指定query指定给logger # 获取返回的url的值 if query.find('&') != -1: param_list = query.split('&') for param in param_list: ip_dict[param.split('=')[0]] = param.split('=')[1] else: ip_dict[query.split('=')[0]] = query.split('=')[1] sql_helper = SqliteHelper() # # 处理删除代理的请求 if 'delete' in ip_dict: # 之前的写法dict.has_key('delete') condition = "ip='" + ip_dict['ip'] + "' AND port=" + ip_dict['port'] sql_helper.delete(SqliteHelper.tableName, condition) self.send_response(200) self.end_headers() self.wfile.write("Success delete proxy: " + ip_dict['ip'] + ":" + ip_dict['port']) else: str_count = '' conditions = [] for key in ip_dict: if key == 'count': str_count = 'LIMIT 0,%s' % ip_dict[key] if key == 'country' or key == 'area': conditions .append(key+" LIKE '"+ip_dict[key]+"%'") elif key == 'types' or key == 'protocol' or key == 'country' or key == 'area': conditions .append(key+"="+ip_dict[key]) if len(conditions) > 1: conditions = ' AND '.join(conditions) else: conditions = conditions[0] result = sql_helper.select(sql_helper.tableName, conditions, str_count) # print type(result) # for r in result: # print r data = [{'ip': item[0], 'port': item[1]} for item in result] # 返回有效的ip数据 data = json.dumps(data) # 生成json数据格式 self.send_response(200) self.end_headers() self.wfile.write(data) except Exception as e: logger.warning(str(e)) self.send_response(404)
def do_GET(self): """ """ dict = {} parsed_path = urlparse.urlparse(self.path) try: query = urllib.unquote(parsed_path.query) logger.info("query %s" % query) if query.find('&') != -1: params = query.split('&') for param in params: dict[param.split('=')[0]] = param.split('=')[1] else: dict[query.split('=')[0]] = query.split('=')[1] sqlHelper = SqliteHelper() # 处理删除代理的请求 if dict.has_key('delete'): condition = "ip='" + dict['ip'] + "' AND port=" + dict['port'] sqlHelper.delete(SqliteHelper.tableName, condition) self.send_response(200) self.end_headers() self.wfile.write("Success delete proxy: " + dict['ip'] + ":" + dict['port']) else: str_count = '' conditions = [] for key in dict: if key == 'count': str_count = 'LIMIT 0,%s' % dict[key] if key == 'country' or key == 'area': conditions.append(key + " LIKE '" + dict[key] + "%'") elif key == 'types' or key == 'protocol' or key == 'country' or key == 'area': conditions.append(key + "=" + dict[key]) if len(conditions) > 1: conditions = ' AND '.join(conditions) else: conditions = conditions[0] result = sqlHelper.select(sqlHelper.tableName, conditions, str_count) # print type(result) # for r in result: # print r data = [{'ip': item[0], 'port': item[1]} for item in result] data = json.dumps(data) self.send_response(200) self.end_headers() self.wfile.write(data) except Exception, e: logger.warning(str(e)) self.send_response(404)
def do_GET(self): """ """ dict = {} parsed_path = urlparse.urlparse(self.path) try: query = urllib.unquote(parsed_path.query) print query if query.find('&') != -1: params = query.split('&') for param in params: dict[param.split('=')[0]] = param.split('=')[1] else: dict[query.split('=')[0]] = query.split('=')[1] str_count = '' conditions = [] for key in dict: if key == 'count': str_count = 'lIMIT 0,%s' % dict[key] if key == 'country' or key == 'area': conditions.append(key + " LIKE '" + dict[key] + "%'") elif key == 'types' or key == 'protocol' or key == 'country' or key == 'area': conditions.append(key + "=" + dict[key]) if len(conditions) > 1: conditions = ' AND '.join(conditions) else: conditions = conditions[0] sqlHelper = SqliteHelper() result = sqlHelper.select(sqlHelper.tableName, conditions, str_count) # print type(result) # for r in result: # print r print result data = json.dumps(result) self.send_response(200) self.end_headers() self.wfile.write(data) except Exception, e: print e self.send_response(404)
def do_GET(self): """ """ dict={} parsed_path = urlparse.urlparse(self.path) try: query = urllib.unquote(parsed_path.query) print query if query.find('&')!=-1: params = query.split('&') for param in params: dict[param.split('=')[0]]=param.split('=')[1] else: dict[query.split('=')[0]]=query.split('=')[1] str_count='' conditions=[] for key in dict: if key =='count': str_count = 'lIMIT 0,%s'% dict[key] if key =='country' or key =='area': conditions .append(key+" LIKE '"+dict[key]+"%'") elif key =='types' or key =='protocol' or key =='country' or key =='area': conditions .append(key+"="+dict[key]) if len(conditions)>1: conditions = ' AND '.join(conditions) else: conditions =conditions[0] sqlHelper = SqliteHelper() result = sqlHelper.select(sqlHelper.tableName,conditions,str_count) # print type(result) # for r in result: # print r print result data = json.dumps(result) self.send_response(200) self.end_headers() self.wfile.write(data) except Exception,e: print e self.send_response(404)
def run(self): while True: logger.info("Start to run spider") sqlHelper = SqliteHelper() logger.info('Start to run validator') validator = Validator(sqlHelper) count = validator.run_db() logger.info('Finished to run validator, count=%s' % count) if count[0] < MINNUM: proxys = self.crawl_pool.map(self.crawl, parserList) #这个时候proxys的格式是[[{},{},{}],[{},{},{}]] # print proxys #这个时候应该去重: proxys_tmp = [] for proxy in proxys: proxys_tmp.extend(proxy) proxys = proxys_tmp logger.info('first_proxys: %s' % len(proxys)) #这个时候proxys的格式是[{},{},{},{},{},{}] proxys_tmp = None #这个时候开始去重: proxys = [ dict(t) for t in set([tuple(proxy.items()) for proxy in proxys]) ] logger.info('end_proxy: %s' % len(proxys)) logger.info('spider proxys: %s' % type(proxys)) proxys = validator.run_list(proxys) #这个是检测后的ip地址 sqlHelper.batch_insert(sqlHelper.tableName, proxys) logger.info('success ip: %s' % sqlHelper.selectCount()) sqlHelper.close() logger.info('Finished to run spider') time.sleep(UPDATE_TIME)
def run(self): while True: print 'spider beginning -------' sqlHelper = SqliteHelper() print 'validator beginning -------' validator = Validator(sqlHelper) count = validator.run_db() print 'validator end ----count=%s'%count if count[0]< MINNUM: proxys = self.crawl_pool.map(self.crawl,parserList) #这个时候proxys的格式是[[{},{},{}],[{},{},{}]] # print proxys #这个时候应该去重: proxys_tmp = [] for proxy in proxys: proxys_tmp.extend(proxy) proxys = proxys_tmp print 'first_proxys--%s',len(proxys) #这个时候proxys的格式是[{},{},{},{},{},{}] proxys_tmp=None #这个时候开始去重: proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])] print 'end_proxys--%s',len(proxys) print 'spider proxys -------%s'%type(proxys) proxys = validator.run_list(proxys)#这个是检测后的ip地址 sqlHelper.batch_insert(sqlHelper.tableName,proxys) print 'success ip =%s'%sqlHelper.selectCount() sqlHelper.close() print 'spider end -------' time.sleep(UPDATE_TIME)
def get_proxy_ip_mimiip(self, urlFormat, tmpName, maxPageNo=1, urlType=1): """[获取代理ip for mimiip] Arguments: urlFormat {[type]} -- [链接] tmpName {[type]} -- [模板目录] Keyword Arguments: maxPageNo {number} -- [最大页码] (default: {1}) urlType {number} -- [1:静态页 2:动态页] (default: {1}) """ extra = Extractor() extra.setXsltFromFile(tmpName) doSpider = Spider() p = SqliteHelper() if maxPageNo <= 1: maxPageNo = 1 maxPageNo += 1 for page in range(1, maxPageNo): url = urlFormat.format(page) # url='http://www.mimiip.com/gngao/{0}'.format(page) html_dom = doSpider.getContent(url, urlType) op_xml = extra.extractHtmlDomtoXml(html_dom) op_json = doSpider.xmlToJson(op_xml) # proxyman.info(op_json) # print(op_json) # return False # break # 解析转换后的json obj = json.loads(op_json) proxy_list = [] if obj['proxyshow']: for ps in obj['proxyshow']['item']: proxy_dict = {} proxy_dict['xip'] = ps['xip'] proxy_dict['xport'] = ps['xport'] proxy_dict['xaddr'] = ps['xaddr'].replace('\n', '') proxy_dict['xlevel'] = ps['xlevel'] proxy_dict['xprotocal'] = ps['xprotocal'].lower() proxy_list.append(proxy_dict) proxy_list_ok = [] # 遍历,验证代理ip是否可用 for pro in proxy_list: aa_show = 'the {0}-{1}:{2} for {3}'.format( pro['xprotocal'], pro['xip'], pro['xport'], pro['xaddr'].encode('utf-8')) print(aa_show) proxyman.info(aa_show) p_ip = { "{0}".format(pro['xprotocal']): "http://{0}:{1}".format(pro['xip'], pro['xport']) } res = self.check_proxy_ip(p_ip) if res: proxy_list_ok.append(pro) # 将筛选处理后的代理IP添加到数据库中 count = p.db_insert_for_proxyip(proxy_list_ok) print('insert %d ips success' % (count)) # 获取完一页数据后,休息一下 Shelper.makeSleep(5)
def __init__(self): self.SQLdb=SqliteHelper() self.proxyman=shelper.setLog('spider')
class ProxySpider(object): """docstring for ProxySpider""" def __init__(self): self.SQLdb=SqliteHelper() self.proxyman=shelper.setLog('spider') # 获取代理ip for mimiip def get_proxy_ip_mimiip(self,urlFormat,tmpName,maxPageNo=1,urlType=1): """[获取代理ip for mimiip] [注意:该方法为示例方法,用于演示抓取站点代理IP列表及验证到保存入库的完整过程] Arguments: urlFormat {[type]} -- [链接] tmpName {[type]} -- [模板目录] Keyword Arguments: maxPageNo {number} -- [最大页码] (default: {1}) urlType {number} -- [1:静态页 2:动态页] (default: {1}) """ extra=Extractor() extra.setXsltFromFile(tmpName) doSpider=Spider() p=SqliteHelper() if maxPageNo <= 1: maxPageNo=1 maxPageNo+=1 for page in range(1,maxPageNo): url=urlFormat.format(page) # url='http://www.mimiip.com/gngao/{0}'.format(page) html_dom=doSpider.getContent(url,urlType) op_xml=extra.extractHtmlDomtoXml(html_dom) op_json=doSpider.xmlToJson(op_xml) # proxyman.info(op_json) # print(op_json) # return False # break # 解析转换后的json obj=json.loads(op_json) proxy_list=[] if obj['proxyshow']: for ps in obj['proxyshow']['item']: proxy_dict={} proxy_dict['xip']=ps['xip'] proxy_dict['xport']=ps['xport'] proxy_dict['xaddr']=ps['xaddr'].replace('\n','') proxy_dict['xlevel']=ps['xlevel'] proxy_dict['xprotocal']=ps['xprotocal'].lower() proxy_list.append(proxy_dict) proxy_list_ok=[] # 遍历,验证代理ip是否可用 for pro in proxy_list: aa_show='the {0}-{1}:{2} for {3}'.format(pro['xprotocal'],pro['xip'],pro['xport'],pro['xaddr'].encode('utf-8')) print(aa_show) self.proxyman.info(aa_show) p_ip={"{0}".format(pro['xprotocal']):"http://{0}:{1}".format(pro['xip'],pro['xport'])} res=self.check_proxy_ip(p_ip) if res: proxy_list_ok.append(pro) # 将筛选处理后的代理IP添加到数据库中 count = p.db_insert_for_proxyip(proxy_list_ok) print('insert %d ips success' %(count)) # 获取完一页数据后,休息一下 shelper.makeSleep(5) # 获取代理ip def get_proxy_ip(self,funcSite,urlFormat,tmpName,maxPageNo=1,urlType=1): """[获取某站点下的代理ip] [通过指定抓取站点链接,指定xslt模板文件的方式来抓取指定代理站点下的可用的高匿代理IP] Arguments: funcSite {[type]} -- [针对于指定站点解析json数据的方法] urlFormat {[type]} -- [指定站点url,页码部分为“{0}”] tmpName {[type]} -- [指定站点的xslt模板] Keyword Arguments: maxPageNo {number} -- [最大页码] (default: {1}) urlType {number} -- [站点html类型 1静态 2动态] (default: {1}) """ extra=Extractor() extra.setXsltFromFile(tmpName) doSpider=Spider() if maxPageNo <= 1: maxPageNo=1 maxPageNo+=1 try: for page in range(1,maxPageNo): url=urlFormat.format(page) # 获取某页面html内容 page_html_dom=doSpider.getContent(url,urlType) page_xml=extra.extractHtmlDomtoXml(page_html_dom) page_json_data=doSpider.xmlToJson(page_xml) # ************************************** # Debug html # page_htmlStr=doSpider.htmlStr # self.proxyman.info(page_htmlStr) # Debug jsondata self.proxyman.info(page_json_data) # print(page_json_data) # ************************************** # 针对于抓取的站点,对得到的内容进行解析处理后得到抓取的代理IP集合 page_proxy_list=funcSite(page_json_data) # 对代理IP进行可用性验证筛选 page_proxy_list_ok=self.availabile_proxy_ip(page_proxy_list) # 将验证通过的代理IP添加到数据库中 self.save_proxy_ip(page_proxy_list_ok) # 获取完一页数据后,休息一下 shelper.makeSleep(5) except Exception as e: err_show='[get_proxy_ip]--error-{0}'.format(str(e)) print(err_show) self.proxyman.error(err_show) finally: fina_show='[get_proxy_ip]--The work is Done' print(fina_show) self.proxyman.error(fina_show) # 代理Ip可用性验证 def availabile_proxy_ip(self,proxyList): """[可用性验证] [遍历,验证代理ip是否可用] Arguments: proxyList {[list]} -- [待验证的代理ip集合] Returns: [list] -- [验证通过的代理ip集合] """ proxy_list_ok=[] try: for pro in proxyList: aa_show='the {0}-{1}:{2} for {3}'.format(pro['xprotocal'],pro['xip'],pro['xport'],pro['xaddr'].encode('utf-8')) print(aa_show) self.proxyman.info(aa_show) # {"http":"http://102.168.5.103:8080"} p_ip={"{0}".format(pro['xprotocal']):"http://{0}:{1}".format(pro['xip'],pro['xport'])} # 通过比对正常请求和通过代理请求的结果判断该代理ip是否可用 res=self.check_proxy_ip(p_ip) if res: proxy_list_ok.append(pro) except Exception as e: err_show='[availabile_proxy_ip]--error-{0}'.format(str(e)) print(err_show) self.proxyman.error(err_show) finally: return proxy_list_ok # 将验证后的代理ip添加到数据库中 def save_proxy_ip(self,proxyList): """[添加到数据库中] [将验证后的代理ip添加到数据库中] Arguments: proxyList {[list]} -- [验证后的代理Ip集合] """ count = self.SQLdb.db_insert_for_proxyip(proxyList) print('insert %d ips success' %(count)) self.proxyman.info('insert %d ips success' %(count)) # 验证指定代理ip是否可用 def check_proxy_ip(self,proxyip): """[验证代理ip是否可用] [proxyip 格式 {"http":"http://120.52.73.97:8081"}] Arguments: proxyip {[dict]} -- [待验证的代理ip字典] Returns: bool -- [是否通过] """ s = requests.Session() a = requests.adapters.HTTPAdapter(max_retries=3) b = requests.adapters.HTTPAdapter(max_retries=3) s.mount('http://', a) s.mount('https://', b) the_checked_ip=proxyip.values()[0] try: MaskedIP = s.get("http://icanhazip.com", timeout=10, proxies=proxyip).content.strip() # 用正则判断请求返回的内容是否是ip pattern=r"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" pattern_res=re.match(pattern,MaskedIP) if not pattern_res: res_show='return result is not ip' print(res_show) self.proxyman.error('Result Content is Not Ip') return False mask_ip=pattern_res.group(0) # 直接访问 OrigionalIP = requests.get("http://icanhazip.com", timeout=30).content.strip() ip_show='origional_ip is [{0}] -- mask_ip is [{1}]'.format(OrigionalIP,mask_ip) print(ip_show) if OrigionalIP != mask_ip: print('Proxy IP ok') self.proxyman.info('the mask ip【{0}】and return ip【{1}】is {2}'.format(the_checked_ip,mask_ip,'【OK】')) return True else: print('Not Anonymous') self.proxyman.info('the mask ip【{0}】and return ip【{1}】is {2}'.format(the_checked_ip,mask_ip,'Not Anonymous')) return False except requests.exceptions.Timeout: print('the request timeout') self.proxyman.error('Timeout') return False except Exception as e: print('the request error') self.proxyman.error('Error') return False # 判断数据库中代理ip是否可用并进行标记处理 def verify_proxy_ip(self): """[判断代理ip是否过期] [验证数据库中的【全部的】代理ip是否过期,如果过期,更新status=1,标记为待删除状态] """ # 从数据库中获取全部代理IP列表 result=self.SQLdb.db_select_all_for_verify() if result: for pro in result: pid=pro[0] aa_show='verify {0}-{1}:{2}'.format(pro[3],pro[1],pro[2]) print(aa_show) self.proxyman.info(aa_show) p_ip={"{0}".format(pro[3]):"http://{0}:{1}".format(pro[1],pro[2])} res=self.check_proxy_ip(p_ip) if not res: # 该代理ip不可用了 sign_show='proxy ip【{0}】can not used ,signed for delete it'.format(pro[1]) print(sign_show) self.proxyman.info(sign_show) # 在数据库中标记为待删除状态 self.SQLdb.db_update_for_status(pid,1) shelper.makeSleep(3,False) else: res_show='未从数据库中获取到待检测的代理ip' print(res_show) self.proxyman.info(res_show)