def scan_proxy(): """ 扫描代理资源 :return: """ import requests from pyquery import PyQuery as Pq source_site = 'http://ip.qiaodm.com/' header = { 'Host': 'ip.qiaodm.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' } s = requests.session() # 抓取页面 file_html = s.get(source_site).content # 保存文件 # with open('test.html', 'a') as f: # f.write(file_html.encode('utf-8')) # # # 读取抓取的页面 # with open('test.html', 'r') as f: # file_html = f.read() text_pq = Pq(file_html) tr_list = text_pq('tbody').find('tr[style="text-align: center;"]') print '单页共 %s 条记录' % len(tr_list) for tr_item in tr_list: # print Pq(tr_item).html() # print('---------------------') td_list = Pq(tr_item).find('td') # print '单条共 %s 列字段' % len(td_list) field_list = [] for td_item in Pq(td_list): field = Pq(td_item).text() field_list.append(field) # print field # print('++++++++++++++++++') # 特殊处理ip地址 ip = Pq(td_list).eq(0).html() # 去除干扰信息 ip = html.replace_html(ip, r'<p style="display:none;"/>') ip = html.replace_html(ip, r'<p style="display: none;"/>') ip = html.replace_html(ip, r'<p style=.*?display:.*?none;.*?>.*?</p>') # 去除标签 ip = html.strip_html(ip) # print ip # 过滤掉非法ip地址 if len(ip.split('.')) != 4: continue # 特殊处理端口 port_key = Pq(td_list).eq(1).attr('class').split()[1] if port_key not in PortDict: print '发现新端口: %s' % port_key continue port = PortDict.get(port_key, '') ProsyItem['Ip'] = ip.replace(' ', '') ProsyItem['Port'] = port ProsyItem['Type'] = field_list[2].strip() ProsyItem['AnonymousDegree'] = field_list[3].strip() ProsyItem['Area'] = field_list[4].strip() ProsyItem['Speed'] = field_list[5].strip() ProsyItem['ScanTime'] = field_list[6].strip() # print ProsyItem proxy_item = json.dumps(ProsyItem, ensure_ascii=False) html.save_file('proxy.json', proxy_item + '\n', 'a')
def scan_proxy_qiaodm(): """ 扫描代理资源 :return: """ import requests from pyquery import PyQuery as Pq source_site = 'http://ip.qiaodm.com/' header = { 'Host': 'ip.qiaodm.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' } s = requests.session() # 抓取页面 file_html = s.get(source_site).content # 保存文件 # with open('test.html', 'a') as f: # f.write(file_html.encode('utf-8')) # # # 读取抓取的页面 # with open('test.html', 'r') as f: # file_html = f.read() text_pq = Pq(file_html) tr_list = text_pq('tbody').find('tr[style="text-align: center;"]') print '单页共 %s 条记录' % len(tr_list) for tr_item in tr_list: # print Pq(tr_item).html() # print('---------------------') td_list = Pq(tr_item).find('td') # print '单条共 %s 列字段' % len(td_list) field_list = [] for td_item in Pq(td_list): field = Pq(td_item).text() field_list.append(field) # print field # print('++++++++++++++++++') # 特殊处理ip地址 ip = Pq(td_list).eq(0).html() # 去除干扰信息 ip = html.replace_html(ip, r'<p style="display:none;"/>') ip = html.replace_html(ip, r'<p style="display: none;"/>') ip = html.replace_html(ip, r'<p style=.*?display:.*?none;.*?>.*?</p>') # 去除标签 ip = html.strip_html(ip) # print ip # 过滤掉非法ip地址 if len(ip.split('.')) != 4: continue # 特殊处理端口 port_key = Pq(td_list).eq(1).attr('class').split()[1] if port_key not in PortDict: print '发现新端口: %s' % port_key continue port = PortDict.get(port_key, '') ProsyItem['Ip'] = ip.replace(' ', '') ProsyItem['Port'] = port ProsyItem['Type'] = field_list[2].strip() ProsyItem['AnonymousDegree'] = field_list[3].strip() ProsyItem['Area'] = field_list[4].strip() ProsyItem['Speed'] = field_list[5].strip() ProsyItem['ScanTime'] = field_list[6].strip() # print ProsyItem proxy_item = json.dumps(ProsyItem, ensure_ascii=False) html.save_file('proxy.json', proxy_item + '\n', 'a')
def clean(s): return strip_html(s.strip('"')).strip()