def crawl_daxiang(self): url = 'http://vxer.daili666api.com/ip/?tid=555397563436240&num=1000&filter=on' html = get_page(url) if html: urls = html.split('\n') for ip in urls: yield ip.replace('\r', '')
def crawl_gaonidaili_free(self): start_url = "http://www.xiladaili.com/api/?uuid=ee4bcc3648ad4df4973ea79146d30278&num=100&place=%E4%B8%AD%E5%9B%BD&category=1&protocol=1&sortby=0&repeat=1&format=3&position=1" res = get_page(start_url) if res: proxies = re.findall("\d+\.\d+\.\d+\.\d+\:\d+", res) for proxy in proxies: yield proxy
def crawl_nimadaili(self): urls = [ "http://www.nimadaili.com/http/{}/", "http://www.nimadaili.com/gaoni/{}/", "http://www.nimadaili.com/https/{}/" ] try: for url in urls: for page in range(100): html = get_page(url.format(str(page))) if html: html_tree = etree.HTML(html) for ip in html_tree.xpath( '/html/body/div/div[1]/div[2]/table/tbody/tr/td[1]/text()' ): yield ip else: print( "\033[1;31;40m 泥马代理网站 ----> 爬取网站为空已准备跳过! \033[0m" ) return 0 time.sleep(2) except: print("\033[1;41;97m 泥马代理网站 ----> 爬虫网站规则更改,请修改! \033[0m") return 0
def crawl_goubanjia(self): try: url = "http://www.goubanjia.com/" html = get_page(url) if html: tree = etree.HTML(html) proxy_list = tree.xpath('//td[@class="ip"]') # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 # 需要过滤掉<p style="display:none;">的内容 xpath_str = """.//*[not(contains(@style, 'display: none')) and not(contains(@style, 'display:none')) and not(contains(@class, 'port')) ]/text() """ for each_proxy in proxy_list: try: # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port ip_addr = ''.join(each_proxy.xpath(xpath_str)) port = each_proxy.xpath( ".//span[contains(@class, 'port')]/text()")[0] yield '{}:{}'.format(ip_addr, port) except: pass else: print("\033[1;31;40m 够搬家代理网站 ----> 爬取网站为空已准备跳过! \033[0m") return 0 except: print("\033[1;41;97m 够搬家代理网站 ----> 爬虫网站规则更改,请修改! \033[0m") return 0
def crawl_superfastip(self, page_count=10): try: url_list = [ "http://www.superfastip.com/welcome/freeip/{}".format(page) for page in range(1, page_count + 1) ] for url in url_list: html = get_page(url) if html: html_tree = etree.HTML(html) ip_list = html_tree.xpath( "/html/body/div[3]/div/div/div[2]/div/table/tbody//tr/td[1]/text()" ) port_list = html_tree.xpath( "/html/body/div[3]/div/div/div[2]/div/table/tbody//tr/td[2]/text()" ) ip_lists = zip(ip_list, port_list) for ip in ip_lists: yield ":".join(ip) else: print("\033[1;31;40m 极速代理网站 ----> 爬取网站为空已准备跳过! \033[0m") return 0 except: print("\033[1;41;97m 极速代理网站 ----> 爬虫网站规则更改,请修改! \033[0m") return 0
def crawl_iphai(self): start_url = 'http://www.iphai.com/free/wg' html = get_page(start_url) if html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S) re_port = find_port.findall(trs[s]) find_protocol = re.compile( '<td>(.*?)</td>\s+<td>(.*?)</td>\s+<td>(.*?)</td>\s+<td>(.*?)</td>', re.S) re_pro = find_protocol.findall(trs[s]) if re_pro[0][3].strip() == '': re_pro = ["HTTP"] else: re_pro = ["HTTPS"] for address, port, protocol in zip(re_ip_address, re_port, re_pro): address_port = address + ':' + port tmp = (address_port, protocol) yield tmp
def crawl_xicidaili(self): for i in range(1, 3): start_url = 'http://www.xicidaili.com/nn/{}'.format(i) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3', 'Host': 'www.xicidaili.com', 'Referer': 'http://www.xicidaili.com/nn/3', 'Upgrade-Insecure-Requests': '1', } html = get_page(start_url, options=headers) if html: find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S) trs = find_trs.findall(html) for tr in trs: find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(tr) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(tr) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port print(address_port) yield address_port.replace(' ', '')
def crawl_mayidaili(self): try: d1 = datetime.datetime(2018, 12, 31) year = int(time.strftime("%Y", time.localtime())) mouth = int(time.strftime("%m", time.localtime())) day = int(time.strftime("%d", time.localtime())) d2 = datetime.datetime(year, mouth, day) start = (d2 - d1).days for i in range((1325 + (start)), 1325 + start + 1): html = get_page( "http://www.mayidaili.com/share/view/{name}/".format( name=i)) if html: doc = pq(html) iptables = doc('body > div:nth-child(4) > p').text() iptable_list = iptables.split("#") for i in iptable_list: if not i.isalpha(): result = re.search('(\d+)', i) start = result.start() ips = i[start:] yield ips.replace(' ', '') else: pass else: print("\033[1;31;40m 蚂蚁代理网站 ----> 爬取网站为空已准备跳过! \033[0m") return 0 except: print("\033[1;41;97m 蚂蚁代理网站 ----> 爬虫网站规则更改,请修改! \033[0m") return 0
def crawl_data5u(self): start_url = 'http://www.data5u.com/free/gngn/index.shtml' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86', 'Host': 'www.data5u.com', 'Referer': 'http://www.data5u.com/free/index.shtml', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', } html = get_page(start_url, options=headers) if html: ip_address = re.compile( '<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S) re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
def crawl_89ip(self): start_url = 'http://www.89ip.cn/apijk/?&tqsl=1000&sxa=&sxb=&tta=&ports=&ktip=&cf=1' html = get_page(start_url) if html: find_ips = re.compile('(\d+\.\d+\.\d+\.\d+:\d+)', re.S) ip_ports = find_ips.findall(html) for address_port in ip_ports: yield address_port
def crawl_66ip(self): start_url = "http://www.66ip.cn/nmtq.php?getnum=300&isp=0&anonymoustype=2&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip" res = get_page(start_url) if res: proxies = re.findall("(\d+.*?)<br />", res) for proxy in proxies: yield proxy time.sleep(random.randint(1, 4))
def crawl_daxiang(self): url = 'http://vtp.daxiangdaili.com/ip/?tid=556488478479034&num=1000' data = get_page(url) if data: data = data.split('\r\n') for proxy in data: if proxy: yield proxy
def crawl_ipjai(self): url = 'http://www.iphai.com/free/ng' html = get_page(url) ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.?*)</td>', re.S) re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
def crawl_ip181(self): start_url = 'http://www.ip181.com/' html = get_page(start_url) ip_address = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s* 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
def crawl_premproxy(self): for i in ['China-01', 'China-02', 'China-03', 'China-04', 'Taiwan-01']: start_url = 'https://premproxy.com/proxy-by-country/{}.htm'.format(i) html = get_page(start_url) if html: ip_address = re.compile('<td data-label="IP:port ">(.*?)</td>') re_ip_address = ip_address.findall(html) for address_port in re_ip_address: yield address_port.replace(' ', '')
def crawl_ip3366(self): for page in range(1, 4): start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page) html = get_page(start_url) ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s * 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address+':'+ port yield result.replace(' ', '')
def crawl_kxdaili(self): for i in range(1, 11): start_url = 'http://www.kxdaili.com/ipList/{}.html#ip'.format(i) html = get_page(start_url) ip_address = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s* 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
def crawl_66ip(self): start_url = 'http://www.66ip.cn/nmtq.php?getnum=500&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip' html = get_page(start_url) a = html.split("var mediav_ad_height = '60';")[-1].split( '<script type="text/javascript" src="http://www.66ip.cn/ggg/jquery.min.js"></script>')[0].split('</div>')[ 0].split('</script>')[-1] b = a.split('<br />') for i in b: result = i.strip() yield result.replace(' ', '')
def crawl_ip181(self): start_url = 'http://www.ip181.com/' text = get_page(start_url) try: a = json.loads(text)['RESULT'] for item in a: result = item['ip'] + ':' + item['port'] yield result.replace(' ', '') except: return None
def crawl_yqie(self): start_url = 'http://ip.yqie.com/ipproxy.htm' html = get_page(start_url) ip_address = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>', re.S) # \s* 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
def crawl_mianfei(self): for i in range(1, 4): start_url = 'http://ip.jiangxianli.com/?page={}'.format(i) html = get_page(start_url) if html: find_ips = re.compile('(\d+\.\d+\.\d+\.\d+:\d+)', re.S) re_ip_address = find_ips.findall(html) re_ip_address = list(set(re_ip_address)) for eve_ip_address in re_ip_address: yield eve_ip_address
def crawl_data5u(self): print(3) start_url = 'http://www.data5u.com/' html = get_page(start_url) if html: doc = pq(html) uls = doc('body ul>li>ul.l2').items() for ul in uls: host = ul.find('span:nth-child(1)').text() port = ul.find('span:nth-child(2)').text() yield ':'.join([host, port])
def crawl_89ip(self): start_url = "http://www.89ip.cn/index_{}.html" for i in range(1, 7): res = get_page(start_url.format(i)) if res: doc = pq(res) trs = doc("tbody tr").items() for tr in trs: ip = tr.find("td:nth-child(1)").text() port = tr.find("td:nth-child(2)").text() yield ":".join([ip, port]) time.sleep(random.randint(1, 4))
def crawl_89ip(self): start_url = 'http://www.89ip.cn/tqdl.html?api=1&num=100&port=&address=&isp=' text = get_page(start_url) # a=re.findall('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,5}',text) try: for i in re.findall( '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,5}', text): yield i except: return None
def crawl_89ip(self): for i in range(1, 10): start_url = 'http://www.89ip.cn/index_{}.html'.format(i) html = get_page(start_url) if html: ip_address = re.compile( '<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>', re.S) # \s* 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address.strip() + ':' + port.strip() yield result
def crawl_xroxy(self): for i in ['CN', 'TW']: start_url = 'http://www.xroxy.com/proxylist.php?country={}'.format(i) html = get_page(start_url) if html: ip_address1 = re.compile("title='View this Proxy details'>\s*(.*).*") re_ip_address1 = ip_address1.findall(html) ip_address2 = re.compile("title='Select proxies with port number .*'>(.*)</a>") re_ip_address2 = ip_address2.findall(html) for address, port in zip(re_ip_address1, re_ip_address2): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_kuaidaili(self): for i in range(1, 4): start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i) html = get_page(start_url) if html: ip_address = re.compile('<td data-title="IP">(.*?)</td>') re_ip_address = ip_address.findall(html) port = re.compile('<td data-title="PORT">(.*?)</td>') re_port = port.findall(html) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_data5u(self): for i in ['gngn', 'gnpt']: start_url = 'http://www.data5u.com/free/{}/index.shtml'.format(i) html = get_page(start_url) ip_adress = re.compile( ' <ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;"><li class=".*">(.*?)</li></span>' ) # \s * 匹配空格,起到换行作用 re_ip_adress = ip_adress.findall(html) for adress, port in re_ip_adress: result = adress + ':' + port yield result.replace(' ', '')
def crawl_xicidaili(self): for page in range(1, 4): start_url = 'http://www.xicidaili.com/wt/{}'.format(page) html = get_page(start_url) ip_adress = re.compile( '<td class="country"><img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn" /></td>\s*<td>(.*?)</td>\s*<td>(.*?)</td>' ) # \s* 匹配空格,起到换行作用 re_ip_adress = ip_adress.findall(html) for adress, port in re_ip_adress: result = adress + ':' + port yield result.replace(' ', '')
def crawl_ip3366(self): start_url = "http://www.ip3366.net/?stype=1&page={}" for i in range(1, 11): res = get_page(start_url.format(i)) if res: doc = pq(res) trs = doc("tbody tr").items() for tr in trs: ip = tr.find("td:nth-child(1)").text() port = tr.find("td:nth-child(2)").text() yield ":".join([ip, port]) time.sleep(random.randint(1, 4))