示例#1
0
 def freeProxyRay01(page_count=1):
     """
     http://www.xsdaili.com/dayProxy/2020/1/1.html
     免费代理库
     /html/body/div[5]/div/div[2]/div/div/div/div[2]/div/div[2]/div[1]/div[1]/a
     :return:
     """
     try:
         base_url = 'http://www.xsdaili.com'
         nowt = time.localtime(time.time())
         url = '{}/dayProxy/{}/{}/1.html'.format(base_url, nowt.tm_year,
                                                 nowt.tm_mon)
         html_tree = getHtmlTree(url)
         today_urls = []
         for aurl in html_tree.xpath("//a"):
             if isinstance(aurl.text,
                           str) and "代理IP" in aurl.text and time.strftime(
                               "%Y年%m月%d日", nowt) in aurl.text:
                 today_urls.append('/'.join(
                     [base_url.strip('/'), aurl.attrib['href'].strip('/')]))
         for purl in today_urls:
             html_tree = getHtmlTree(purl)
             for abr in html_tree.xpath("//br"):
                 try:
                     searchObj = re.search(
                         '[1-9]{1}[0-9]{0,2}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,5}',
                         abr.tail.strip())
                     if searchObj:
                         yield searchObj.group()
                 except Exception:
                     pass
     except Exception:
         pass
示例#2
0
    def freeProxyNinth():
        """
        站大爷代理 http://ip.zdaye.com/
        :return:
        """
        url = 'http://ip.zdaye.com/'
        html_tree = getHtmlTree(url)
        item_list = html_tree.xpath('//div[@class="Loglist"]/div[2]/div[@class="panel-body"]//a/text()')
        for item in item_list:
            try:
                yield item.split('@')[0].strip()
            except Exception as e:
                print(e)

        header = {
            'Referer': 'http://ip.zdaye.com/',
        }
        new_urls = html_tree.xpath('//div[@class="Loglist"]/div[1]/div[@class="panel-body"]//a/@href')
        for new_url in new_urls:
            try:
                new_html_tree = getHtmlTree(url + new_url, header=header)
                new_item_list = new_html_tree.xpath('//div[@class="cont"]/text()')
                for new_item in new_item_list:
                    try:
                        yield new_item.split('@')[0].strip()
                    except Exception as e:
                        print(e)
            except Exception as e:
                print(e)
示例#3
0
 def freeProxyEight():
     """
     小舒代理 http://www.xsdaili.com/
     """
     url = 'http://www.xsdaili.com/'
     html_tree = getHtmlTree(url)
     new_url = url + html_tree.xpath('//div[@class="col-md-12"]/div[1]//a[1]/@href')[0]
     new_html_tree = getHtmlTree(new_url)
     proxy_list = new_html_tree.xpath('//div[@class="cont"]/text()')
     for proxy in proxy_list:
         try:
             yield proxy.split('@')[0].strip()
         except Exception as e:
             print(e)
 def freeProxyFifth():
     """
     抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml
     :return:
     """
     url = "http://www.goubanjia.com/free/gngn/index{page}.shtml"
     for page in range(1, 10):
         page_url = url.format(page=page)
         tree = getHtmlTree(page_url)
         proxy_list = tree.xpath('//td[@class="ip"]')
         # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
         # 需要过滤掉<p style="display:none;">的内容
         xpath_str = """.//*[not(contains(@style, 'display: none'))
                             and not(contains(@style, 'display:none'))
                             and not(contains(@class, 'port'))
                             ]/text()
                     """
         for each_proxy in proxy_list:
             try:
                 # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
                 ip_addr = ''.join(each_proxy.xpath(xpath_str))
                 port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0]
                 yield '{}:{}'.format(ip_addr, port)
             except Exception as e:
                 pass
示例#5
0
 def freeProxyFirst(page=10):
     """
     无忧代理 http://www.data5u.com/
     无忧代理有反爬虫机制。
     需要获得元素的 classname。
     匹配classname中每个字符在key中的位置,组合得到一个整数。
     最后将整数右移3位得到的才是正确的端口号。
     :param page: 页数
     :return:
     """
     url_list = [
         'http://www.data5u.com/',
         'http://www.data5u.com/free/gngn/index.shtml',
         'http://www.data5u.com/free/gnpt/index.shtml'
     ]
     key = 'ABCDEFGHIZ'
     for url in url_list:
         html_tree = getHtmlTree(url)
         ul_list = html_tree.xpath('//ul[@class="l2"]')
         for ul in ul_list:
             try:
                 ip = ul.xpath('./span[1]/li/text()')[0]
                 classnames = ul.xpath('./span[2]/li/attribute::class')[0]
                 classname = classnames.split(' ')[1]
                 port_sum = 0
                 for c in classname:
                     port_sum *= 10
                     port_sum += key.index(c)
                 port = port_sum >> 3
                 yield '{}:{}'.format(ip, port)
             except Exception as e:
                 print(e)
示例#6
0
 def freeProxyFifth():
     """
     抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml
     :return:
     """
     url = "http://www.goubanjia.com/free/gngn/index{page}.shtml"
     for page in range(1, 10):
         page_url = url.format(page=page)
         tree = getHtmlTree(page_url)
         proxy_list = tree.xpath('//td[@class="ip"]')
         # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
         # 需要过滤掉<p style="display:none;">的内容
         xpath_str = """.//*[not(contains(@style, 'display: none'))
                             and not(contains(@style, 'display:none'))
                             and not(contains(@class, 'port'))
                             ]/text()
                     """
         for each_proxy in proxy_list:
             try:
                 # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
                 ip_addr = ''.join(each_proxy.xpath(xpath_str))
                 port = each_proxy.xpath(
                     ".//span[contains(@class, 'port')]/text()")[0]
                 yield '{}:{}'.format(ip_addr, port)
             except Exception as e:
                 self.log.warning("fetch proxy failed: " + str(e))
示例#7
0
    def freeProxy04():
        """
        guobanjia http://www.goubanjia.com/
        :return:
        """
        url = "http://www.goubanjia.com/"
        tree = getHtmlTree(url)
        proxy_list = tree.xpath('//td[@class="ip"]')
        # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
        # 需要过滤掉<p style="display:none;">的内容
        xpath_str = """.//*[not(contains(@style, 'display: none'))
                                        and not(contains(@style, 'display:none'))
                                        and not(contains(@class, 'port'))
                                        ]/text()
                                """
        for each_proxy in proxy_list:
            try:
                # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
                ip_addr = ''.join(each_proxy.xpath(xpath_str))

                # HTML中的port是随机数,真正的端口编码在class后面的字母中。
                # 比如这个:
                # <span class="port CFACE">9054</span>
                # CFACE解码后对应的是3128。
                port = 0
                for _ in each_proxy.xpath(".//span[contains(@class, 'port')]"
                                          "/attribute::class")[0]. \
                        replace("port ", ""):
                    port *= 10
                    port += (ord(_) - ord('A'))
                port /= 8

                yield '{}:{}'.format(ip_addr, int(port))
            except Exception as e:
                pass
示例#8
0
 def free_proxy_xici():
     """
     西刺代理 http://www.xicidaili.com
     :return:
     """
     url_list = [{
         "url": "http://www.xicidaili.com/nn/",
         "page_count": 10
     }, {
         "url": "http://www.xicidaili.com/nt/",
         "page_count": 10
     }, {
         "url": "http://www.xicidaili.com/wn/",
         "page_count": 10
     }]
     for task in url_list:
         each_url = task['url']
         page_count = task['page_count']
         for i in range(1, page_count + 1):
             page_url = each_url + str(i)
             tree = getHtmlTree(page_url)
             proxy_list = tree.xpath(
                 './/table[@id="ip_list"]//tr[position()>1]')
             for proxy in proxy_list:
                 try:
                     yield ':'.join(proxy.xpath('./td/text()')[0:2])
                 except Exception as e:
                     pass
示例#9
0
 def freeProxy01():
     """
     无忧代理 http://www.data5u.com/
     几乎没有能用的
     :return:
     """
     url_list = [
         'http://www.data5u.com/',
         'http://www.data5u.com/free/gngn/index.shtml',
         'http://www.data5u.com/free/gnpt/index.shtml'
     ]
     key = 'ABCDEFGHIZ'
     for url in url_list:
         html_tree = getHtmlTree(url)
         ul_list = html_tree.xpath('//ul[@class="l2"]')
         for ul in ul_list:
             try:
                 ip = ul.xpath('./span[1]/li/text()')[0]
                 classnames = ul.xpath('./span[2]/li/attribute::class')[0]
                 classname = classnames.split(' ')[1]
                 port_sum = 0
                 for c in classname:
                     port_sum *= 10
                     port_sum += key.index(c)
                 port = port_sum >> 3
                 yield '{}:{}'.format(ip, port)
             except Exception as e:
                 print(e)
示例#10
0
    def freeProxy_xiladaili():
        urls = [
            'http://www.xiladaili.com/gaoni/%d/',
            'http://www.xiladaili.com/http/%d/',
            'http://www.xiladaili.com/https/%d/'
        ]

        for url in urls:
            # 每个url抓取20页
            for i in range(1, 21):
                new_url = url % i
                dom = getHtmlTree(new_url)
                for item in dom.xpath('//tr'):
                    ip = item.xpath('./td[1]/text()')
                    if not len(ip):
                        continue
                    ip = item.xpath('./td[1]/text()')[0]
                    protocol = item.xpath('./td[2]/text()')[0]
                    if "," in protocol:
                        # 支持http 和 https
                        yield 'http://' + ip.strip()
                        yield 'https://' + ip.strip()
                    elif "HTTPS" in protocol:
                        yield 'https://' + ip.strip()
                    else:
                        yield 'http://' + ip.strip()
示例#11
0
文件: 89ip.py 项目: 23233/sproxy
 def run(self):
     url = 'http://www.89ip.cn/tqdl.html?num=2500&address=&kill_address=&port=&kill_port=&isp='
     html_tree = getHtmlTree(url)
     data_warp = html_tree.xpath(
         "//div[@class='fly-panel']/div[@style='padding-left:20px;']//text()"
     )
     for data in data_warp:
         if ':' in data:
             yield data.strip()
示例#12
0
 def freeProxyTwelve(page_count=8):
     for i in range(1, page_count + 1):
         url = 'http://ip.jiangxianli.com/?page={}'.format(i)
         html_tree = getHtmlTree(url)
         tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr")
         if len(tr_list) == 0:
             continue
         for tr in tr_list:
             yield tr.xpath("./td[2]/text()")[0] + ":" + tr.xpath("./td[3]/text()")[0]
示例#13
0
 def freeProxyThird(days=1):
     url = 'http://www.ip181.com/'
     html_tree = getHtmlTree(url)
     try:
         tr_list = html_tree.xpath('//tr')[1:]
         for tr in tr_list:
             yield ':'.join(tr.xpath('./td/text()')[0:2])
     except Exception as e:
         pass
示例#14
0
 def freeProxyFifth():
     """
     抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml
     :return:
     """
     url = "http://www.goubanjia.com/free/gngn/index.shtml"
     tree = getHtmlTree(url)
     proxy_list = tree.xpath('.//td[@class="ip"]')
     for proxy in proxy_list:
         yield ''.join(proxy.xpath('.//text()'))
示例#15
0
 def freeProxyKuaidaili():
     # 快代理http://www.kuaidaili.com/free/inha/1/
     url = "http://www.kuaidaili.com/free/inha/{}/"
     for page in range(1, 10):
         page_url = url.format(page)
         tree = getHtmlTree(page_url)
         ip_list = tree.xpath('//td[@data-title="IP"]/text()')
         port_list = tree.xpath('//td[@data-title="PORT"]/text()')
         for index, ip in enumerate(ip_list):
             yield '{}:{}'.format(ip, port_list[index])
示例#16
0
 def freeProxyThirteen():
     """
     飞蚁代理 http://www.feiyiproxy.com/?page_id=1457
     :return:
     """
     url = 'http://www.feiyiproxy.com/?page_id=1457'
     html_tree = getHtmlTree(url)
     tr_list = html_tree.xpath('//div[@class="et_pb_code et_pb_module  et_pb_code_1"]//tr[position()>1]')
     for tr in tr_list:
         yield tr.xpath('./td[1]/text()')[0].strip() + ':' + tr.xpath('./td[2]/text()')[0].strip()
示例#17
0
 def freeProxySeventh():
     """
     快代理免费https://www.kuaidaili.com/free/inha/1/
     """
     url = 'https://www.kuaidaili.com/free/inha/{page}/'
     for page in range(1, 10):
         page_url = url.format(page=page)
         tree = getHtmlTree(page_url)
         proxy_list = tree.xpath('.//table//tr')
         for tr in proxy_list[1:]:
             yield ':'.join(tr.xpath('./td/text()')[0:2])
示例#18
0
 def run(self):
     end = 10
     url = 'http://www.nimadaili.com/{col}/{page}/'
     col_list = ['gaoni', 'http', 'https']
     for col in col_list:
         for i in range(1, end):
             html_tree = getHtmlTree(url.format(page=i, col=col))
             data_list = html_tree.xpath('//tr//td[1]//text()')
             for data in data_list:
                 if ':' in data:
                     yield data.strip()
示例#19
0
 def freeProxyThird(self, days=1):
     """
     抓取ip181 http://www.ip181.com/
     :param days:
     :return:
     """
     url = 'http://www.ip181.com/'
     html_tree = getHtmlTree(url)
     tr_list = html_tree.xpath('//tr')[1:]
     for tr in tr_list:
         yield ':'.join(tr.xpath('./td/text()')[0:2])
示例#20
0
 def freeProxyFourteen():
     """
     旗云代理  http://www.qydaili.com/free/?action=china&page=
     :return:
     """
     urls = ['http://www.qydaili.com/free/?action=china&page={}'.format(page) for page in range(1, 4)]
     for url in urls:
         html_tree = getHtmlTree(url)
         tr_list = html_tree.xpath('//table[@class="table table-bordered table-striped"]//tbody//tr')
         for tr in tr_list:
             yield tr.xpath('./td[1]/text()')[0].strip() + ':' + tr.xpath('./td[2]/text()')[0].strip()
示例#21
0
 def freeProxySeventh():
     """
     快代理免费https://www.kuaidaili.com/free/inha/1/
     """
     url = 'https://www.kuaidaili.com/free/inha/{page}/'
     for page in range(1, 10):
         page_url = url.format(page=page)
         tree = getHtmlTree(page_url)
         proxy_list = tree.xpath('.//table//tr')
         for tr in proxy_list[1:]:
             yield ':'.join(tr.xpath('./td/text()')[0:2])
示例#22
0
 def freeProxy06():
     """
     码农代理 https://proxy.coderbusy.com/
     :return:
     """
     urls = ['https://proxy.coderbusy.com/']
     for url in urls:
         tree = getHtmlTree(url)
         proxy_list = tree.xpath('.//table//tr')
         for tr in proxy_list[1:]:
             yield ':'.join(tr.xpath('./td/text()')[0:2])
示例#23
0
 def freeProxySecond(area=33, page=1):
     area = 33 if area > 33 else area
     for area_index in range(1, area + 1):
         for i in range(1, page + 1):
             url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i)
             html_tree = getHtmlTree(url)
             tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]")
             if len(tr_list) == 0:
                 continue
             for tr in tr_list:
                 yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0]
             break
示例#24
0
 def freeProxySeventh():
     url_list = [
         'https://www.kuaidaili.com/free/inha/{page}/',
         'https://www.kuaidaili.com/free/intr/{page}/'
     ]
     for url in url_list:
         for page in range(1, 5):
             page_url = url.format(page=page)
             tree = getHtmlTree(page_url)
             proxy_list = tree.xpath('.//table//tr')
             for tr in proxy_list[1:]:
                 yield ':'.join(tr.xpath('./td/text()')[0:2])
示例#25
0
 def freeProxyFifth(self):
     """
     抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml
     :return:
     """
     url = "http://www.goubanjia.com/free/gngn/index{page}.shtml"
     for page in range(1, 10):
         page_url = url.format(page=page)
         tree = getHtmlTree(page_url)
         proxy_list = tree.xpath('//td[@class="ip"]')
         for each_proxy in proxy_list:
             yield ''.join(each_proxy.xpath('.//text()'))
示例#26
0
 def freeProxyThirteen():
     """
     https://ip.ihuan.me/address/5Lit5Zu9.html   反爬严厉
     免费代理库
     超多量
     :return:
     """
     url_list = ['https://ip.ihuan.me/']
     for url in url_list:
         tree = getHtmlTree(url)
         proxy_list = tree.xpath('.//table/tbody/tr')
         for tr in proxy_list[1:]:
             yield ':'.join(tr.xpath('./td/text()')[0:2])
示例#27
0
 def freeProxyFifth():
     """
     抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml
     :return:
     """
     url = "http://www.goubanjia.com/free/gngn/index.shtml"
     tree = getHtmlTree(url)
     # 现在每天最多放15个(一页)
     for i in xrange(15):
         d = tree.xpath(
             './/table[@class="table"]/tbody/tr[{}]/td'.format(i + 1))[0]
         o = d.xpath('.//span/text() | .//div/text()')
         yield ''.join(o[:-1]) + ':' + o[-1]
示例#28
0
 def freeProxyProxydb():
     # Proxydb  http://proxydb.net/?protocol=http&protocol=https&country=&offset=0
     url = "http://proxydb.net/?protocol=http&country=&offset={}"
     for offset in range(0, 150, 15):
         page_url = url.format(offset)
         tree = getHtmlTree(page_url)
         proxy_list = tree.xpath('//table//script/text()')
         for item in proxy_list:
             list = re.split('=|;|\'', "".join(item.split()))
             ip = list[2][::-1] + list[10]
             port = eval(list[13])
             yield '{}:{}'.format(ip, port)
         time.sleep(2)
示例#29
0
 def freeProxyFourth():
     """
     抓取西刺代理 http://api.xicidaili.com/free2016.txt
     :return:
     """
     url_list = ['http://www.xicidaili.com/nn',  # 高匿
                 'http://www.xicidaili.com/nt',  # 透明
                 ]
     for each_url in url_list:
         tree = getHtmlTree(each_url)
         proxy_list = tree.xpath('.//table[@id="ip_list"]//tr')
         for proxy in proxy_list:
             yield ':'.join(proxy.xpath('./td/text()')[0:2])
示例#30
0
    def freeProxyPingRui():
        # PingRui
        url_list = ['http://pingrui.net/wn/', 'http://pingrui.net/wt/']

        for each_url in url_list:
            tree = getHtmlTree(each_url)
            proxy_list = tree.xpath('.//table[@id="ip_list"]//tr')
            for proxy in proxy_list:
                try:
                    yield ':'.join(proxy.xpath('./td/text()')[0:2])
                except Exception as e:
                    pass
            time.sleep(2)
示例#31
0
 def freeProxy09(page_count=1):
     """
     http://ip.jiangxianli.com/?page=
     免费代理库
     :return:
     """
     for i in range(1, page_count + 1):
         url = 'http://ip.jiangxianli.com/?country=中国&?page={}'.format(i)
         html_tree = getHtmlTree(url)
         for index, tr in enumerate(html_tree.xpath("//table//tr")):
             if index == 0:
                 continue
             yield ":".join(tr.xpath("./td/text()")[0:2]).strip()
示例#32
0
 def freeProxySeventh():
     """
     快代理 https://www.kuaidaili.com
     """
     url_list = [
         'https://www.kuaidaili.com/free/inha/',
         'https://www.kuaidaili.com/free/intr/'
     ]
     for url in url_list:
         tree = getHtmlTree(url)
         proxy_list = tree.xpath('.//table//tr')
         for tr in proxy_list[1:]:
             yield ':'.join(tr.xpath('./td/text()')[0:2])
示例#33
0
 def freeProxyThird(days=1):
     """
     抓取ip181 http://www.ip181.com/
     :param days:
     :return:
     """
     url = 'http://www.ip181.com/'
     html_tree = getHtmlTree(url)
     try:
         tr_list = html_tree.xpath('//tr')[1:]
         for tr in tr_list:
             yield ':'.join(tr.xpath('./td/text()')[0:2])
     except Exception as e:
         pass
示例#34
0
 def freeProxyFourth():
     """
     抓取西刺代理 http://api.xicidaili.com/free2016.txt
     :return:
     """
     url_list = ['http://www.xicidaili.com/nn',  # 高匿
                 'http://www.xicidaili.com/nt',  # 透明
                 ]
     for each_url in url_list:
         tree = getHtmlTree(each_url)
         proxy_list = tree.xpath('.//table[@id="ip_list"]//tr')
         for proxy in proxy_list:
             try:
                 yield ':'.join(proxy.xpath('./td/text()')[0:2])
             except Exception as e:
                 pass
示例#35
0
 def freeProxyFirst(page=10):
     """
     抓取无忧代理 http://www.data5u.com/
     :param page: 页数
     :return:
     """
     url_list = ['http://www.data5u.com/',
                 'http://www.data5u.com/free/',
                 'http://www.data5u.com/free/gngn/index.shtml',
                 'http://www.data5u.com/free/gnpt/index.shtml']
     for url in url_list:
         html_tree = getHtmlTree(url)
         ul_list = html_tree.xpath('//ul[@class="l2"]')
         for ul in ul_list:
             try:
                 yield ':'.join(ul.xpath('.//li/text()')[0:2])
             except Exception as e:
                 pass
示例#36
0
 def freeProxySecond(area=33, page=1):
     """
     代理66 http://www.66ip.cn/
     :param area: 抓取代理页数,page=1北京代理页,page=2上海代理页......
     :param page: 翻页
     :return:
     """
     area = 33 if area > 33 else area
     for area_index in range(1, area + 1):
         for i in range(1, page + 1):
             url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i)
             html_tree = getHtmlTree(url)
             tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]")
             if len(tr_list) == 0:
                 continue
             for tr in tr_list:
                 yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0]
             break
示例#37
0
 def freeProxyFourth(page_count=2):
     """
     西刺代理 http://www.xicidaili.com
     :return:
     """
     url_list = [
         'http://www.xicidaili.com/nn/',  # 高匿
         'http://www.xicidaili.com/nt/',  # 透明
     ]
     for each_url in url_list:
         for i in range(1, page_count + 1):
             page_url = each_url + str(i)
             tree = getHtmlTree(page_url)
             proxy_list = tree.xpath('.//table[@id="ip_list"]//tr[position()>1]')
             for proxy in proxy_list:
                 try:
                     yield ':'.join(proxy.xpath('./td/text()')[0:2])
                 except Exception as e:
                     pass