Пример #1
0
    def crawData5u(self, pageCount = 1):
        startUrl = 'http://www.data5u.com/free/gngn/index.shtml'
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Cookie': 'JSESSIONID=694DB8BC18C0697975ABD4D10A216C38',
            'Host': 'www.data5u.com',
            'Referer': 'http://www.data5u.com/free/index.shtml',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
        }

        for count in range(pageCount):
            print("开始爬取 {} 第 {} 页".format(startUrl, count+1))
            source = getPage(startUrl, option=headers)
            html = etree.HTML(source)

            items = html.xpath("//div[@class='wlist']//li//ul")
            for item in items[1:]:
                speed = item.xpath(".//span[8]/li/text()")
                if float(speed[0].replace('秒', "").strip()) > 4.0:
                    continue
                ip = item.xpath(".//span[1]/li/text()")
                port = item.xpath(".//span[2]/li/text()")
                yield ":".join([ip[0], port[0]])
Пример #2
0
    def crawIp66DL(self, pageCount = 4):
        startUrl = "http://www.66ip.cn"  ## HTTP && HTTPS
        for count in range(pageCount):
            print("开始爬取 {} 第 {} 页".format(startUrl, count+1))
            source = getPage(startUrl)
            html = etree.HTML(source)

            items = html.xpath("//div[@id='main']//tbody//tr")
            for item in items:
                ip = item.xpath(".//td[1]/text()")
                port = item.xpath(".//td[2]/text()")
                yield ":".join([ip[0], port[0]])

            page = count + 1
            startUrl = "http://www.66ip.cn/" + page
Пример #3
0
    def crawKuaiDL(self, pageCount = 4):
        startUrl = "https://www.kuaidaili.com/free/inha/"  ## HTTP
        for count in range(pageCount):
            print("开始爬取 {} 第 {} 页".format(startUrl, count+1))
            source = getPage(startUrl)
            html = etree.HTML(source)

            items = html.xpath("//div//div[@id='list']//tbody/tr")
            for item in items:
                speed = item.xpath(".//td[6]/text()")
                if float(speed[0].replace('秒', "").strip()) > 4.0:
                    continue
                ip = item.xpath(".//td[1]/text()")
                port = item.xpath(".//td[2]/text()")
                yield ":".join([ip[0], port[0]])

            page = count + 1
            startUrl = "https://www.kuaidaili.com/free/inha/" + page
Пример #4
0
    def crawXici(self, pageCount = 4):
        startUrl = "https://www.xicidaili.com/wt/"  ## HTTP
        for count in range(pageCount):
            print("开始爬取 {} 第 {} 页".format(startUrl, count+1))
            source = getPage(startUrl)
            html = etree.HTML(source)

            items = html.xpath("//table[@id='ip_list']//tr")
            for item in items[1:]:
                speed = item.xpath(".//td[7]/div/@title")
                if float(speed[0].replace('秒', "").strip()) > 4.0:
                    continue
                ip = item.xpath(".//td[2]/text()")
                port = item.xpath(".//td[3]/text()")
                yield ":".join([ip[0], port[0]])

            nextLink = html.xpath("//div[@class='pagination']//a[@class='next_page']/@href")
            if nextLink:
                startUrl = "https://www.xicidaili.com" + nextLink[0]
Пример #5
0
    def crawYunDL(self, pageCount = 4):
        if pageCount > 7:
            print("最大页数 7 页,已设置为 7 页!")
            pageCount = 7
        startUrl = "http://www.ip3366.net/free/?stype=1"  ## HTTP && HTTPS
        for count in range(pageCount):
            print("开始爬取 {} 第 {} 页".format(startUrl, count+1))
            source = getPage(startUrl)
            html = etree.HTML(source)

            items = html.xpath("//div[@id='list']//tbody//tr")
            for item in items:
                speed = item.xpath(".//td[6]/text()")
                if float(speed[0].replace('秒', "").strip()) > 4.0:
                    continue
                ip = item.xpath(".//td[1]/text()")
                port = item.xpath(".//td[2]/text()")
                yield ":".join([ip[0], port[0]])

            page = count + 1
            startUrl = "http://www.ip3366.net/free/?stype=1&page=" + page