示例#1
0
    def parse(self, response):

        iplist = response.xpath('//table/tr')
        page_number = response.xpath(
            "//div[@class='pagination']/a[last()-1]/text()").extract_first()
        level = response.meta['level']

        for x in iplist[1:-1]:
            ips = x.xpath('td[1]/text()').extract_first()
            ports = x.xpath('td[2]/text()').extract_first()
            protocols = x.xpath('td[5]/text()').extract_first()
            types = x.xpath('td[4]/text()').extract_first()

            province = x.xpath('td[3]/a[1]/text()').extract_first()
            city = x.xpath('td[3]/a[2]/text()').extract_first()
            address = ""
            if city is not None:
                address = province + city
            else:
                address = province

            yield ProxyPoolItem({
                'ip': ips,
                'protocol': protocols,
                'port': ports,
                'types': types,
                'address': address,
                'website': 'www.mimiip.com'
            })

        if level == 1 and page_number is not None:
            url = response.url
            for i in range(2, int(page_number) + 1):
                yield scrapy.Request("{0}/{1}".format(url, i),
                                     meta={'level': 2})
示例#2
0
 def parse(self, response):
     page = re.findall(
         '<strong><font color="red">(\d+)</font>/(\d+)</strong>',
         response.text)
     ips = re.findall('<td class="style1">(\d+\.\d+\.\d+\.\d+)</td>',
                      response.text)
     ports = re.findall('<td class="style2">(\d+)</td>', response.text)
     types = re.findall('<td class="style3">([^<]+)</td>', response.text)
     addresses = re.findall('<td class="style5">([^<]+)</td>',
                            response.text)
     protocols = re.findall('<td class="style4">([^<]+)</td>',
                            response.text)
     if page:
         now_page, count_all = page[0]
         for ip, port, typ, address, protocol in zip(
                 ips, ports, types, addresses, protocols):
             yield ProxyPoolItem({
                 'ip': ip,
                 'protocol': protocol,
                 'types': typ,
                 'port': port,
                 'address': address,
                 'website': 'www.yun-daili.com'
             })
         _next = int(now_page) + 1
         _next_url = re.sub('page=(\d+)', "page=" + str(_next),
                            response.url)
         yield scrapy.Request(_next_url, self.parse)
示例#3
0
    def parse_list(self, response):
        ip_list = response.css('#footer > div > table > tr')

        for _ip_info in ip_list:
            _ip = _ip_info.css('td:nth-child(1)::text').extract()
            _port = _ip_info.css('td:nth-child(2)::text').extract()

            if not _ip or not _port:
                continue

            if _ip[0].strip() == 'ip':
                continue
            item = ProxyPoolItem()
            item['ip'] = _ip[0].strip()
            item['port'] = int(_port[0].strip())
            item['http_type'] = 3
            item['country'] = 'CN'
            yield item
        now_page = re.findall('.*?\/(\d+)\.html', response._url)
        if now_page:
            now_page = int(now_page[0])

            if now_page < 5:
                yield scrapy.Request(
                    response._url.replace('%d.html' % (now_page),
                                          '%d.html' % (now_page + 1)), callback=self.parse_list)
            time.sleep(1)
示例#4
0
    def parse(self, response):

        iplist = response.xpath('//ul[@class="l2"]')

        for x in iplist[1:-1]:
            ip = x.xpath('span[1]/li/text()').extract_first()
            port = x.xpath('span[2]/li/text()').extract_first()
            protocol = str(
                x.xpath('span[4]/li/a/text()').extract_first()).upper()
            type = x.xpath('span[3]/li/a/text()').extract_first()

            city = x.xpath('span[6]/li/a/text()').extract_first()
            country = x.xpath(
                '//ul[@class="l2"][1]/span[5]/li/a/text()').extract_first()
            address = ""
            if city is not None:
                address = country + city
            else:
                address = country

            yield ProxyPoolItem({
                'ip': ip,
                'protocol': protocol,
                'port': port,
                'types': type,
                'address': address,
                'website': 'www.data5u.com'
            })
示例#5
0
 def parse(self, response):
     info = response.xpath('//div[@class="daililist"]/table/tr')
     for x in info[1::2]:
         item = ProxyPoolItem()
         data = x.xpath('td/text()').extract()
         item['ip'] = data[0].split(':')[0]
         item['protocol'] = 'HTTP'
         item['port'] = data[0].split(':')[1]
         item['address'] = data[1]
         item['website'] = 'baizhongsou.com'
         item['types'] = '普通'
         yield item
示例#6
0
    def parse(self, response):
        s = response.text	
        r = re.compile(r'<tr>[\s\S]*?<\/tr>', re.I|re.M)
        d = r.findall(s)
        for each in d[1:]:
            yield ProxyPoolItem({
				'ip': re.findall(r'\d+\.\d+\.\d+\.\d+', each)[0],
				'port': re.findall(r'<td>(\d+)<\/td>', each)[0],
				'types': re.findall(r'<td>([\s\S]*?)<\/td>', each)[6],
				'protocol': re.findall(r'<td>([\s\S]*?)<\/td>', each)[4],
				'address': re.findall(r'\.aspx">([^<]+)<\/a>', each)[0],
				'website': 'https://proxy.coderbusy.com/'
			})
示例#7
0
 def parse(self, response):
     all_data = re.findall(
         '<td>(\d+\.\d+\.\d+\.\d+)</td>\s+<td>(\d+)</td>\s+<td>([^<]+)</td>\s+<td>([^<]+)</td>\s+<td>([^<]+)</td>\s+<td>([^<]+)</td>',
         response.text)
     for each in all_data:
         yield ProxyPoolItem({
             'ip': each[0],
             'port': each[1],
             'types': each[2],
             'protocol': each[3],
             'address': each[5],
             'website': 'www.kxdaili.com'
         })
示例#8
0
 def parse(self, response):
     ips = re.findall('<td>(\d+\.\d+\.\d+\.\d+)</td>', response.text)
     ports = re.findall('<td>(\d+)</td>', response.text)
     types = re.findall('<td class="country">([^<]+)</td>', response.text)
     protocols = re.findall('<td>(HTTPS?)</td>', response.text)
     for ip, port, _type, protocol in zip(ips, ports, types, protocols):
         yield ProxyPoolItem({
             'website': 'xicidaili',
             'ip': ip,
             'protocol': protocol,
             'port': port,
             'types': _type
         })
示例#9
0
 def parse(self, response):
     data = response.xpath('//*[@id="list"]/table/tbody/tr')
     if data : 
         for x in data :
            item = ProxyPoolItem()
            info = x.xpath('td')
            ipport = x.xpath('td[1]//*[name(.)!="p"]/text()').extract()
            address = x.xpath('td[4]//*[name(.)="a"]/text()').extract()
            item['ip'] = "".join(ipport[:-1])
            item['port'] = ipport[-1]
            item['protocol'] = info[2].xpath('string(.)').extract()[0]
            item['types'] = info[1].xpath('string(.)').extract()[0] 
            item['address'] = "".join(address)
            item['website'] = 'goubanjia.com'
            yield item
示例#10
0
 def parse(self, response):
     data = response.xpath(
         '//table[@class="table table-bordered table-striped table-hover"]/tr'
     )
     if data:
         for x in data[1:]:
             item = ProxyPoolItem()
             info = x.xpath('td/text()').extract()
             item['ip'] = info[0].strip()
             item['protocol'] = info[3].strip()
             item['port'] = info[1].strip()
             item['types'] = info[2].strip()
             item['address'] = info[4].strip()
             item['website'] = 'iphai.com'
             yield item
示例#11
0
 def parse(self, response):
     info = response.xpath('//div[@class="CommonBody"]/table[6]/tr[4]/td/table/tr')
     for x in info[1:]:
         item = ProxyPoolItem()
         data = x.xpath('td/text()').extract()
         try:
             item['ip'] = data[0]
         except IndexError:
             continue
         item['protocol'] = 'HTTP'
         item['port'] = data[1]
         item['address'] = x.xpath('td/div/text()').extract()[0]
         item['types'] = data[2]
         item['website'] = '3464.com'
         yield item
示例#12
0
 def parse(self, response):
     info = response.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
     if info:
         for d in info:
             item = ProxyPoolItem()
             data = d.xpath('td/text()').extract()
             item['ip'] = data[0]
             item['protocol'] = data[3]
             item['port'] = data[1]
             item['types'] = data[2]
             item['address'] = data[4]
             item['website'] = 'httpsdaili.com'
             yield item
         next_url = response.xpath('//div[@id="listnav"]/ul/a/@href').extract()[-2]
         yield Request('http://www.httpsdaili.com/'+next_url, self.parse)
示例#13
0
 def parse(self, response):
     typee, now_page = re.findall('com/free/(.*?)/(\d+)/', response.url)[0]
     iplist = response.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
     if iplist:
         for x in iplist:
             data = x.xpath('td/text()').extract()
             item = ProxyPoolItem()
             item['ip'] = data[0]
             item['protocol'] = data[3]
             item['port'] = data[1]
             item['types'] = data[2]
             item['address'] = data[4]
             item['website'] = 'www.kuaidaili.com'
             yield item
         next_page = int(now_page) + 1
         next_url = 'http://www.kuaidaili.com/free/{}/{}/'.format(typee, next_page)
         yield Request(next_url, self.parse)
示例#14
0
 def parse(self, response):
     ip_list = response.css('#ip_list > tr')
     for _ip_info in ip_list:
         _ip = _ip_info.css('td:nth-child(2)::text').extract()
         _port = _ip_info.css('td:nth-child(3)::text').extract()
         _http_type = _ip_info.css('td:nth-child(6)::text').extract()
         if not _ip or not _port or not _http_type:
             continue
         item = ProxyPoolItem()
         item['ip'] = _ip[0].strip()
         item['port'] = int(_port[0].strip())
         item['http_type'] = 1 if _http_type[0].strip() == 'HTTP' else 2
         item['country'] = 'CN'
         yield item
     if self.CURRENT_PAGE < 5:
         self.CURRENT_PAGE += 1
         yield scrapy.Request('http://www.xicidaili.com/nn/' +
                              str(self.CURRENT_PAGE))
示例#15
0
 def parse(self, response):
     info = response.xpath('/html/body/div[2]/table/tbody/tr')
     for x in info:
         ip_script = x.xpath('td[1]/script/text()').extract()[0]
         item = ProxyPoolItem()
         item['ip'], item['port'] = parse_script(ip_script)
         item['protocol'] = x.xpath('td[2]/text()').extract()[0].strip()
         item['address'] = x.xpath('td[3]/abbr/text()').extract()[0]
         item['types'] = x.xpath('td[4]/span/text()').extract()[0]
         item['website'] = 'proxydb.net'
         yield item
     next_url = response.xpath(
         '/html/body/div[2]/nav/a[2]/@href').extract()[0]
     if self.index_page < self.max_page and next_url:
         self.index_page += 1
         yield Request('http://proxydb.net' + next_url,
                       self.parse,
                       headers={'user-agent': USER_AGENT})
示例#16
0
    def parse(self, response):

        iplist = response.xpath('//*[@id="footer"]/div/table//tr')

        for x in iplist[1:-1]:
            ips = x.xpath('td[1]/text()').extract_first()
            ports = x.xpath('td[2]/text()').extract_first()
            protocols = 'HTTP'
            address = x.xpath('td[3]/text()').extract_first()
            types = x.xpath('td[4]/text()').extract_first()

            yield ProxyPoolItem({
                'ip': ips,
                'protocol': protocols,
                'port': ports,
                'types': types,
                'address': address,
                'website': 'www.66ip.cn'
            })
示例#17
0
 def parse(self, response):
     page = re.findall('<strong><font color="#49afcd">(\d+)</font>/(\d+)</strong>', response.text)
     ips = re.findall('<td style="WIDTH:110PX">(\d+\.\d+\.\d+\.\d+)</td>', response.text)
     ports = re.findall('<td style="WIDTH:40PX">(\d+)</td>', response.text)
     types = re.findall(u"<td style=\"WIDTH:55PX\">([\u4e00-\u9fa5]+)</td>",response.text)
     addresses = re.findall(u"<td style=\"WIDTH:135PX\">([[\u4e00-\u9fa5]+)</td>", response.text)
     protocols = re.findall('<td style="WIDTH:55PX">(HTTPS?)</td>', response.text)
     if page:
         now_page, count_all = page[0]
         for ip, port, typ, address, protocol in zip(ips, ports, types, addresses, protocols):
             yield ProxyPoolItem({
                 'ip': ip,
                 'protocol': protocol,
                 'types': typ,
                 'port': port,
                 'address': address,
                 'website': 'www.nianshao.me'
             })
         _next = int(now_page) + 1 
         _next_url = re.sub('page=(\d+)', "page="+str(_next), response.url)
         yield scrapy.Request(_next_url, self.parse)