def parse(self, response): iplist = response.xpath('//table/tr') page_number = response.xpath( "//div[@class='pagination']/a[last()-1]/text()").extract_first() level = response.meta['level'] for x in iplist[1:-1]: ips = x.xpath('td[1]/text()').extract_first() ports = x.xpath('td[2]/text()').extract_first() protocols = x.xpath('td[5]/text()').extract_first() types = x.xpath('td[4]/text()').extract_first() province = x.xpath('td[3]/a[1]/text()').extract_first() city = x.xpath('td[3]/a[2]/text()').extract_first() address = "" if city is not None: address = province + city else: address = province yield ProxyPoolItem({ 'ip': ips, 'protocol': protocols, 'port': ports, 'types': types, 'address': address, 'website': 'www.mimiip.com' }) if level == 1 and page_number is not None: url = response.url for i in range(2, int(page_number) + 1): yield scrapy.Request("{0}/{1}".format(url, i), meta={'level': 2})
def parse(self, response): page = re.findall( '<strong><font color="red">(\d+)</font>/(\d+)</strong>', response.text) ips = re.findall('<td class="style1">(\d+\.\d+\.\d+\.\d+)</td>', response.text) ports = re.findall('<td class="style2">(\d+)</td>', response.text) types = re.findall('<td class="style3">([^<]+)</td>', response.text) addresses = re.findall('<td class="style5">([^<]+)</td>', response.text) protocols = re.findall('<td class="style4">([^<]+)</td>', response.text) if page: now_page, count_all = page[0] for ip, port, typ, address, protocol in zip( ips, ports, types, addresses, protocols): yield ProxyPoolItem({ 'ip': ip, 'protocol': protocol, 'types': typ, 'port': port, 'address': address, 'website': 'www.yun-daili.com' }) _next = int(now_page) + 1 _next_url = re.sub('page=(\d+)', "page=" + str(_next), response.url) yield scrapy.Request(_next_url, self.parse)
def parse_list(self, response): ip_list = response.css('#footer > div > table > tr') for _ip_info in ip_list: _ip = _ip_info.css('td:nth-child(1)::text').extract() _port = _ip_info.css('td:nth-child(2)::text').extract() if not _ip or not _port: continue if _ip[0].strip() == 'ip': continue item = ProxyPoolItem() item['ip'] = _ip[0].strip() item['port'] = int(_port[0].strip()) item['http_type'] = 3 item['country'] = 'CN' yield item now_page = re.findall('.*?\/(\d+)\.html', response._url) if now_page: now_page = int(now_page[0]) if now_page < 5: yield scrapy.Request( response._url.replace('%d.html' % (now_page), '%d.html' % (now_page + 1)), callback=self.parse_list) time.sleep(1)
def parse(self, response): iplist = response.xpath('//ul[@class="l2"]') for x in iplist[1:-1]: ip = x.xpath('span[1]/li/text()').extract_first() port = x.xpath('span[2]/li/text()').extract_first() protocol = str( x.xpath('span[4]/li/a/text()').extract_first()).upper() type = x.xpath('span[3]/li/a/text()').extract_first() city = x.xpath('span[6]/li/a/text()').extract_first() country = x.xpath( '//ul[@class="l2"][1]/span[5]/li/a/text()').extract_first() address = "" if city is not None: address = country + city else: address = country yield ProxyPoolItem({ 'ip': ip, 'protocol': protocol, 'port': port, 'types': type, 'address': address, 'website': 'www.data5u.com' })
def parse(self, response): info = response.xpath('//div[@class="daililist"]/table/tr') for x in info[1::2]: item = ProxyPoolItem() data = x.xpath('td/text()').extract() item['ip'] = data[0].split(':')[0] item['protocol'] = 'HTTP' item['port'] = data[0].split(':')[1] item['address'] = data[1] item['website'] = 'baizhongsou.com' item['types'] = '普通' yield item
def parse(self, response): s = response.text r = re.compile(r'<tr>[\s\S]*?<\/tr>', re.I|re.M) d = r.findall(s) for each in d[1:]: yield ProxyPoolItem({ 'ip': re.findall(r'\d+\.\d+\.\d+\.\d+', each)[0], 'port': re.findall(r'<td>(\d+)<\/td>', each)[0], 'types': re.findall(r'<td>([\s\S]*?)<\/td>', each)[6], 'protocol': re.findall(r'<td>([\s\S]*?)<\/td>', each)[4], 'address': re.findall(r'\.aspx">([^<]+)<\/a>', each)[0], 'website': 'https://proxy.coderbusy.com/' })
def parse(self, response): all_data = re.findall( '<td>(\d+\.\d+\.\d+\.\d+)</td>\s+<td>(\d+)</td>\s+<td>([^<]+)</td>\s+<td>([^<]+)</td>\s+<td>([^<]+)</td>\s+<td>([^<]+)</td>', response.text) for each in all_data: yield ProxyPoolItem({ 'ip': each[0], 'port': each[1], 'types': each[2], 'protocol': each[3], 'address': each[5], 'website': 'www.kxdaili.com' })
def parse(self, response): ips = re.findall('<td>(\d+\.\d+\.\d+\.\d+)</td>', response.text) ports = re.findall('<td>(\d+)</td>', response.text) types = re.findall('<td class="country">([^<]+)</td>', response.text) protocols = re.findall('<td>(HTTPS?)</td>', response.text) for ip, port, _type, protocol in zip(ips, ports, types, protocols): yield ProxyPoolItem({ 'website': 'xicidaili', 'ip': ip, 'protocol': protocol, 'port': port, 'types': _type })
def parse(self, response): data = response.xpath('//*[@id="list"]/table/tbody/tr') if data : for x in data : item = ProxyPoolItem() info = x.xpath('td') ipport = x.xpath('td[1]//*[name(.)!="p"]/text()').extract() address = x.xpath('td[4]//*[name(.)="a"]/text()').extract() item['ip'] = "".join(ipport[:-1]) item['port'] = ipport[-1] item['protocol'] = info[2].xpath('string(.)').extract()[0] item['types'] = info[1].xpath('string(.)').extract()[0] item['address'] = "".join(address) item['website'] = 'goubanjia.com' yield item
def parse(self, response): data = response.xpath( '//table[@class="table table-bordered table-striped table-hover"]/tr' ) if data: for x in data[1:]: item = ProxyPoolItem() info = x.xpath('td/text()').extract() item['ip'] = info[0].strip() item['protocol'] = info[3].strip() item['port'] = info[1].strip() item['types'] = info[2].strip() item['address'] = info[4].strip() item['website'] = 'iphai.com' yield item
def parse(self, response): info = response.xpath('//div[@class="CommonBody"]/table[6]/tr[4]/td/table/tr') for x in info[1:]: item = ProxyPoolItem() data = x.xpath('td/text()').extract() try: item['ip'] = data[0] except IndexError: continue item['protocol'] = 'HTTP' item['port'] = data[1] item['address'] = x.xpath('td/div/text()').extract()[0] item['types'] = data[2] item['website'] = '3464.com' yield item
def parse(self, response): info = response.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr') if info: for d in info: item = ProxyPoolItem() data = d.xpath('td/text()').extract() item['ip'] = data[0] item['protocol'] = data[3] item['port'] = data[1] item['types'] = data[2] item['address'] = data[4] item['website'] = 'httpsdaili.com' yield item next_url = response.xpath('//div[@id="listnav"]/ul/a/@href').extract()[-2] yield Request('http://www.httpsdaili.com/'+next_url, self.parse)
def parse(self, response): typee, now_page = re.findall('com/free/(.*?)/(\d+)/', response.url)[0] iplist = response.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr') if iplist: for x in iplist: data = x.xpath('td/text()').extract() item = ProxyPoolItem() item['ip'] = data[0] item['protocol'] = data[3] item['port'] = data[1] item['types'] = data[2] item['address'] = data[4] item['website'] = 'www.kuaidaili.com' yield item next_page = int(now_page) + 1 next_url = 'http://www.kuaidaili.com/free/{}/{}/'.format(typee, next_page) yield Request(next_url, self.parse)
def parse(self, response): ip_list = response.css('#ip_list > tr') for _ip_info in ip_list: _ip = _ip_info.css('td:nth-child(2)::text').extract() _port = _ip_info.css('td:nth-child(3)::text').extract() _http_type = _ip_info.css('td:nth-child(6)::text').extract() if not _ip or not _port or not _http_type: continue item = ProxyPoolItem() item['ip'] = _ip[0].strip() item['port'] = int(_port[0].strip()) item['http_type'] = 1 if _http_type[0].strip() == 'HTTP' else 2 item['country'] = 'CN' yield item if self.CURRENT_PAGE < 5: self.CURRENT_PAGE += 1 yield scrapy.Request('http://www.xicidaili.com/nn/' + str(self.CURRENT_PAGE))
def parse(self, response): info = response.xpath('/html/body/div[2]/table/tbody/tr') for x in info: ip_script = x.xpath('td[1]/script/text()').extract()[0] item = ProxyPoolItem() item['ip'], item['port'] = parse_script(ip_script) item['protocol'] = x.xpath('td[2]/text()').extract()[0].strip() item['address'] = x.xpath('td[3]/abbr/text()').extract()[0] item['types'] = x.xpath('td[4]/span/text()').extract()[0] item['website'] = 'proxydb.net' yield item next_url = response.xpath( '/html/body/div[2]/nav/a[2]/@href').extract()[0] if self.index_page < self.max_page and next_url: self.index_page += 1 yield Request('http://proxydb.net' + next_url, self.parse, headers={'user-agent': USER_AGENT})
def parse(self, response): iplist = response.xpath('//*[@id="footer"]/div/table//tr') for x in iplist[1:-1]: ips = x.xpath('td[1]/text()').extract_first() ports = x.xpath('td[2]/text()').extract_first() protocols = 'HTTP' address = x.xpath('td[3]/text()').extract_first() types = x.xpath('td[4]/text()').extract_first() yield ProxyPoolItem({ 'ip': ips, 'protocol': protocols, 'port': ports, 'types': types, 'address': address, 'website': 'www.66ip.cn' })
def parse(self, response): page = re.findall('<strong><font color="#49afcd">(\d+)</font>/(\d+)</strong>', response.text) ips = re.findall('<td style="WIDTH:110PX">(\d+\.\d+\.\d+\.\d+)</td>', response.text) ports = re.findall('<td style="WIDTH:40PX">(\d+)</td>', response.text) types = re.findall(u"<td style=\"WIDTH:55PX\">([\u4e00-\u9fa5]+)</td>",response.text) addresses = re.findall(u"<td style=\"WIDTH:135PX\">([[\u4e00-\u9fa5]+)</td>", response.text) protocols = re.findall('<td style="WIDTH:55PX">(HTTPS?)</td>', response.text) if page: now_page, count_all = page[0] for ip, port, typ, address, protocol in zip(ips, ports, types, addresses, protocols): yield ProxyPoolItem({ 'ip': ip, 'protocol': protocol, 'types': typ, 'port': port, 'address': address, 'website': 'www.nianshao.me' }) _next = int(now_page) + 1 _next_url = re.sub('page=(\d+)', "page="+str(_next), response.url) yield scrapy.Request(_next_url, self.parse)