def parse_item(self, response): page = response.meta['page'] print utils.get_time_now(), "Target ==> " + (response.url) if page > MAX_PAGE_INDEX_NS: return #是否停止爬行 isBreak = False itemList = [] trs = response.xpath("//table[@class='table']/tbody/tr") if trs: for sel in trs: item = Item( sel.xpath('td[1]/text()').extract()[0], sel.xpath('td[2]/text()').extract()[0], sel.xpath('td[4]/text()').extract()[0], sel.xpath('td[5]/text()').extract()[0], sel.xpath('td[3]//text()').extract()[0], '0', sel.xpath('td[8]/text()').extract()[0]) itemList.append(item) #排序 try: import operator except ImportError: cmpfun = lambda x: x.count # use a lambda if no operator module else: cmpfun = operator.attrgetter( "last_verify_time" ) # use operator since it's faster than lambda itemList.sort(key=cmpfun, reverse=True) for item in itemList: if not self.daysDelta(item.last_verify_time): isBreak = True break else: item_ = IPItem() item_['ip'] = item.ip item_['port'] = item.port item_['anonymous'] = item.anonymous item_['http_type'] = item.http_type item_['location'] = item.location item_['latency'] = item.latency item_['last_verify_time'] = item.last_verify_time item_['source'] = self.allowed_domains[0] yield item_ else: isBreak = True if not isBreak: yield Request(self.start_urls[0] + '?page=' + str(page + 1), callback=self.parse_item, meta={'page': page + 1})
def parse_item(self, response): page = response.meta['page'] url = response.meta['url'] print utils.get_time_now(), "Target ==> " + (response.url) # if page > MAX_PAGE_INDEX_KDL: # return #是否停止爬行 isBreak = False itemList = [] trs = response.xpath("//table[@id='ip_list']/tr")[1:] if trs: for sel in trs: item = Item(sel.xpath('td[3]/text()').extract()[0], sel.xpath('td[4]/text()').extract()[0], sel.xpath('td[6]/text()').extract()[0], sel.xpath('td[7]/text()').extract()[0], "".join(sel.xpath('td[5]//a/text()').extract() + sel.xpath('td[5]/text()').extract()).strip(), sel.xpath('td[9]/div/@title').extract()[0], sel.xpath('td[10]/text()').extract()[0]) # print item.to_string() itemList.append(item) #排序 try: import operator except ImportError: cmpfun= lambda x: x.count # use a lambda if no operator module else: cmpfun= operator.attrgetter("last_verify_time") # use operator since it's faster than lambda itemList.sort(key=cmpfun, reverse=True) for item in itemList: if not self.daysDelta(item.last_verify_time): isBreak = True break else: item_ = IPItem() item_['ip'] = item.ip item_['port'] = item.port item_['anonymous'] = item.anonymous item_['http_type'] = item.http_type item_['location'] = item.location item_['latency'] = item.latency item_['last_verify_time'] = datetime.datetime.strptime(item.last_verify_time, '%y-%m-%d %H:%M') item_['source'] = url yield item_ else: isBreak = True if not isBreak: yield Request(url + str(page+1), callback=self.parse_item, meta={'page': page+1, 'url':url})
def parse_item(self, response): print utils.get_time_now(), "Target ==> " + (response.url) js = PyV8.JSContext() js.enter() main_js = response.xpath("//body/script[1]/text()").extract() js.eval(main_js[0]) itemList = [] trs = response.xpath("//table[2]/tr[4]/td/table/tr") if trs: for tr in trs[4:-1]: ip_port = tr.xpath("td[1]/font[2]") ip = ip_port.xpath("text()").extract() port = ip_port.xpath("script/text()").extract()[0] port = port.split('<\/font>"+')[1] port_list = port[:-1].split("+") port = "" for val in port_list: port = port + str(js.eval(val)) item = Item(ip[0], port, tr.xpath("td[3]/font/text()").extract()[0], "".join(tr.xpath("td[2]/a/font[@class='spy1']/text()").extract() + tr.xpath("td[2]/a/font[@class='spy14']/text()").extract()), "".join(tr.xpath("td[4]/font/text()").extract() + tr.xpath("td[4]/font/font/text()").extract()), tr.xpath("td[6]/font/text()").extract()[0], tr.xpath("td[9]/font/font[@class='spy14']/text()").extract()[0] + tr.xpath("td[9]/font/text()").extract()[0]) itemList.append(item) #排序 try: import operator except ImportError: cmpfun= lambda x: x.count # use a lambda if no operator module else: cmpfun= operator.attrgetter("last_verify_time") # use operator since it's faster than lambda itemList.sort(key=cmpfun, reverse=True) for item in itemList: if not self.daysDelta(str(item.last_verify_time)): break else: item_ = IPItem() item_['ip'] = item.ip item_['port'] = item.port item_['anonymous'] = item.anonymous item_['http_type'] = item.http_type item_['location'] = item.location item_['latency'] = item.latency item_['last_verify_time'] = datetime.datetime.strptime(item.last_verify_time[:-1] + ':00', '%d-%b-%Y %H:%M:%S') item_['source'] = self.allowed_domains[0] yield item_
def parse_item(self, response): page = response.meta['page'] print utils.get_time_now(), "Target ==> " + (response.url) if page > MAX_PAGE_INDEX_NS: return #是否停止爬行 isBreak = False itemList = [] trs = response.xpath("//table[@class='table']/tbody/tr") if trs: for sel in trs: item = Item(sel.xpath('td[1]/text()').extract()[0], sel.xpath('td[2]/text()').extract()[0], sel.xpath('td[4]/text()').extract()[0], sel.xpath('td[5]/text()').extract()[0], sel.xpath('td[3]//text()').extract()[0], '0', sel.xpath('td[8]/text()').extract()[0]) itemList.append(item) #排序 try: import operator except ImportError: cmpfun= lambda x: x.count # use a lambda if no operator module else: cmpfun= operator.attrgetter("last_verify_time") # use operator since it's faster than lambda itemList.sort(key=cmpfun, reverse=True) for item in itemList: if not self.daysDelta(item.last_verify_time): isBreak = True break else: item_ = IPItem() item_['ip'] = item.ip item_['port'] = item.port item_['anonymous'] = item.anonymous item_['http_type'] = item.http_type item_['location'] = item.location item_['latency'] = item.latency item_['last_verify_time'] = item.last_verify_time item_['source'] = self.allowed_domains[0] yield item_ else: isBreak = True if not isBreak: yield Request(self.start_urls[0] + '?page=' + str(page+1), callback=self.parse_item, meta={'page': page+1})
def parse_item(self, response): page = response.meta['page'] url = response.meta['url'] print utils.get_time_now(), "Target ==> " + (response.url) # if page > MAX_PAGE_INDEX_KDL: # return #是否停止爬行 isBreak = False itemList = [] trs = response.xpath("//table[@id='ip_list']/tr")[1:] if trs: for sel in trs: item = Item( sel.xpath('td[3]/text()').extract()[0], sel.xpath('td[4]/text()').extract()[0], sel.xpath('td[6]/text()').extract()[0], sel.xpath('td[7]/text()').extract()[0], "".join( sel.xpath('td[5]//a/text()').extract() + sel.xpath('td[5]/text()').extract()).strip(), sel.xpath('td[9]/div/@title').extract()[0], sel.xpath('td[10]/text()').extract()[0]) # print item.to_string() itemList.append(item) #排序 try: import operator except ImportError: cmpfun = lambda x: x.count # use a lambda if no operator module else: cmpfun = operator.attrgetter( "last_verify_time" ) # use operator since it's faster than lambda itemList.sort(key=cmpfun, reverse=True) for item in itemList: if not self.daysDelta(item.last_verify_time): isBreak = True break else: item_ = IPItem() item_['ip'] = item.ip item_['port'] = item.port item_['anonymous'] = item.anonymous item_['http_type'] = item.http_type item_['location'] = item.location item_['latency'] = item.latency item_['last_verify_time'] = datetime.datetime.strptime( item.last_verify_time, '%y-%m-%d %H:%M') item_['source'] = url yield item_ else: isBreak = True if not isBreak: yield Request(url + str(page + 1), callback=self.parse_item, meta={ 'page': page + 1, 'url': url })
def parse_item(self, response): print utils.get_time_now(), "Target ==> " + (response.url) js = PyV8.JSContext() js.enter() main_js = response.xpath("//body/script[1]/text()").extract() js.eval(main_js[0]) itemList = [] trs = response.xpath("//table[2]/tr[4]/td/table/tr") if trs: for tr in trs[4:-1]: ip_port = tr.xpath("td[1]/font[2]") ip = ip_port.xpath("text()").extract() port = ip_port.xpath("script/text()").extract()[0] port = port.split('<\/font>"+')[1] port_list = port[:-1].split("+") port = "" for val in port_list: port = port + str(js.eval(val)) item = Item( ip[0], port, tr.xpath("td[3]/font/text()").extract()[0], "".join( tr.xpath( "td[2]/a/font[@class='spy1']/text()").extract() + tr.xpath( "td[2]/a/font[@class='spy14']/text()").extract()), "".join( tr.xpath("td[4]/font/text()").extract() + tr.xpath("td[4]/font/font/text()").extract()), tr.xpath("td[6]/font/text()").extract()[0], tr.xpath("td[9]/font/font[@class='spy14']/text()").extract( )[0] + tr.xpath("td[9]/font/text()").extract()[0]) itemList.append(item) #排序 try: import operator except ImportError: cmpfun = lambda x: x.count # use a lambda if no operator module else: cmpfun = operator.attrgetter( "last_verify_time" ) # use operator since it's faster than lambda itemList.sort(key=cmpfun, reverse=True) for item in itemList: if not self.daysDelta(str(item.last_verify_time)): break else: item_ = IPItem() item_['ip'] = item.ip item_['port'] = item.port item_['anonymous'] = item.anonymous item_['http_type'] = item.http_type item_['location'] = item.location item_['latency'] = item.latency item_['last_verify_time'] = datetime.datetime.strptime( item.last_verify_time[:-1] + ':00', '%d-%b-%Y %H:%M:%S') item_['source'] = self.allowed_domains[0] yield item_