def parse(self, response): if response.status_code == 200: log('{}抓取成功'.format(response.url)) pyquery = PyQuery(response.text) trs = pyquery('.boxindex table tr:nth-of-type(n+2)') for tr in trs.items(): yield { 'ip': tr('td').eq(0).text(), 'port': tr('td').eq(1).text() }
def spider(self): for type in ('nn', 'nt'): for page in range(self.begin_page, int(self.total_page) + 1): url = self.url.format(page=page, type=type) try: response = requests_get(url) if response.status_code != 200: raise RequestException for item in self.parse(response): yield item except RequestException: log('{}抓取失败, 加入重试的队列中'.format(url)) yield Request('GET', url)
def spider(self): self.set_cookie() for page in range(self.begin_page, int(self.total_page) + 1): url = self.url.format(page=page) try: response = requests.get(url, headers=self.headers, timeout=env('TIME_OUT')) if response.status_code != 200: raise RequestException for item in self.parse(response): yield item except RequestException: log('{}抓取失败, 加入重试的队列中'.format(url)) yield Request('GET', url)