from urllib.parse import urljoin from xpaw import Spider, HttpRequest, Selector, run_spider class XPathQuotesSpider(Spider): def start_requests(self): yield HttpRequest('http://quotes.toscrape.com/', callback=self.parse) def parse(self, response): selector = Selector(response.text) for quote in selector.xpath('//div[@class="quote"]'): text = quote.xpath('.//span[@itemprop="text"]')[0].text author = quote.xpath('.//small[@itemprop="author"]')[0].text author_url = quote.xpath('.//span/a/@href')[0].text author_url = urljoin(str(response.url), author_url) tags = quote.xpath('.//div[@class="tags"]/a').text self.log( 'quote: %s', dict(text=text, tags=tags, author=author, author_url=author_url)) next_page_url = selector.xpath('//li[@class="next"]/a/@href')[0].text self.log('next page url: %s', next_page_url) if __name__ == '__main__': run_spider(XPathQuotesSpider, log_level='DEBUG')
# coding=utf-8 from xpaw import Spider, HttpRequest, Selector, every, run_spider class CronJobSpider(Spider): @every(seconds=10) def start_requests(self): yield HttpRequest("http://news.baidu.com/", callback=self.parse, dont_filter=True) def parse(self, response): selector = Selector(response.text) hot = selector.css("div.hotnews a").text self.log("Hot News:") for i in range(len(hot)): self.log("%s: %s", i + 1, hot[i]) if __name__ == '__main__': run_spider(CronJobSpider)
# coding=utf-8 from xpaw import Spider, HttpRequest, Selector, run_spider class BaiduNewsSpider(Spider): def start_requests(self): yield HttpRequest("http://news.baidu.com/", callback=self.parse) def parse(self, response): selector = Selector(response.text) hot = selector.css("div.hotnews a").text self.log("Hot News:") for i in range(len(hot)): self.log("%s: %s", i + 1, hot[i]) if __name__ == '__main__': run_spider(BaiduNewsSpider)
callback=self.login) def login(self, response): try: self.headers['Cookie'] = response.headers['Set-Cookie'].split( ";")[0] print(self.headers) except Exception as e: print("<error>" + str(e) + "</error>") def parse_html(self, response): pass # try: # html = response.text # print(html) # res = re.findall("论文外审结果.*?</td>",html) # if "待专家评审" not in res[0]: # print("出结果了。。。") # else: # print("还没结果") # # raise Exception("test") # except Exception as e: # print("<error>" + str(e) + "</error>") def close(self): print("<acq_num>" + str(int(random.random() * 100)) + "</acq_num>") if __name__ == '__main__': run_spider(TestSpider, log_level='INFO')
from xpaw import Spider, HttpRequest, run_spider from xpaw.errors import HttpError, ClientError class ErrorHandlingSpider(Spider): start_urls = [ "http://www.python.org/", # 200 OK "http://www.httpbin.org/status/404", # 404 Not Found "http://www.httpbin.org/status/500", # 500 Service Not Available "http://unknown/", # ClientError ] def start_requests(self): for url in self.start_urls: yield HttpRequest(url, errback=self.handle_error) def parse(self, response): self.logger.info('Successful response: %s', response) def handle_error(self, request, error): if isinstance(error, HttpError): response = error.response self.logger.error('HttpError on %s: HTTP status=%s', request.url, response.status) elif isinstance(error, ClientError): self.logger.error('ClientError on %s: %s', request.url, error) if __name__ == '__main__': run_spider(ErrorHandlingSpider, retry_enabled=False)
# coding=utf-8 from xpaw import Spider, HttpRequest, Selector, run_spider class AsyncGeneratorSpider(Spider): """ Need Python 3.6+ """ async def start_requests(self): yield HttpRequest("http://quotes.toscrape.com/", callback=self.parse) async def parse(self, response): selector = Selector(response.text) tags = selector.xpath("//div[contains(@class, 'tags-box')]//a").text self.log("Top ten tags: %s", tags) yield HttpRequest("http://quotes.toscrape.com/", callback=self.parse) if __name__ == '__main__': run_spider(AsyncGeneratorSpider)
# coding=utf-8 from xpaw import Spider, HttpRequest, Selector, run_spider class RenderingSpider(Spider): def start_requests(self): yield HttpRequest('http://quotes.toscrape.com/js/', callback=self.parse, render=True) def parse(self, response): selector = Selector(response.text) for quote in selector.css('div.quote'): text = quote.css('span.text')[0].text author = quote.css('small.author')[0].text self.log(author + ": " + text) if __name__ == '__main__': run_spider(RenderingSpider)