def parse(self, response): try: sel = Selector(response) # not sure if this needs editing movies = sel.xpath('//table[@class="left rt_table"]/tbody/tr') # make sure we have a list of movies if not movies: self.log("Unable to find list of movies in {:s}.".format(response.request.url), level=log.ERROR) items = [] for movie in movies: # Ignore the header row, which is the first row returned if (movie.xpath('th')): continue # Gather information about the movie loader = ItemLoader(MovieItem(), response = response, selector = movie) # Generate the href for the details page details_href = 'http://' + response.url.split('/')[2] + str(movie.xpath('td[3]/a/@href').extract()[0]) loader.add_xpath('category', '//form[@action="/top/bestofrt/"]/p/select/option[@selected="selected"]/text()') loader.add_xpath('rank', 'td[1]/text()', re = r'\d+') # ignore the '.' loader.add_xpath('rating_tomatoes', 'td[2]/span/span[2]/text()', re = r'\d+') loader.add_xpath('title', 'td[3]/a/text()', re = r'.*(?= \([0-9]{4}\))') loader.add_xpath('review_count', 'td[4]/text()') loader.add_xpath('year', 'td[3]/a/text()', re = r'\d{4}(?=\)$)') yield loader.load_item() # yield Request(url=details_href, callback=self.parse_movie_details) except Exception as e: # Log the exception then reraise it. log("Could not parse URL '{:s}'".format(response.request.url), level=log.ERROR) def parse_movie_details(self, response, loader): # Process further details here yield loader.load_item()
* Scrapy提供5层logging级别 scrapy.log.CRITICAL 严重错误的Log级别 scrapy.log.ERROR 错误的Log级别 Log level for errors scrapy.log.WARNING 警告的Log级别 Log level for warnings scrapy.log.INFO 记录信息的Log级别(生产部署时推荐的Log级别) scrapy.log.DEBUG 调试信息的Log级别(开发时推荐的Log级别) * 使用 from scrapy import log log.msg("This is a warning", level=log.WARNING) * 在spider中添加log的推荐方式是使用Spider的 log() 方法 * 该方法会自动在调用 scrapy.log.msg() 时赋值 spider 参数,其他的参数则直接传递给 msg() 方法 * '以上都是被标识为过时(废弃)的了... ...' ---------------------------- loging | ---------------------------- import logging logging.warning("This is a warning") logging.debug() logging.log(logging.WARNING, "This is a warning") * 在scrapy中使用 self.logger.warning('警告信息')
def process_response(self, request, response, spider): log('Response received from request url %s ' % (request.url))
def process_request(self, request, spider): log('Requesting url %s with ' % (request.url))