def parse(self, response): sel = Selector(response) links = sel.xpath(LocationHotelSelectors.LIST_SALONS).extract() next_page = self.get_property(sel, LocationHotelSelectors.NEXT_URL) print u'links: %s, %s' % (len(links), response.url) if len(links) <= 0: self.log_message(u'links: %s, %s' % (len(links), response.url)) if LocationHotelSelectors.is_first_page(sel): total = LocationHotelSelectors.get_list_total(sel) self.total += total if total > 999: # yahoo search can not paginate beyond 1000 items # so need to run crawler for smaller areas or cateories page_cat = LocationHotelSelectors.get_category(sel) if page_cat and page_cat != "01": self.log_message(u'Pagination overflow: %s' % response.url) else: for category in GOURMET_CATEGORY: next_page = response.url.replace('genrecd=01', 'genrecd=%s' % category) print u'new links --> %s' % next_page request = Request(next_page, callback=self.parse, errback=self.parse_err) request.meta['page_kind'] = 'list' yield request if self.start_urls[-1] == response.url: self.log_message(u'Counted this many places: %s' % self.total) if self.scan_mode: return if links: for link in links: canonical = link.split('?')[0] if LocationRestaurantEs.check_by_url(canonical): # print u'skipped: %s' % link continue request = Request(link, callback=self.parse_salon, errback=self.parse_err) request.meta['page_kind'] = 'salon' yield request if next_page: request = Request(next_page, callback=self.parse, errback=self.parse_err) request.meta['page_kind'] = 'list' yield request