def parse_html(self, response, lru): depth = response.meta['depth'] lrulinks = [] for link in self.link_extractor.extract_links(response): try: lrulink = url_to_lru(link.url) except ValueError, e: self.log("Error converting URL to LRU: %s" % e, log.ERROR) continue lrulinks.append(lrulink) if self._should_follow(depth, lru, lrulink) and \ not url_has_any_extension(link.url, self.ignored_exts): yield Request(link.url, callback=self.parse)
def parse(self, response): lru = url_to_lru(response.url) if isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru)