def parse(self): tags = [('a', 'href'), ('img', 'src'), ('link', 'href'), ('iframe', 'src')] # TODO: check rel="canonical" urls = set([]) for tag_name, attr_name in tags: for tag in self.doc.findall('.//%s' % tag_name): attr = tag.get(attr_name) if attr is None: continue url = normalize_url(urljoin(self.url, attr)) if url is not None: urls.add(url) on_parse.send(self, urls=urls) for url in urls: self.state.crawl(url, path=self.next_path)
def url(self): url = self.raw_url if self._has_response(): url = self._response.url url = normalize_url(url) return url
def seeds(self): if not hasattr(self, '_seeds'): seeds = [normalize_url(s) for s in self.get_list('seed')] self._seeds = [s for s in seeds if s is not None] return self._seeds