def _process_html(self, path, url, task, site): url_mapping = {} def _process_link(l): if not self._accept_download(l): return l if not url_mapping.has_key(l): if l.count(":"): #mailto:, javascript:, http: url_mapping[l] = l else: url_mapping[l] = utils.absolute_path(path, l) return url_mapping[l] crawler = LinkCrawler() crawler.crawling(site.real_path(path), url, _process_link) for link, path in url_mapping.iteritems(): if link.count(':') > 0: continue link = utils.absolute_url(url, link) self.logger.info("add spider:%s==>%s" % (link, path)) task.add_action("%s==>%s" % (link, path))