示例#1
0
文件: Site.py 项目: emop/task-worker
    def _process_html(self, path, url, task, site):

        url_mapping = {}

        def _process_link(l):
            if not self._accept_download(l):
                return l

            if not url_mapping.has_key(l):
                if l.count(":"):  #mailto:, javascript:, http:
                    url_mapping[l] = l
                else:
                    url_mapping[l] = utils.absolute_path(path, l)

            return url_mapping[l]

        crawler = LinkCrawler()
        crawler.crawling(site.real_path(path), url, _process_link)

        for link, path in url_mapping.iteritems():
            if link.count(':') > 0: continue
            link = utils.absolute_url(url, link)

            self.logger.info("add spider:%s==>%s" % (link, path))
            task.add_action("%s==>%s" % (link, path))
示例#2
0
 def _process_html(self, path, url, task, site):
     
     url_mapping = {}
     def _process_link(l):
         if not self._accept_download(l):
             return l
         
         if not url_mapping.has_key(l):
             if l.count(":"): #mailto:, javascript:, http:
                 url_mapping[l] = l
             else:
                 url_mapping[l] = utils.absolute_path(path, l)
         
         return url_mapping[l]
     
     crawler = LinkCrawler()
     crawler.crawling(site.real_path(path), url, _process_link)
     
     for link, path in url_mapping.iteritems():
         if link.count(':') > 0: continue
         link = utils.absolute_url(url, link)
                     
         self.logger.info("add spider:%s==>%s" % (link, path))            
         task.add_action("%s==>%s" % (link, path))