예제 #1
0
    def craw(self):
        self.urls.add_new_url(self.root)
        while self.urls.has_new_url():
            _content = []
            th = []
            for i in list(range(self.threadNum)):
                if self.urls.has_new_url() is False:
                    break
                new_url = self.urls.get_new_url()
                print("craw:" + new_url)
                t = threading.Thread(target=self.download.download,
                                     args=(new_url, _content))
                t.start()
                th.append(t)
            for t in th:
                t.join()

            for _str in _content:
                if _str is None:
                    continue
                new_urls = self._parse(new_url, _str["html"])
                disallow = ["sqlcheck"]
                _plugin = plugin.spiderplus("script", disallow)
                _plugin.work(_str["url"], _str["html"])
                self.urls.add_new_urls(new_urls)
예제 #2
0
 def craw(self):
     """
     爬虫入口
     :return:
     """
     self.urls.add_new_url(self.root)
     while self.urls.has_new_url():
         _content = []
         th = []
         for i in list(range(self.threadNum)):
             if self.urls.has_new_url() is False:
                 break
             new_url = self.urls.get_new_url()
             print("crwa: %s" % new_url)
             t = threading.Thread(target=self.download.download,
                                  args=(new_url, _content))
             t.start()
             th.append(t)
         for t in th:
             t.join()
         for _str in _content:
             if _str is None:
                 continue
             new_urls = self._parse(new_url, _str['html'])
             disallow = []
             _plugin = plugin.spiderplus('script', disallow)
             _plugin.work(_str['url'], _str['html'])
             self.urls.add_new_urls(new_urls)