def craw(self): self.urls.add_new_url(self.root) while self.urls.has_new_url(): _content = [] th = [] for i in list(range(self.threadNum)): if self.urls.has_new_url() is False: break new_url = self.urls.get_new_url() print("craw:" + new_url) t = threading.Thread(target=self.download.download, args=(new_url, _content)) t.start() th.append(t) for t in th: t.join() for _str in _content: if _str is None: continue new_urls = self._parse(new_url, _str["html"]) disallow = ["sqlcheck"] _plugin = plugin.spiderplus("script", disallow) _plugin.work(_str["url"], _str["html"]) self.urls.add_new_urls(new_urls)
def craw(self): """ 爬虫入口 :return: """ self.urls.add_new_url(self.root) while self.urls.has_new_url(): _content = [] th = [] for i in list(range(self.threadNum)): if self.urls.has_new_url() is False: break new_url = self.urls.get_new_url() print("crwa: %s" % new_url) t = threading.Thread(target=self.download.download, args=(new_url, _content)) t.start() th.append(t) for t in th: t.join() for _str in _content: if _str is None: continue new_urls = self._parse(new_url, _str['html']) disallow = [] _plugin = plugin.spiderplus('script', disallow) _plugin.work(_str['url'], _str['html']) self.urls.add_new_urls(new_urls)