def url_manage_proc(self, url_q, conn_q, root_url, page_num): url_manager = UrlManager() url_manager.add_new_url(root_url) print('url_mannager is working...') while True: while url_manager.has_new_url(): # 从URL管理器获取新的URL new_url = url_manager.get_new_url() # 将新的URL发到工作节点 url_q.put(new_url) # 加上判断, 爬满2000个链接终止爬虫并保存进度 if (url_manager.old_urls_size() > page_num): # 通知爬虫节点结束工作 url_q.put('end') print('控制节点发起结束通知!') # 关闭节点同事存储状态 url_manager.save_process('new_urls.txt', url_manager.new_urls) url_manager.save_process('old_urls.txt', url_manager.old_urls) return # 从result_solve_proc获取的URL添加到URL管理器 print('url control working..., solve result') try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except Exception as e: time.sleep(1) # 延时休息 print('has crawl page num : ', url_manager.old_urls_size()) time.sleep(5)
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() # url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): # 从URL管理器获取新的url new_url = url_manager.get_new_url() print(new_url) # 将新的URL发给工作节点 url_q.put(new_url) print('old_url=', url_manager.old_url_size()) # 加一个判断条件,当爬去2000个链接后就关闭,并保存进度 if (url_manager.old_url_size() > 2000): # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知!') # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return # 将从result_solve_proc获取到的urls添加到URL管理器之间 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1) # 延时休息
class SpiderMan(): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url =self.manager.get_new_url() html = self.downloader.download(new_url) new_urls,data = self.parser.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print("已经抓取%s个链接"%self.manager.old_url_size()) except Exception as e: print('crawl failed') self.output.output_html()
class Spiderman(object): def __init__(self): self.manage = UrlManager() self.parser = HtmlParser() self.downloader = Htmldownloader() self.output = DataOutput() def crawl(self,root_url): self.manage.add_new_url(root_url) print(len(self.manage.new_urls)) while(self.manage.has_new_url() and self.manage.old_url_size() < 100): try: new_url = self.manage.get_new_url() html = self.downloader.download(new_url) new_urls,data = self.parser.parser(new_url,html) self.manage.add_new_urls(new_urls) self.output.store_data(data=data) print('已经抓取%s个链接' % self.manage.old_url_size()) except: print('crawl Failed') self.output.output_html()
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: if url_manager.has_new_url(): new_url = url_manager.get_new_url() url_q.put(new_url) print('old_url=', url_manager.old_url_size()) if url_manager.old_url_size() > 2000: url_q.put('end') print('Manager notify ending!') url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1)