def url_manager_proc(self, url_q: Queue, conn_q: Queue, root_url): print('url manager process start...') url_manager = UrlManager() url_manager.add_new_url(root_url) print('url manager process started...') while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() print('new_url', new_url) # 将新的URL发给工作节点 url_q.put(new_url) # 加一个判断条件, 当爬取2000个链接后就关闭, 并保存进度 if url_manager.old_url_size() > 2000: # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知') # 关闭管理节点, 同事存储set状态 url_manager.save_process(path.join('dist', 'new_urls.txt'), url_manager.new_urls) url_manager.save_process(path.join('dist', 'old_urls.txt'), url_manager.old_urls) return # 将从result_solve_proc 获取到的URL添加到URL管理器 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1)
class Scheduler(object): def __init__(self): self.url_manager = UrlManager() self.downloader = Downloader() self.parser = Parser() self.data_output = DadaOutput() def crawl(self, start_url, max_page): self.url_manager.add_new_url(start_url) while self.url_manager.has_url( ) and self.url_manager.old_url_size() < max_page: page_url = self.url_manager.get_new_url() page_html = self.downloader.down(page_url) new_urls, new_data = self.parser.parse(start_url, page_html) self.url_manager.add_new_urls(new_urls) self.data_output.store_data(new_data) self.data_output.output_html() print('第%s条数据写入' % (self.url_manager.old_url_size()))
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) print(url_q) while True: while (url_manager.has_new_url()): new_url = url_manager.get_new_url() url_q.put(new_url) print('old_url=%s' % url_manager.old_url_size()) if (url_manager.old_url_size() > 2000): url_q.put('end') print('控制节点发起结束通知!') url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1)