def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, parser_worker_count, downloader_worker_count, resulter_worker_count, speed=None, session=requests.session()): self.parser_worker_count = int(parser_worker_count) self.downloader_worker_count = int(downloader_worker_count) self.resulter_worker_count = int(resulter_worker_count) self.downloader_worker = [] self.parser_worker = [] self.resulter_worker = [] self.log = Log("Crawler") self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q if speed is not None: TaskManager.download_wait = 1 / speed self.task_manager = TaskManager(self.to_download_q) self.session = session self.lock = LOCK self.task_manager_thread = Thread(target=self.task_manager.run)
def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str): super().__init__(name=name) self.result_q = result_q self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self._exit = False self.log = Log(self.name)
class Crawler: def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, parser_worker_count, downloader_worker_count, resulter_worker_count, speed=None, session=requests.session()): self.parser_worker_count = int(parser_worker_count) self.downloader_worker_count = int(downloader_worker_count) self.resulter_worker_count = int(resulter_worker_count) self.downloader_worker = [] self.parser_worker = [] self.resulter_worker = [] self.log = Log("Crawler") self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q if speed is not None: TaskManager.download_wait = 1 / speed self.task_manager = TaskManager(self.to_download_q) self.session = session self.lock = LOCK self.task_manager_thread = Thread(target=self.task_manager.run) def start(self): self.task_manager_thread.start() for i in range(self.downloader_worker_count): _worker = Downloader( self.to_download_q, self.downloader_parser_q, self.result_q, "Downloader {}".format(i), self.session, ) self.downloader_worker.append(_worker) self.log.log_it("启动 Downloader {}".format(i), 'INFO') _worker.start() for i in range(self.parser_worker_count): _worker = Parser(self.to_download_q, self.downloader_parser_q, self.result_q, "Parser {}".format(i)) self.parser_worker.append(_worker) self.log.log_it("启动 Parser {}".format(i), 'INFO') _worker.start() for i in range(self.resulter_worker_count): _worker = Resulter(self.to_download_q, self.downloader_parser_q, self.result_q, "Resulter {}".format(i)) self.resulter_worker.append(_worker) self.log.log_it("启动 Resulter {}".format(i), 'INFO') _worker.start() while True: time.sleep(1) if self.task_manager.is_empty(): for worker in self.downloader_worker: worker.exit() for worker in self.parser_worker: worker.exit() resulter_not_alive = False while not resulter_not_alive: resulter_not_alive = True time.sleep(1) for worker in self.resulter_worker: resulter_not_alive &= not worker.is_alive() for worker in self.resulter_worker: worker.exit() self.task_manager.exit() TaskManager.ALLDONE = False return
class Resulter(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str): super().__init__(name=name) self.result_q = result_q self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self._exit = False self.log = Log(self.name) def exit(self): self._exit = True def result(self): with COND: COND.notify_all() try: task = self.result_q.get_nowait() except Empty: time.sleep(0.1) return try: self.log.log_it("正在处理{}".format(task['tid'])) task['resulter'](task) except RetryDownload: self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO') retry(task, self.to_download_q) return except RetryDownloadEnForceNodelay: self.log.log_it( "RetryDownloadEnForce Exception.Task{}".format(task), 'INFO') self.to_download_q.put(task) return except RetryDownloadNodelay: self.log.log_it( "RetryDownloadNodelay Exception.Task{}".format(task), 'INFO') retry_nodelay(task, self.to_download_q) return except RetryParse: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') retry(task, self.downloader_parser_q) return except RetryParseEnForceNodelay: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') self.downloader_parser_q.put(task) return except RetryParseNodelay: self.log.log_it("RetryParseNodelay Exception.Task{}".format(task), 'INFO') retry_nodelay(task, self.downloader_parser_q) return except RetryResult: self.log.log_it("RetryResult Exception.Task{}".format(task), 'INFO') retry(task, self.result_q) return except RetryResultEnForceNodelay: self.log.log_it("RetryResultEnForce Exception.Task{}".format(task), 'INFO') self.result_q.put(task) return except RetryResultNodelay: self.log.log_it("RetryResultNodelay Exception.Task{}".format(task), 'INFO') retry_nodelay(task, self.result_q) return except Exception as e: traceback.print_exc() self.log.log_it( "Resulter函数错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN') retry(task, self.result_q) return def run(self): while (not TaskManager.ALLDONE) or (not self.result_q.empty()) or ( not self.to_download_q.empty()): self.result() self.log.logd("Exit")
class Parser(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str): super().__init__(name=name) self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self.result_q = result_q self._exit = False self.log = Log(self.name) def exit(self): self._exit = True def parser(self): # 此处需要先唤醒一次,否则会出现死锁 with COND: COND.notify_all() try: task = self.downloader_parser_q.get_nowait() except Empty: # 否则会一直while time.sleep(0.1) with COND: COND.notify_all() return try: task_with_parsed_data, tasks = task['parser'](task) if tasks: if not isinstance(tasks, list): tasks = [tasks] self.log.log_it("获取新任务{}个。".format(len(tasks)), 'INFO') for each_task in tasks: # 注册新任务 TaskManager.register(each_task['tid']) # 放入队列 self.to_download_q.put(each_task) # 处理各种在爬虫脚本中抛出的重试错误 except RetryDownload: self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO') retry(task, self.to_download_q) return except RetryDownloadEnForce: self.log.log_it( "RetryDownloadEnForce Exception.Task{}".format(task), 'INFO') self.to_download_q.put(task) return except RetryParse: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') retry(task, self.downloader_parser_q) return except RetryParseEnForce: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') self.downloader_parser_q.put(task) return except Exception as e: self.log.log_it("解析错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN') traceback.print_exc() return # 在Parser里面反注册 TaskManager.unregister(task['tid']) return task_with_parsed_data def run(self): while not self._exit: task_with_parsed_data = self.parser() # 放入Resulter队列 if task_with_parsed_data: self.result_q.put(task_with_parsed_data) self.log.logd("Exit")
class Downloader(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str, session=requests.session()): super().__init__(name=name) self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.session = session self._exit = False self.log = Log(self.name) def exit(self): self._exit = True def request(self): response = None if time.time( ) - TaskManager.last_download_time < TaskManager.download_wait: time.sleep(TaskManager.download_wait / 4) return try: # 获取task task = self.to_download_q.get_nowait() # 在TaskManager里面注册 TaskManager.register(task['tid']) except Empty: self.log.log_it( "Scheduler to Downloader队列为空,{}等待中。".format(self.name), 'DEBUG') # 等待被Parser唤醒 with COND: COND.wait() self.log.log_it( "Downloader to Parser队列不为空。{}被唤醒。".format(self.name), 'DEBUG') return self.log.log_it("请求 {}".format(task['url']), 'INFO') try: # 记录下时间 TaskManager.mark_download_time() # 网络请求 response = self.session.request(task['method'], task['url'], **task.get('meta', {})) except Exception as e: traceback.print_exc() self.log.log_it( "网络请求错误。错误信息:{} URL:{} Response:{}".format( str(e), task['url'], response), 'INFO') # 重试 retry(task, self.to_download_q) return # 如果网络请求成功 if response: task['response'] = response else: task['response'] = None # 放入队列供Parser使用 self.downloader_parser_q.put(task) def run(self): while not self._exit: self.request() self.log.logd("Exit")