class Scheduler(object): def __init__(self): self.proxies = ProxyHandler() self.tasker = Tasker() self.saver = Saver() self.log_handler = LogHandler() self.tieba_task = tieba_task self.loop_num = LOOP_NUM def end_task(self, loop): # 保存数据到mongodb self.saver.run() if self.tasker.check_retries(): self.retry_task(loop) else: loop.close() self.log_handler.logger().info('完成循环') print('Finish!!') def retry_task(self, loop): #已经分片好的每次重试的url列表 per_step_urls_list = self.tasker.get_perloop_retry() # 获取代理池中的代理 proxy_list = self.proxies.get_proxies() for per_step_urls in per_step_urls_list: # 待处理 tasks = tieba_task('retry', per_step_urls, proxy_list) # 启动 loop.run_until_complete(asyncio.gather(*tasks)) self.end_task(loop) # @run_time def start_task(self): loop = asyncio.get_event_loop() # 获取代理池中的代理 proxy_list = self.proxies.get_proxies() # 每次所有任务 one_task = self.tasker.get_task() # self.loop_num: 每个任务循环几次, 避免Semaphore量太大 for index in range(self.loop_num): # 每次循环的urls per_step_urls = self.tasker.get_perloop(one_task) # 待处理 tasks = tieba_task(index, per_step_urls, proxy_list) # 启动 loop.run_until_complete(asyncio.gather(*tasks)) self.end_task(loop)
def run(): # 协程池 loop = asyncio.get_event_loop() tasker = Tasker() each_task = tasker.get_task() # LOOP_NUM: 每个任务循环几次, 避免Semaphore量太大 for i in range(LOOP_NUM): # 每次循环的urls per_step_urls = tasker.get_urls(each_task) # 待处理 tasks = tasker.create_task(i, per_step_urls) # 启动 loop.run_until_complete(asyncio.gather(*tasks)) # 保存数据到mongodb saver = Saver() saver.run() loop.close()