예제 #1
0
 def get_task(self):
     """
     从数据库中定时获取需要执行的任务,发送至下载队列
     :return:
     """
     task_cell = self.crawler_setting.get(
         "task_cell") if self.crawler_setting.get("task_cell") else 10
     mq_queue = get_queue(self.crawler_setting, 'download')
     mq_conn = connect(mq_queue, self.mq_params[0], self.mq_params[1],
                       self.mq_params[2], self.mq_params[3])
     while True:
         if RedisUtil.get_lock():
             tasks = SqlUtil.get_task()
             if tasks:
                 for task in tasks:
                     task_id = task.get("task_id")
                     RedisUtil.monitor_task(task_id)
                     task["main_task_flag"] = 1
                     message = repr(task)
                     # 判断是否超出队列限制大小,超出则不下发
                     is_send(self.mq_params, self.crawler_setting, mq_queue)
                     send_data(mq_conn, '', message, mq_queue)
                     SqlUtil.update_task(
                         1, "'{}'".format(task_id),
                         "'{}'".format(task.get("exec_time")),
                         "'{}'".format(task.get("pre_exec_time")))
                 Logger.logger.info(
                     "任务发送完成, 开始进行休眠, 休眠..{}s..".format(task_cell))
             else:
                 Logger.logger.info(
                     "没有可提取的任务,开始进行休眠,休眠..{}s..".format(task_cell))
             RedisUtil.release_lock()
         else:
             Logger.logger.info("未抢到锁,休眠..{}s..".format(task_cell))
         time.sleep(task_cell)
예제 #2
0
 def call_back(ch, method, properties, body):
     """
     rabitmq 回调消费者汉书, 如果有next_pages 则为生成任务, 没有则为回收任务
     :param ch:
     :param method:
     :param properties:
     :param body:
     :return:
     """
     ch.basic_ack(delivery_tag=method.delivery_tag)
     message: dict = eval(body.decode())
     if message.get("next_pages"):
         next_pages = copy.deepcopy(message.get("next_pages"))
         del message["next_pages"]
         for result in next_pages:
             url = result.get("url")
             header = result.get("header")
             message["task_url"] = url
             message["main_task_flag"] = 0
             message["is_detail"] = result.get("is_detail")
             if header:
                 message["header"] = header
             Logger.logger.info("新任务:{}".format(message))
             mq_queue = get_queue(Dispatch.crawler_setting, 'download')
             mq_params = get_login_info(Dispatch.crawler_setting)
             is_send(mq_params, Dispatch.crawler_setting, mq_queue)
             send_data(ch, '', repr(message), mq_queue)
     else:
         send_data(ch, '', repr(message), 'download')
예제 #3
0
 def call_back(ch, method, properties, body):
     ch.basic_ack(delivery_tag=method.delivery_tag)
     message: dict = eval(body.decode())
     path = get_plugin_path(Extractor.crawler_setting, 'extract')
     result = process(message, path)
     mq_queue = get_queue(Extractor.crawler_setting, "storage_dup")
     send_data(ch, '', repr(result), mq_queue)
     Logger.logger.info("发送任务至排重入库")
예제 #4
0
    def call_back(ch, method, properties, body):
        ch.basic_ack(delivery_tag=method.delivery_tag)
        message: dict = eval(body.decode())
        path = get_plugin_path(Downloader.crawler_setting, 'download')
        result = process(message, path)
        if result.get("recovery_flag"):
            if result.get("recovery_flag") < 3:
                mq_queue = get_queue(Downloader.crawler_setting, "recovery")
                send_data(ch, '', repr(result), mq_queue)
                Logger.logger.info("回收--{}--成功".format(result.get("task_url")))
            else:
                # 任务下载失败, 都需要清除调redis临时任务库中的url
                RedisUtil.del_exist(message.get("task_id"),
                                    hashlib.md5(message.get("task_url").encode("utf-8")).hexdigest())
                # 主任务回收, 则需要更新任务状态, 以及清除调在redis中生成的临时任务库
                if message.get("main_task_flag"):
                    while True:
                        if RedisUtil.get_lock():
                            pre_exec_time = message.get("exec_time")
                            exec_time = message.get("exec_time") + datetime.timedelta(seconds=message.get("task_cell"))
                            SqlUtil.update_task(0, "'{}'".format(message.get("task_id")), "'{}'".format(str(exec_time)),
                                                "'{}'".format(str(pre_exec_time)))
                            RedisUtil.release_lock()
                            RedisUtil.release_monitor(message.get("task_id"))
                            break
                        time.sleep(0.3)

                # 进行判断, 如果redis临时任务库中所有的数据的分没有10, 则关闭任务(注:分值为10表示详细页面, 分值为100表示列表页面)
                if not RedisUtil.monitor_score(message.get("task_id")):
                    RedisUtil.release_monitor(message.get("task_id"))
                    while True:
                        if RedisUtil.get_lock():
                            pre_exec_time = message.get("exec_time")
                            exec_time = datetime.datetime.now() + datetime.timedelta(seconds=message.get("task_cell"))
                            SqlUtil.update_task(0, "'{}'".format(message.get("task_id")), "'{}'".format(str(exec_time)),
                                                "'{}'".format(str(pre_exec_time)))
                            RedisUtil.release_lock()
                            break
                        time.sleep(0.3)
                Logger.logger.info("{}--超出回收次数上限, 不做回收".format(result.get("task_url")))
        else:
            mq_queue = get_queue(Downloader.crawler_setting, "extract")
            send_data(ch, '', repr(result), mq_queue)
            Logger.logger.info(result)
            Logger.logger.info("发送任务至提取中心")
예제 #5
0
 def back_task(self):
     """
     回收任务
     :return:
     """
     mq_queue = get_queue(self.crawler_setting, "recovery")
     mq_conn_recovery = connect(mq_queue, self.mq_params[0],
                                self.mq_params[1], self.mq_params[2],
                                self.mq_params[3])
     self.call_back(**{
         "no_ack": None,
         "channel": mq_conn_recovery,
         "routing_key": mq_queue
     })
예제 #6
0
 def generate_task(self):
     """
     生成任务
     :return:
     """
     mq_queue = get_queue(self.crawler_setting, "dispatch")
     mq_conn_download = connect(mq_queue, self.mq_params[0],
                                self.mq_params[1], self.mq_params[2],
                                self.mq_params[3])
     self.call_back(**{
         "no_ack": None,
         "channel": mq_conn_download,
         "routing_key": mq_queue
     })
예제 #7
0
    def process(self):
        crawler_mode = self.crawler_setting.get("crawler_mode")
        if not crawler_mode:
            self.simple()
        else:
            try:
                user = self.crawler_setting.get("mq").get("user")
                pwd = self.crawler_setting.get("mq").get("pwd")
                host = self.crawler_setting.get("mq").get("host")
                port = self.crawler_setting.get("mq").get("port")
                mq_queue = get_queue(self.crawler_setting, "extract")
            except AttributeError:
                user = "******"
                pwd = "crawler4py"
                host = "127.0.0.1"
                port = 5672
                mq_queue = "extract"

            mq_conn = connect(mq_queue, user, pwd, host, port)
            self.call_back(**{"no_ack": None, "channel": mq_conn, "routing_key": mq_queue})
예제 #8
0
    def call_back(ch, method, properties, body):
        ch.basic_ack(delivery_tag=method.delivery_tag)
        message: dict = eval(body.decode())
        Logger.logger.info(message)
        path = get_plugin_path(BaseStorageDup.crawler_setting, 'storage_dup')
        del message["view_source"]
        if not message.get("next_pages"):
            process(message, path)
        else:
            # 非详细页面, 需要先判断临时任务库是否存在,存在则进行处理
            if RedisUtil.monitor_is_exist(
                    message.get("task_id")) and RedisUtil.monitor_ttl(
                        message.get("task_id")) > 10:
                result = process(message, path)

                if len(message.get("next_pages")):
                    mq_queue = get_queue(BaseStorageDup.crawler_setting,
                                         'dispatch')
                    send_data(ch, '', repr(result), mq_queue)
                    Logger.logger.info("发送数据至dispatch进行构造任务")
                else:
                    Logger.logger.info("所有数据都被排掉, 不添加数据")
            else:
                Logger.logger.info("监控集合已经消失或者超出监控时间, 不再发送任务")
        # 每次处理数据, 需要判断当前临时任务的状态, 确定是否关闭
        if not RedisUtil.monitor_score(message.get("task_id")):
            RedisUtil.release_monitor(message.get("task_id"))
            while True:
                if RedisUtil.get_lock():
                    pre_exec_time = message.get("exec_time")
                    exec_time = datetime.datetime.now() + datetime.timedelta(
                        seconds=message.get("task_cell"))
                    SqlUtil.update_task(0,
                                        "'{}'".format(message.get("task_id")),
                                        "'{}'".format(str(exec_time)),
                                        "'{}'".format(str(pre_exec_time)))
                    RedisUtil.release_lock()
                    break
                time.sleep(0.3)
예제 #9
0
 def process(self):
     crawler_mode = self.crawler_setting.get("crawler_mode")
     if not crawler_mode:
         self.simple()
     else:
         try:
             user = self.crawler_setting.get("mq").get("user")
             pwd = self.crawler_setting.get("mq").get("pwd")
             host = self.crawler_setting.get("mq").get("host")
             port = self.crawler_setting.get("mq").get("port")
             mq_queue = get_queue(self.crawler_setting, "download")
         except AttributeError:
             user = "******"
             pwd = "crawler4py"
             host = "127.0.0.1"
             port = 5672
             mq_queue = "download"
         mq_conn = connect(mq_queue, user, pwd, host, port)
         try:
             plugin_path = self.crawler_setting.get("plugins").get("download")
         except ArithmeticError:
             plugin_path = None
         self.call_back(
             **{"no_ack": None, "channel": mq_conn, "routing_key": mq_queue, "plugin_path": plugin_path})