示例#1
0
def init_spider(module_name):
    """
    :param module_name: like  spider.booking.hotel_list_spider
    :return: 理论上只有一个spider
    """
    print(module_name)
    spider_module = importlib.import_module('.' + module_name, 'mioji')
    spider_list = []
    for attr in inspect.getmembers(spider_module):
        if inspect.isclass(attr[1]) and attr[1].__module__.endswith(
                '_spider') and attr[1].__module__.endswith(module_name):
            if issubclass(attr[1].__bases__[0], Spider):
                # 当为 Spider 子类或同类时加载
                try:
                    spider_clazz = getattr(spider_module, attr[0])
                    spider = spider_clazz()
                    if isinstance(spider, Spider):
                        spider_desc = {}
                        spider_desc['source_type'] = spider.source_type
                        spider_desc['spider_class'] = spider_clazz
                        spider_desc['targets'] = spider.targets.keys()
                        spider_list.append(spider_desc)
                except:
                    logger.exception('instance spider[%s]', attr[1])

    return spider_list
示例#2
0
    def crawl(self):
        """
        外部启动爬虫的入口方法
        当调用这个方法时才能开始爬虫工作~
        :return:
        """
        # todo
        self.__create_browser()
        cur_id = str(uuid.uuid1())
        if hasattr(self.task, 'new_task_id'):
            cur_id = self.task.new_task_id
        self.spider_taskinfo = {'task_id': cur_id}
        for k, v in self.task.__dict__.items():
            self.spider_taskinfo[k] = v
            try:
                logger.info(current_log_tag() + '[任务信息][%s][%s]' %
                            (k, json.dumps(v)))
            except Exception:
                continue
        chains = self.targets_request()
        try:
            self.code = self.__crawl_by_chain(chains)
        except parser_except.ParserException as e:
            logger.exception(e)
            self.code = e.code
            self.exception = e.msg
            if e.retry_from_first:
                raise e

        # 通过返回的全部 result 判断错误码
        self.check_all_result()
        return self.code
示例#3
0
 def call(*args, **kwargs):
     try:
         return func(*args, **kwargs), True
     except Exception as exc:
         logger.exception('[新框架][页面解析异常][ {0} ]'.format(
             traceback.format_exc().replace('\n', '\t')))
         return (args, kwargs, exc), False
示例#4
0
        def write_message(max_try):
            """
            :param max_try:
            :return:
            """
            try:
                max_try -= 1
                msg = json.dumps({
                    'qid': task.req_qid,
                    'type': task.callback_type,
                    'uid': task.req_uid,
                    'query': json.dumps(query),
                    'status': spider_status
                })
                credentials = pika.PlainCredentials(
                    username=task.master_info['spider_mq_user'],
                    password=task.master_info['spider_mq_passwd'])
                connection = pika.BlockingConnection(
                    pika.ConnectionParameters(
                        host=task.master_info['spider_mq_host'],
                        virtual_host=task.master_info['spider_mq_vhost'],
                        credentials=credentials,
                        # heartbeat_interval=0
                    ))
                channel = connection.channel()

                res = channel.basic_publish(
                    exchange=task.master_info['spider_mq_exchange'],
                    routing_key=task.master_info['spider_mq_routerKey'],
                    properties=pika.BasicProperties(delivery_mode=2),
                    body=msg,
                )
                connection.process_data_events()

                connection.close()
                if not res:
                    warn_msg = 'RabbitMQ Result False: {0}'.format(msg)
                    info = warn(str(task.req_qid), 'ex_RabbitMQ',
                                get_local_ip(), warn_msg)
                    logger.debug("\n" + info)
                    raise Exception('RabbitMQ Result False')
                logger.debug(
                    '[callback a verifytask done] qid:{}, source: {}, task_info: {}, status: {}'
                    .format(str(task.req_qid), str(task.source), task.content,
                            spider_status))
                return max_try
            except Exception as exc:
                if max_try > 0:
                    return write_message(max_try)
                else:
                    warn_msg = 'RabbitMQ Result False qid : {}, e_info: {}, msg: {}'.format(
                        task.req_qid, traceback.format_exc(), msg)
                    info = warn(task.req_qid, 'ex_SpiderMQ', get_local_ip(),
                                warn_msg)
                    logger.exception("\n" + info)
                    return max_try
示例#5
0
    def __crawl_by_chain(self, chains):
        """
        根据请求链的类型,进入不同的抓取顺序进行抓取
        :param chains:
        :return:
        """
        code = 0
        try:
            for reqParse in chains:
                # gevent.sleep(0)
                browser = self.__create_browser(reqParse.new_session)
                reqParse.spider = self
                t_req = reqParse.request()

                if isinstance(t_req, dict):  # 单一请求
                    new_result = self.__single_crawl(reqParse, browser, t_req,
                                                     0)

                elif isinstance(t_req, list):
                    # 爬虫有可能返回一个空列表!!!
                    if t_req:
                        if reqParse.asynchronous:  # 并行抓取
                            list_result = self.__async_crawl_list(
                                reqParse, browser, t_req)
                        else:  # 串行请求
                            list_result = self.__crawl_list(
                                reqParse, browser, t_req)
                        new_result, code = self.check_list_result(
                            list_result, code)  # $$$ 可以优化

                elif isinstance(t_req,
                                types.GeneratorType):  # 针对使用的yelid 调用方法的请求
                    list_result = self.__crawl_list(reqParse, browser, t_req)
                    new_result, code = self.check_list_result(
                        list_result, code)

                self.__spider_append_result(new_result)

            if self.use_selenium and browser.br:
                browser.close()
        except parser_except.ParserException as e:
            if self.use_selenium and browser.br:
                browser.close()
            logger.error(e)
            raise e
        except Exception:
            if self.use_selenium and browser.br:
                browser.close()
            logger.exception(current_log_tag() +
                             '[新框架 持续请求 未知问题][ {0} ]'.format(
                                 traceback.format_exc().replace('\n', '\t')))
            raise parser_except.ParserException(
                parser_except.UNKNOWN_ERROR,
                'e:{0}'.format(traceback.format_exc()))

        return code
示例#6
0
    def doCallback(self, task, error_code, spider, result_type):
        """
        执行回调工作
        """
        def get_ticket_num():
            ticket_num = 0
            for per_data_type in spider.crawl_targets_required:
                ticket_num += len(spider._asy_temp_result[per_data_type])
            return ticket_num

        def get_result(_result):
            _proxy_or_ticket = []
            for per_data_type in spider.crawl_targets_required:
                _proxy_or_ticket.extend(_result[per_data_type])
            return _proxy_or_ticket

        # 如果是 running状态 等一秒再判断下。
        if result_type == "RUNNING":
            num1 = get_ticket_num()
            time.sleep(1)
            # 缓冲后票张数量
            num2 = get_ticket_num()
            if num1 != num2 or spider.spider_frame_status:
                return

        task.other_info['parser_error'] = int(error_code)
        query = {"other_info": task.other_info}
        result = None
        redis_mq_logger = RedisMQCostLogger()
        extra = {}
        if spider:
            result = spider._asy_temp_result if result_type == 'RUNNING' else spider.result
            result = get_result(result)
            extra = spider.extra
            redis_mq_logger.ticket_num = len(spider._asy_temp_result)

        try:
            redis_mq_logger.qid = task.req_qid
            redis_mq_logger.source = task.source
            redis_mq_logger.task_id = task.new_task_id
            redis_mq_logger.task_info = task.content
            redis_mq_logger.error_code = error_code
            if result_type == 'END':
                redis_mq_logger.is_end = 1
            # 写入redis
            redis_cost = self.write_redis_ticket(task, result, error_code,
                                                 extra)
            if isinstance(redis_cost, tuple):
                redis_mq_logger.conn_redis = redis_cost[0]
                redis_mq_logger.write_redis = redis_cost[1]
            else:
                redis_mq_logger.exception = redis_cost

        except Exception, e:
            logger.exception('not redis con' + str(e))
示例#7
0
    def __crawl_list(self, reqParse, browser, req_list):
        """
        串行抓取分页
        """
        result = defaultdict(list)
        all_except = True
        all_ok = True
        one_exception = None

        total_count = 0
        success_count = 0
        error_req = []
        for req in req_list:
            # 串行增加翻页限制取消
            # if NEED_FLIP_LIMIT:
            #     if total_count >= MAX_FLIP:
            #         break
            total_count += 1
            try:
                success_count += 1
                res = self.__single_crawl(reqParse,
                                          browser,
                                          req,
                                          page_count=total_count)
                self.__target_append_result(result, res)
                all_except = False
            except Exception as e:
                all_ok = False
                one_exception = e
                error_req.append((req, one_exception.message))
                logger.exception(
                    current_log_tag() + '[新框架][页面解析异常][ {0} ]'.format(
                        traceback.format_exc().replace('\n', '\t')))

                #  抛出生成器部分的异常
                if isinstance(req, types.GeneratorType):
                    raise e
        if reqParse.binding:
            self.success_count = success_count
            self.all_count = total_count
        logger.debug(
            current_log_tag() +
            '[翻页抓取][串行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count))
        if error_req:
            logger.debug(current_log_tag() +
                         '[翻页抓取][串行抓取][ 失败页请求 {0} ]'.format(str(error_req)))
        return result, all_except, all_ok, one_exception
示例#8
0
 def write_redis_ticket(self, task, result, error_code, extra):
     try:
         begin = time.time()
         params = (task.redis_host, task.redis_port, int(task.redis_db),
                   task.redis_passwd)
         rds = self.get_redis_pool(params)
         conn_cost_time = round(time.time() - begin, 3)
         # 等检索做兼容,暂时妥协方案,谷歌api返回格式强行转换。
         if task.source == "Realtraffic" and result:
             result = json.dumps(result[0])
         result = {"err_code": error_code, "data": result, "extra": extra}
         begin = time.time()
         if task.ticket_info.get("auth"):
             rds.setex(task.redis_key, json.dumps(result), 1800)
         else:
             rds.setex(task.redis_key, json.dumps(result), 600)
         write_cost_time = round(time.time() - begin, 3)
         return conn_cost_time, write_cost_time
     except Exception, e:
         warn_msg = 'redis_host:' + task.redis_host + ' ' + str(e)
         info = warn(task.req_qid, 'ex_SpiderRedis', msg=warn_msg)
         logger.exception("\n" + info)
         return str(e)
示例#9
0
文件: slave.py 项目: gitPff/sb-
def doTask(task):
    """ 此方法用于调用spider 并接收最终的code和result信息
    """
    spider = g_spider_factory.get_spider(task.source)
    if not spider :
        logger.error('未找到指定源[%s]对应的spider' % (task.source))
        callback.CallbackResult(task=task, error_code=11,result_type="END")
        code = 11
    else:
        spider = spider(task)
        spider.task = task
        print task
        spider.debug = False
        spider.process_callback = callback.CallbackResult  # 执行回调
        spider.task_post_process_queue = g_task_post_process_queue # 上传ucloud所用池子
        spider.need_proxy =  g_config.need_proxy 
        spider.machine_type = g_config.machine_type
        spider.env = g_config.env
        spider.local_ip = g_config.local_ip
        spider.is_verify = g_config.need_post_process
        crawl_time = time.time()
        try:
            spider = spider_crawl(spider, task) # 执行 爬虫,并从头重试
        except ParserException as e:
            error_info = e.msg
            error = e.code
            logger.exception('新框架 爬虫抛出异常: task:{0}, error:{1}, msg: {2}'.format(task, error_info, error))
        except Exception, e:
            logger.exception("新框架 爬虫抛出异常: task_data:{0}  error:{1}".format(task, e))
            error = SLAVE_ERROR
        spider.last_time = int((time.time() - crawl_time) * 1000)
        check_all_result(spider) # 最后对所有返回数据进行check 
        spider.spider_frame_status = 1
        callback.CallbackResult(task=task, error_code=spider.code, spider=spider, result_type="END") # 执行回调操作,如果是end将执行同步回调
        error_logger(spider) # 写入error日志

        code = spider.code