Пример #1
0
def sync_data2mongodb(logger=None):
    logger.info(Colored.green("[Data2MongoDB]: 正在从 redis 同步数据到 mongodb----"))
    items = rdb.spop(STORE_TOPIC, count=500)
    index = 0
    wait_publish_ids = []
    for item in items:
        try:
            json_dict = json.loads(item)
            collection = json_dict.get('data_table', "").split(AT)[1]
            del json_dict['data_table']
            _id = json_dict['_id']
            if _id == '1d33c55f57dd60237f2657028814b638':
                continue
            if insert_data(collection, json_dict):
                logger.info(
                    Colored.blue(
                        "[Data2MongoDB]: db:{1} >> content 的 id==>{0}".format(
                            json_dict.get('_id'), collection)))
                wait_publish_ids.append(
                    (PUBLISHED.format(collection), json_dict['_id']))
                index += 1
        except Exception as e:
            rdb.sadd(STORE_TOPIC, item)
            logger.error(Colored.red("[Data2MongoDB]: 发生错误>>>{0},".format(e)),
                         exc_info=True)
    try:
        pipeline = rdb_publish.pipeline(transaction=False)
        for key, value in wait_publish_ids:
            pipeline.sadd(key, value)
        pipeline.execute()
    except Exception as e:
        logger.error(Colored.red(
            "[Data2MongoDB]: 推送到 redis 发生错误>>>{0},".format(e)),
                     exc_info=True)
    logger.info(Colored.blue("[Data2MongoDB]: 当前轮次共同步条数==>{0}".format(index)))
Пример #2
0
 def _parameter_expression_handler(self, expression, parameters):
     """
     参数表达式处理
     :param expression: 参数表达式
     :return: 参数表达式的结果
     """
     expression_result = {}
     self.logger.info(
         Colored.blue(
             "[Liz2Bird-_parameter_expression_handler()]: 参数处理模板 =>{0}".
             format(expression)))
     try:
         # 表达式中包含dot
         if DOT in expression and 'http' not in expression:
             left, rights = expression.split(DOT, 1)
             # 包含process处理
             if left == PROCESSES:
                 number = rights[0]
                 process = self.queue.find(PROCESSES_TOPIC.format(
                     parameters['spider_name']),
                                           way=WayType.HGET,
                                           key=number)
                 process_template = self.middlewares[
                     2].middle_origin2engine(process)[0]
                 expression_result = HTTPProcessTemplate(**process_template)
             # 包含result处理
             elif left == RESULTS:
                 number, *parameter_name = rights.split(DOT)
                 if len(parameter_name) == 0:
                     result = self.queue.find(RESULTS_TOPIC.format(
                         parameters['spider_name']),
                                              way=WayType.HGET,
                                              key=number)
                     result_template = self.middlewares[
                         2].middle_origin2engine(result)[0]
                     expression_result = HTTPResultTemplate(
                         **result_template)
                 else:
                     value = parameters.get(parameter_name[0], None)
                     expression_result = {parameter_name[0]: value}
             # 包含$处理
             elif left == DOLLAR:
                 parameter_name = rights
                 value = parameters.get(parameter_name, None)
                 expression_result = {parameter_name: value}
         else:
             expression_result = {expression: expression}
     except Exception as e:
         self.logger.error(Colored.red(
             "[Liz2Bird-_parameter_expression_handler()]: 出现处理错误>>>{0}".
             format(e)),
                           exc_info=True)
         raise e
     finally:
         return expression_result
Пример #3
0
    def _result_template_handler(self, result_template, data, parameters):
        """
        结果template的处理
        :param result_template: 结果template
        :param data: 数据
        :return: 结果template的普通映射结果, 以及generator 的映射结果
        """
        temp_results = {}
        temp_generator_results = {}
        temp_parameters = {k: v for k, v in parameters.items()}
        try:
            # 处理result 模板的各个字段
            field_dict = result_template.field_dict
            if result_template.global_parameter is not None and isinstance(
                    result_template.global_parameter, str):
                global_parameters = [
                    p.strip()
                    for p in result_template.global_parameter.split(AND)
                ]
                self.logger.info(
                    Colored.blue(
                        "[Liz2Bird-_result_template_handler()]: 全局参数>>>>{0}".
                        format(', '.join(global_parameters))))
            else:
                global_parameters = []
            for field, expression in field_dict.items():
                if field in EXCLUDE_FIELD:
                    continue
                self.logger.info(
                    Colored.blue(
                        "Liz2Bird-_result_template_handler()]: result 处理字典>>>> (<{0}::{1}>)"
                        .format(field, expression)))
                # 起始下划线,则直接eval运行
                if field.startswith(UNDER_LINE):
                    expression_result = eval(expression)
                elif isinstance(expression, str):
                    # 起始//采用format_data获得结果
                    if expression.startswith(DIAGONAL):
                        expression_result = format_data(data, expression)
                    # 包含::的 css 提取数据
                    elif COLON in expression:
                        expression_result = format_data(data, expression)
                    # 包含>采用format_data获得结果
                    elif GREATER_THAN in expression:
                        expression_result = format_data(data, expression)
                    # 包含@的正则匹配
                    elif AT in expression:
                        exp, temp_data_field = expression.split(AT)
                        data_field, reg_idx = temp_data_field.split(HASH)
                        temp_data = temp_parameters.get(data_field)
                        if isinstance(temp_data, list):
                            expression_result = []
                            for temp_da in temp_data:
                                expression_one = format_data(
                                    temp_da, exp, reg_idx)
                                expression_result.append(expression_one)
                        else:
                            expression_result = format_data(
                                temp_data, exp, reg_idx)
                    # 包含$符号的拼接字段
                    elif expression.startswith(DOLLAR):
                        at_parameters_name = PARAMETER.findall(expression)
                        one_parameters = {}
                        dict_parameter = {}
                        for at_parameter_name in at_parameters_name:
                            parameter_value = temp_parameters.get(
                                at_parameter_name, None)
                            if isinstance(parameter_value, list):
                                dict_parameter.setdefault(
                                    at_parameter_name, parameter_value)
                            else:
                                one_parameters.setdefault(
                                    at_parameter_name, parameter_value)
                        # 处理 format 表达式
                        extend_parameters_group = []
                        one_round = 0
                        if len(dict_parameter) > 0:
                            for key, value_list in dict_parameter.items():
                                # 扩展参数组
                                for index, value in enumerate(value_list):
                                    if index == one_round:
                                        copy_one = deepcopy(one_parameters)
                                        copy_one.update({key: value})
                                        extend_parameters_group.append(
                                            copy_one)
                                        one_round += 1
                                    else:
                                        extend_parameters_group[index].update(
                                            {key: value})

                        if len(extend_parameters_group) > 0:
                            expression_result = [
                                expression.replace(
                                    DOLLAR, "").format(**parameters_group)
                                for parameters_group in extend_parameters_group
                            ]
                        else:
                            expression_result = expression.replace(
                                DOLLAR, "").format(**one_parameters)
                    elif LEFT_BRACE in expression and RIGHT_BRACE in expression:
                        # 包含{xxx}的参数,进行计算
                        at_parameters_name = PARAMETER.findall(expression)
                        total_parameters = {}
                        for at_parameter_name in at_parameters_name:
                            if at_parameter_name in global_parameters:
                                site = HASH_MAP.format(
                                    temp_parameters['site_name'])
                                parameter_value = self.queue.find(
                                    "{site}:{spider}:{key}".format(
                                        site=site,
                                        spider=temp_parameters['spider_name'],
                                        key=at_parameter_name),
                                    way=WayType.GET,
                                    key=at_parameter_name)
                                if isinstance(parameter_value, bytes):
                                    parameter_value = parameter_value.decode(
                                        'utf-8')
                                total_parameters.setdefault(
                                    at_parameter_name, parameter_value)
                            else:
                                parameter_value = temp_parameters.get(
                                    at_parameter_name, None)
                            total_parameters.setdefault(
                                at_parameter_name, parameter_value)
                        expression_result = eval(
                            expression.format(**total_parameters))
                    else:
                        # 否则直接返回原结果
                        expression_result = expression
                else:
                    # 否则直接返回原结果
                    expression_result = expression

                # 更新 global 参数
                if field in global_parameters:
                    if isinstance(expression_result, str):
                        expression_result_ = expression_result.encode('utf-8')
                    else:
                        expression_result_ = expression_result
                    site = HASH_MAP.format(temp_parameters['site_name'])
                    self.queue.commit_data("{site}:{spider}:{key}".format(
                        site=site,
                        spider=temp_parameters['spider_name'],
                        key=field),
                                           key=field,
                                           value=expression_result_,
                                           way=WayType.SET)
                    self.logger.info(
                        Colored.white(
                            "Liz2Bird-_result_template_handler()]: 提交全局参数>>>> (<{0}: {1}>)"
                            .format(field, expression_result)))

                # 更新模板结果, result模板字典
                if isinstance(expression_result, list):
                    temp_generator_results.update({field: expression_result})
                else:
                    temp_results.update({field: expression_result})
                temp_parameters.update({field: expression_result})
        except Exception as e:
            self.logger.error(Colored.red(
                "[Liz2Bird-_result_template_handler()]: 出现处理错误>>>{0}....(@ $ _ $ @)----"
                .format(e)),
                              exc_info=True)
        return temp_results, temp_generator_results
Пример #4
0
 def __str__(self):
     return Colored.blue('<Process [{0}]:[{1}]>'.format(
         self.process_id, self.process_method))