def sync_data2mongodb(logger=None): logger.info(Colored.green("[Data2MongoDB]: 正在从 redis 同步数据到 mongodb----")) items = rdb.spop(STORE_TOPIC, count=500) index = 0 wait_publish_ids = [] for item in items: try: json_dict = json.loads(item) collection = json_dict.get('data_table', "").split(AT)[1] del json_dict['data_table'] _id = json_dict['_id'] if _id == '1d33c55f57dd60237f2657028814b638': continue if insert_data(collection, json_dict): logger.info( Colored.blue( "[Data2MongoDB]: db:{1} >> content 的 id==>{0}".format( json_dict.get('_id'), collection))) wait_publish_ids.append( (PUBLISHED.format(collection), json_dict['_id'])) index += 1 except Exception as e: rdb.sadd(STORE_TOPIC, item) logger.error(Colored.red("[Data2MongoDB]: 发生错误>>>{0},".format(e)), exc_info=True) try: pipeline = rdb_publish.pipeline(transaction=False) for key, value in wait_publish_ids: pipeline.sadd(key, value) pipeline.execute() except Exception as e: logger.error(Colored.red( "[Data2MongoDB]: 推送到 redis 发生错误>>>{0},".format(e)), exc_info=True) logger.info(Colored.blue("[Data2MongoDB]: 当前轮次共同步条数==>{0}".format(index)))
def _parameter_expression_handler(self, expression, parameters): """ 参数表达式处理 :param expression: 参数表达式 :return: 参数表达式的结果 """ expression_result = {} self.logger.info( Colored.blue( "[Liz2Bird-_parameter_expression_handler()]: 参数处理模板 =>{0}". format(expression))) try: # 表达式中包含dot if DOT in expression and 'http' not in expression: left, rights = expression.split(DOT, 1) # 包含process处理 if left == PROCESSES: number = rights[0] process = self.queue.find(PROCESSES_TOPIC.format( parameters['spider_name']), way=WayType.HGET, key=number) process_template = self.middlewares[ 2].middle_origin2engine(process)[0] expression_result = HTTPProcessTemplate(**process_template) # 包含result处理 elif left == RESULTS: number, *parameter_name = rights.split(DOT) if len(parameter_name) == 0: result = self.queue.find(RESULTS_TOPIC.format( parameters['spider_name']), way=WayType.HGET, key=number) result_template = self.middlewares[ 2].middle_origin2engine(result)[0] expression_result = HTTPResultTemplate( **result_template) else: value = parameters.get(parameter_name[0], None) expression_result = {parameter_name[0]: value} # 包含$处理 elif left == DOLLAR: parameter_name = rights value = parameters.get(parameter_name, None) expression_result = {parameter_name: value} else: expression_result = {expression: expression} except Exception as e: self.logger.error(Colored.red( "[Liz2Bird-_parameter_expression_handler()]: 出现处理错误>>>{0}". format(e)), exc_info=True) raise e finally: return expression_result
def _result_template_handler(self, result_template, data, parameters): """ 结果template的处理 :param result_template: 结果template :param data: 数据 :return: 结果template的普通映射结果, 以及generator 的映射结果 """ temp_results = {} temp_generator_results = {} temp_parameters = {k: v for k, v in parameters.items()} try: # 处理result 模板的各个字段 field_dict = result_template.field_dict if result_template.global_parameter is not None and isinstance( result_template.global_parameter, str): global_parameters = [ p.strip() for p in result_template.global_parameter.split(AND) ] self.logger.info( Colored.blue( "[Liz2Bird-_result_template_handler()]: 全局参数>>>>{0}". format(', '.join(global_parameters)))) else: global_parameters = [] for field, expression in field_dict.items(): if field in EXCLUDE_FIELD: continue self.logger.info( Colored.blue( "Liz2Bird-_result_template_handler()]: result 处理字典>>>> (<{0}::{1}>)" .format(field, expression))) # 起始下划线,则直接eval运行 if field.startswith(UNDER_LINE): expression_result = eval(expression) elif isinstance(expression, str): # 起始//采用format_data获得结果 if expression.startswith(DIAGONAL): expression_result = format_data(data, expression) # 包含::的 css 提取数据 elif COLON in expression: expression_result = format_data(data, expression) # 包含>采用format_data获得结果 elif GREATER_THAN in expression: expression_result = format_data(data, expression) # 包含@的正则匹配 elif AT in expression: exp, temp_data_field = expression.split(AT) data_field, reg_idx = temp_data_field.split(HASH) temp_data = temp_parameters.get(data_field) if isinstance(temp_data, list): expression_result = [] for temp_da in temp_data: expression_one = format_data( temp_da, exp, reg_idx) expression_result.append(expression_one) else: expression_result = format_data( temp_data, exp, reg_idx) # 包含$符号的拼接字段 elif expression.startswith(DOLLAR): at_parameters_name = PARAMETER.findall(expression) one_parameters = {} dict_parameter = {} for at_parameter_name in at_parameters_name: parameter_value = temp_parameters.get( at_parameter_name, None) if isinstance(parameter_value, list): dict_parameter.setdefault( at_parameter_name, parameter_value) else: one_parameters.setdefault( at_parameter_name, parameter_value) # 处理 format 表达式 extend_parameters_group = [] one_round = 0 if len(dict_parameter) > 0: for key, value_list in dict_parameter.items(): # 扩展参数组 for index, value in enumerate(value_list): if index == one_round: copy_one = deepcopy(one_parameters) copy_one.update({key: value}) extend_parameters_group.append( copy_one) one_round += 1 else: extend_parameters_group[index].update( {key: value}) if len(extend_parameters_group) > 0: expression_result = [ expression.replace( DOLLAR, "").format(**parameters_group) for parameters_group in extend_parameters_group ] else: expression_result = expression.replace( DOLLAR, "").format(**one_parameters) elif LEFT_BRACE in expression and RIGHT_BRACE in expression: # 包含{xxx}的参数,进行计算 at_parameters_name = PARAMETER.findall(expression) total_parameters = {} for at_parameter_name in at_parameters_name: if at_parameter_name in global_parameters: site = HASH_MAP.format( temp_parameters['site_name']) parameter_value = self.queue.find( "{site}:{spider}:{key}".format( site=site, spider=temp_parameters['spider_name'], key=at_parameter_name), way=WayType.GET, key=at_parameter_name) if isinstance(parameter_value, bytes): parameter_value = parameter_value.decode( 'utf-8') total_parameters.setdefault( at_parameter_name, parameter_value) else: parameter_value = temp_parameters.get( at_parameter_name, None) total_parameters.setdefault( at_parameter_name, parameter_value) expression_result = eval( expression.format(**total_parameters)) else: # 否则直接返回原结果 expression_result = expression else: # 否则直接返回原结果 expression_result = expression # 更新 global 参数 if field in global_parameters: if isinstance(expression_result, str): expression_result_ = expression_result.encode('utf-8') else: expression_result_ = expression_result site = HASH_MAP.format(temp_parameters['site_name']) self.queue.commit_data("{site}:{spider}:{key}".format( site=site, spider=temp_parameters['spider_name'], key=field), key=field, value=expression_result_, way=WayType.SET) self.logger.info( Colored.white( "Liz2Bird-_result_template_handler()]: 提交全局参数>>>> (<{0}: {1}>)" .format(field, expression_result))) # 更新模板结果, result模板字典 if isinstance(expression_result, list): temp_generator_results.update({field: expression_result}) else: temp_results.update({field: expression_result}) temp_parameters.update({field: expression_result}) except Exception as e: self.logger.error(Colored.red( "[Liz2Bird-_result_template_handler()]: 出现处理错误>>>{0}....(@ $ _ $ @)----" .format(e)), exc_info=True) return temp_results, temp_generator_results
def __str__(self): return Colored.blue('<Process [{0}]:[{1}]>'.format( self.process_id, self.process_method))