def fun_output_in_rdd_mapPartitions(iter_x, step_conf, pre_output_conf): """ 输入: Row 输出: obj_dict """ STATISTIC_INFO_PREFIX_KEY = 'statistic.' print('= = ' * 5, 'pre_output_conf = ', [(k, v) for k, v in pre_output_conf.iteritems() if k.startswith(STATISTIC_INFO_PREFIX_KEY)]) added_info_dict = dict([(k[10:], v) for k, v in pre_output_conf.iteritems() if k.startswith(STATISTIC_INFO_PREFIX_KEY)]) print('= = ' * 10, 'added_info_dict =', added_info_dict) import json idx = pre_output_conf['index'] typ = pre_output_conf['type'] es_host = pre_output_conf['serverPort.list'].split(',') es = EsConnectionPool.get_connection(es_host) # 取 id value_key = pre_output_conf['value.key'] # 添加 raw_data for row in iter_x: obj = dict_merge(row.asDict(), added_info_dict) # value_key 字段取值需要更新 raw_data_tmp = dict([(k, v) for k, v in obj.iteritems() if k != value_key]) # 去掉 value_key 构造 id md5id = md5(json.dumps(raw_data_tmp)) # 先查询,在更新 resp = es.get(index=idx, doc_type=typ, id=md5id, ignore=404) # print('= = ' * 5, type(resp), 'get(index=' + idx + ', doc_type=' + typ + ', id=' + md5id + # '), resp=', resp, ', with obj = ', obj) # 常见 resp # {u'status': 404, u'error': u'IndexMissingException[[opkpi] missing]'} if resp.get('status') == 404: # TODO: 创建索引,插入 pass elif resp.get('exists'): value_update = obj[value_key] + resp['_source'][value_key] # 更新value_key 取值 obj[value_key] = value_update obj['raw_data'] = json.dumps(obj) else: obj['raw_data'] = json.dumps(obj) es.index(index=idx, doc_type=typ, id=md5id, body=obj) print( '= = ' * 10 + '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] obj = ', obj) yield obj
def fun_output_in_rdd_mapPartitions(iter_x, step_conf, pre_output_conf): """ 输入: Row 输出: obj_dict """ STATISTIC_INFO_PREFIX_KEY = 'statistic.' print('= = ' * 5, 'pre_output_conf = ', [(k, v) for k, v in pre_output_conf.iteritems() if k.startswith(STATISTIC_INFO_PREFIX_KEY)]) added_info_dict = dict([(k[10:], v) for k, v in pre_output_conf.iteritems() if k.startswith(STATISTIC_INFO_PREFIX_KEY)]) print('= = ' * 10, 'added_info_dict =', added_info_dict) import json idx = pre_output_conf['index'] typ = pre_output_conf['type'] es_host = pre_output_conf['serverPort.list'].split(',') es = EsConnectionPool.get_connection(es_host) # 取 id value_key = pre_output_conf['value.key'] # 添加 raw_data for row in iter_x: obj = dict_merge(row.asDict(), added_info_dict) # value_key 字段取值需要更新 raw_data_tmp = dict([(k, v) for k, v in obj.iteritems() if k != value_key]) # 去掉 value_key 构造 id md5id = md5(json.dumps(raw_data_tmp)) # 先查询,在更新 resp = es.get(index=idx, doc_type=typ, id=md5id, ignore=404) # print('= = ' * 5, type(resp), 'get(index=' + idx + ', doc_type=' + typ + ', id=' + md5id + # '), resp=', resp, ', with obj = ', obj) # 常见 resp # {u'status': 404, u'error': u'IndexMissingException[[opkpi] missing]'} if resp.get('status') == 404: # TODO: 创建索引,插入 pass elif resp.get('exists'): value_update = obj[value_key] + resp['_source'][value_key] # 更新value_key 取值 obj[value_key] = value_update obj['raw_data'] = json.dumps(obj) else: obj['raw_data'] = json.dumps(obj) es.index(index=idx, doc_type=typ, id=md5id, body=obj) print('= = ' * 10 + '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] obj = ', obj) yield obj
def fun_output_in_rdd_mapPartitions_new(iter_x, step_conf, pre_output_conf): """ 构造输出到es的结构,rdd.collection返回到driver端,然后统一执行 输入: Row 输出: obj_dict """ STATISTIC_INFO_PREFIX_KEY = 'statistic.' print('= = ' * 5, 'pre_output_conf = ', [(k, v) for k, v in pre_output_conf.iteritems() if k.startswith(STATISTIC_INFO_PREFIX_KEY)]) added_info_dict = dict([(k[10:], v) for k, v in pre_output_conf.iteritems() if k.startswith(STATISTIC_INFO_PREFIX_KEY)]) print('= = ' * 10, 'added_info_dict =', added_info_dict) idx = pre_output_conf['index'] typ = pre_output_conf['type'] value_key = pre_output_conf['value.key'] # 添加 raw_data for row in iter_x: ret = {} obj = dict_merge(row.asDict(), added_info_dict) raw_data_tmp = dict([(k, v) for k, v in obj.iteritems() if k != value_key]) # 去掉 value_key 构造 id ret['serverPort.list'] = pre_output_conf['serverPort.list'] ret['index'] = idx ret['doct_type'] = typ ret['id_raw_data_without_value_key'] = raw_data_tmp ret['value_key'] = value_key ret['body'] = obj print( '= = ' * 10 + '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] obj = ', obj) print( '= = ' * 10 + '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] ret = ', ret) yield ret
def fun_output_in_rdd_mapPartitions_new(iter_x, step_conf, pre_output_conf): """ 构造输出到es的结构,rdd.collection返回到driver端,然后统一执行 输入: Row 输出: obj_dict """ STATISTIC_INFO_PREFIX_KEY = 'statistic.' print('= = ' * 5, 'pre_output_conf = ', [(k, v) for k, v in pre_output_conf.iteritems() if k.startswith(STATISTIC_INFO_PREFIX_KEY)]) added_info_dict = dict([(k[10:], v) for k, v in pre_output_conf.iteritems() if k.startswith(STATISTIC_INFO_PREFIX_KEY)]) print('= = ' * 10, 'added_info_dict =', added_info_dict) idx = pre_output_conf['index'] typ = pre_output_conf['type'] value_key = pre_output_conf['value.key'] # 添加 raw_data for row in iter_x: ret = {} obj = dict_merge(row.asDict(), added_info_dict) raw_data_tmp = dict([(k, v) for k, v in obj.iteritems() if k != value_key]) # 去掉 value_key 构造 id ret['serverPort.list'] = pre_output_conf['serverPort.list'] ret['index'] = idx ret['doct_type'] = typ ret['id_raw_data_without_value_key'] = raw_data_tmp ret['value_key'] = value_key ret['body'] = obj print('= = ' * 10 + '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] obj = ', obj) print('= = ' * 10 + '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] ret = ', ret) yield ret