Exemplo n.º 1
0
    def fun_output_in_rdd_mapPartitions(iter_x, step_conf, pre_output_conf):
        """
        输入: Row
        输出: obj_dict
        """
        STATISTIC_INFO_PREFIX_KEY = 'statistic.'
        print('= = ' * 5, 'pre_output_conf = ',
              [(k, v) for k, v in pre_output_conf.iteritems()
               if k.startswith(STATISTIC_INFO_PREFIX_KEY)])
        added_info_dict = dict([(k[10:], v)
                                for k, v in pre_output_conf.iteritems()
                                if k.startswith(STATISTIC_INFO_PREFIX_KEY)])

        print('= = ' * 10, 'added_info_dict =', added_info_dict)
        import json

        idx = pre_output_conf['index']
        typ = pre_output_conf['type']
        es_host = pre_output_conf['serverPort.list'].split(',')
        es = EsConnectionPool.get_connection(es_host)

        # 取 id
        value_key = pre_output_conf['value.key']

        # 添加 raw_data
        for row in iter_x:
            obj = dict_merge(row.asDict(),
                             added_info_dict)  # value_key 字段取值需要更新

            raw_data_tmp = dict([(k, v) for k, v in obj.iteritems()
                                 if k != value_key])  # 去掉 value_key 构造 id
            md5id = md5(json.dumps(raw_data_tmp))

            # 先查询,在更新
            resp = es.get(index=idx, doc_type=typ, id=md5id, ignore=404)
            # print('= = ' * 5, type(resp), 'get(index=' + idx + ', doc_type=' + typ + ', id=' + md5id +
            #       '), resp=', resp, ', with obj = ', obj)
            # 常见 resp
            # {u'status': 404, u'error': u'IndexMissingException[[opkpi] missing]'}

            if resp.get('status') == 404:
                # TODO: 创建索引,插入
                pass
            elif resp.get('exists'):
                value_update = obj[value_key] + resp['_source'][value_key]
                # 更新value_key 取值
                obj[value_key] = value_update
                obj['raw_data'] = json.dumps(obj)
            else:
                obj['raw_data'] = json.dumps(obj)

            es.index(index=idx, doc_type=typ, id=md5id, body=obj)

            print(
                '= = ' * 10 +
                '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] obj = ',
                obj)

            yield obj
Exemplo n.º 2
0
    def fun_output_in_rdd_mapPartitions(iter_x, step_conf, pre_output_conf):
        """
        输入: Row
        输出: obj_dict
        """
        STATISTIC_INFO_PREFIX_KEY = 'statistic.'
        print('= = ' * 5, 'pre_output_conf = ', [(k, v) for k, v in pre_output_conf.iteritems()
                                                 if k.startswith(STATISTIC_INFO_PREFIX_KEY)])
        added_info_dict = dict([(k[10:], v) for k, v in pre_output_conf.iteritems()
                                if k.startswith(STATISTIC_INFO_PREFIX_KEY)])

        print('= = ' * 10, 'added_info_dict =', added_info_dict)
        import json

        idx = pre_output_conf['index']
        typ = pre_output_conf['type']
        es_host = pre_output_conf['serverPort.list'].split(',')
        es = EsConnectionPool.get_connection(es_host)

        # 取 id
        value_key = pre_output_conf['value.key']

        # 添加 raw_data
        for row in iter_x:
            obj = dict_merge(row.asDict(), added_info_dict)  # value_key 字段取值需要更新

            raw_data_tmp = dict([(k, v) for k, v in obj.iteritems() if k != value_key])  # 去掉 value_key 构造 id
            md5id = md5(json.dumps(raw_data_tmp))

            # 先查询,在更新
            resp = es.get(index=idx, doc_type=typ, id=md5id, ignore=404)
            # print('= = ' * 5, type(resp), 'get(index=' + idx + ', doc_type=' + typ + ', id=' + md5id +
            #       '), resp=', resp, ', with obj = ', obj)
            # 常见 resp
            # {u'status': 404, u'error': u'IndexMissingException[[opkpi] missing]'}

            if resp.get('status') == 404:
                # TODO: 创建索引,插入
                pass
            elif resp.get('exists'):
                value_update = obj[value_key] + resp['_source'][value_key]
                # 更新value_key 取值
                obj[value_key] = value_update
                obj['raw_data'] = json.dumps(obj)
            else:
                obj['raw_data'] = json.dumps(obj)

            es.index(index=idx, doc_type=typ, id=md5id, body=obj)

            print('= = ' * 10 + '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] obj = ', obj)

            yield obj
Exemplo n.º 3
0
    def fun_output_in_rdd_mapPartitions_new(iter_x, step_conf,
                                            pre_output_conf):
        """
        构造输出到es的结构,rdd.collection返回到driver端,然后统一执行
        输入: Row
        输出: obj_dict
        """
        STATISTIC_INFO_PREFIX_KEY = 'statistic.'
        print('= = ' * 5, 'pre_output_conf = ',
              [(k, v) for k, v in pre_output_conf.iteritems()
               if k.startswith(STATISTIC_INFO_PREFIX_KEY)])
        added_info_dict = dict([(k[10:], v)
                                for k, v in pre_output_conf.iteritems()
                                if k.startswith(STATISTIC_INFO_PREFIX_KEY)])

        print('= = ' * 10, 'added_info_dict =', added_info_dict)

        idx = pre_output_conf['index']
        typ = pre_output_conf['type']
        value_key = pre_output_conf['value.key']

        # 添加 raw_data
        for row in iter_x:
            ret = {}
            obj = dict_merge(row.asDict(), added_info_dict)

            raw_data_tmp = dict([(k, v) for k, v in obj.iteritems()
                                 if k != value_key])  # 去掉 value_key 构造 id

            ret['serverPort.list'] = pre_output_conf['serverPort.list']
            ret['index'] = idx
            ret['doct_type'] = typ
            ret['id_raw_data_without_value_key'] = raw_data_tmp

            ret['value_key'] = value_key
            ret['body'] = obj
            print(
                '= = ' * 10 +
                '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] obj = ',
                obj)
            print(
                '= = ' * 10 +
                '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] ret = ',
                ret)

            yield ret
Exemplo n.º 4
0
    def fun_output_in_rdd_mapPartitions_new(iter_x, step_conf, pre_output_conf):
        """
        构造输出到es的结构,rdd.collection返回到driver端,然后统一执行
        输入: Row
        输出: obj_dict
        """
        STATISTIC_INFO_PREFIX_KEY = 'statistic.'
        print('= = ' * 5, 'pre_output_conf = ', [(k, v) for k, v in pre_output_conf.iteritems()
                                                 if k.startswith(STATISTIC_INFO_PREFIX_KEY)])
        added_info_dict = dict([(k[10:], v) for k, v in pre_output_conf.iteritems()
                                if k.startswith(STATISTIC_INFO_PREFIX_KEY)])

        print('= = ' * 10, 'added_info_dict =', added_info_dict)

        idx = pre_output_conf['index']
        typ = pre_output_conf['type']
        value_key = pre_output_conf['value.key']

        # 添加 raw_data
        for row in iter_x:
            ret = {}
            obj = dict_merge(row.asDict(), added_info_dict)

            raw_data_tmp = dict([(k, v) for k, v in obj.iteritems() if k != value_key])  # 去掉 value_key 构造 id

            ret['serverPort.list'] = pre_output_conf['serverPort.list']
            ret['index'] = idx
            ret['doct_type'] = typ
            ret['id_raw_data_without_value_key'] = raw_data_tmp

            ret['value_key'] = value_key
            ret['body'] = obj
            print('= = ' * 10 + '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] obj = ', obj)
            print('= = ' * 10 + '[myapp ESWriter.output.fun_output_in_rdd_mapPartitions] ret = ', ret)

            yield ret