Python HBaseUtils 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: server.utils

클래스/타입: HBaseUtils

hotexamples.com에서의 예제들: 11

Python HBaseUtils - 11개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 server.utils.HBaseUtils에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

get_table_row(8)

get_table_cells(7)

HBaseUtils(4)

get_table_put(3)

get_table_delete(2)

예제 #1

파일 보기

파일: recall_service.py 프로젝트: nameli0722/git-

class ReadRecall(object):
    """读取召回集的结果
    """
    def __init__(self):
        self.client = redis_client
        self.hbu = HBaseUtils(pool)
        self.hot_num = 10

    def read_hbase_recall(self, table_name, key_format, column_format):
        """
        读取用户的召回结果
        :return:
        """
        reco_set = []
        try:
            data = self.hbu.get_table_cells(table_name, key_format,
                                            column_format)

            for _ in data:
                reco_set = list(set(reco_set).union(set(eval(_))))

            # 删除这个召回
            self.hbu.get_table_delete(table_name, key_format, column_format)
        except Exception as e:
            logger.warning("{} WARN read {} recall exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), table_name, e))
        return reco_set

    def read_redis_new_article(self, channel_id):
        """
        读取用户的新文章
        :param channel_id:
        :return:
        """
        logger.warning("{} INFO read channel {} redis new article".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), channel_id))
        key = 'ch:{}:new'.format(channel_id)
        try:

            reco_list = self.client.zrevrange(key, 0, -1)

        except Exception as e:
            logger.warning("{} WARN read new article exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))

            reco_list = []

        return list(map(int, reco_list))

    def read_redis_hot_article(self, channel_id):
        """
        读取热门章召回结果
        :param channel_id: 提供频道
        :return:
        """
        logger.warning("{} INFO read channel {} redis hot article".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), channel_id))
        _key = "ch:{}:hot".format(channel_id)
        try:
            res = self.client.zrevrange(_key, 0, -1)

        except Exception as e:
            logger.warning("{} WARN read new article exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
            res = []

        # 由于每个频道的热门文章有很多，因为保留文章点击次数
        res = list(map(int, res))
        if len(res) > self.hot_num:
            res = res[:self.hot_num]
        return res


# if __name__ == '__main__':
# rr = ReadRecall()
# print(rr.read_hbase_recall('cb_recall', b'recall:user:1115629498121846784', b'als:18'))
# print(rr.read_redis_new_article(18))

예제 #2

파일 보기

파일: recall_service.py 프로젝트: nameli0722/git-

 def __init__(self):
     self.client = redis_client
     self.hbu = HBaseUtils(pool)
     self.hot_num = 10

예제 #3

파일 보기

 def __init__(self):
     self.hbu = HBaseUtils(pool)
     self.recall_service = ReadRecall()

예제 #4

파일 보기

파일: sort_service.py 프로젝트: meelement/Headline_Recommendation

def wdl_sort_service():
    """
    wide&deep进行排序预测
    :param reco_set:
    :param temp:
    :param hbu:
    """
    hbu = HBaseUtils(pool)
    # 排序
    # 1、读取用户特征中心特征
    try:
        user_feature = eval(hbu.get_table_row('ctr_feature_user',
                                              '{}'.format(1115629498121846784).encode(),
                                              'channel:{}'.format(18).encode()))
        # logger.info("{} INFO get user user_id:{} channel:{} profile data".format(
        #     datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id, temp.channel_id))
    except Exception as e:
        user_feature = []
    if user_feature:
        # 2、读取文章特征中心特征
        result = []

        # examples
        examples = []
        for article_id in [17749, 17748, 44371, 44368]:
            try:
                article_feature = eval(hbu.get_table_row('ctr_feature_article',
                                                         '{}'.format(article_id).encode(),
                                                         'article:{}'.format(article_id).encode()))
            except Exception as e:
                article_feature = [0.0] * 111

            # article_feature结构： [channel, 10weights, 100vector]

            # 构造每一个文章与用户的example结构：和 wide_and_deep.py 中训练样本顺序，格式也必须相同（求了平均）
            channel_id = int(article_feature[0])

            vector = np.mean(article_feature[11:]) # 和训练时相同：求平均（简化工作）

            user_weights = np.mean(user_feature) # 和训练时相同：求平均（简化工作）

            article_weights = np.mean(article_feature[1:11]) # 和训练时相同：求平均（简化工作）

            # 封装到example(一次一个样本)
            example = tf.train.Example(features=tf.train.Features(feature={
                "channel_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[channel_id])),
                "vector": tf.train.Feature(float_list=tf.train.FloatList(value=[vector])),
                'user_weights': tf.train.Feature(float_list=tf.train.FloatList(value=[user_weights])),
                'article_weights': tf.train.Feature(float_list=tf.train.FloatList(value=[article_weights])),
            }))

            examples.append(example)

        # 所有的样本，放入一个列表中
        # 调用tensorflow serving的模型服务
        with grpc.insecure_channel("127.0.0.1:8500") as channel:
            stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)

            # 构造请求
            request = classification_pb2.ClassificationRequest()
            request.model_spec.name = 'wdl' # 模型名称
            request.input.example_list.examples.extend(examples) # 将要预测的example样本列表

            # 发送请求：获取结果
            response = stub.Classify(request, 10.0)
            print(response)

    # 是要返回 response的预测结果的，但是现在没有运行，不知道response的数据结构，所以只能暂时返回None
    return None

예제 #5

파일 보기

class ReadRecall(object):
    '''
    读取召回集的结果
    '''
    def __init__(self):
        self.client = redis_client
        self.hbu = HBaseUtils(pool)

    def read_hbase_recall_data(self, table_name, key_format, column_format):
        '''
        获取指定用户的对应频道的召回结果：在线文章内容召回，离线文章内容召回，离线ALS协同过滤召回 都在 cb_recall 的HBase多版本过期时间表
        在 合并多路召回数据 做推荐准备时：从cb_recall表中拿取数据后，要进行删除。
        '''
        reco_set = []
        try:
            data = self.hbu.get_table_cells(table_name, key_format,
                                            column_format)  # 取出多版本数据
            for _ in data:
                # HBase中存储的是bytes，所以需要调用eval()
                reco_set = list(set(reco_set).union(set(eval(_))))  # 合并、去重

            # 删除召回结果（测试时 注释掉）
            self.hbu.get_table_delete(table_name, key_format, column_format)
        except Exception as e:
            logger.warning("{} WARN read {} recall exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), table_name, e))

        return reco_set

    def read_redis_new_article(self, channel_id):
        '''
        读取新文章召回结果
        '''
        logger.info("{} INFO read channel {} redis new article".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), channel_id))
        key = 'ch:{}:new'.format(channel_id)
        try:
            reco_list = self.client.zrevrange(key, 0, -1)
        except Exception as e:
            logger.warning("{} WARN read new article exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
            reco_list = []

        return list(map(int, reco_list))

    def read_redis_hot_article(self, channel_id, hot_num=10):
        '''
        读取热门文章召回结果
        '''
        logger.info("{} INFO read channel {} redis hot article".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), channel_id))
        _key = "ch:{}:hot".format(channel_id)
        try:
            res = self.client.zrevrange(_key, 0, -1)
        except Exception as e:
            logger.warning("{} WARN read new article exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
            res = []

        # 由于每个频道的热门文章有很多，因为保留文章点击次数
        res = list(map(int, res))
        if len(res) > hot_num:
            res = res[:hot_num]

        return res

    def read_hbase_article_similar(self,
                                   table_name,
                                   key_format,
                                   article_num=10):
        """获取文章相似结果
        """
        # 第一种表结构方式测试：
        # create 'article_similar', 'similar'
        # put 'article_similar', '1', 'similar:1', 0.2
        # put 'article_similar', '1', 'similar:2', 0.34
        try:
            _dic = self.hbu.get_table_row(table_name, key_format)

            res = []
            _srt = sorted(_dic.items(), key=lambda obj: obj[1], reverse=True)
            if len(_srt) > article_num:
                _srt = _srt[:article_num]
            for _ in _srt:
                res.append(int(_[0].decode().split(':')[1]))
        except Exception as e:
            logger.error("{} ERROR read similar article exception: {}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
            res = []

        return res


# 测试：
# if __name__ == '__main__':
#     rr = ReadRecall()
#     print("离线ALS模型召回" + str(rr.read_hbase_recall('cb_recall', b'recall:user:2', b'als:18')))
#     print("离线内容召回" + str(rr.read_hbase_recall('cb_recall', b'recall:user:2', b'content:18')))
#     print("在线内容召回" + str(rr.read_hbase_recall('cb_recall', b'recall:user:2', b'online:18')))
#
#     print("在线新文章召回" + str(rr.read_redis_new_article(18)))
#     print("在线热门文章召回" + str(rr.read_redis_hot_article(18)))
#
#     print(rr.read_hbase_article_similar('article_similar', b'1', 10))

예제 #6

파일 보기

class RecoCenter(object):
    """推荐中心
    1、处理时间戳逻辑
    2、召回、排序、缓存
    """
    def __init__(self):
        self.hbu = HBaseUtils(pool)
        self.recall_service = ReadRecall()

    def feed_recommend_time_stamp_logic(self, temp):
        """
        用户刷新时间戳的逻辑
        :param temp: ABTest传入的用户请求参数
        :return:
        """
        # 1、获取用户的历史数据库中最近一次时间戳lt
        # 如果过用户没有过历史记录
        try:
            last_stamp = self.hbu.get_table_row('history_recommend1',
                                                'reco:his:{}'.format(temp.user_id).encode(),
                                                'channel:{}'.format(temp.channel_id).encode(),
                                                include_timestamp=True)[1]
            logger.info("{} INFO get user_id:{} channel:{} history last_stamp".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id, temp.channel_id))

        except Exception as e:
            logger.info("{} INFO get user_id:{} channel:{} history last_stamp, exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id, temp.channel_id, e))
            last_stamp = 0

        # 2、如果lt < 用户请求时间戳, 用户的刷新操作
        if last_stamp < temp.time_stamp:
            # 走正常的推荐流程
            # 缓存读取、召回排序流程

            # last_stamp应该是temp.time_stamp前面一条数据
            # 返回给用户上一条时间戳给定为last_stamp
            # 2.1获取缓存
            res = get_cache_from_redis_hbase(temp, self.hbu)
            # 如果没有缓存，重新读取召回，排序
            if not res:
                logger.info("{} INFO cache is null get user_id:{} channel:{} recall/sort data".
                            format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id, temp.channel_id))

                res = self.user_reco_list(temp)

            # 2.2 直接拿推荐结果，不走缓存
            # res = self.user_reco_list(temp)

            temp.time_stamp = last_stamp
            _track = add_track(res, temp)

            # 读取用户召回结果返回

        else:

            logger.info("{} INFO read user_id:{} channel:{} history recommend data".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id, temp.channel_id))
            # 3、如果lt >= 用户请求时间戳, 用户才翻历史记录
            # 根据用户传入的时间戳请求，去读取对应的历史记录
            # temp.time_stamp
            # 1559148615353,hbase取出1559148615353小的时间戳的数据， 1559148615354
            try:
                row = self.hbu.get_table_cells('history_recommend1',
                                               'reco:his:{}'.format(temp.user_id).encode(),
                                               'channel:{}'.format(temp.channel_id).encode(),
                                               timestamp=temp.time_stamp + 1,
                                               include_timestamp=True)
            except Exception as e:
                logger.warning("{} WARN read history recommend exception:{}".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
                row = []
                res = []

            # [(,), ()]
            # 1559148615353, [15140, 16421, 19494, 14381, 17966]
            # 1558236647437, [18904, 14300, 44412, 18238, 18103, 43986, 44339, 17454, 14899, 18335]
            # 1558236629309, [43997, 14299, 17632, 17120]

            # 3步判断逻辑
            #1、如果没有历史数据，返回时间戳0以及结果空列表
            # 1558236629307
            if not row:
                temp.time_stamp = 0
                res = []
            elif len(row) == 1 and row[0][1] == temp.time_stamp:
                # [([43997, 14299, 17632, 17120], 1558236629309)]
                # 2、如果历史数据只有一条，返回这一条历史数据以及时间戳正好为请求时间戳，修改时间戳为0，表示后面请求以后就没有历史数据了(APP的行为就是翻历史记录停止了)
                res = row[0][0]
                temp.time_stamp = 0
            elif len(row) >= 2:
                res = row[0][0]
                temp.time_stamp = int(row[1][1])
                # 3、如果历史数据多条，返回最近的第一条历史数据，然后返回之后第二条历史数据的时间戳

            # res bytes--->list
            # list str---> int id
            res = list(map(int, eval(res)))

            logger.info(
                "{} INFO history:{}, {}".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), res, temp.time_stamp))

            _track = add_track(res, temp)
            _track['param'] = ''

        return _track

    def user_reco_list(self, temp):
        """
        用户的下拉刷新获取新数据的逻辑
        - 1、循环算法组合参数，遍历不同召回结果进行过滤
        - 2、过滤当前该请求频道推荐历史结果，如果不是0频道需要过滤0频道推荐结果，防止出现
        - 3、过滤之后，推荐出去指定个数的文章列表，写入历史记录，剩下多的写入待推荐结果
        :return:
        """
        # - 1、循环算法组合参数，遍历不同召回结果进行过滤
        reco_set = []
        # (1, [100, 101, 102, 103, 104], [])
        for number in RAParam.COMBINE[temp.algo][1]:
            if number == 103:
                _res = self.recall_service.read_redis_new_article(temp.channel_id)
                reco_set = list(set(reco_set).union(set(_res)))
            elif number == 104:
                _res = self.recall_service.read_redis_hot_article(temp.channel_id)
                reco_set = list(set(reco_set).union(set(_res)))
            else:
                # 100, 101, 102召回结果读取
                _res = self.recall_service.read_hbase_recall_data(RAParam.RECALL[number][0],
                                                             'recall:user:{}'.format(temp.user_id).encode(),
                                                             '{}:{}'.format(RAParam.RECALL[number][1],
                                                                            temp.channel_id).encode())
                reco_set = list(set(reco_set).union(set(_res)))

        # - 2、过滤当前该请求频道推荐历史结果，如果不是0频道需要过滤0频道推荐结果，防止出现其他频道和0频道重复
        history_list = []
        try:
            data = self.hbu.get_table_cells('history_recommend1',
                                            'reco:his:{}'.format(temp.user_id).encode(),
                                            'channel:{}'.format(temp.channel_id).encode())

            for _ in data:
                history_list = list(set(history_list).union(set(eval(_))))

            logger.info("{} INFO read user_id:{} channel_id:{} history data".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id, temp.channel_id))
        except Exception as e:
            # 打印日志
            logger.warning(
                "{} WARN filter history article exception:{}".format(datetime.now().
                                                                     strftime('%Y-%m-%d %H:%M:%S'), e))

        try:
            data = self.hbu.get_table_cells('history_recommend1',
                                            'reco:his:{}'.format(temp.user_id).encode(),
                                            'channel:{}'.format(0).encode())

            for _ in data:
                history_list = list(set(history_list).union(set(eval(_))))

            logger.info("{} INFO read user_id:{} channel_id:{} history data".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id, 0))

        except Exception as e:
            # 打印日志
            logger.warning(
                "{} WARN filter history article exception:{}".format(datetime.now().
                                                                     strftime('%Y-%m-%d %H:%M:%S'), e))

        # reco_set  history_list
        # - 3、过滤之后，推荐出去指定个数的文章列表，写入历史记录，剩下多的写入待推荐结果
        reco_set = list(set(reco_set).difference(set(history_list)))
        print("召回数据：", reco_set)
        logger.info(
            "{} INFO after filter history:{}".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), reco_set))

        # 如果过滤完历史数据后为空，直接返回，不用走排序
        if not reco_set:

            return reco_set
        else:
            # 模型对推荐的结果排序
            # temp.user_id， reco_set
            _sort_num = RAParam.COMBINE[temp.algo][2][0]
            # 'LR'
            reco_set = sort_dict[RAParam.SORT[_sort_num]](reco_set, temp, self.hbu)
            print("排序数据：", reco_set)



            # 如果reco_set小于用户需要推荐的文章
            if len(reco_set) <= temp.article_num:
                res = reco_set
            else:
                # 大于要推荐的文章结果
                res = reco_set[:temp.article_num]

                # 将剩下的文章列表写入待推荐的结果
                self.hbu.get_table_put('wait_recommend',
                                       'reco:{}'.format(temp.user_id).encode(),
                                       'channel:{}'.format(temp.channel_id).encode(),
                                       str(reco_set[temp.article_num:]).encode(),
                                       timestamp=temp.time_stamp)

                # res在外面写入历史记录

            # 直接写入历史记录当中，表示这次又成功推荐一次
            self.hbu.get_table_put('history_recommend1',
                                   'reco:his:{}'.format(temp.user_id).encode(),
                                   'channel:{}'.format(temp.channel_id).encode(),
                                   str(res).encode(),
                                   timestamp=temp.time_stamp)

            return res

예제 #7

파일 보기

파일: reco_center.py 프로젝트: meelement/Headline_Recommendation

class RecoCenter(object):
    """推荐中心
    1、处理时间戳逻辑
    2、召回、排序、缓存
    """
    def __init__(self):
        self.hbu = HBaseUtils(pool)
        self.recall_service = ReadRecall()

    # 增加feed_recommend_logic函数，进行时间戳逻辑判断
    def feed_recommend_time_stamp_logic(self, temp):
        """
        用户刷新时间戳的逻辑
        :param temp: ABTest传入的用户请求参数
        """
        # 1、获取用户的历史数据库中最近一次时间戳last_stamp
        try:
            last_stamp = self.hbu.get_table_row(
                'history_recommend',
                'reco:his:{}'.format(temp.user_id).encode(),
                'channel:{}'.format(temp.channel_id).encode(),
                include_timestamp=True)[1]  # 返回的是列表，[1]是时间戳
            logger.info(
                "{} INFO get user_id:{} channel:{} history last_stamp".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id,
                    temp.channel_id))
        except Exception as e:
            # 如果用户没有历史记录会报异常
            logger.info(
                "{} INFO get user_id:{} channel:{} history last_stamp, exception:{}"
                .format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        temp.user_id, temp.channel_id, e))
            last_stamp = 0

        # 返回的 Track的time_stamp字段：上一条历史记录的时间戳（没有赋值为0）
        logger.info(str(last_stamp) + "___" + str(temp.time_stamp))

        # 2、如果last_stamp < 用户请求时间戳：用户的刷新操作
        if last_stamp < temp.time_stamp:
            # 2.1、走正常的推荐流程：缓存读取
            res = get_cache_from_redis_hbase(temp, self.hbu)
            if not res:  # 缓存中没有，则：召回排序流程
                logger.info(
                    "{} INFO cache is Null get user_id:{} channel:{} recall/sort data"
                    .format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                            temp.user_id, temp.channel_id, e))
                # 2.2、走正常的推荐流程：召回排序流程
                res = self.user_reco_list(temp)
            # 历史数据库中最近一次时间戳last_stamp 赋值给 temp.time_stamp 最后封装成 Track的time_stamp字段 返回给前端
            temp.time_stamp = last_stamp
            _track = add_track(res, temp)
        else:
            # 3、如果last_stamp >= 用户请求时间戳, 用户才翻历史记录
            '''
            如果历史时间戳大于用户请求的这次时间戳，那么就是在获取历史记录，用户请求的历史时间戳是具体某个历史记录的时间戳T，
            Hbase当中不能够直接用T去获取，而需要去（T + N=1）> T 的时间戳获取，才能拿到包含T时间的结果，并且使用get_table_cells去获取
            '''
            logger.info(
                "{} INFO read user_id:{} channel:{} history recommend data".
                format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                       temp.user_id, temp.channel_id))
            try:
                # 根据 用户传入的时间戳（需要 传入的时间戳 + 1） 使用get_table_cells去获取 <= 传入的时间戳 的所有历史数据。
                row = self.hbu.get_table_cells(
                    'history_recommend',
                    'reco:his:{}'.format(temp.user_id).encode(),
                    'channel:{}'.format(temp.channel_id).encode(),
                    timestamp=temp.time_stamp + 1,
                    include_timestamp=True)
            except Exception as e:
                # 如果用户没有历史记录会报异常
                logger.warning(
                    "{} WARN read history recommend exception:{}".format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
                row = []
                res = []

            # 历史推荐结果 三步判断逻辑：
            # 1、如果没有历史数据，返回时间戳0以及结果空列表
            if not row:
                temp.time_stamp = 0  # temp.time_stamp 最后封装成 Track的time_stamp字段 返回给前端
                res = []  # 推荐列表
            elif len(row) == 1 and row[0][1] == temp.time_stamp:
                # 2、如果历史数据只有一条，返回这一条历史数据以及时间戳正好为请求时间戳，修改时间戳为0，表示后面请求以后就没有历史数据了(APP的行为就是翻历史记录停止了)
                res = row[0][0]
                temp.time_stamp = 0  # temp.time_stamp 最后封装成 Track的time_stamp字段 返回给前端
            elif len(row) >= 2:
                # 3、如果历史数据多条，返回最近的第一条历史数据，然后返回最近的第二条历史数据的时间戳（为下次翻历史记录做准备）
                res = row[0][0]
                temp.time_stamp = int(
                    row[1]
                    [1])  # temp.time_stamp 最后封装成 Track的time_stamp字段 返回给前端

            # res(bytes) → eval(res) → list(str)
            # list(str) → map(int, eval(res)) → list(int)
            # 最外层再套list：list(map(int, eval(res))) 防止res为空时报异常
            res = list(map(int, eval(res)))  # 一条历史推荐结果（封装成列表）
            logger.info("{} INFO history:{}, {}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), res,
                temp.time_stamp))
            _track = add_track(res, temp)
            # 埋点参数参考 中的 曝光参数："param": '{"action": "exposure", "userId": 1, "articleId": [1,2,3,4],  "algorithmCombine": "c1"}'
            # 返回历史记录时，曝光参数就置为 '' 了。
            _track['param'] = ''

        return _track

    def user_reco_list(self, temp):
        """
        用户下拉刷新获取新数据的逻辑
        1、循环算法的召回集组合参数，合并多路召回结果集
        2.1、过滤当前该请求频道推荐历史结果（对合并的召回结果集进行history_recommend过滤）
        2.2、如果0号频道（推荐频道）有历史推荐记录，也需要过滤
        3、过滤之后，推荐出去指定个数的文章列表，写入历史记录history_recommend，剩下的写入待推荐结果wait_recommend
        """
        reco_set = []
        '''
        COMBINE={
            'Algo-1': (1, [100, 101, 102, 103, 104], [200]),  # 算法集名称 : (序号, [召回结果数据集列表], [排序模型列表])
            'Algo-2': (2, [100, 101, 102, 103, 104], [200])   # 目前为止不使用 105:文章相似度 直接查询 article_similar 的HBase表
        }
        '''
        # 1、循环算法的召回集组合参数，合并多路召回结果集
        for _num in RAParam.COMBINE[temp.algo][1]:
            # 进行每个召回结果的读取100,101,102,103,104
            if _num == 103:
                # 新文章召回读取
                _res = self.recall_service.read_redis_new_article(
                    temp.channel_id)
                reco_set = list(set(reco_set).union(set(_res)))  # 合并召回的结果
            elif _num == 104:
                # 热门文章召回读取
                _res = self.recall_service.read_redis_hot_article(
                    temp.channel_id)
                reco_set = list(set(reco_set).union(set(_res)))  # 合并召回的结果
            else:
                # 离线模型ALS召回、离线文章内容召回、在线文章内容召回
                _res = self.recall_service.read_hbase_recall_data(
                    RAParam.RECALL[_num][0],
                    'recall:user:{}'.format(temp.user_id).encode(),
                    '{}:{}'.format(RAParam.RECALL[_num][1],
                                   temp.channel_id).encode())
                reco_set = list(set(reco_set).union(set(_res)))  # 合并召回的结果

        # 2.1、过滤当前该请求频道推荐历史结果（对合并的召回结果集进行history_recommend过滤）
        history_list = []
        try:
            # 所有版本
            data = self.hbu.get_table_cells(
                'history_recommend',
                'reco:his:{}'.format(temp.user_id).encode(),
                'channel:{}'.format(temp.channel_id).encode())
            for _ in data:
                history_list = list(set(history_list).union(set(eval(_))))

            logger.info(
                "{} INFO filter user_id:{} channel:{} history data".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id,
                    temp.channel_id))
        except Exception as e:
            logger.warning(
                "{} WARN filter history article exception:{}".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))

        # 2.2、如果0号频道（推荐频道）有历史推荐记录，也需要过滤
        try:
            data = self.hbu.get_table_cells(
                'history_recommend',
                'reco:his:{}'.format(temp.user_id).encode(),
                'channel:{}'.format(0).encode())
            for _ in data:
                history_list = list(set(history_list).union(set(eval(_))))

            logger.info(
                "{} INFO filter user_id:{} channel:{} history data".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id,
                    0))
        except Exception as e:
            logger.warning(
                "{} WARN filter history article exception:{}".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))

        # 过滤操作 reco_set 与 history_list 进行过滤
        reco_set = list(set(reco_set).difference(set(history_list)))
        logger.info("{} INFO after filter history is {}".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), reco_set))

        # 3、过滤之后，推荐出去指定个数的文章列表，写入历史记录history_recommend，剩下的写入待推荐结果wait_recommend
        # 如果过滤之后没有数据，直接返回
        if not reco_set:
            return reco_set
        else:
            # 3.1、排序代码逻辑（Spark的LR模型、TensorFlow的Wide&Deep模型）
            _sort_num = RAParam.COMBINE[temp.algo][2][0]  # 排序模型列表 中 索引出 排序模型编号
            reco_set = sort_dict[RAParam.SORT[_sort_num]](reco_set, temp,
                                                          self.hbu)

            # 类型进行转换
            reco_set = list(map(int, reco_set))

            # 跟请求需要推荐的文章数量article_num 进行比对
            # 3.2、如果请求推荐文章数量article_num > 实际推荐文章总数量reco_set
            if len(reco_set) <= temp.article_num:
                # 按 实际推荐文章总数量reco_set 进行推荐
                res = reco_set
            else:
                # 3.3、如果请求推荐文章数量article_num < 实际推荐文章总数量reco_set
                # 3.3.1、截取请求推荐文章数量
                res = reco_set[:temp.article_num]  # 左开右闭
                # 3.3.2、剩下的实际推荐结果放入wait_recommend，等待下次刷新时直接推荐（len(reco_set) - article_num）
                self.hbu.get_table_put(
                    'wait_recommend',
                    'reco:{}'.format(temp.user_id).encode(),
                    'channel:{}'.format(temp.channel_id).encode(),
                    str(reco_set[temp.article_num:]).encode(),  # 多出的实际推荐结果
                    timestamp=temp.time_stamp)
                logger.info(
                    "{} INFO put user_id:{} channel:{} wait data".format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        temp.user_id, temp.channel_id))

            # 4、将实际推荐出去的结果 放入历史记录表当中
            self.hbu.get_table_put('history_recommend',
                                   'reco:his:{}'.format(temp.user_id).encode(),
                                   'channel:{}'.format(
                                       temp.channel_id).encode(),
                                   str(res).encode(),
                                   timestamp=temp.time_stamp)
            # 将实际推荐出去的结果 放入历史记录日志
            logger.info(
                "{} INFO store recall/sorted user_id:{} channel:{} history_recommend data"
                .format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        temp.user_id, temp.channel_id))

        return res

예제 #8

파일 보기

class ReadRecall(object):
    """读取召回集的结果
    """
    def __init__(self):
        self.client = redis_client
        self.hbu = HBaseUtils(pool)
        self.hot_num = 10

    def read_hbase_recall_data(self, table_name, key_format, column_format):
        """
        读取用户指定召回表中的召回结果
        :param table_name: 召回表名字
        :param key_format: 键，用户
        :param column_format: 列族，哪个频道
        :return:
        """
        recall_list = []
        # 需要异常处理
        try:
            # 读取这个频道所有版本数据，合并到一起
            data = self.hbu.get_table_cells(table_name, key_format,
                                            column_format)

            for _ in data:
                recall_list = list(set(recall_list).union(set(eval(_))))

            # 删除召回结果, 为了测试我们暂时不删除数据，否则还得创造测试数据
            # self.hbu.get_table_delete(table_name, key_format, column_format)

        except Exception as e:
            logger.warning("{} WARN read {} recall exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), table_name, e))
        return recall_list

    def read_redis_new_article(self, channel_id):
        """
        读取新 文章的redis
        :param channel_id: 具体频道的new article
        :return:
        """

        _key = "ch:{}:new".format(channel_id)
        try:
            res = self.client.zrevrange(_key, 0, -1)
            logger.info("{} INFO read channel_id:{} new article:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), channel_id, res))
        except Exception as e:
            res = []
            logger.info("{} WARN exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))

        # res里面的文章ID是字符串---->int
        return list(map(int, res))

    def read_redis_hot_article(self, channel_id):
        """
        读取新闻章召回结果
        :param channel_id: 提供频道
        :return:
        """
        logger.warning("{} WARN read channel {} redis hot article".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), channel_id))
        _key = "ch:{}:hot".format(channel_id)
        try:
            res = self.client.zrevrange(_key, 0, -1)

        except Exception as e:
            logger.warning("{} WARN read new article exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
            res = []

        # 做一个数量处理取出TOPK
        res = list(map(int, res))
        if len(res) > self.hot_num:
            res = res[:self.hot_num]
        return res

    def read_hbase_article_similar(self, table_name, key_format, article_num):
        """获取文章相似结果
        :param article_id: 文章id
        :param article_num: 文章数量
        :return:
        """
        # 第一种表结构方式测试：
        # create 'article_similar', 'similar'
        # put 'article_similar', '1', 'similar:1', 0.2
        # put 'article_similar', '1', 'similar:2', 0.34
        try:
            _dic = self.hbu.get_table_row(table_name, key_format)

            res = []
            _srt = sorted(_dic.items(), key=lambda obj: obj[1], reverse=True)
            if len(_srt) > article_num:
                _srt = _srt[:article_num]
            for _ in _srt:
                res.append(int(_[0].decode().split(':')[1]))
        except Exception as e:
            logger.error("{} ERROR read similar article exception: {}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
            res = []
        return res

예제 #9

파일 보기

파일: sort_service.py 프로젝트: yuhuofei/Bigdata

def wdl_sort_service():
    """
    wide&deep进行排序预测
    :param reco_set:
    :param temp:
    :param hbu:
    :return:
    """
    hbu = HBaseUtils(pool)
    # 排序
    # 1、读取用户特征中心特征 1115629498121846784
    try:
        user_feature = eval(hbu.get_table_row('ctr_feature_user',
                                              '{}'.format(1113244157343694848).encode(),
                                              'channel:{}'.format(18).encode()))
        # logger.info("{} INFO get user user_id:{} channel:{} profile data".format(
        #     datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id, temp.channel_id))
    except Exception as e:
        user_feature = []
    if user_feature:
        # 2、读取文章特征中心特征
        result = []

        # examples
        examples = []
        # for article_id in [17749, 17748, 44371, 44368]:
        for article_id in [22324, 22325, 22326, 22327]:
            try:
                article_feature = eval(hbu.get_table_row('ctr_feature_article',
                                                         '{}'.format(article_id).encode(),
                                                         'article:{}'.format(article_id).encode()))
            except Exception as e:

                article_feature = [0.0] * 111

            channel_id = int(article_feature[0])
            # 求出后面若干向量的平均值
            vector = np.mean(article_feature[11:])
            # 第三个用户权重特征10维
            user_feature = np.mean(user_feature)
            # 第四个文章权重特征10维
            article_feature = np.mean(article_feature[1:11])

            # 组建example
            example = tf.train.Example(features=tf.train.Features(feature={
                "channel_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[channel_id])),
                "vector": tf.train.Feature(float_list=tf.train.FloatList(value=[vector])),
                'user_weigths': tf.train.Feature(float_list=tf.train.FloatList(value=[user_feature])),
                'article_weights': tf.train.Feature(float_list=tf.train.FloatList(value=[article_feature])),
            }))

            examples.append(example)

        with grpc.insecure_channel('127.0.0.1:8500') as channel:
            # 建立连接通道
            stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)

            # 获取测试数据集，并转换成 Example 实例
            # 构造 RPC 请求，指定模型名称。
            request = classification_pb2.ClassificationRequest()
            request.model_spec.name = 'wdl'
            request.input.example_list.examples.extend(examples)

            # 发送请求并获取结果
            response = stub.Classify(request, 10.0)
            print(response)

    return None

예제 #10

파일 보기

파일: recall_service.py 프로젝트: yuhuofei/rcmd

class ReadRecall(object):
    """
    读取召回集的结果
    """
    def __init__(self):
        self.client = redis_client
        self.hbu = HBaseUtils(pool)
        self.hot_num = 10

    def read_hbase_recall_data(self, table_name, key_format, column_format):
        """
        读取多路召回als,content,online
        :param table_name:
        :param key_format:
        :param column_format:
        :return:
        """

        recall_list = []
        try:
            data = self.hbu.get_table_cells(table_name, key_format,
                                            column_format)
            for _ in data:
                recall_list = list(set(recall_list).union(set(eval(_))))

            # self.hbu.get_table_delete(table_name,key_format,column_format)

        except Exception as e:
            logger.warning("{} WARN read {} recall exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), table_name, e))
        return recall_list

    def read_redis_new_article(self, channel_id):
        """
        读取用户的新文章
        :param channel_id:
        :return:
        """
        logger.warning("{} INFO read channel {} redis new article".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), channel_id))
        key = 'ch:{}:new'.format(channel_id)
        try:

            reco_list = self.client.zrevrange(key, 0, -1)

        except Exception as e:
            logger.warning("{} WARN read new article exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))

            reco_list = []

        return list(map(int, reco_list))

    def read_redis_hot_article(self, channel_id):
        """
        读取热门文章召回结果
        :param channel_id: 提供频道
        :return:
        """
        logger.warning("{} INFO read channel {} redis hot article".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), channel_id))
        _key = "ch:{}:hot".format(channel_id)
        try:
            res = self.client.zrevrange(_key, 0, -1)

        except Exception as e:
            logger.warning("{} WARN read new article exception:{}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
            res = []

        # 由于每个频道的热门文章有很多，因为保留文章点击次数
        res = list(map(int, res))
        if len(res) > self.hot_num:
            res = res[:self.hot_num]
        return res

    def read_hbase_article_similar(self, table_name, key_format, article_num):
        """
        相似文章召回
        :param table_name:
        :param key_format:
        :param article_num:
        :return:
        """
        try:
            _dic = self.hbu.get_table_row(table_name, key_format)

            res = []
            _srt = sorted(_dic.items(), key=lambda obj: obj[1], reverse=True)
            if len(_srt) > article_num:
                _srt = _srt[:article_num]

            for _ in _srt:
                res.append(int(_[0].decode().split(':')[1]))

        except Exception as e:
            logger.error("{} ERROR read similar article exception: {}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
            res = []
        return res

예제 #11

파일 보기

파일: reco_cent.py 프로젝트: yuhuofei/Bigdata

class RecoCenter(object):
    """推荐中心
    """
    def __init__(self):
        self.hbu = HBaseUtils(pool)
        self.recall_service = recall_server.ReadRecall()

    def feed_recommend_logic(self, temp):
        """
        推荐中心的逻辑
        :param temp:
        :return:
        """
        # 根据请求时间戳判断
        try:
            last_stamp = self.hbu.get_table_row(
                'history_recommend',
                'reco:his:{}'.format(temp.user_id).encode(),
                'channel:{}'.format(temp.channel_id).encode(),
                include_timestamp=True)[1]
            logger.info(
                "{} INFO get user_id:{} channel:{} history last_stamp".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id,
                    temp.channel_id))
        except Exception as e:
            logger.warning(
                "{} WARN read history recommend exception:{}".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
            last_stamp = 0

        # 1、如果该用户某频道的历史推荐记录中的最近时间戳 < temp.time_stamp，下拉刷新
        # 获取历史最近时间戳
        # 1558143073173,
        if last_stamp < temp.time_stamp:
            # 测试
            # 返回召回结果
            # 返回前面一个历史记录时间戳
            # temp.time_stamp = int(last_stamp)
            # track = add_track([44657, 14961, 17522, 43894, 44412, 16000, 14208, 44419, 17802, 14223, 18836], temp)
            # 1、加入二级缓存逻辑
            res = get_cache_from_redis_hbase(temp, self.hbu)
            # 如果二级缓存里面没有数据
            if not res:
                logger.info(
                    "{} INFO get user_id:{} channel:{} recall/sort data".
                    format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                           temp.user_id, temp.channel_id))
                # 获取召回结果
                res = self.user_reco_list(temp)

            # 2、不开启二级缓存
            # 获取多路召回数据结果
            # res = self.user_reco_list(temp)

            temp.time_stamp = int(last_stamp)
            track = add_track(res, temp)

            return track
        else:
            # 2、如果该用户某频道的历史推荐记录中的最近时间戳 >= temp.time_stamp 上滑获取历史记录
            # 读取历史记录，cells
            # 获取所有版本的数据
            # timestamp=temp.time_stamp + 1, 包含自己请求的时间戳数据
            logger.info(
                "{} INFO read user_id:{} channel:{} history recommend data".
                format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                       temp.user_id, temp.channel_id))
            res = []
            try:
                row = self.hbu.get_table_cells(
                    'history_recommend',
                    'reco:his:{}'.format(temp.user_id).encode(),
                    'channel:{}'.format(temp.channel_id).encode(),
                    timestamp=temp.time_stamp + 1,
                    include_timestamp=True)

            except Exception as e:
                logger.warning(
                    "{} WARN read history recommend exception:{}".format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))
                row = []
                res = []

            # 进行逻辑判断
            # 1、如果没有历史数据，返回时间戳0以及结果空列表
            # 2、如果历史数据只有一条，返回这一条历史数据以及时间戳正好为请求时间戳，修改时间戳为0，表示后面请求以后就没有历史数据了(APP的行为就是翻历史记录停止了)
            # 3、如果历史数据多条，返回最近的第一条历史数据，然后返回之后第二条历史数据的时间戳
            if not row:
                # 表示看历史记录到头了一个标志
                temp.time_stamp = 0
                # 返回空结果
                res = []
            elif len(row) == 1 and row[0][1] == temp.time_stamp:
                # 123456789102 最后一条历史记录
                res = eval(row[0][0])
                temp.time_stamp = 0
            elif len(row) >= 2:
                # 5,4, 3,2,1
                res = eval(row[0][0])
                temp.time_stamp = int(row[1][1])

            # 最终结果封装
            res = list(map(int, res))

            track = add_track(res, temp)
            # 因为获取历史，不是新请求召回排序刷新
            track['param'] = ''
            return track

    def user_reco_list(self, temp):
        reco_set = []
        # 1、循环算法组合参数，遍历不同召回结果进行过滤
        for _num in RAParam.COMBINE[temp.algo][1]:
            # 进行每个召回结果的读取100,101,102,103,104
            if _num == 103:
                # 新文章召回读取
                _res = self.recall_service.read_redis_new_article(
                    temp.channel_id)
                reco_set = list(set(reco_set).union(set(_res)))
            elif _num == 104:
                # 热门文章召回读取
                _res = self.recall_service.read_redis_hot_article(
                    temp.channel_id)
                reco_set = list(set(reco_set).union(set(_res)))
            else:
                _res = self.recall_service. \
                    read_hbase_recall_data(RAParam.RECALL[_num][0],
                                           'recall:user:{}'.format(temp.user_id).encode(),
                                           '{}:{}'.format(RAParam.RECALL[_num][1], temp.channel_id).encode())
                # 合并某个协同过滤召回的结果
                reco_set = list(set(reco_set).union(set(_res)))

        # reco_set都是新推荐的结果，需要进行过滤
        history_list = []
        try:
            data = self.hbu.get_table_cells(
                'history_recommend',
                'reco:his:{}'.format(temp.user_id).encode(),
                'channel:{}'.format(temp.channel_id).encode())
            for _ in data:
                history_list = list(set(history_list).union(set(eval(_))))

            logger.info(
                "{} INFO filter user_id:{} channel:{} history data".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id,
                    temp.channel_id))
        except Exception as e:
            logger.warning(
                "{} WARN filter history article exception:{}".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))

        # 如果0号频道有历史记录，也需要过滤

        try:
            data = self.hbu.get_table_cells(
                'history_recommend',
                'reco:his:{}'.format(temp.user_id).encode(),
                'channel:{}'.format(0).encode())
            for _ in data:
                history_list = list(set(history_list).union(set(eval(_))))

            logger.info(
                "{} INFO filter user_id:{} channel:{} history data".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), temp.user_id,
                    0))
        except Exception as e:
            logger.warning(
                "{} WARN filter history article exception:{}".format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), e))

        # 过滤操作 reco_set 与history_list进行过滤
        reco_set = list(set(reco_set).difference(set(history_list)))

        # 排序操作
        # 根据指定的排序算法和获取的排序后样本数量获取推荐结果
        if reco_set:
            _sort_num = RAParam.COMBINE[temp.algo][2][0]
            reco_set = sort_dict[RAParam.SORT[_sort_num]](reco_set, temp,
                                                          self.hbu)
        # 如果没有内容，直接返回
        if not reco_set:
            return reco_set
        else:
            # 类型进行转换
            reco_set = list(map(int, reco_set))
            # 跟后端请求的推荐文章数量进行比对 article_num
            # article_num > reco_set
            if len(reco_set) <= temp.article_num:
                res = reco_set
            else:
                # 只取出指定数量的推荐内容
                res = reco_set[:temp.article_num]
                # 剩下的推荐结果放入wait_recommend等待下次刷新的时候直接推荐
                self.hbu.get_table_put(
                    'wait_recommend',
                    'reco:{}'.format(temp.user_id).encode(),
                    'channel:{}'.format(temp.channel_id).encode(),
                    str(reco_set[temp.article_num:]).encode(),
                    timestamp=temp.time_stamp)
                logger.info(
                    "{} INFO put user_id:{} channel:{} wait data".format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        temp.user_id, temp.channel_id))

            # 已推荐内容放入历史记录表当中
            self.hbu.get_table_put('history_recommend',
                                   'reco:his:{}'.format(temp.user_id).encode(),
                                   'channel:{}'.format(
                                       temp.channel_id).encode(),
                                   str(res).encode(),
                                   timestamp=temp.time_stamp)

            # 写入推荐日志
            logger.info(
                "{} INFO store recall/sorted user_id:{} channel:{} history_recommend data"
                .format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        temp.user_id, temp.channel_id))

            return res