示例#1
0
"""
# @file tags_gen.py
# @Synopsis  gen tags
# @author Ming Gu([email protected]))
# @version 1.0
# @date 2015-09-20
"""

import sys
sys.path.append('..')
import commands
from conf.env_config import EnvConfig
from conf.init_logger import InitLogger
from dal.spark_submit import SparkSubmit
from bll.gen_rec_list import genRecList
import logging
import os
from datetime import datetime

if __name__ == '__main__':
    start_time = datetime.now()
    InitLogger()
    logger = logging.getLogger(EnvConfig.LOG_NAME)
    main_program_path = '../bll/item_cf.py'
    SparkSubmit.sparkSubmit(main_program_path, run_locally=False)
    end_time = datetime.now()
    time_span = end_time - start_time
    minutes = time_span.total_seconds() / 60
    logger.debug('cal_itemcf spent {0} minutes'.format(minutes))
def main():
    """
    # @Synopsis  spark main program
    #
    # @Returns   succeeded or not(to be added)
    """
    works_type = 'movie'
    InitLogger()
    logger = logging.getLogger(EnvConfig.LOG_NAME)
    HDFS.overwrite(EnvConfig.LOCAL_PM_REVIEWED_CHANNEL_DICT[works_type],
            EnvConfig.HDFS_CHANNEL_PATH_DICT[works_type])
    LongVideoInfo.genVideoInfo(works_type)
    sc = ps.SparkContext()

    play_log = Util.genLogList(3, 1, works_type=works_type)
    # play_log = Util.getOneHourSampleLog(works_type=works_type, platform='PC')
    logger.debug('play_log: {0}'.format(play_log))

    play_log_rdd = sc.textFile(play_log)\
            .map(RddFuncs.parsePlayLog)\
            .map(lambda x: (x[0], x[1]))
    # play_log_rdd = sc.textFile(play_log)\
    #         .map(RddFuncs.parsePlayLog)\
    #         .filter(lambda x: x[2] == 'search')\
    #         .map(lambda x: (x[0], x[1]))
    Util.debugRdd(play_log_rdd, 'play_log_rdd', logger)

    #(vid, uid)
    item_user_rdd = play_log_rdd \
            .map(lambda x: (x[1], x[0]))

    # (group_id, group_name)
    group_rdd = sc.textFile(EnvConfig.HDFS_CHANNEL_PATH_DICT[works_type])\
            .map(RddFuncs.parseGroupFile)
    Util.debugRdd(group_rdd, 'group_rdd', logger)

    # (group_id, feature), a group_id may contain more than one feature
    group_feature_rdd = group_rdd\
            .flatMap(lambda x: [(x[0], i) for i in x[0].split('$$')])
    Util.debugRdd(group_feature_rdd, 'group_feature_rdd', logger)

    # (feature, set(vid))
    feature_vids_rdd = \
            sc.textFile(EnvConfig.HDFS_LONG_VIDEO_INFO_PATH_DICT[works_type])\
            .flatMap(lambda x: RddFuncs.parseVideoFeature(x, works_type))\
            .distinct()\
            .filter(lambda x: x is not None)\
            .map(lambda x: (x[1], x[0]))\
            .groupByKey()\
            .map(lambda x: (x[0], set(x[1])))
    Util.debugRdd(feature_vids_rdd, 'feature_vids_rdd', logger)

    # (feature, (group_id, vids)) => (group_id, vids)
    group_vids_rdd = group_feature_rdd\
            .map(lambda x: (x[1], x[0]))\
            .join(feature_vids_rdd)\
            .map(lambda x: (x[1][0], x[1][1]))\
            .reduceByKey(lambda a, b: a & b)
    Util.debugRdd(group_vids_rdd, 'group_vids_rdd', logger)

    # (group_id, (vids, group_name)) => (group_name, vids) => (group_name, vid)
    group_vid_rdd = group_vids_rdd\
            .join(group_rdd)\
            .map(lambda x: (x[1][1], x[1][0]))\
            .flatMap(lambda x: [(x[0], i) for i in x[1]])
    Util.debugRdd(group_vid_rdd, 'group_vid_rdd', logger)

    #(uid, (group_name, cnt))
    user_played_movie_cnt_in_group_rdd = group_vid_rdd \
            .map(lambda x: (x[1], x[0])) \
            .join(item_user_rdd) \
            .map(lambda x:((x[1][0], x[1][1]), 1)) \
            .reduceByKey(lambda a, b: a + b)\
            .map(lambda x: (x[0][1], (x[0][0], x[1])))
    Util.debugRdd(user_played_movie_cnt_in_group_rdd,
            'user_played_movie_cnt_in_group_rdd', logger)

    #(uid, vid) => (uid, (vid, (group_name, cnt))) => ((group_name, vid), cnt)
    group_item_sim_rdd = play_log_rdd \
            .join(user_played_movie_cnt_in_group_rdd) \
            .map(lambda x: ((x[1][1][0], x[1][0]), x[1][1][1] - 1)) \
            .reduceByKey(lambda a, b: a + b)
    Util.debugRdd(group_item_sim_rdd, 'group_item_sim_rdd', logger)

    group_vid_key_rdd = group_vid_rdd \
            .map(lambda x: (x, 1))

    # we only need the similarity between a vid and the group it belongs,
    # ignore the similarity between vid and other groups, and vid in the group
    # but with no play history should still be in the group, with 0 similarity
    # ((group, vid), sim)
    group_item_sim_rdd = group_item_sim_rdd \
            .rightOuterJoin(group_vid_key_rdd) \
            .map(RddFuncs.fillZeroSim)
    Util.debugRdd(group_item_sim_rdd, 'group_item_sim_rdd', logger)

    # ((group, vid), sim) => (group, (vid, sim)) => (group, [(vid, sim)]) =>
    #(group_name\t vid:sim$$vid:sim...)
    group_content_rdd = group_item_sim_rdd\
            .map(lambda x: (x[0][0], (x[0][1], x[1])))\
            .groupByKey()\
            .map(lambda x: (x[0], sorted(x[1], key=lambda a: -a[1])))\
            .map(lambda x: RddFuncs.video_info_map_func(x, works_type))\
            .map(lambda x: '%s\t%s' % (x[0], x[1]))\
            .repartition(1)
    Util.debugRdd(group_content_rdd, 'group_content_rdd', logger)

    hdfs_path = EnvConfig.HDFS_CHANNEL_CONTENT_PATH_DICT[works_type]
    local_path = EnvConfig.LOCAL_CHANNEL_CONTENT_DICT[works_type]
    succeeded = Util.save_rdd(group_content_rdd, hdfs_path, local_path)
    if succeeded:
        if not EnvConfig.DEBUG:
            succeeded = Mola.updateDb('movie_channel_channel2id:', local_path)
    if not succeeded:
        logger.fatal('{0} channel_content_gen failed'.format(works_type))

    GenTagContentTitle.replaceIdWithTitle(works_type)

    return succeeded
def main():
    """
    # @Synopsis  get short video candidates
    #
    # @Returns   nothing
    """
    InitLogger()
    logger = logging.getLogger(EnvConfig.LOG_NAME)
    sc = ps.SparkContext()
    log_day_cnt = 28
    MAX_USER_SHORT_VIDEO_PER_DAY = 50
    # MAX_SHORT_VIDEO_UV_PER_DAY = 1000
    MIN_SHORT_VIDEO_UV_PER_DAY = 1

    # short_video_play_log = LogFileNameGen.genOneHourSampleLog('short',
    #         'play', 'PC')
    short_video_play_log = LogFileNameGen.genLogList(log_day_cnt, 1, 'short',
                                                     'play', 'PC')

    short_video_play_log_rdd = sc.textFile(short_video_play_log)\
            .map(LogParser.parseWithoutException)\
            .filter(lambda x: x is not None)\
            .filter(lambda x: 'uid' in x and 'url' in x and 'playType' in x)\
            .map(lambda x: (x['uid'], unquote(x['url'])))\
            .filter(lambda x: 'v.baidu.com' not in x[1])\
            .filter(lambda x: 'video.baidu.com' not in x[1])\
            .distinct()

    def filterTooActiveUser(log_rdd, max_item_cnt):
        """
        # @Synopsis  filter users who watches too many videos(long or short) during
        # the time window, these users are probobaly robots
        #
        # @Args log_rdd
        # @Args max_item_cnt max number of videos a user should watch
        #
        # @Returns   filtered rdd
        """
        user_item_cnt = log_rdd\
                .map(lambda x: (x[0], 1))\
                .reduceByKey(lambda a, b: a + b)\
                .filter(lambda x: x[1] <= max_item_cnt)
        filtered_rdd = log_rdd\
                .join(user_item_cnt)\
                .filter(lambda x: x[1][1] is not None)\
                .map(lambda x: (x[0], x[1][0]))
        return filtered_rdd

    short_video_play_log_rdd = filterTooActiveUser(
        short_video_play_log_rdd,
        MAX_USER_SHORT_VIDEO_PER_DAY * log_day_cnt).cache()
    RddUtils.debugRdd(short_video_play_log_rdd, 'short_video_play_log_rdd',
                      logger)

    short_video_uv_rdd = short_video_play_log_rdd\
            .map(lambda x: (x[1], 1)) \
            .reduceByKey(lambda a, b: a + b)\
            .filter(lambda x: x[1] >= MIN_SHORT_VIDEO_UV_PER_DAY * log_day_cnt)

    short_video_meta_rdd = RddUtils.loadGBKFile(sc, EnvConfig.HDFS_SHORT_VIDEO_META_PATH)\
            .map(VideoInfoUtil.parseShortVideoMeta)\
            .filter(lambda x: x is not None)\
            .filter(lambda x: x['duration'] < 20 * 60)\
            .map(lambda x: (x.pop('url', None), x))

    # RddUtils.debugRdd(short_video_title_rdd, 'short_video_title_rdd', logger)

    def output_mapper(x):
        """
        # @Synopsis  output_mapper
        # @Args x
        # @Returns   output string in utf8
        """
        return u'\t'.join([
            str(x[0]), x[1][1]['title'], x[1][1]['image_link'],
            x[1][1]['duration'],
            str(x[1][0])
        ]).encode('utf8')
    candidate_video_meta_rdd = short_video_uv_rdd\
            .join(short_video_meta_rdd)\
            .map(output_mapper)
    # RddUtils.debugRdd(candidate_video_meta_rdd, 'candidate_video_meta_rdd', logger)

    hdfs_path = os.path.join(EnvConfig.HDFS_DERIVANT_PATH, 'candidates')
    local_path = os.path.join(EnvConfig.LOCAL_DATA_BASE_PATH, 'short',
                              'source', 'candidates')
    RddUtils.saveRdd(candidate_video_meta_rdd, hdfs_path, local_path)
示例#4
0
def main():
    """
    # @Synopsis  main program
    #
    # @Returns succeeded or not
    """
    InitLogger()
    logger = logging.getLogger(EnvConfig.LOG_NAME)
    sc = ps.SparkContext()
    works_type = 'tv'
    logger.debug('works_type is {0}'.format(works_type))
    LongVideoInfo.genVideoInfo(works_type)
    VIDEO_CNT_LOWER_BOUND = 10

    name_gen = TagNameGen()

    # (group_id, [vids])
    one_feature_group_rdd = \
            sc.textFile(EnvConfig.HDFS_LONG_VIDEO_INFO_PATH_DICT[works_type])\
            .flatMap(lambda x: RddFuncs.parseVideoFeature(x, works_type))\
            .distinct()\
            .filter(lambda x: x is not None)\
            .filter(lambda x: featureFilter(x, works_type))\
            .map(lambda x: (x[1], x[0]))\
            .groupByKey()\
            .filter(lambda x: len(x[1]) >= VIDEO_CNT_LOWER_BOUND)
    Util.debugRdd(one_feature_group_rdd, 'one_feature_group_rdd', logger)

    group_item_rdd = one_feature_group_rdd\
            .flatMap(lambda x: [(x[0], i) for i in x[1]])
    Util.debugRdd(group_item_rdd, 'group_item_rdd', logger)

    play_log = Util.genLogList(7, 1, works_type='tv', platform='PC')
    # play_log = Util.getOneHourSampleLog('comic', 'PC')
    user_item_rdd = sc.textFile(play_log)\
            .map(RddFuncs.parsePlayLog)\
            .map(lambda x: (x[0], x[1]))\
            .distinct()
    # user_item_rdd = sc.textFile(play_log)\
    #         .map(RddFuncs.parsePlayLog)\
    #         .filter(lambda x: x[2] == 'search')\
    #         .map(lambda x: (x[0], x[1]))\
    #         .distinct()
    Util.debugRdd(user_item_rdd, 'user_item_rdd', logger)

    item_uv_rdd = user_item_rdd\
            .map(lambda x: (x[1], 1))\
            .reduceByKey(lambda a, b: a + b)\
            .filter(lambda x: x[1] > 1)
    Util.debugRdd(item_uv_rdd, 'item_uv_rdd', logger)
    #(item, (group, uv))
    group_uv_rdd = group_item_rdd\
            .map(lambda x: (x[1], x[0]))\
            .join(item_uv_rdd)\
            .map(lambda x: (x[1][0], x[1][1]))\
            .reduceByKey(lambda a, b: a + b)
    Util.debugRdd(group_uv_rdd, 'group_uv_rdd', logger)


    output_rdd = one_feature_group_rdd\
            .leftOuterJoin(group_uv_rdd)\
            .map(RddFuncs.fillZeroUV)\
            .repartition(1)\
            .sortBy(lambda x: x[1][1], ascending=False)\
            .map(lambda x: (x[0], '$$'.join(x[1][0])))\
            .map(lambda x: (x[0], name_gen.genTagName(x[0], works_type), x[1]))\
            .filter(lambda x: x[1] is not None)\
            .map(lambda x: '{0}\t{1}\t{2}'.format(x[0], '\t'.join(x[1]), x[2]))
    hdfs_path = EnvConfig.HDFS_CHANNEL_PATH_DICT[works_type]
    local_path = os.path.join(EnvConfig.LOCAL_DATA_PATH_DICT[works_type],
                              'channel_tmp')
    Util.save_rdd(output_rdd, hdfs_path, local_path)

    GroupFilter.similar_filter(works_type)
    succeeded = HDFS.overwrite(EnvConfig.LOCAL_CHANNEL_PATH_DICT[works_type],
                               EnvConfig.HDFS_CHANNEL_PATH_DICT[works_type])

    return succeeded