""" # @file tags_gen.py # @Synopsis gen tags # @author Ming Gu([email protected])) # @version 1.0 # @date 2015-09-20 """ import sys sys.path.append('..') import commands from conf.env_config import EnvConfig from conf.init_logger import InitLogger from dal.spark_submit import SparkSubmit from bll.gen_rec_list import genRecList import logging import os from datetime import datetime if __name__ == '__main__': start_time = datetime.now() InitLogger() logger = logging.getLogger(EnvConfig.LOG_NAME) main_program_path = '../bll/item_cf.py' SparkSubmit.sparkSubmit(main_program_path, run_locally=False) end_time = datetime.now() time_span = end_time - start_time minutes = time_span.total_seconds() / 60 logger.debug('cal_itemcf spent {0} minutes'.format(minutes))
def main(): """ # @Synopsis spark main program # # @Returns succeeded or not(to be added) """ works_type = 'movie' InitLogger() logger = logging.getLogger(EnvConfig.LOG_NAME) HDFS.overwrite(EnvConfig.LOCAL_PM_REVIEWED_CHANNEL_DICT[works_type], EnvConfig.HDFS_CHANNEL_PATH_DICT[works_type]) LongVideoInfo.genVideoInfo(works_type) sc = ps.SparkContext() play_log = Util.genLogList(3, 1, works_type=works_type) # play_log = Util.getOneHourSampleLog(works_type=works_type, platform='PC') logger.debug('play_log: {0}'.format(play_log)) play_log_rdd = sc.textFile(play_log)\ .map(RddFuncs.parsePlayLog)\ .map(lambda x: (x[0], x[1])) # play_log_rdd = sc.textFile(play_log)\ # .map(RddFuncs.parsePlayLog)\ # .filter(lambda x: x[2] == 'search')\ # .map(lambda x: (x[0], x[1])) Util.debugRdd(play_log_rdd, 'play_log_rdd', logger) #(vid, uid) item_user_rdd = play_log_rdd \ .map(lambda x: (x[1], x[0])) # (group_id, group_name) group_rdd = sc.textFile(EnvConfig.HDFS_CHANNEL_PATH_DICT[works_type])\ .map(RddFuncs.parseGroupFile) Util.debugRdd(group_rdd, 'group_rdd', logger) # (group_id, feature), a group_id may contain more than one feature group_feature_rdd = group_rdd\ .flatMap(lambda x: [(x[0], i) for i in x[0].split('$$')]) Util.debugRdd(group_feature_rdd, 'group_feature_rdd', logger) # (feature, set(vid)) feature_vids_rdd = \ sc.textFile(EnvConfig.HDFS_LONG_VIDEO_INFO_PATH_DICT[works_type])\ .flatMap(lambda x: RddFuncs.parseVideoFeature(x, works_type))\ .distinct()\ .filter(lambda x: x is not None)\ .map(lambda x: (x[1], x[0]))\ .groupByKey()\ .map(lambda x: (x[0], set(x[1]))) Util.debugRdd(feature_vids_rdd, 'feature_vids_rdd', logger) # (feature, (group_id, vids)) => (group_id, vids) group_vids_rdd = group_feature_rdd\ .map(lambda x: (x[1], x[0]))\ .join(feature_vids_rdd)\ .map(lambda x: (x[1][0], x[1][1]))\ .reduceByKey(lambda a, b: a & b) Util.debugRdd(group_vids_rdd, 'group_vids_rdd', logger) # (group_id, (vids, group_name)) => (group_name, vids) => (group_name, vid) group_vid_rdd = group_vids_rdd\ .join(group_rdd)\ .map(lambda x: (x[1][1], x[1][0]))\ .flatMap(lambda x: [(x[0], i) for i in x[1]]) Util.debugRdd(group_vid_rdd, 'group_vid_rdd', logger) #(uid, (group_name, cnt)) user_played_movie_cnt_in_group_rdd = group_vid_rdd \ .map(lambda x: (x[1], x[0])) \ .join(item_user_rdd) \ .map(lambda x:((x[1][0], x[1][1]), 1)) \ .reduceByKey(lambda a, b: a + b)\ .map(lambda x: (x[0][1], (x[0][0], x[1]))) Util.debugRdd(user_played_movie_cnt_in_group_rdd, 'user_played_movie_cnt_in_group_rdd', logger) #(uid, vid) => (uid, (vid, (group_name, cnt))) => ((group_name, vid), cnt) group_item_sim_rdd = play_log_rdd \ .join(user_played_movie_cnt_in_group_rdd) \ .map(lambda x: ((x[1][1][0], x[1][0]), x[1][1][1] - 1)) \ .reduceByKey(lambda a, b: a + b) Util.debugRdd(group_item_sim_rdd, 'group_item_sim_rdd', logger) group_vid_key_rdd = group_vid_rdd \ .map(lambda x: (x, 1)) # we only need the similarity between a vid and the group it belongs, # ignore the similarity between vid and other groups, and vid in the group # but with no play history should still be in the group, with 0 similarity # ((group, vid), sim) group_item_sim_rdd = group_item_sim_rdd \ .rightOuterJoin(group_vid_key_rdd) \ .map(RddFuncs.fillZeroSim) Util.debugRdd(group_item_sim_rdd, 'group_item_sim_rdd', logger) # ((group, vid), sim) => (group, (vid, sim)) => (group, [(vid, sim)]) => #(group_name\t vid:sim$$vid:sim...) group_content_rdd = group_item_sim_rdd\ .map(lambda x: (x[0][0], (x[0][1], x[1])))\ .groupByKey()\ .map(lambda x: (x[0], sorted(x[1], key=lambda a: -a[1])))\ .map(lambda x: RddFuncs.video_info_map_func(x, works_type))\ .map(lambda x: '%s\t%s' % (x[0], x[1]))\ .repartition(1) Util.debugRdd(group_content_rdd, 'group_content_rdd', logger) hdfs_path = EnvConfig.HDFS_CHANNEL_CONTENT_PATH_DICT[works_type] local_path = EnvConfig.LOCAL_CHANNEL_CONTENT_DICT[works_type] succeeded = Util.save_rdd(group_content_rdd, hdfs_path, local_path) if succeeded: if not EnvConfig.DEBUG: succeeded = Mola.updateDb('movie_channel_channel2id:', local_path) if not succeeded: logger.fatal('{0} channel_content_gen failed'.format(works_type)) GenTagContentTitle.replaceIdWithTitle(works_type) return succeeded
def main(): """ # @Synopsis get short video candidates # # @Returns nothing """ InitLogger() logger = logging.getLogger(EnvConfig.LOG_NAME) sc = ps.SparkContext() log_day_cnt = 28 MAX_USER_SHORT_VIDEO_PER_DAY = 50 # MAX_SHORT_VIDEO_UV_PER_DAY = 1000 MIN_SHORT_VIDEO_UV_PER_DAY = 1 # short_video_play_log = LogFileNameGen.genOneHourSampleLog('short', # 'play', 'PC') short_video_play_log = LogFileNameGen.genLogList(log_day_cnt, 1, 'short', 'play', 'PC') short_video_play_log_rdd = sc.textFile(short_video_play_log)\ .map(LogParser.parseWithoutException)\ .filter(lambda x: x is not None)\ .filter(lambda x: 'uid' in x and 'url' in x and 'playType' in x)\ .map(lambda x: (x['uid'], unquote(x['url'])))\ .filter(lambda x: 'v.baidu.com' not in x[1])\ .filter(lambda x: 'video.baidu.com' not in x[1])\ .distinct() def filterTooActiveUser(log_rdd, max_item_cnt): """ # @Synopsis filter users who watches too many videos(long or short) during # the time window, these users are probobaly robots # # @Args log_rdd # @Args max_item_cnt max number of videos a user should watch # # @Returns filtered rdd """ user_item_cnt = log_rdd\ .map(lambda x: (x[0], 1))\ .reduceByKey(lambda a, b: a + b)\ .filter(lambda x: x[1] <= max_item_cnt) filtered_rdd = log_rdd\ .join(user_item_cnt)\ .filter(lambda x: x[1][1] is not None)\ .map(lambda x: (x[0], x[1][0])) return filtered_rdd short_video_play_log_rdd = filterTooActiveUser( short_video_play_log_rdd, MAX_USER_SHORT_VIDEO_PER_DAY * log_day_cnt).cache() RddUtils.debugRdd(short_video_play_log_rdd, 'short_video_play_log_rdd', logger) short_video_uv_rdd = short_video_play_log_rdd\ .map(lambda x: (x[1], 1)) \ .reduceByKey(lambda a, b: a + b)\ .filter(lambda x: x[1] >= MIN_SHORT_VIDEO_UV_PER_DAY * log_day_cnt) short_video_meta_rdd = RddUtils.loadGBKFile(sc, EnvConfig.HDFS_SHORT_VIDEO_META_PATH)\ .map(VideoInfoUtil.parseShortVideoMeta)\ .filter(lambda x: x is not None)\ .filter(lambda x: x['duration'] < 20 * 60)\ .map(lambda x: (x.pop('url', None), x)) # RddUtils.debugRdd(short_video_title_rdd, 'short_video_title_rdd', logger) def output_mapper(x): """ # @Synopsis output_mapper # @Args x # @Returns output string in utf8 """ return u'\t'.join([ str(x[0]), x[1][1]['title'], x[1][1]['image_link'], x[1][1]['duration'], str(x[1][0]) ]).encode('utf8') candidate_video_meta_rdd = short_video_uv_rdd\ .join(short_video_meta_rdd)\ .map(output_mapper) # RddUtils.debugRdd(candidate_video_meta_rdd, 'candidate_video_meta_rdd', logger) hdfs_path = os.path.join(EnvConfig.HDFS_DERIVANT_PATH, 'candidates') local_path = os.path.join(EnvConfig.LOCAL_DATA_BASE_PATH, 'short', 'source', 'candidates') RddUtils.saveRdd(candidate_video_meta_rdd, hdfs_path, local_path)
def main(): """ # @Synopsis main program # # @Returns succeeded or not """ InitLogger() logger = logging.getLogger(EnvConfig.LOG_NAME) sc = ps.SparkContext() works_type = 'tv' logger.debug('works_type is {0}'.format(works_type)) LongVideoInfo.genVideoInfo(works_type) VIDEO_CNT_LOWER_BOUND = 10 name_gen = TagNameGen() # (group_id, [vids]) one_feature_group_rdd = \ sc.textFile(EnvConfig.HDFS_LONG_VIDEO_INFO_PATH_DICT[works_type])\ .flatMap(lambda x: RddFuncs.parseVideoFeature(x, works_type))\ .distinct()\ .filter(lambda x: x is not None)\ .filter(lambda x: featureFilter(x, works_type))\ .map(lambda x: (x[1], x[0]))\ .groupByKey()\ .filter(lambda x: len(x[1]) >= VIDEO_CNT_LOWER_BOUND) Util.debugRdd(one_feature_group_rdd, 'one_feature_group_rdd', logger) group_item_rdd = one_feature_group_rdd\ .flatMap(lambda x: [(x[0], i) for i in x[1]]) Util.debugRdd(group_item_rdd, 'group_item_rdd', logger) play_log = Util.genLogList(7, 1, works_type='tv', platform='PC') # play_log = Util.getOneHourSampleLog('comic', 'PC') user_item_rdd = sc.textFile(play_log)\ .map(RddFuncs.parsePlayLog)\ .map(lambda x: (x[0], x[1]))\ .distinct() # user_item_rdd = sc.textFile(play_log)\ # .map(RddFuncs.parsePlayLog)\ # .filter(lambda x: x[2] == 'search')\ # .map(lambda x: (x[0], x[1]))\ # .distinct() Util.debugRdd(user_item_rdd, 'user_item_rdd', logger) item_uv_rdd = user_item_rdd\ .map(lambda x: (x[1], 1))\ .reduceByKey(lambda a, b: a + b)\ .filter(lambda x: x[1] > 1) Util.debugRdd(item_uv_rdd, 'item_uv_rdd', logger) #(item, (group, uv)) group_uv_rdd = group_item_rdd\ .map(lambda x: (x[1], x[0]))\ .join(item_uv_rdd)\ .map(lambda x: (x[1][0], x[1][1]))\ .reduceByKey(lambda a, b: a + b) Util.debugRdd(group_uv_rdd, 'group_uv_rdd', logger) output_rdd = one_feature_group_rdd\ .leftOuterJoin(group_uv_rdd)\ .map(RddFuncs.fillZeroUV)\ .repartition(1)\ .sortBy(lambda x: x[1][1], ascending=False)\ .map(lambda x: (x[0], '$$'.join(x[1][0])))\ .map(lambda x: (x[0], name_gen.genTagName(x[0], works_type), x[1]))\ .filter(lambda x: x[1] is not None)\ .map(lambda x: '{0}\t{1}\t{2}'.format(x[0], '\t'.join(x[1]), x[2])) hdfs_path = EnvConfig.HDFS_CHANNEL_PATH_DICT[works_type] local_path = os.path.join(EnvConfig.LOCAL_DATA_PATH_DICT[works_type], 'channel_tmp') Util.save_rdd(output_rdd, hdfs_path, local_path) GroupFilter.similar_filter(works_type) succeeded = HDFS.overwrite(EnvConfig.LOCAL_CHANNEL_PATH_DICT[works_type], EnvConfig.HDFS_CHANNEL_PATH_DICT[works_type]) return succeeded