M = len(ori) V = len(ori[0]) assert M % d == 0 assert V % d == 0 m = M / d v = V / d GAMMA = 0.02 LAMBDA = 0.1 STEP = 0.9 W = MutableDict(d) H = MutableDict(d) ori_b = dpark.broadcast(ori) def sgd(i_j): (i, j) = i_j Wi = W.get(i) if Wi is None: Wi = numpy.random.rand(m, k) W.put(i, Wi) Hj = H.get(j) if Hj is None: Hj = numpy.random.rand(v, k) H.put(j, Hj) ori = ori_b.value
IBIAS_PATH='/nfs/wuhong/offline_use/ibias_0/' RATING_PATH='/nfs/wuhong/fm_data/user_music_factor_model/user_track_rating_for_training/' ITEM_FACTOR_PATH='/nfs/wuhong/offline_use/H_0/' NEW_RATING_PATH='/nfs/wuhong/offline_use/rating_new/' NEW_ITEM_FACTOR_PATH='/nfs/wuhong/offline_use/H_new/' dpark = DparkContext() f_global = file(MU_PATH) line = '' for l in f_global: line = l mu = float(line.strip().split('\t')[1]) f_global.close() mu = dpark.broadcast(mu) def local_mapper(line): iid, v, _ = line.strip().split('\t') return (iid, float(v)) ibias = {} ibias = dpark.textFile(glob.glob(IBIAS_PATH)).map( local_mapper ).collectAsMap() ibias = dpark.broadcast(ibias) def local_mapper2(line): uid, iid, aid, v = line.strip().split('\t') return '%s,%s,%s\n' % (uid , iid, float(v) - mu - ibias[iid])
.filter(lambda x:x)\ .map(lambda l: general_map.value.parse(l, spec))\ .filter(lambda x:x)\ .filter(lambda line: (not is_spider(line) and (line['uid'] or line['bid'])))\ .filter(lambda l: l['bid'] not in fraud.value and l['uid'] not in fraud.value) spec = set(['url', 'uid', 'bid', 'unit_id', 'ad_id', 'status_code', 'user_agent', 'region', 'page_tags', 'hour', 'group']) features = common_gen(spec) features = features.map(feature_extract)\ .filter(lambda x:x)\ .cache() user_list = set(features.map(lambda x: x[0]).filter(lambda x: x<>'None').collect()) user_list_b = dp.broadcast(user_list) user_feature = dp.makeRDD([]) def _parse_list(line): uid, features = line.split('\t') features = [x.split(':') for x in features.split('|')] features = [(x[0], float(x[1])) for x in features] features = sorted(features, key=lambda x: x[1], reverse=True) return (uid, features) for name in ['book_cluster', 'movie_cluster', 'group_cluster', 'text_cluster']: fn = '/home2/alg/user_profile/%s/%s' % (current_date, name) if not os.path.exists(fn): continue rdd = dp.textFile(fn, splitSize=16<<20)\