def main(): global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor base_dir = '../../paper/data/dianping/mf/' user_item_score_file = os.path.join( base_dir, 'train/comment.keyword.train.user_item_star') user_matrix_file = os.path.join( base_dir, 'out/comment.keyword.train.user_item_star.user') item_matrix_file = os.path.join( base_dir, 'out/comment.keyword.train.user_item_star.item') test_file = os.path.join(base_dir, '../comment.keyword.test') #test_file = os.path.join(base_dir, 'train/comment.mongo.train') logging.info('loading user matrix...') user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True) logging.info('loading item matrix...') item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True) logging.info('loading item score...') user_score, item_score, global_bias = load_user_item_score( user_item_score_file, print_log=True) logging.info('global_bias:%f' % global_bias) #load vector vector_directory = "../../paper/data/dianping/w2v/vector" model_directory = "../../paper/data/dianping/lr_model/" user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector.200") shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector.200") vector_model_file = os.path.join(model_directory, "w2v_200_lr") vec_predictor = vec_lr_predictor(user_vector, shop_vector, vector_model_file) user_trained_shops = load_user_trained_shops(user_item_score_file) shop_ids = load_ids(shop_vector) user_ids = load_ids(user_vector) fout = file('./predict_res.all', 'w') user_count = 0 for user_id in user_ids: if user_count % 1000 == 0: logging.info('user count:%d' % user_count) user_count += 1 if user_id not in user_trained_shops: continue #predict_res = [] heap = MinSizeHeap(10) for shop_id in shop_ids: # if shop in training data, ignore it if shop_id in user_trained_shops[user_id]: continue heap.push((vector_score_function(user_id, shop_id), shop_id)) heap.sort() #sorted_res = sorted(predict_res, key=lambda x: x[0], reverse=True)[0] #res = ['%s:%lf' % (shop_id, score) for score, shop_id in sorted_res] #fout.write('%s\t%s\n' % (user_id, '\t'.join(res))) for score, shop_id in heap.arr: fout.write('%s\t%s\t%s\n' % (user_id, shop_id, score)) fout.close()
def main(): global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor base_dir = '../../paper/data/dianping/mf/' user_item_score_file = os.path.join(base_dir, 'train/comment.keyword.train.user_item_star') user_matrix_file = os.path.join(base_dir, 'out/comment.keyword.train.user_item_star.user') item_matrix_file = os.path.join(base_dir, 'out/comment.keyword.train.user_item_star.item') test_file = os.path.join(base_dir, '../comment.keyword.test') #test_file = os.path.join(base_dir, 'train/comment.mongo.train') logging.info('loading user matrix...') user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True) logging.info('loading item matrix...') item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True) logging.info('loading item score...') user_score, item_score, global_bias = load_user_item_score(user_item_score_file, print_log=True) logging.info('global_bias:%f' % global_bias) #load vector vector_directory = "../../paper/data/dianping/w2v/vector" model_directory = "../../paper/data/dianping/lr_model/" user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector.200") shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector.200") vector_model_file = os.path.join(model_directory, "w2v_200_lr") vec_predictor = vec_lr_predictor(user_vector, shop_vector, vector_model_file) user_trained_shops = load_user_trained_shops(user_item_score_file) shop_ids = load_ids(shop_vector) user_ids = load_ids(user_vector) fout = file('./predict_res.all', 'w') user_count = 0 for user_id in user_ids: if user_count % 1000 == 0: logging.info('user count:%d' % user_count) user_count += 1 if user_id not in user_trained_shops: continue #predict_res = [] heap = MinSizeHeap(10) for shop_id in shop_ids: # if shop in training data, ignore it if shop_id in user_trained_shops[user_id]: continue heap.push((vector_score_function(user_id, shop_id), shop_id)) heap.sort() #sorted_res = sorted(predict_res, key=lambda x: x[0], reverse=True)[0] #res = ['%s:%lf' % (shop_id, score) for score, shop_id in sorted_res] #fout.write('%s\t%s\n' % (user_id, '\t'.join(res))) for score, shop_id in heap.arr: fout.write('%s\t%s\t%s\n' % (user_id, shop_id, score)) fout.close()
def main(): global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor base_dir = '../../paper/data/dianping/mf/' user_item_score_file = os.path.join( base_dir, 'train/comment.keyword.train.user_item_star') user_matrix_file = os.path.join( base_dir, 'out/comment.keyword.train.user_item_star.user') item_matrix_file = os.path.join( base_dir, 'out/comment.keyword.train.user_item_star.item') test_file = os.path.join(base_dir, '../comment.keyword.test') #test_file = os.path.join(base_dir, 'train/comment.mongo.train') logging.info('loading user matrix...') user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True) logging.info('loading item matrix...') item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True) logging.info('loading item score...') user_score, item_score, global_bias = load_user_item_score( user_item_score_file, print_log=True) logging.info('global_bias:%f' % global_bias) #load vector tfidf_directory = "../../paper/data/dianping/tfidf/vector" vector_directory = "../../paper/data/dianping/w2v/vector" model_directory = "../../paper/data/dianping/lr_model/" tfidf_user_vector = os.path.join(tfidf_directory, "comment.keyword.train.user.vector.1000") tfidf_shop_vector = os.path.join(tfidf_directory, "comment.keyword.train.shop.vector.1000") user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector") shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector") tfidf_model_file = os.path.join(model_directory, "tfidf_top10K") vector_model_file = os.path.join(model_directory, "w2v_500") #tfidf_predictor = tfidf_lr_predictor(tfidf_user_vector, tfidf_shop_vector, tfidf_model_file) vec_predictor = vec_lr_predictor(user_vector, shop_vector, vector_model_file) logging.info('calculating rmse...') #rmse = cal_rmse(test_file, mf_score_function) rmse = cal_rmse(test_file, vector_score_function) print 'rmse:%lf' % rmse logging.info('user_miss:%d, item_miss:%d, all_miss: %d' % (user_miss, item_miss, all_miss)) #logging.info('tfidf_predictor.hit:%d, miss:%d' % (tfidf_predictor.hit, tfidf_predictor.miss)) logging.info('vec_predictor.hit:%d, miss:%d' % (vec_predictor.hit, vec_predictor.miss))
def main(): global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor base_dir = "../../paper/data/dianping/mf/" user_item_score_file = os.path.join(base_dir, "train/comment.keyword.train.user_item_star") user_matrix_file = os.path.join(base_dir, "out/comment.keyword.train.user_item_star.user") item_matrix_file = os.path.join(base_dir, "out/comment.keyword.train.user_item_star.item") test_file = os.path.join(base_dir, "../comment.keyword.test") # test_file = os.path.join(base_dir, 'train/comment.mongo.train') logging.info("loading user matrix...") user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True) logging.info("loading item matrix...") item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True) logging.info("loading item score...") user_score, item_score, global_bias = load_user_item_score(user_item_score_file, print_log=True) logging.info("global_bias:%f" % global_bias) # load vector tfidf_directory = "../../paper/data/dianping/tfidf/vector" vector_directory = "../../paper/data/dianping/w2v/vector" model_directory = "../../paper/data/dianping/lr_model/" tfidf_user_vector = os.path.join(tfidf_directory, "comment.keyword.train.user.vector.1000") tfidf_shop_vector = os.path.join(tfidf_directory, "comment.keyword.train.shop.vector.1000") user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector") shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector") tfidf_model_file = os.path.join(model_directory, "tfidf_top10K") vector_model_file = os.path.join(model_directory, "w2v_500") # tfidf_predictor = tfidf_lr_predictor(tfidf_user_vector, tfidf_shop_vector, tfidf_model_file) vec_predictor = vec_lr_predictor(user_vector, shop_vector, vector_model_file) logging.info("calculating rmse...") # rmse = cal_rmse(test_file, mf_score_function) rmse = cal_rmse(test_file, vector_score_function) print "rmse:%lf" % rmse logging.info("user_miss:%d, item_miss:%d, all_miss: %d" % (user_miss, item_miss, all_miss)) # logging.info('tfidf_predictor.hit:%d, miss:%d' % (tfidf_predictor.hit, tfidf_predictor.miss)) logging.info("vec_predictor.hit:%d, miss:%d" % (vec_predictor.hit, vec_predictor.miss))