def bipartite(user_k, item_k): train_mtx_ori = rating_matrix.matrix_transfer(2) [row, col] = train_mtx_ori.shape train_mtx = np.transpose(train_mtx_ori) num_of_round = 0 user_dict = {} item_dict = {} train_mtx_p = [] train_mtx_pp = [] while num_of_round < 5: num_of_round += 1 # step 1 user_dict = k_means(train_mtx, user_k) # step 2 train_mtx_p = np.zeros((row, user_k)) for cluster in user_dict: train_mtx_p[:, cluster] = np.asarray( train_mtx_ori[:, user_dict.get(cluster)].mean( axis=1)).reshape(row) # step 3 item_dict = k_means(train_mtx_p, item_k) # step 4 train_mtx_pp = np.zeros((item_k, col)) for cluster in item_dict: train_mtx_pp[cluster, :] = np.asarray( train_mtx_ori[item_dict.get(cluster), :].mean( axis=0)).reshape(col) # step 5 train_mtx = np.transpose(train_mtx_pp) print 'bipartite finished.' user_item_dict = (user_dict, train_mtx_p, item_dict, train_mtx_pp) return user_item_dict
def bipartite(user_k, item_k): train_mtx_ori = rating_matrix.matrix_transfer(2) [row, col] = train_mtx_ori.shape train_mtx = np.transpose(train_mtx_ori) num_of_round = 0 user_dict = {} item_dict = {} train_mtx_p = [] train_mtx_pp = [] while num_of_round < 5: num_of_round += 1 # step 1 user_dict = k_means(train_mtx, user_k) # step 2 train_mtx_p = np.zeros((row, user_k)) for cluster in user_dict: train_mtx_p[:, cluster] = np.asarray(train_mtx_ori[:, user_dict.get(cluster)].mean(axis=1)).reshape(row) # step 3 item_dict = k_means(train_mtx_p, item_k) # step 4 train_mtx_pp = np.zeros((item_k, col)) for cluster in item_dict: train_mtx_pp[cluster, :] = np.asarray(train_mtx_ori[item_dict.get(cluster), :].mean(axis=0)).reshape(col) # step 5 train_mtx = np.transpose(train_mtx_pp) print 'bipartite finished.' user_item_dict = (user_dict, train_mtx_p, item_dict, train_mtx_pp) return user_item_dict
def pcc_item_rating_pred(pair_path, k, option): pair = pred_set.pred_pair(pair_path) train_mtx = rating_matrix.matrix_transfer(2) item_zero_vec = np.where(~train_mtx.any(axis=0))[0] # add a bias to the all zero column vectors train_mtx[:, [item_zero_vec]] = 0.001 pcc_mtx = np.transpose(train_mtx) # user rating standardization pcc_mtx = pcc_mtx - np.sum(pcc_mtx, axis=0) / len(pcc_mtx) pcc_mtx /= np.linalg.norm(pcc_mtx, axis=0) pcc_mtx = np.transpose(pcc_mtx) item_sim_mtx = [] pred_list = [] if option == 1 or option == 2: item_sim_mtx = movie_sim.item_dot_sim(pcc_mtx) if option == 3 or option == 4: train_mtx[:, [item_zero_vec]] = 0.001 item_sim_mtx = movie_sim.item_cos_sim(pcc_mtx) for row in pair: pred_rating = 0 movie_id = row[0] user_id = row[1] item_sim_list = item_sim_mtx[movie_id] # top k+1 nearest neighbors item_knn_list = np.argsort(item_sim_list)[::-1][0:k + 1] if movie_id in item_knn_list: position = np.where(item_knn_list == movie_id) item_knn_list = np.delete(item_knn_list, position) else: item_knn_list = np.delete(item_knn_list, len(item_knn_list) - 1) if option == 1 or option == 3: pred_rating = np.sum( np.take(train_mtx[:, user_id], item_knn_list.tolist())) / float(k) + 3 if option == 2 or option == 4: item_knn_sim = item_sim_list[item_knn_list] if np.sum(item_knn_sim) != 0: weight = item_knn_sim / np.sum(item_knn_sim) pred_rating = np.sum( np.multiply( np.take(train_mtx[:, user_id], item_knn_list.tolist()), weight)) + 3 else: pred_rating = 3.0 pred_list.append(pred_rating) # output the result pred_result.file_writer(pred_list) return pred_list
def user_rating_pred(pair_path, k, option): pair = pred_set.pred_pair(pair_path) train_mtx = rating_matrix.matrix_transfer(2) user_sim_mtx = [] pred_list = [] user_zero_vec = np.where(~train_mtx.any(axis=0))[0] if option == 1 or option == 2: user_sim_mtx = user_sim.user_dot_sim(train_mtx) if option == 3 or option == 4: # add a bias to the all zero column vectors train_mtx[0, [user_zero_vec]] = 0.001 user_sim_mtx = user_sim.user_cos_sim(train_mtx) # TODO: weighted mean need refine for row in pair: pred_rating = 0 movie_id = row[0] user_id = row[1] user_sim_list = user_sim_mtx[user_id] # top k+1 nearest neighbors user_knn_list = np.argsort(user_sim_list)[::-1][0:k + 1] # TODO: if two sim equals, small user_id comes first if user_id in user_knn_list: position = np.where(user_knn_list == user_id) user_knn_list = np.delete(user_knn_list, position) else: user_knn_list = np.delete(user_knn_list, len(user_knn_list) - 1) if option == 1 or option == 3: pred_rating = np.sum( np.take(train_mtx[movie_id, :], user_knn_list.tolist())) / float(k) + 3 # TODO: problem exists, what if weighted sum is zero if option == 2 or option == 4: user_knn_sim = user_sim_list[user_knn_list] if np.sum(user_knn_sim) != 0: weight = user_knn_sim / np.sum(user_knn_sim) pred_rating = np.sum( np.multiply( np.take(train_mtx[movie_id, :], user_knn_list.tolist()), weight)) + 3 else: pred_rating = np.sum(train_mtx[movie_id, :]) / np.size( np.nonzero(train_mtx[movie_id, :])) + 3 pred_list.append(pred_rating) # output the result pred_result.file_writer(pred_list) return pred_list
def pcc_item_rating_pred(pair_path, k, option): pair = pred_set.pred_pair(pair_path) train_mtx = rating_matrix.matrix_transfer(2) item_zero_vec = np.where(~train_mtx.any(axis=0))[0] # add a bias to the all zero column vectors train_mtx[:, [item_zero_vec]] = 0.001 pcc_mtx = np.transpose(train_mtx) # user rating standardization pcc_mtx = pcc_mtx - np.sum(pcc_mtx, axis=0) / len(pcc_mtx) pcc_mtx /= np.linalg.norm(pcc_mtx, axis=0) pcc_mtx = np.transpose(pcc_mtx) item_sim_mtx = [] pred_list = [] if option == 1 or option == 2: item_sim_mtx = movie_sim.item_dot_sim(pcc_mtx) if option == 3 or option == 4: train_mtx[:, [item_zero_vec]] = 0.001 item_sim_mtx = movie_sim.item_cos_sim(pcc_mtx) for row in pair: pred_rating = 0 movie_id = row[0] user_id = row[1] item_sim_list = item_sim_mtx[movie_id] # top k+1 nearest neighbors item_knn_list = np.argsort(item_sim_list)[::-1][0: k+1] if movie_id in item_knn_list: position = np.where(item_knn_list == movie_id) item_knn_list = np.delete(item_knn_list, position) else: item_knn_list = np.delete(item_knn_list, len(item_knn_list) - 1) if option == 1 or option == 3: pred_rating = np.sum(np.take(train_mtx[:, user_id], item_knn_list.tolist())) / float(k) + 3 if option == 2 or option == 4: item_knn_sim = item_sim_list[item_knn_list] if np.sum(item_knn_sim) != 0: weight = item_knn_sim / np.sum(item_knn_sim) pred_rating = np.sum(np.multiply(np.take(train_mtx[:, user_id], item_knn_list.tolist()), weight)) + 3 else: pred_rating = 3.0 pred_list.append(pred_rating) # output the result pred_result.file_writer(pred_list) return pred_list
def item_rating_pred(pair_path, k, option): pair = pred_set.pred_pair(pair_path) train_mtx = rating_matrix.matrix_transfer(2) item_zero_vec = np.where(~train_mtx.any(axis=0))[0] item_sim_mtx = [] pred_list = [] if option == 1 or option == 2: item_sim_mtx = movie_sim.item_dot_sim(train_mtx) if option == 3 or option == 4: train_mtx[:, [item_zero_vec]] = 0.001 item_sim_mtx = movie_sim.item_cos_sim(train_mtx) for row in pair: pred_rating = 0 movie_id = row[0] user_id = row[1] item_sim_list = item_sim_mtx[movie_id] # top k+1 nearest neighbors item_knn_list = np.argsort(item_sim_list)[::-1][0:k + 1] if movie_id in item_knn_list: position = np.where(item_knn_list == movie_id) item_knn_list = np.delete(item_knn_list, position) else: item_knn_list = np.delete(item_knn_list, len(item_knn_list) - 1) if option == 1 or option == 3: pred_rating = np.sum( np.take(train_mtx[:, user_id], item_knn_list.tolist())) / float(k) + 3 if option == 2 or option == 4: item_knn_sim = item_sim_list[item_knn_list] if np.sum(item_knn_sim) != 0: weight = item_knn_sim / np.sum(item_knn_sim) pred_rating = np.sum( np.multiply( np.take(train_mtx[:, user_id], item_knn_list.tolist()), weight)) + 3 else: pred_rating = np.sum(train_mtx[movie_id, :]) / np.size( np.nonzero(train_mtx[movie_id, :])) + 3 pred_list.append(pred_rating) # output the result pred_result.file_writer(pred_list) return pred_list
def user_rating_pred(pair_path, k, option): pair = pred_set.pred_pair(pair_path) train_mtx = rating_matrix.matrix_transfer(2) user_sim_mtx = [] pred_list = [] user_zero_vec = np.where(~train_mtx.any(axis=0))[0] if option == 1 or option == 2: user_sim_mtx = user_sim.user_dot_sim(train_mtx) if option == 3 or option == 4: # add a bias to the all zero column vectors train_mtx[0, [user_zero_vec]] = 0.001 user_sim_mtx = user_sim.user_cos_sim(train_mtx) # TODO: weighted mean need refine for row in pair: pred_rating = 0 movie_id = row[0] user_id = row[1] user_sim_list = user_sim_mtx[user_id] # top k+1 nearest neighbors user_knn_list = np.argsort(user_sim_list)[::-1][0: k+1] # TODO: if two sim equals, small user_id comes first if user_id in user_knn_list: position = np.where(user_knn_list == user_id) user_knn_list = np.delete(user_knn_list, position) else: user_knn_list = np.delete(user_knn_list, len(user_knn_list) - 1) if option == 1 or option == 3: pred_rating = np.sum(np.take(train_mtx[movie_id, :], user_knn_list.tolist())) / float(k) + 3 # TODO: problem exists, what if weighted sum is zero if option == 2 or option == 4: user_knn_sim = user_sim_list[user_knn_list] if np.sum(user_knn_sim) != 0: weight = user_knn_sim / np.sum(user_knn_sim) pred_rating = np.sum(np.multiply(np.take(train_mtx[movie_id, :], user_knn_list.tolist()), weight)) + 3 else: pred_rating = np.sum(train_mtx[movie_id, :]) / np.size(np.nonzero(train_mtx[movie_id, :])) + 3 pred_list.append(pred_rating) # output the result pred_result.file_writer(pred_list) return pred_list
def item_rating_pred(pair_path, k, option): pair = pred_set.pred_pair(pair_path) train_mtx = rating_matrix.matrix_transfer(2) item_zero_vec = np.where(~train_mtx.any(axis=0))[0] item_sim_mtx = [] pred_list = [] if option == 1 or option == 2: item_sim_mtx = movie_sim.item_dot_sim(train_mtx) if option == 3 or option == 4: train_mtx[:, [item_zero_vec]] = 0.001 item_sim_mtx = movie_sim.item_cos_sim(train_mtx) for row in pair: pred_rating = 0 movie_id = row[0] user_id = row[1] item_sim_list = item_sim_mtx[movie_id] # top k+1 nearest neighbors item_knn_list = np.argsort(item_sim_list)[::-1][0: k+1] if movie_id in item_knn_list: position = np.where(item_knn_list == movie_id) item_knn_list = np.delete(item_knn_list, position) else: item_knn_list = np.delete(item_knn_list, len(item_knn_list) - 1) if option == 1 or option == 3: pred_rating = np.sum(np.take(train_mtx[:, user_id], item_knn_list.tolist())) / float(k) + 3 if option == 2 or option == 4: item_knn_sim = item_sim_list[item_knn_list] if np.sum(item_knn_sim) != 0: weight = item_knn_sim / np.sum(item_knn_sim) pred_rating = np.sum(np.multiply(np.take(train_mtx[:, user_id], item_knn_list.tolist()), weight)) + 3 else: pred_rating = np.sum(train_mtx[movie_id, :]) / np.size(np.nonzero(train_mtx[movie_id, :])) + 3 pred_list.append(pred_rating) # output the result pred_result.file_writer(pred_list) return pred_list
def pcc_user_rating_pred(pair_path, k, option): pair = pred_set.pred_pair(pair_path) train_mtx = rating_matrix.matrix_transfer(2) user_zero_vec = np.where(~train_mtx.any(axis=0))[0] # add a bias to the all zero column vectors train_mtx[:, [user_zero_vec]] = 0.001 # user rating standardization pcc_mtx = train_mtx - np.sum(train_mtx, axis=0) / len(train_mtx) pcc_mtx /= np.linalg.norm(train_mtx, axis=0) user_sim_mtx = [] pred_list = [] if option == 1 or option == 2: user_sim_mtx = user_sim.user_dot_sim(pcc_mtx) if option == 3 or option == 4: user_sim_mtx = user_sim.user_cos_sim(pcc_mtx) # TODO: weighted mean need refine for row in pair: # pred_rating = 0 movie_id = row[0] user_id = row[1] user_sim_list = user_sim_mtx[user_id] # top k+1 nearest neighbors user_knn_list = np.argsort(user_sim_list)[::-1][0:k + 1] # TODO: if two sim equals, small user_id comes first if user_id in user_knn_list: position = np.where(user_knn_list == user_id) user_knn_list = np.delete(user_knn_list, position) else: user_knn_list = np.delete(user_knn_list, len(user_knn_list) - 1) pred_rating = np.sum( np.take(train_mtx[movie_id, :], user_knn_list.tolist())) / float(k) + 3 pred_list.append(pred_rating) # output the result pred_result.file_writer(pred_list) return pred_list
def explorer(): train_mtx = rating_matrix.matrix_transfer(0) [row, col] = train_mtx.shape # ***** part 1.1.1: statistics ***** rating_one = np.where(train_mtx == 1) print 'movie with rating 1: ', rating_one[0].size, '\n' rating_three = np.where(train_mtx == 3) print 'movie with rating 3: ', rating_three[0].size, '\n' rating_five = np.where(train_mtx == 5) print 'movie with rating 5: ', rating_five[0].size, '\n' rating_avg = np.sum(train_mtx) / np.count_nonzero(train_mtx) print 'movie rating average: ', rating_avg, '\n' # ***** part 1.1.2: user_id 4321 ***** curuser = train_mtx[:, 4321] movie_num = np.count_nonzero(curuser) print 'number of movie rated: ', movie_num, '\n' rating_one_num = np.where(curuser == 1) print 'movie with rating 1: ', rating_one_num[0].size, '\n' rating_three_num = np.where(curuser == 3) print 'movie with rating 3: ', rating_three_num[0].size, '\n' rating_five_num = np.where(curuser == 5) print 'movie with rating 5: ', rating_five_num[0].size, '\n' rating_avg_score = np.sum(curuser) / np.count_nonzero(curuser) print 'movie rating average: ', rating_avg_score, '\n' # ***** part 1.1: movie_id 3 **** curmovie = train_mtx[3, :] user_num = np.count_nonzero(curmovie) print 'number of user rated: ', user_num, '\n' rating_one_user = np.where(curmovie == 1) print 'movie with rating 1: ', rating_one_user[0].size, '\n' rating_three_user = np.where(curmovie == 3) print 'movie with rating 3: ', rating_three_user[0].size, '\n' rating_five_user = np.where(curmovie == 5) print 'movie with rating 5: ', rating_five_user[0].size, '\n' rating_avg_user = np.sum(curmovie) / np.count_nonzero(curmovie) print 'movie rating average: ', rating_avg_user, '\n'
def pcc_user_rating_pred(pair_path, k, option): pair = pred_set.pred_pair(pair_path) train_mtx = rating_matrix.matrix_transfer(2) user_zero_vec = np.where(~train_mtx.any(axis=0))[0] # add a bias to the all zero column vectors train_mtx[:, [user_zero_vec]] = 0.001 # user rating standardization pcc_mtx = train_mtx - np.sum(train_mtx, axis=0) / len(train_mtx) pcc_mtx /= np.linalg.norm(train_mtx, axis=0) user_sim_mtx = [] pred_list = [] if option == 1 or option == 2: user_sim_mtx = user_sim.user_dot_sim(pcc_mtx) if option == 3 or option == 4: user_sim_mtx = user_sim.user_cos_sim(pcc_mtx) # TODO: weighted mean need refine for row in pair: # pred_rating = 0 movie_id = row[0] user_id = row[1] user_sim_list = user_sim_mtx[user_id] # top k+1 nearest neighbors user_knn_list = np.argsort(user_sim_list)[::-1][0: k+1] # TODO: if two sim equals, small user_id comes first if user_id in user_knn_list: position = np.where(user_knn_list == user_id) user_knn_list = np.delete(user_knn_list, position) else: user_knn_list = np.delete(user_knn_list, len(user_knn_list) - 1) pred_rating = np.sum(np.take(train_mtx[movie_id, :], user_knn_list.tolist())) / float(k) + 3 pred_list.append(pred_rating) # output the result pred_result.file_writer(pred_list) return pred_list