def pcc_item_rating_pred(path, rating, method, k): start = time.time() name = 'pcc_item' data = extract_data(path) mtx = get_matrix(3).toarray() item_mtx = [] result = [] zero = np.where(~mtx.any(axis=0))[0] # get zero mtx[:, [zero]] = 0.00001 # prevent zero-devide #normalize pcc = (mtx.T - np.sum(mtx, axis=1)) / len(mtx) pcc /= np.linalg.norm(mtx, axis=1).T mtx = pcc.T if method == 'dot': item_mtx = dot_sim(mtx, name) elif method == 'cos': inputs = (mtx.T * np.linalg.norm(mtx, axis=1)).T item_mtx = cos_sim(inputs, name) #KNN for i in data: score = 0 item_id = i[0] #get item_id user_id = i[1] #get user_id item = item_mtx[item_id] #row knn = np.argsort(item, kind='heapsort')[::-1][0:k + 1] if item_id in knn: # delte query idx = np.where(knn == item_id) knn = np.delete(knn, idx) else: knn = np.delete(knn, len(knn) - 1) #get score if rating == 'mean': score = np.sum(np.take(mtx[:, user_id], knn.tolist())) / float(k) + 3 elif rating == 'weighted': knn_sim = item[knn] if np.sum(knn_sim) != 0: #prevent zero-devide weight = knn_sim / np.sum(knn_sim) score = np.sum( np.multiply(np.take(mtx[:, user_id], knn.tolist()), weight)) + 3 else: score = np.sum(mtx[:, user_id]) / np.size( np.nonzero(mtx[:, user_id])) + 3 result.append(score) write(result, name, rating, method, k) print('item_rating_pred {} {} {} time : {}'.format(method, rating, k, time.time() - start)) gold = golden() print("RMSE :", np.sqrt(np.mean(np.square(result - gold))))
def user_rating_pred(path, rating,method,k): start = time.time() name='user' data = extract_data(path) mtx = get_matrix(3).toarray() user_mtx = [] result = [] zero = np.where(~mtx.any(axis=0))[0] #get zero mtx[:, [zero]] = 0.00001 # prevent zero-devide if method =='dot': user_mtx = dot_sim(mtx,name) elif method=='cos': inputs=np.linalg.norm(mtx,axis=0)*mtx #normalize before cos_sim user_mtx = cos_sim(inputs,name)#honestly cos_sim is cosine similariy but input is normalized so same with cos_similarity for i in data: score = 0 mv_id = i[0] #get item_id user_id = i[1] #get user_id user = user_mtx[user_id] #get user knn = np.argsort(user,kind='heapsort')[::-1][0: k+1] if user_id in knn:# delte query i = np.where(knn == user_id) knn = np.delete(knn, i) else: knn = np.delete(knn, len(knn) - 1) #get score if rating == 'mean': score = (np.sum(np.take(mtx[mv_id, :], knn.tolist())) / float(k))+3 elif rating=='weighted': knn_sim = user[knn] if np.sum(knn_sim) != 0: weight = knn_sim / np.sum(knn_sim) #prevent zero-devide score = np.sum(np.multiply(np.take(mtx[mv_id, :], knn.tolist()), weight))+3 else: score = np.sum(mtx[mv_id, :]) / np.size(np.nonzero(mtx[mv_id, :]))+3 result.append(score) #print('start _writting') write(result,name,rating,method,k) print('user_rating_pred {} {} {} time : {}'.format(method,rating, k,time.time() - start)) gold=golden() print("RMSE :",np.sqrt(np.mean(np.square(result-gold))))
#!/usr/bin/env python # coding: utf-8 # In[2]: from myutils import get_matrix, extract_data import numpy as np import torch from copy import deepcopy import time # In[3]: data = extract_data('data/dev.csv') # In[4]: import csv f = open('data/dev.golden', 'r', encoding='utf-8') reader = csv.reader(f) golden = [] for i in reader: golden += i golden = np.array(golden, dtype=float) # In[5]: def get_score(U, V, data): u = U.numpy().take(data.take(1, axis=1), axis=0)