def train_trim_nmf(data, R): kfold = KFold(n_splits=10) rmse_list = [[], [], []] for k in range(2, 52, 2): print("using k = %d" % k) p_rmse = [] u_rmse = [] hv_rmse = [] nmf = NMF(n_factors=k) for trainset, testset in kfold.split(data): nmf.fit(trainset) (p_testset, u_testset, hv_testset) = trim(testset, R) p_pred = nmf.test(p_testset) u_pred = nmf.test(u_testset) hv_pred = nmf.test(hv_testset) p_rmse.append(accuracy.rmse(p_pred)) u_rmse.append(accuracy.rmse(u_pred)) hv_rmse.append(accuracy.rmse(hv_pred)) rmse_list[0].append(np.mean(p_rmse)) rmse_list[1].append(np.mean(u_rmse)) rmse_list[2].append(np.mean(hv_rmse)) print("NMF with trim is finished!!") return rmse_list
def nmf_compute_high_var_trim_rmse(k): nmf = NMF(n_factors=k, random_state=42) rmse = [] for trainset, testset in KFold(n_splits=10, random_state=42).split(R): nmf.fit(trainset) testset_trimmed = high_variance_trimming(testset, frequency, variance) pred = nmf.test(testset_trimmed) rmse.append(accuracy.rmse(pred, verbose=False)) print('k: %s | RMSE: %f' % (k, np.mean(rmse))) return np.mean(rmse)
def nmf_compute_prec_rec(t): precision, recall = [], [] for trainset, testset in KFold(n_splits=10, random_state=42).split(R): nmf = NMF(n_factors=nmf_best_k, random_state=42) nmf.fit(trainset) trimmed_testset = trim_unpopular_user(testset, t, threshold) pred = nmf.test(trimmed_testset) precision_dict, recall_dict = calculate_precision_recall( pred, t, threshold) precision.append(np.mean([prec for prec in precision_dict.values()])) recall.append(np.mean([rec for rec in recall_dict.values()])) return np.mean(precision), np.mean(recall)
def NMF_trim_filter(ratings, dims, func, mv_dict): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) RMSE = np.empty([len(dims)]) MAE = np.empty([len(dims)]) min_RMSE = False min_MAE = False fac_num_RMSE = 0 fac_num_MAE = 0 kf = KFold(n_splits=10, random_state=42) for k in range(len(dims)): nmf = NMF(n_factors=dims[k], random_state=42) test_rmse = np.array([]) test_mae = np.array([]) for trainset, testset in kf.split(data): nmf.fit(trainset) full_data = trainset.build_testset() + testset func(mv_dict, testset) pred = nmf.test(testset) test_rmse = np.append(test_rmse, accuracy.rmse(pred, verbose=False)) test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False)) RMSE[k] = np.mean(test_rmse) if ((not min_RMSE) or RMSE[k] < min_RMSE): min_RMSE = RMSE[k] fac_num_RMSE = dims[k] MAE[k] = np.mean(test_mae) if ((not min_MAE) or MAE[k] < min_MAE): min_MAE = MAE[k] fac_num_MAE = dims[k] print('For k = %i :' % dims[k]) print('RMSE: ', RMSE[k]) print('MAE: ', MAE[k]) plt.plot(dims, RMSE) plt.plot(dims, MAE) plt.legend(['RMSE', 'MAE']) plt.show() print('Finishing Plotting...') print('For RMSE:') print('\t---Optimal number of latent factors is ', fac_num_RMSE) print('\t---Minumun Average RMSE is ', min_RMSE) print('\nFor MAE:') print('\t---Optimal number of latent factors is ', fac_num_MAE) print('\t---Minumun Average MAE is ', min_MAE)
def Q19to21(qNum): data = load_data() kf = KFold(n_splits=10) trimFun = {12: popularTrim, 13: unpopularTrim, 14: highVarTrim} RMSE = [] for k in range(2, 20, 2): nmf = NMF() subRMSE = [] for trainSet, testSet in kf.split(data): subsubRMSE = 0 nmf.fit(trainSet) testSet = trimFun[qNum](testSet) nTest = len(testSet) print("test set size after trimming: %d", nTest) predictions = nmf.test(testSet) for p in predictions: subsubRMSE += pow(p.est - p.r_ui, 2) # average of all train-test splits of k-NN RMSE.append(np.mean(subRMSE)) return RMSE
def use_nmf(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using NMF') algo_NMF = NMF() algo_NMF.fit(trainset) testset = trainset.build_anti_testset() predictions_NMF = algo_NMF.test(testset) accuracy_rmse = accuracy.rmse(predictions_NMF) accuracy_mae = accuracy.mae(predictions_NMF) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def NMF_bin_pre(ratings, ts, nmf_fac, thrd): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) trainset, testset = train_test_split(data, test_size=ts) algo = NMF(n_factors=nmf_fac, random_state=42) algo.fit(trainset) pre = algo.test(testset) true_rating = np.empty(len(pre)) pred_rating = np.empty(len(pre)) for i in range(len(pre)): true_rating[i] = pre[i][2] pred_rating[i] = pre[i][3] bi_rating = np.empty(len(pre)) one_idx = true_rating >= thrd zero_idx = true_rating < thrd bi_rating[one_idx] = 1.0 bi_rating[zero_idx] = 0.0 return bi_rating, pred_rating
plt.savefig('plot/q22_nmf_roc_' + str(threshold) + '.png') plt.clf() if __name__ == "__main__": threshold = [2.5, 3, 3.5, 4] file_path = os.path.expanduser("ml-latest-small/ratings_new.csv") reader = Reader(sep=',') data = Dataset.load_from_file(file_path, reader=reader) sim_options = {'name': 'pearson', 'user_based': True} trainset, testset = train_test_split(data, test_size=0.1) for th in threshold: algo = NMF(n_factors=16) algo.fit(trainset) predictions = algo.test(testset) y_true = [] y_estimate = [] for row in predictions: if row[2] >= th: y_true.append(1) else: y_true.append(0) y_estimate.append(row[3]) plot_roc(y_true, y_estimate, th)
print("=================================Optimal Number of Latent Factors=============================================================") all_genres = set('|'.join(movies.genres).split('|')) print('#of Genres - ', len(all_genres)) print("===================================NNMF collaborative filtering on popular movie trimmed set=================================================") avg_rmse = [] k_range = range(2, 51, 2) kf = KFold(n_splits=10) for k in k_range: algo = NMF(n_factors=k) k_rmse = [] for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(popular_trim(testset)) k_rmse.append(accuracy.rmse(predictions, verbose=False)) avg_rmse.append(np.mean(k_rmse)) print('Minimum average RMSE is ', min(avg_rmse), ' for k = ', k_range[np.argmin(avg_rmse)]) plt.plot(k_range, avg_rmse) plt.xlabel('Number of latent factors', fontsize=15) plt.ylabel('Average RMSE', fontsize=15) plt.title('#latent factors vs Average RMSE for NMF Popular trimming') plt.show() print("========================NNMF collaborative filtering on unpopular movie trimmed set==================================") avg_rmse = [] k_range = range(2, 51, 2) kf = KFold(n_splits=10)
fig3.savefig(path + 'fig/Part_8_knn_preVSrec.png') #define model for training k_min_rmse = 18 nnmf = NMF(n_factors=k_min_rmse, random_state=1) #train, test and rank top_t_list = range(1, 26) pre_list_nnmf = [] rec_list_nnmf = [] for top_t in top_t_list: pre = 0 rec = 0 for trainset, testset in kf.split(data_raw): nnmf.fit(trainset) prediction = nnmf.test(testset) G = create_dict(testset) G_s = create_dict(prediction, if_pred=1) R, R_s = threshold_rank_filter(G, G_s, thre=3, top_t=top_t) #precision and recall for each fold pre_fold = 0 rec_fold = 0 for key in R.keys(): pre_temp, rec_temp = precision_recall(R[key], R_s[key]) pre_fold += pre_temp rec_fold += rec_temp pre += pre_fold / len(R) rec += rec_fold / len(R) pre_list_nnmf.append(pre / num_fold) rec_list_nnmf.append(rec / num_fold)
plt.figure() plt.title("kNN: Avg Precision vs Avg Recall with 10 fold CV") plt.xlabel("Avg Recall") plt.ylabel("Avg Precision") plt.plot(knn_recall, knn_prec) plt.show(0) """ Question 37 """ nmf_prec, nmf_recall = [], [] for t in ts: precision_sum, recall_sum = 0.0, 0.0 nmf = NMF(n_factors=20, biased=False) for trainset, testset in kf.split(data): nmf.fit(trainset) pred = nmf.test(testset) precision, recall = calc_precision_recall(pred, int(t), threshold) precision_sum += np.mean(list(precision.values())) recall_sum += np.mean(list(recall.values())) precision_avg = precision_sum / kf.n_splits recall_avg = recall_sum / kf.n_splits print( f"NMF t: {t}, precision_avg: {precision_avg}, recall_avg: {recall_avg}" ) nmf_prec.append(precision_avg) nmf_recall.append(recall_avg) plt.figure() plt.subplot(2, 1, 1) plt.title("NMF: Avg Precision vs t with 10 fold CV") plt.ylabel("Avg Precision")
# In[42]: print("Minimum average RMSE after high variance movie trimming: %.4f" % np.min(nmf_rmse_high_var_trim)) # <font size=4>**Question 22:** Plot the ROC curves for the NNMF-based collaborative filter designed in Question 17 for threshold values [2.5,3,3.5,4]. For the ROC plotting use the optimal number of latent factors found in Question 18. For each of the plots, also report the area under the curve (AUC) value.</font> # In[43]: nmf_best_k = ks[np.argmin(nmf_rmse)] trainset, testset = train_test_split(R, test_size=0.1, random_state=42) nmf_best = NMF(n_factors=nmf_best_k, random_state=42) nmf_best.fit(trainset) nmf_best_pred = nmf_best.test(testset) plot_roc_curves(testset, nmf_best_pred, 'NNMF') # <font size=4>**Question 23:** Perform Non-negative matrix factorization on the ratings matrix R to obtain the factor matrices U and V , where U represents the user-latent factors interaction and V represents the movie-latent factors interaction (use k = 20). For each column of V , sort the movies in descending order and report the genres of the top 10 movies. Do the top 10 movies belong to a particular or a small collection of genre? Is there a connection between the latent factors and the movie genres?</font> # In[44]: nmf_k20 = NMF(n_factors=20, random_state=42) nmf_k20.fit(R.build_full_trainset()) # In[45]: item_factors = nmf_k20.qi # In[46]:
def problems_19_20_21_rmse_pop_unpop_hv(): x_axis = range(2, 52, 2) ratings = {} for r in data.raw_ratings: if r[1] not in ratings: ratings[r[1]] = [] ratings[r[1]].append(r[2]) ############################################################################################### popular_movies = [x for x in ratings if len(ratings[x]) > 2] unpopular_movies = [x for x in ratings if len(ratings[x]) <= 2] ################################################################################### kf = KFold(n_splits=10) rmse_popular_store = [] for i in x_axis: algo = NMF(i, verbose=False) accu = [] for trainset, testset in kf.split(data): algo.fit(trainset) test_trim = [x for x in testset if x[1] in popular_movies] predictions = algo.test(test_trim) accu.append(accuracy.rmse(predictions, verbose=True)) s = np.mean(accu) rmse_popular_store.append(s) plotgraphs(x_axis, rmse_popular_store, 'K', 'Mean RMSE scores', 'Plot of popular movies', 'q19_rmse_popular_movies.png') plotgraphs(x_axis, rmse_popular_store, 'K', 'Mean RMSE scores', 'Plot of popular movies') ########################################################################################## kf = KFold(n_splits=10) rmse_unpopular_store = [] for i in x_axis: algo = NMF(i, verbose=False) accu = [] for trainset, testset in kf.split(data): algo.fit(trainset) test_trim = [x for x in testset if x[1] in unpopular_movies] predictions = algo.test(test_trim) accu.append(accuracy.rmse(predictions, verbose=True)) s = np.mean(accu) rmse_unpopular_store.append(s) plotgraphs(x_axis, rmse_unpopular_store, 'K', 'Mean RMSE scores', 'Plot of unpopular movies', 'q20_rmse_unpopular_movies.png') plotgraphs(x_axis, rmse_unpopular_store, 'K', 'Mean RMSE scores', 'Plot of unpopular movies') ############ rates "key" id, values are ratings ####################################### movie_var = {} for k in ratings: # print(k) movie_var[k] = np.var(ratings[k]) #################################################################################### highvar_movies = [ x for x in ratings if len(ratings[x]) >= 5 and movie_var[x] >= 2 ] ################################################################################## kf = KFold(n_splits=10) rmse_highvar_store = [] for i in x_axis: algo = NMF(i, verbose=False) accu = [] for trainset, testset in kf.split(data): algo.fit(trainset) test_trim = [x for x in testset if x[1] in highvar_movies] predictions = algo.test(test_trim) accu.append(accuracy.rmse(predictions, verbose=True)) s = np.mean(accu) rmse_highvar_store.append(s) pd.DataFrame(rmse_highvar_store).to_csv("rmse_highvar_store_21.csv") plotgraphs(x_axis, rmse_highvar_store, 'K', 'Mean RMSE scores', 'Plot of high variance movies', 'q21_rmse_high_var_movies.png')