def train_trim_nmf(data, R): kfold = KFold(n_splits=10) rmse_list = [[], [], []] for k in range(2, 52, 2): print("using k = %d" % k) p_rmse = [] u_rmse = [] hv_rmse = [] nmf = NMF(n_factors=k) for trainset, testset in kfold.split(data): nmf.fit(trainset) (p_testset, u_testset, hv_testset) = trim(testset, R) p_pred = nmf.test(p_testset) u_pred = nmf.test(u_testset) hv_pred = nmf.test(hv_testset) p_rmse.append(accuracy.rmse(p_pred)) u_rmse.append(accuracy.rmse(u_pred)) hv_rmse.append(accuracy.rmse(hv_pred)) rmse_list[0].append(np.mean(p_rmse)) rmse_list[1].append(np.mean(u_rmse)) rmse_list[2].append(np.mean(hv_rmse)) print("NMF with trim is finished!!") return rmse_list
def nmf_ratings_predicate(observed_ratings_df, truth_ratings_df, fold='0', phase='eval'): """ nmf_ratings Predicates """ print("NMF predicates") nmf_model = NMF() reader = Reader(rating_scale=(0.2, 1)) train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index( ).loc[:, ['userId', 'movieId', 'rating']], reader=reader) nmf_model.fit(train_dataset.build_full_trainset()) # make predictions predictions = pd.DataFrame(index=truth_ratings_df.index, columns=['rating']) for row in truth_ratings_df.loc[:, ['rating']].iterrows(): uid = row[0][0] iid = row[0][1] predictions.loc[(uid, iid), 'rating'] = nmf_model.predict(uid, iid).est write(predictions, 'nmf_rating_obs', fold, phase)
def nmf_compute_high_var_trim_rmse(k): nmf = NMF(n_factors=k, random_state=42) rmse = [] for trainset, testset in KFold(n_splits=10, random_state=42).split(R): nmf.fit(trainset) testset_trimmed = high_variance_trimming(testset, frequency, variance) pred = nmf.test(testset_trimmed) rmse.append(accuracy.rmse(pred, verbose=False)) print('k: %s | RMSE: %f' % (k, np.mean(rmse))) return np.mean(rmse)
def nmf_compute_prec_rec(t): precision, recall = [], [] for trainset, testset in KFold(n_splits=10, random_state=42).split(R): nmf = NMF(n_factors=nmf_best_k, random_state=42) nmf.fit(trainset) trimmed_testset = trim_unpopular_user(testset, t, threshold) pred = nmf.test(trimmed_testset) precision_dict, recall_dict = calculate_precision_recall( pred, t, threshold) precision.append(np.mean([prec for prec in precision_dict.values()])) recall.append(np.mean([rec for rec in recall_dict.values()])) return np.mean(precision), np.mean(recall)
def Q23(col=0): print('Chosen column is ' + str(col)) data = np.loadtxt('ml-latest-small/ratings.csv', delimiter=',', skiprows=1, usecols=(0, 1, 2)) row_userId = data[:, :1].astype(int) row_movieId = data[:, 1:2].astype(int) row_rating = data[:, 2:3] sortedId = np.sort(row_movieId.transpose()[0]) m = {} idx = 0 last = None for i in sortedId.tolist(): if i != last: m[i] = idx idx += 1 last = i data = load_data() model = NMF(n_factors=20) trainset, testset = train_test_split(data, test_size=0.0001) model.fit(trainset) U = model.pu V = model.qi import csv dict_ID_to_genre = {} with open('ml-latest-small/movies.csv', 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') cnt = 0 for row in reader: if cnt != 0: dict_ID_to_genre[row[0]] = row[1:] cnt += 1 dict_col_to_ID = {} for key in m: dict_col_to_ID[m[key]] = key V_col = V[:, col] V_col_sort_top10 = np.sort(V_col)[::-1][:10] V_col_list = V_col.tolist() for val in V_col_sort_top10: ind = V_col_list.index(val) m_id = dict_col_to_ID[ind] genre = dict_ID_to_genre[str(m_id)] print(genre[-1])
def NMF_trim_filter(ratings, dims, func, mv_dict): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) RMSE = np.empty([len(dims)]) MAE = np.empty([len(dims)]) min_RMSE = False min_MAE = False fac_num_RMSE = 0 fac_num_MAE = 0 kf = KFold(n_splits=10, random_state=42) for k in range(len(dims)): nmf = NMF(n_factors=dims[k], random_state=42) test_rmse = np.array([]) test_mae = np.array([]) for trainset, testset in kf.split(data): nmf.fit(trainset) full_data = trainset.build_testset() + testset func(mv_dict, testset) pred = nmf.test(testset) test_rmse = np.append(test_rmse, accuracy.rmse(pred, verbose=False)) test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False)) RMSE[k] = np.mean(test_rmse) if ((not min_RMSE) or RMSE[k] < min_RMSE): min_RMSE = RMSE[k] fac_num_RMSE = dims[k] MAE[k] = np.mean(test_mae) if ((not min_MAE) or MAE[k] < min_MAE): min_MAE = MAE[k] fac_num_MAE = dims[k] print('For k = %i :' % dims[k]) print('RMSE: ', RMSE[k]) print('MAE: ', MAE[k]) plt.plot(dims, RMSE) plt.plot(dims, MAE) plt.legend(['RMSE', 'MAE']) plt.show() print('Finishing Plotting...') print('For RMSE:') print('\t---Optimal number of latent factors is ', fac_num_RMSE) print('\t---Minumun Average RMSE is ', min_RMSE) print('\nFor MAE:') print('\t---Optimal number of latent factors is ', fac_num_MAE) print('\t---Minumun Average MAE is ', min_MAE)
def Q19to21(qNum): data = load_data() kf = KFold(n_splits=10) trimFun = {12: popularTrim, 13: unpopularTrim, 14: highVarTrim} RMSE = [] for k in range(2, 20, 2): nmf = NMF() subRMSE = [] for trainSet, testSet in kf.split(data): subsubRMSE = 0 nmf.fit(trainSet) testSet = trimFun[qNum](testSet) nTest = len(testSet) print("test set size after trimming: %d", nTest) predictions = nmf.test(testSet) for p in predictions: subsubRMSE += pow(p.est - p.r_ui, 2) # average of all train-test splits of k-NN RMSE.append(np.mean(subRMSE)) return RMSE
def use_nmf(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using NMF') algo_NMF = NMF() algo_NMF.fit(trainset) testset = trainset.build_anti_testset() predictions_NMF = algo_NMF.test(testset) accuracy_rmse = accuracy.rmse(predictions_NMF) accuracy_mae = accuracy.mae(predictions_NMF) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def NMF_bin_pre(ratings, ts, nmf_fac, thrd): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) trainset, testset = train_test_split(data, test_size=ts) algo = NMF(n_factors=nmf_fac, random_state=42) algo.fit(trainset) pre = algo.test(testset) true_rating = np.empty(len(pre)) pred_rating = np.empty(len(pre)) for i in range(len(pre)): true_rating[i] = pre[i][2] pred_rating[i] = pre[i][3] bi_rating = np.empty(len(pre)) one_idx = true_rating >= thrd zero_idx = true_rating < thrd bi_rating[one_idx] = 1.0 bi_rating[zero_idx] = 0.0 return bi_rating, pred_rating
bi_rating[zero_idx] = 0.0 return bi_rating, pred_rating threshold = np.array([2.5, 3, 3.5, 4]) for td in threshold: tar, pre = NMF_bin_pre(ratings, 0.1, 18, td) plot_roc(pre, tar) # Q23 reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) data = data.build_full_trainset() nmf = NMF(n_factors=20, random_state=42) nmf.fit(data) for i in range(10): col = nmf.qi[:, i] top_movie = col.argsort()[::-1][:10] print('For the %i th column, the top 10 movie genres are:' % (i + 1)) for j in range(10): raw_iid = nmf.trainset.to_raw_iid(top_movie[j]) gen = movies.loc[movies['movieId'] == raw_iid]['genres'].values print('\t--%i :' % (j + 1), gen) # MF With Bias Filter # Q24 & 25 def MF_bias_filter(ratings, dims): reader = Reader(rating_scale=(0.0, 5.0))
plt.show() fig3.savefig(path + 'fig/Part_8_knn_preVSrec.png') #define model for training k_min_rmse = 18 nnmf = NMF(n_factors=k_min_rmse, random_state=1) #train, test and rank top_t_list = range(1, 26) pre_list_nnmf = [] rec_list_nnmf = [] for top_t in top_t_list: pre = 0 rec = 0 for trainset, testset in kf.split(data_raw): nnmf.fit(trainset) prediction = nnmf.test(testset) G = create_dict(testset) G_s = create_dict(prediction, if_pred=1) R, R_s = threshold_rank_filter(G, G_s, thre=3, top_t=top_t) #precision and recall for each fold pre_fold = 0 rec_fold = 0 for key in R.keys(): pre_temp, rec_temp = precision_recall(R[key], R_s[key]) pre_fold += pre_temp rec_fold += rec_temp pre += pre_fold / len(R) rec += rec_fold / len(R) pre_list_nnmf.append(pre / num_fold)
y_score = [prediction.est for prediction in predictions] fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_score) roc_auc = auc(fpr, tpr) plotROC(fpr, tpr, roc_auc, threshold) """ Question 23: Movie-Latent Factor Interaction """ reader = Reader(rating_scale=(0.5, 5)) data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader) data = data.build_full_trainset() movieDat = pd.read_csv('ml-latest-small/movies.csv') nmf = NMF(n_factors=20, biased=False) nmf.fit(data) movies = df['movieId'].unique( ) # identify unique movie IDs from the ratings CSV (9724, already sorted) V = nmf.qi # get top 10 movie genres for the first 20 columns of the V matrix for i in range(20): Vcol = V[:, i] # convert column of V into a list for processing VcolOrig = [] VcolSort = [] for j in range(len(Vcol)): VcolOrig.append(Vcol[j]) # original array for looking up movie index VcolSort.append(Vcol[j]) # sorted array for getting top movies
class DataLoader(object): def __init__(self,category,save_name): self.category=category self.max_user=10000 #maximum number of user self.price_dict={} self.price_dict_temp={} self.cate_dict={} self.cate_dict_temp={} self.top_value=15 # top x features in SVD self.model=NMF() self.topk=500 #maximum items in each category, finding the top k popular self.max_price={} self.save_path= os.path.join("..", "feature", save_name) if not os.path.isfile(self.save_path): self.load_data() #load raw data #self.create_user_item_matrix() self.create_ratings() self.gen_new_price_dict() self.save_data(self.save_path) #save the feature else: self.load(self.save_path) #load the feature def load_ratings(self, filename): with open(os.path.join("..", "data", filename), "rb") as f: ratings = pd.read_csv(f,names=("user","item","rating","timestamp")) return ratings def load_prices(self,filename): price_dict = {} num_no_price=0 for review in parse(os.path.join("..", "data", filename)): try: price=review['price'] asin=review['asin'] v=list(review['salesRank'].values())[0] if v<self.topk: price_dict[asin]=price except: num_no_price+=1 continue print("filename:",filename) print("length of price dict:", len(price_dict)) print("# of items without price", num_no_price) return price_dict def load_data(self): print("Loading data:") for i in self.category: ratings_name= "ratings_"+i+".csv" price_name="meta_"+i+".json.gz" ratings_temp=self.load_ratings(ratings_name) print(len(ratings_temp)) price_temp=self.load_prices(price_name) ratings_temp=ratings_temp[ratings_temp['item'].isin(price_temp.keys())] print(len(ratings_temp)) self.price_dict_temp.update(price_temp) self.max_price[i]=max(list(price_temp.values())) cate_temp={} for j in price_temp.keys(): cate_temp[j]=i self.cate_dict_temp.update(cate_temp) price_temp.clear() try: self.ratings=pd.merge(self.ratings,ratings_temp, how='outer') except: self.ratings=ratings_temp print(self.max_price) #old method def create_user_item_matrix(self, user_key="user",item_key="item"): n = len(set(self.ratings[user_key])) d = len(set(self.ratings[item_key])) self.user_mapper = dict(zip(np.unique(self.ratings[user_key]), list(range(n)))) self.item_mapper = dict(zip(np.unique(self.ratings[item_key]), list(range(d)))) self.user_inverse_mapper = dict(zip(list(range(n)), np.unique(self.ratings[user_key]))) self.item_inverse_mapper = dict(zip(list(range(d)), np.unique(self.ratings[item_key]))) self.user_ind = [self.user_mapper[i] for i in self.ratings[user_key]] self.item_ind = [self.item_mapper[i] for i in self.ratings[item_key]] self.ratings_matrix = sparse_matrix((self.ratings["rating"]-3, (self.user_ind, self.item_ind)), shape=(n,d)) print("user-item matrix generated.") def create_ratings(self): #C=MBRecsys(self.ratings_matrix,top_value) S=set(self.ratings['user']) S=sample(S,self.max_user) n = len(S) d = len(set(self.ratings['item'])) self.ratings=self.ratings[self.ratings['user'].isin(S)] reader=Reader(rating_scale=(1,5)) data = Dataset.load_from_df(self.ratings[['user', 'item', 'rating']], reader) train_set=data.build_full_trainset() self.model.fit(train_set) self.inv_cate_dict={} #{'categoryA':[],'categoryB':[]} for i in self.category: self.inv_cate_dict[i]=[] for j in train_set.all_items(): item_raw=train_set.to_raw_iid(j) self.inv_cate_dict[self.cate_dict_temp[item_raw]].append(j) self.price_dict[j]=self.price_dict_temp[item_raw] self.cate_dict[j]=self.cate_dict_temp[item_raw] self.cate_dict_temp.clear() self.price_dict_temp.clear() print("inv_cate_dict constructed.") d=0 for i in self.category: d+=len(self.inv_cate_dict[i]) print(i,':',len(self.inv_cate_dict[i])) self.ratings_predict=np.zeros([n,d]) for i in train_set.all_users(): user_raw=train_set.to_raw_uid(i) for j in train_set.all_items(): item_raw=train_set.to_raw_iid(j) self.ratings_predict[i][j]=self.model.predict(user_raw, item_raw)[3] print("predicted ratings generated.") self.ranking=np.zeros([n,d]) temp={} for i in range(n): for c in self.category: temp[c]=sorted(self.ratings_predict[i][self.inv_cate_dict[c]],reverse=True) for j in range(d): c=self.cate_dict[j] self.ranking[i][j]= temp[c].index(self.ratings_predict[i][j])+1 print("user_item rankings generated.") def save_data(self,save_path): self.dict_all={'prices':self.price_dict,#'raw_ratings':self.ratings_matrix, 'new_ratings':self.ratings_predict,'cate':self.cate_dict, 'rankings': self.ranking,'max_price':self.max_price, 'new_price':self.new_price_dict} #'user_mapper':self.user_mapper, 'item_mapper':self.item_mapper, #'user_inverse_mapper':self.user_inverse_mapper, 'item_inverse_mapper':self.item_inverse_mapper} with open(save_path,'wb') as f: pickle.dump(self.dict_all, f) print("data saved in ", save_path) def load(self,save_path): with open(save_path,'rb') as f: self.dict_all=pickle.load(f) #self.ratings_matrix =self.dict_all['raw_ratings'] self.ratings_predict=self.dict_all['new_ratings'] self.price_dict=self.dict_all['prices'] self.cate_dict=self.dict_all['cate'] self.ranking=self.dict_all['rankings'] self.max_price=self.dict_all['max_price'] self.new_price_dict=self.dict_all['new_price'] #self.user_mapper=self.dict_all['user_mapper'] #self.item_mapper=self.dict_all['item_mapper'] #self.user_inverse_mapper=self.dict_all['user_inverse_mapper'] #self.item_inverse_mapper=self.dict_all['item_inverse_mapper'] self.dict_all.clear() del self.dict_all print("Saved data loaded.") def gen_new_price_dict(self): self.new_price_dict={} for i in self.category: self.new_price_dict[i]={} for i in range(len(self.cate_dict)): self.new_price_dict[self.cate_dict[i]][i]=self.price_dict[i] print("new price dictionary generated.")
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve for threshold = %0.2f' % thresh) plt.legend(loc="lower right") plt.show() #23 trainset, testset = train_test_split(data, test_size=.1, random_state=100) nmf = NMF(n_factors=20, verbose=False, random_state=100) nmf.fit(trainset).test(testset) V = nmf.qi k = [item for item in range(0, 20)] df = pd.read_csv(movies_file_path, names=['movieid', 'title', 'genres'], header=0) for i in k: print(i) mov = V[:, i] mov1 = [(n, j) for n, j in enumerate(mov)] mov1.sort(key=lambda x: x[1], reverse=True) for a in mov1[:10]: print(df['genres'][a[0]])
'RMSE after High Variance Movie Trimming') # In[42]: print("Minimum average RMSE after high variance movie trimming: %.4f" % np.min(nmf_rmse_high_var_trim)) # <font size=4>**Question 22:** Plot the ROC curves for the NNMF-based collaborative filter designed in Question 17 for threshold values [2.5,3,3.5,4]. For the ROC plotting use the optimal number of latent factors found in Question 18. For each of the plots, also report the area under the curve (AUC) value.</font> # In[43]: nmf_best_k = ks[np.argmin(nmf_rmse)] trainset, testset = train_test_split(R, test_size=0.1, random_state=42) nmf_best = NMF(n_factors=nmf_best_k, random_state=42) nmf_best.fit(trainset) nmf_best_pred = nmf_best.test(testset) plot_roc_curves(testset, nmf_best_pred, 'NNMF') # <font size=4>**Question 23:** Perform Non-negative matrix factorization on the ratings matrix R to obtain the factor matrices U and V , where U represents the user-latent factors interaction and V represents the movie-latent factors interaction (use k = 20). For each column of V , sort the movies in descending order and report the genres of the top 10 movies. Do the top 10 movies belong to a particular or a small collection of genre? Is there a connection between the latent factors and the movie genres?</font> # In[44]: nmf_k20 = NMF(n_factors=20, random_state=42) nmf_k20.fit(R.build_full_trainset()) # In[45]: item_factors = nmf_k20.qi
def main(): # Load data reader = Reader(sep=',', rating_scale=(0.0, 5.0), skip_lines=1) allMoives = Dataset.load_from_file('ratings.csv', reader=reader) popMoives = Dataset.load_from_file('popular.csv', reader=reader) unpopMoives = Dataset.load_from_file('unpopular.csv', reader=reader) varMoives = Dataset.load_from_file('variance.csv', reader=reader) binary = [] binary.append(Dataset.load_from_file('bin2.5.csv', reader=reader)) binary.append(Dataset.load_from_file('bin3.csv', reader=reader)) binary.append(Dataset.load_from_file('bin3.5.csv', reader=reader)) binary.append(Dataset.load_from_file('bin4.csv', reader=reader)) with open('movies.csv', 'r', encoding='utf8') as f: reader = csv.reader(f, delimiter=',', quotechar='"') next(reader, None) movies = {int(movie[0]): movie[2] for movie in reader} # NMFs ks = range(2, 52, 2) mae, rmse = [0] * len(ks), [0] * len(ks) def nmf(dataName, data, biased=True): print('Start building NMF with ' + dataName + '!') for i, k in enumerate(ks): nmf = NMF(n_factors=k, biased=biased) scores = cross_validate(nmf, data, cv=10) mae[i] = scores['test_mae'].mean() rmse[i] = scores['test_rmse'].mean() print('k = ' + str(k) + ' finished!') plt.figure() plt.subplot(211) plt.plot(ks, mae) plt.xlabel('k') plt.ylabel('mean absolute error') plt.title('Mean absolute error vs. k of ' + dataName) plt.subplot(212) plt.plot(ks, rmse) plt.xlabel('k') plt.ylabel('root mean squared error') plt.title('Root mean squared error vs. k of ' + dataName) print('mae:') print(mae) print('rmse:') print(rmse) print('Finish building NMF with ' + dataName + '!') # Q17 nmf('all movies', allMoives) # Q18 optimalK = 4 print('The optimal number of latent factors is ' + str(optimalK)) # Q19 nmf('popular movies', popMoives) # Q20 nmf('unpopular movies', unpopMoives) # Q21 nmf('high variance movies', varMoives) # Draw ROC Curve thresholds = [2.5, 3, 3.5, 4] def drawRoc(model, i, k): print('Start drawing ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i]) + '!') train, test = train_test_split(binary[i], train_size=0.9, test_size=0.1) model.fit(train) labels = model.test(test) y_true = [label.r_ui for label in labels] y_pred = [label.est for label in labels] fpr, tpr, _ = roc_curve(y_true, y_pred) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i])) plt.legend(loc="lower right") print('Finish drawing ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i]) + '!') # Q22 nmf = NMF(n_factors=optimalK) for i in range(len(thresholds)): drawRoc(nmf, i, optimalK) # Q23 print("Start finding top K!") k, col = 20, 5 nmf = NMF(n_factors=k) trainAllMovies = allMoives.build_full_trainset() nmf.fit(trainAllMovies) ids = [[] for _ in range(col)] for i in range(col): factors = nmf.qi[:, i] s = sorted([[i, factor] for i, factor in enumerate(factors)], key=lambda x: x[1], reverse=True) for k in range(10): ids[i].append(s[k][0]) genres = [[] for _ in range(col)] for i in range(col): for j in range(10): genres[i].append(movies[int(trainAllMovies.to_raw_iid(ids[i][j]))]) for i in range(col): print('Col ' + str(i + 1) + ':') for genre in genres[i]: print(genre, end=', ') print('') print("Finish finding top K!") # Q24 nmf('all movies', allMoives, True) # Q25 optimalKBiased = 2 print('The optimal number of latent factors is ' + optimalKBiased) # Q26 nmf('popular movies', popMoives, True) # Q27 nmf('unpopular movies', unpopMoives, True) # Q28 nmf('high variance movies', varMoives, True) # Q29 optimalKBiased = 2 nmfBiased = NMF(n_factors=optimalKBiased, biased=True) for i in range(len(thresholds)): drawRoc(nmfBiased, i, optimalKBiased) plt.show()
plt.savefig('plot/q22_nmf_roc_' + str(threshold) + '.png') plt.clf() if __name__ == "__main__": threshold = [2.5, 3, 3.5, 4] file_path = os.path.expanduser("ml-latest-small/ratings_new.csv") reader = Reader(sep=',') data = Dataset.load_from_file(file_path, reader=reader) sim_options = {'name': 'pearson', 'user_based': True} trainset, testset = train_test_split(data, test_size=0.1) for th in threshold: algo = NMF(n_factors=16) algo.fit(trainset) predictions = algo.test(testset) y_true = [] y_estimate = [] for row in predictions: if row[2] >= th: y_true.append(1) else: y_true.append(0) y_estimate.append(row[3]) plot_roc(y_true, y_estimate, th)
def problems_19_20_21_rmse_pop_unpop_hv(): x_axis = range(2, 52, 2) ratings = {} for r in data.raw_ratings: if r[1] not in ratings: ratings[r[1]] = [] ratings[r[1]].append(r[2]) ############################################################################################### popular_movies = [x for x in ratings if len(ratings[x]) > 2] unpopular_movies = [x for x in ratings if len(ratings[x]) <= 2] ################################################################################### kf = KFold(n_splits=10) rmse_popular_store = [] for i in x_axis: algo = NMF(i, verbose=False) accu = [] for trainset, testset in kf.split(data): algo.fit(trainset) test_trim = [x for x in testset if x[1] in popular_movies] predictions = algo.test(test_trim) accu.append(accuracy.rmse(predictions, verbose=True)) s = np.mean(accu) rmse_popular_store.append(s) plotgraphs(x_axis, rmse_popular_store, 'K', 'Mean RMSE scores', 'Plot of popular movies', 'q19_rmse_popular_movies.png') plotgraphs(x_axis, rmse_popular_store, 'K', 'Mean RMSE scores', 'Plot of popular movies') ########################################################################################## kf = KFold(n_splits=10) rmse_unpopular_store = [] for i in x_axis: algo = NMF(i, verbose=False) accu = [] for trainset, testset in kf.split(data): algo.fit(trainset) test_trim = [x for x in testset if x[1] in unpopular_movies] predictions = algo.test(test_trim) accu.append(accuracy.rmse(predictions, verbose=True)) s = np.mean(accu) rmse_unpopular_store.append(s) plotgraphs(x_axis, rmse_unpopular_store, 'K', 'Mean RMSE scores', 'Plot of unpopular movies', 'q20_rmse_unpopular_movies.png') plotgraphs(x_axis, rmse_unpopular_store, 'K', 'Mean RMSE scores', 'Plot of unpopular movies') ############ rates "key" id, values are ratings ####################################### movie_var = {} for k in ratings: # print(k) movie_var[k] = np.var(ratings[k]) #################################################################################### highvar_movies = [ x for x in ratings if len(ratings[x]) >= 5 and movie_var[x] >= 2 ] ################################################################################## kf = KFold(n_splits=10) rmse_highvar_store = [] for i in x_axis: algo = NMF(i, verbose=False) accu = [] for trainset, testset in kf.split(data): algo.fit(trainset) test_trim = [x for x in testset if x[1] in highvar_movies] predictions = algo.test(test_trim) accu.append(accuracy.rmse(predictions, verbose=True)) s = np.mean(accu) rmse_highvar_store.append(s) pd.DataFrame(rmse_highvar_store).to_csv("rmse_highvar_store_21.csv") plotgraphs(x_axis, rmse_highvar_store, 'K', 'Mean RMSE scores', 'Plot of high variance movies', 'q21_rmse_high_var_movies.png')