def calculate_stats_users(pct_train): dat_file = 'user_data_working.csv' data = Data() data.load(dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) train, test = data.split_train_test(percent=pct_train) svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=False) rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def ex1(dat_file='./ml-1m/ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int}) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K=100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def evaluate(data, count=5, K=100): results = [] for i in range(count): train, test = data.split_train_test(percent=PERCENT_TRAIN) print len(data.get()), len(train.get()), len(test.get()) #test_in_train(test, train) #print train.get() svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) #Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: #print "keyerror: ===========================================================>" continue try: rsu = {} rsu["RMSE"] = rmse.compute() rsu["MAE"] = mae.compute() print rsu results.append(rsu) except: print "one error....++++++++++++++++++++++++++++++++++++++++++++++++++++" return results
def __init__(self): super(TestPrediction, self).__init__() # Prediction-based metrics: MAE, RMSE, Pearson self.mae = MAE(self.DATA_PRED) self.rmse = RMSE(self.DATA_PRED) self.R = 3 # Real Rating (ground truth) self.R_PRED = 2.1 # Predicted Rating
def test_SVD(svd, train, test, pct_train): rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def eval_rmse(self): # Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() for rating, item_id, user_id in self.test.get(): try: pred_rating = self.svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def test_SVD(svd,train,test,pct_train): rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def eval_reco(model, test): """ Compute RMSE and MAE on test set """ #Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = model.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue return rmse, mae
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'col': 0 -> Cols in matrix come from column 0 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat # file # 'ids': int -> Ids (row and col ids) are integers (not strings) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K = 100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() # mae is mean ABSOLUTE error # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5 mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def evaluate(_svd, _testData, verbose=False): global rmse, mae, rating, item_id, user_id, pred_rating rmse = RMSE() mae = MAE() for rating, item_id, user_id in _testData.get(): try: pred_rating = _svd.predict(item_id, user_id, MIN_VALUE=0, MAX_VALUE=10) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) if verbose: print item_id, user_id, rating, pred_rating except Exception as e: print 'ERROR occurred:', e.message print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def get_mae_rmse(step): data = Data() format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'} filename = 'second_train_test.dat.{step}'.format(step=step) data.load(filename, sep='::', format=format) train, test = data.split_train_test(percent=80) try: svd = SVD('svdn_model_{step}.zip'.format(step=step)) print('Loading model... {step}'.format(step=step)) except: return mae_predicted, rmse_predicted = [], [] for rating, item_id, user_id in test: try: predicted = svd.predict(item_id, user_id) mae_predicted.append((rating, predicted)) rmse_predicted.append((rating, predicted)) except: pass mae_value, rmse_value = np.nan, np.nan if len(mae_predicted) > 0: mae = MAE(mae_predicted) mae_value = mae.compute() if len(rmse_predicted) > 0: rmse = RMSE(rmse_predicted) rmse_value = rmse.compute() return mae_value, rmse_value
def __init__(self, filename, sep, **format): # 文件信息 self.filename = filename self.sep = sep self.format = format # 初始化矩阵分解 self.svd = SVD() # 矩阵信息 self.k = 100 # 矩阵的隐因子睡昂 self.min_values = 10 # 删除评分少于10人的电影 self.post_normalize = False # 设置是否加载模型标志 self.load_model = False # 初始化均方误差 self.rmse = RMSE()
def test_random(data): mae_predicted, rmse_predicted = [], [] for rating in data: random_predicted = float(random_score(review_percentages)) mae_predicted.append((rating, random_predicted)) rmse_predicted.append((rating, random_predicted)) mae_value, rmse_value = np.nan, np.nan if len(mae_predicted) > 0: mae = MAE(mae_predicted) mae_value = mae.compute() if len(rmse_predicted) > 0: rmse = RMSE(rmse_predicted) rmse_value = rmse.compute() return mae_value, rmse_value
def __init__(self, filename, sep, **format): self.filename = filename self.sep = sep self.format = format # 训练参数 self.k = 100 self.min_values = 10 self.post_normalize = True self.svd = SVD() # 判断是否加载 self.is_load = False # 添加数据处理 self.data = Data() # 添加模型评估 self.rmse = RMSE()
def calculate_stats_users(pct_train): dat_file = 'user_data_working.csv' data = Data() data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int}) train, test = data.split_train_test(percent=pct_train) svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=False) rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def evaluate(clf, _testData, verbose = False): rmse = RMSE() mae = MAE() numErrors = 0 for rating, item_id, user_id in _testData.get(): try: pred_rating = clf.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) if verbose: print item_id, user_id, rating, pred_rating except KeyError as e: if verbose: print 'ERROR occurred:', e.message numErrors += 1 print '\n%i/%i data points raised errors.' % (numErrors, len(_testData)) print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def ex1(dat_file='ml-1m/ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int}) # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'col': 0 -> Cols in matrix come from column 0 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat # file # 'ids': int -> Ids (row and col ids) are integers (not strings) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K = 100 svd = SVD() svd.set_data(train) svd.compute( k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def root_mean_square_error(train_values, predicted_values): if len(train_values) != len(predicted_values): sys.stderr.write("mean_absolute_error: Invalid list lengths") exit(1) rmse = RMSE() rmse.load_ground_truth(train_values) rmse.load_test(predicted_values) return rmse.compute()
def test_PRED_RMSE_load_test(self): rmse = RMSE() self.TEST_DATA = [2.3, 0.9, 4.9, 0.9, 1.5] rmse.load_test(self.TEST_DATA) assert_equal(len(rmse.get_test()), len(self.TEST_DATA))
def test_PRED_RMSE_compute_one_empty_datasets(self): rmse = RMSE() assert_equal(rmse.compute(self.R, self.R_PRED), 0.9)
class TestPrediction(Test): def __init__(self): super(TestPrediction, self).__init__() # Prediction-based metrics: MAE, RMSE, Pearson self.mae = MAE(self.DATA_PRED) self.rmse = RMSE(self.DATA_PRED) self.R = 3 # Real Rating (ground truth) self.R_PRED = 2.1 # Predicted Rating # test_PRED MAE def test_PRED_MAE_compute_one(self): assert_equal(self.mae.compute(self.R, self.R_PRED), 0.9) def test_PRED_MAE_compute_one_empty_datasets(self): mae = MAE() assert_equal(mae.compute(self.R, self.R_PRED), 0.9) def test_PRED_MAE_compute_all(self): assert_equal(self.mae.compute(), 0.7) def test_PRED_MAE_nan(self): mae = MAE() mae.add(2.0, nan) assert_equal(mae.get_test(), []) assert_equal(mae.get_ground_truth(), []) def test_PRED_MAE_load(self): mae = MAE() mae.load(self.GT_DATA, self.TEST_DATA) assert_equal(mae.compute(), 0.7) def test_PRED_MAE_load_test(self): mae = MAE() mae.load_test(self.TEST_DATA) assert_equal(len(mae.get_test()), len(self.TEST_DATA)) assert_equal(len(mae.get_ground_truth()), 0) assert_raises(ValueError, mae.compute) #Raise: GT is empty! def test_PRED_MAE_load_test_and_ground_truth(self): mae = MAE() mae.load_test(self.TEST_DATA) mae.load_ground_truth(self.GT_DATA) assert_equal(mae.compute(), 0.7) def test_PRED_MAE_add_entry(self): self.mae.add(1, 4) #1: GT rating, 4: Predicted rating assert_equal(len(self.mae.get_test()), len(self.DATA_PRED)+1) assert_equal(self.mae.compute(), 1.083333) def test_PRED_MAE_different_list_sizes(self): mae = MAE() GT = [3, 1, 5, 2] # GT list has one element less than self.TEST_DATA mae.load(GT, self.TEST_DATA) assert_raises(ValueError, mae.compute) # test_PRED RMSE def test_PRED_RMSE_compute_one(self): #Even though rmse has data, we only compute these two param values assert_equal(self.rmse.compute(self.R, self.R_PRED), 0.9) def test_PRED_RMSE_compute_one_empty_datasets(self): rmse = RMSE() assert_equal(rmse.compute(self.R, self.R_PRED), 0.9) def test_PRED_RMSE_compute_all(self): assert_equal(self.rmse.compute(), 0.891067) def test_PRED_RMSE_load_test(self): rmse = RMSE() self.TEST_DATA = [2.3, 0.9, 4.9, 0.9, 1.5] rmse.load_test(self.TEST_DATA) assert_equal(len(rmse.get_test()), len(self.TEST_DATA)) def test_PRED_RMSE_add_entry(self): self.rmse.add(1,4) assert_equal(len(self.rmse.get_test()), len(self.DATA_PRED)+1) assert_equal(self.rmse.compute(), 1.470261) def test_PRED_RMSE_different_list_sizes(self): rmse = RMSE() GT = [3, 1, 5, 2] # GT list has one element less than self.TEST_DATA rmse.load(GT, self.TEST_DATA) assert_raises(ValueError, rmse.compute) def test_PRED_RMSE_numpy_array(self): rmse = RMSE() rmse.load(array(self.GT_DATA), array(self.TEST_DATA)) assert(rmse.compute(), 0.891067)
def test_PRED_RMSE_different_list_sizes(self): rmse = RMSE() GT = [3, 1, 5, 2] # GT list has one element less than self.TEST_DATA rmse.load(GT, self.TEST_DATA) assert_raises(ValueError, rmse.compute)
dist[i] = sorted(dist[i], key=itemgetter(1), reverse=True) #print str(i)+": "+str(item_id) #print dist[i] i += 1 #print dist print "mean leng = "+str(statistics.mean(leng)) print "max leng = "+str(max(leng)) print "min leng = "+str(min(leng)) for k in range(3, 46): print str(k)+"NN: "+str(p)+" fold..." if k not in rmse.keys(): rmse[k] = [] result = RMSE() i = 0 for rating, item_id, user_id in test: if len(dist[i]) < 5: if item_id in train_item.keys(): pred_rating = statistics.mean(train_item[item_id].values()) elif user_id in train_user.keys(): pred_rating = statistics.mean(train_user[user_id].values()) else: pred_rating = average else: ratings = [] for j in range(0, k): if j == len(dist[i]): break ratings.append(train_item[dist[i][j][0]][user_id])
#3.10 [items_full[str(x[0])].get_data() for x in films] #3.11 get_name_item_reviewed(10,user_full,items_full) #3.12 items_full[str(2628)].get_data() users_for_star_wars = svd.recommend(2628,only_unknowns=True) users_for_star_wars #3.13 movies_reviewed_by_sw_rec =[get_name_item_reviewed(x[0],user_full,items_full) for x in users_for_star_wars] movies_flatten = [movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list] movie_aggregate = movies_by_category(movies_flatten, 3) movies_sort = sorted(movie_aggregate,key=lambda x: x[1], reverse=True) movies_sort #3.14 from recsys.evaluation.prediction import RMSE err = RMSE() for rating, item_id, user_id in data.get(): try: prediction = svd.predict(item_id, user_id) err.add(rating, prediction) except KeyError, k: continue print 'RMSE is ' + str(err.compute())
svd_neig.set_data(train) #Compute SVD svd.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True) svd_neig.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True) # Evaluate rmse_svd = RMSE() mae_svd = MAE() rmse_svd_neig = RMSE() mae_svd_neig = MAE() i = 1 total = len(test.get()) print 'Total Test ratings: %s' % total for rating, item_id, user_id in test: try: pred_rating_svd = svd.predict(item_id, user_id) rmse_svd.add(rating, pred_rating_svd) mae_svd.add(rating, pred_rating_svd) pred_rating_svd_neig = svd_neig.predict(item_id, user_id) #Koren & co.
#Create SVD list = [] for j in range(50,80,2): sum_value = 0.0 for i in range(1,11): #Train & Test data train, test = data.split_train_test(percent=PERCENT_TRAIN) K=j svd = SVDNeighbourhood() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) #Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() sum_value = sum_value + rmse.compute() print '-------' print 'the k value is %s' %j print 'Final RMSE=%s' % sum_value print '-------'
def evaluate_matrices_rmse(self, original_matrix, imputed_matrix): return self.evaluate_matrices(original_matrix, imputed_matrix, evaluator=RMSE())
from recsys.evaluation.prediction import RMSE, MAE from recsys.datamodel.data import Data from baseline import Baseline #Import the test class we've just created #Dataset PERCENT_TRAIN = int(sys.argv[2]) data = Data() data.load(sys.argv[1], sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) #Train & Test data train, test = data.split_train_test(percent=PERCENT_TRAIN) baseline = Baseline() baseline.set_data(train) baseline.compute() # In this case, it does nothing # Evaluate rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = baseline.predict(item_id, user_id, user_is_row=False) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() # in my case (~80% train, ~20% test set) returns RMSE = 1.036374 print 'MAE=%s' % mae.compute() # in my case (~80% train, ~20% test set) returns MAE = 0.829024
#Dataset PERCENT_TRAIN = int(sys.argv[2]) data = Data() data.load(sys.argv[1], sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int}) #Train & Test data train, test = data.split_train_test(percent=PERCENT_TRAIN) svdlibc = SVDLIBC('./ml-1m/ratings.dat') svdlibc.to_sparse_matrix(sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) svdlibc.compute(k=100) svd = svdlibc.export() svd.save_model('/tmp/svd-model', options={'k': 100}) #svd.similar(ITEMID1) # results might be different than example 4. as there's no min_values=10 set here #Evaluation using prediction-based metrics print 'Evaluating...' rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id, 0.0, 5.0) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
RUNS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] for run in RUNS: print "RUN(%d)" % run # Train & Test data train, test = data.split_train_test(percent=PERCENT_TRAIN) svd.set_data(train) svd_neig.set_data(train) # Compute SVD svd.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True) svd_neig.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True) # Evaluate rmse_svd = RMSE() mae_svd = MAE() rmse_svd_neig = RMSE() mae_svd_neig = MAE() i = 1 total = len(test.get()) print "Total Test ratings: %s" % total for rating, item_id, user_id in test: try: pred_rating_svd = svd.predict(item_id, user_id) rmse_svd.add(rating, pred_rating_svd) mae_svd.add(rating, pred_rating_svd) pred_rating_svd_neig = svd_neig.predict(item_id, user_id) # Koren & co. if pred_rating_svd_neig is not nan:
def test_PRED_RMSE_numpy_array(self): rmse = RMSE() rmse.load(array(self.GT_DATA), array(self.TEST_DATA)) assert (rmse.compute(), 0.891067)
print '' print 'GENERATING PREDICTION' MIN_RATING = 0.0 MAX_RATING = 5.0 ITEMID = 1 USERID = 1 print svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING) # predicted rating value print svd.get_matrix().value(ITEMID, USERID) # real rating value print '' print 'GENERATING RECOMMENDATION' print svd.recommend(USERID, n=5, only_unknowns=True, is_row=False) #Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() spearman = SpearmanRho() kendall = KendallTau() #decision = PrecisionRecallF1() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) spearman.add(rating, pred_rating) kendall.add(rating, pred_rating) except KeyError: continue print ''
class TestPrediction(Test): def __init__(self): super(TestPrediction, self).__init__() # Prediction-based metrics: MAE, RMSE, Pearson self.mae = MAE(self.DATA_PRED) self.rmse = RMSE(self.DATA_PRED) self.R = 3 # Real Rating (ground truth) self.R_PRED = 2.1 # Predicted Rating # test_PRED MAE def test_PRED_MAE_compute_one(self): assert_equal(self.mae.compute(self.R, self.R_PRED), 0.9) def test_PRED_MAE_compute_one_empty_datasets(self): mae = MAE() assert_equal(mae.compute(self.R, self.R_PRED), 0.9) def test_PRED_MAE_compute_all(self): assert_equal(self.mae.compute(), 0.7) def test_PRED_MAE_nan(self): mae = MAE() mae.add(2.0, nan) assert_equal(mae.get_test(), []) assert_equal(mae.get_ground_truth(), []) def test_PRED_MAE_load(self): mae = MAE() mae.load(self.GT_DATA, self.TEST_DATA) assert_equal(mae.compute(), 0.7) def test_PRED_MAE_load_test(self): mae = MAE() mae.load_test(self.TEST_DATA) assert_equal(len(mae.get_test()), len(self.TEST_DATA)) assert_equal(len(mae.get_ground_truth()), 0) assert_raises(ValueError, mae.compute) #Raise: GT is empty! def test_PRED_MAE_load_test_and_ground_truth(self): mae = MAE() mae.load_test(self.TEST_DATA) mae.load_ground_truth(self.GT_DATA) assert_equal(mae.compute(), 0.7) def test_PRED_MAE_add_entry(self): self.mae.add(1, 4) #1: GT rating, 4: Predicted rating assert_equal(len(self.mae.get_test()), len(self.DATA_PRED) + 1) assert_equal(self.mae.compute(), 1.083333) def test_PRED_MAE_different_list_sizes(self): mae = MAE() GT = [3, 1, 5, 2] # GT list has one element less than self.TEST_DATA mae.load(GT, self.TEST_DATA) assert_raises(ValueError, mae.compute) # test_PRED RMSE def test_PRED_RMSE_compute_one(self): #Even though rmse has data, we only compute these two param values assert_equal(self.rmse.compute(self.R, self.R_PRED), 0.9) def test_PRED_RMSE_compute_one_empty_datasets(self): rmse = RMSE() assert_equal(rmse.compute(self.R, self.R_PRED), 0.9) def test_PRED_RMSE_compute_all(self): assert_equal(self.rmse.compute(), 0.891067) def test_PRED_RMSE_load_test(self): rmse = RMSE() self.TEST_DATA = [2.3, 0.9, 4.9, 0.9, 1.5] rmse.load_test(self.TEST_DATA) assert_equal(len(rmse.get_test()), len(self.TEST_DATA)) def test_PRED_RMSE_add_entry(self): self.rmse.add(1, 4) assert_equal(len(self.rmse.get_test()), len(self.DATA_PRED) + 1) assert_equal(self.rmse.compute(), 1.470261) def test_PRED_RMSE_different_list_sizes(self): rmse = RMSE() GT = [3, 1, 5, 2] # GT list has one element less than self.TEST_DATA rmse.load(GT, self.TEST_DATA) assert_raises(ValueError, rmse.compute) def test_PRED_RMSE_numpy_array(self): rmse = RMSE() rmse.load(array(self.GT_DATA), array(self.TEST_DATA)) assert (rmse.compute(), 0.891067)
def test_PRED_RMSE_numpy_array(self): rmse = RMSE() rmse.load(array(self.GT_DATA), array(self.TEST_DATA)) assert(rmse.compute(), 0.891067)
class RecommendSystem(object): def __init__(self, filename, sep, **format): # 文件信息 self.filename = filename self.sep = sep self.format = format # 初始化矩阵分解 self.svd = SVD() # 矩阵信息 self.k = 100 # 矩阵的隐因子睡昂 self.min_values = 10 # 删除评分少于10人的电影 self.post_normalize = False # 设置是否加载模型标志 self.load_model = False # 初始化均方误差 self.rmse = RMSE() def get_data(self): # 如果模型不存在,则需要加载数据 if not os.path.exists(filename): if not os.path.exists(self.filename): sys.exit() # SVD加载数据 # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format) data = Data() data.load(self.filename, sep=self.sep, format=self.format) # 分割数据集 train, test = data.split_train_test(percent=80) return train, test else: # 直接加载模型 self.svd.load_model(filename) # 将是否加载模型设为True self.load_model = True return None, None def train(self, train): """ 训练数据 :param train: 训练集 :return: """ if not self.load_model: # svd去获取训练数据集 self.svd.set_data(train) # 注意传入的文件名字,不是带后缀名 self.svd.compute(k=self.k, min_values=self.min_values, post_normalize=self.post_normalize, savefile=filename[:-4]) return None def recommend_to_user(self, userid): """ 推荐结果 :param usrid: 用于ID :return: None """ recommend_list = self.svd.recommend(userid, is_row=False) # 打印电影的名称,和预测的评分 # 构建电影名字的列表 movies_list = [] for line in open("./data/ml-1m/movies.dat", "r"): movies_list.append(' '.join(line.split("::")[1:2])) # 依次取出推荐ID for itemid, rating in recommend_list: print "给你推荐的电影叫%s, 预测你对它的评分是%f" % (movies_list[itemid], rating) return None def rs_predict(self, userid, itemid): """ 得出评分 :param userid: 用户ID :param itemid: 物品ID :return: 评分 """ score = self.svd.predict(itemid, userid) return score def evaluation(self, test): """ 均方误差评估模型 :param test: 测试数据 :return: None """ if not self.load_model: # 获取测试数据中的id,rat, <rat, row(itemid), col(userid)> for rating, itemid, userid in test.get(): try: # rating真是值 score = self.rs_predict(userid, itemid) # 添加所有的测试数据 self.rmse.add(rating, score) except KeyError: continue error = self.rmse.compute() print "均方误差为:%s" % error return None
get_name_item_reviewed(10, user_full, items_full) #3.12 items_full[str(2628)].get_data() users_for_star_wars = svd.recommend(2628, only_unknowns=True) users_for_star_wars #3.13 movies_reviewed_by_sw_rec = [ get_name_item_reviewed(x[0], user_full, items_full) for x in users_for_star_wars ] movies_flatten = [ movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list ] movie_aggregate = movies_by_category(movies_flatten, 3) movies_sort = sorted(movie_aggregate, key=lambda x: x[1], reverse=True) movies_sort #3.14 from recsys.evaluation.prediction import RMSE err = RMSE() for rating, item_id, user_id in data.get(): try: prediction = svd.predict(item_id, user_id) err.add(rating, prediction) except KeyError, k: continue print 'RMSE is ' + str(err.compute())
#Load SVD from /tmp svd2 = SVD(filename='/tmp/movielens') # Loading already computed SVD model #Predict User rating for given user and movie: USERID = 2 ITEMID= 1 # Toy Story rating1=svd2.predict(ITEMID, USERID, 0.0, 5.0) print 'Predicted rating=%f'% rating1 flag=0 #Retrieve actual rating for given user and movie for rating, item_id, user_id in data.get(): if user_id == USERID and item_id == ITEMID: rat = rating #print 'Actual rating=%f' % rating flag=1 break if flag == 1: print 'Actual rating=%f'% rat else : sys.exit("No actual rating available") #Evaluating prediction rmse = RMSE() mae = MAE() rmse.add(rating1, rat) mae.add(rating1, rat) print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
class RecommendSystem(object): def __init__(self, filename, sep, **format): self.filename = filename self.sep = sep self.format = format # 训练参数 self.k = 100 self.min_values = 10 self.post_normalize = True self.svd = SVD() # 判断是否加载 self.is_load = False # 添加数据处理 self.data = Data() # 添加模型评估 self.rmse = RMSE() def get_data(self): """ 获取数据 :return: None """ # 如果模型不存在 if not os.path.exists(tmpfile): # 如果数据文件不存在 if not os.path.exists(self.filename): sys.exit() # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format) # 使用Data()来获取数据 self.data.load(self.filename, sep=self.sep, format=self.format) train, test = self.data.split_train_test(percent=80) return train, test else: self.svd.load_model(tmpfile) self.is_load = True return None, None def train(self, train): """ 训练模型 :param train: 训练数据 :return: None """ if not self.is_load: self.svd.set_data(train) self.svd.compute(k=self.k, min_values=self.min_values, post_normalize=self.post_normalize, savefile=tmpfile[:-4]) return None def rs_predict(self, itemid, userid): """ 评分预测 :param itemid: 电影id :param userid: 用户id :return: None """ score = self.svd.predict(itemid, userid) print "推荐的分数为:%f" % score return score def recommend_to_user(self, userid): """ 推荐给用户 :param userid: 用户id :return: None """ recommend_list = self.svd.recommend(userid, is_row=False) # 读取文件里的电影名称 movie_list = [] for line in open(moviefile, "r"): movie_list.append(' '.join(line.split("::")[1:2])) # 推荐具体电影名字和分数 for itemid, rate in recommend_list: print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate) return None def evaluation(self, test): """ 模型的评估 :param test: 测试集 :return: None """ # 如果模型不是直接加载 if not self.is_load: # 循环取出测试集里面的元组数据<评分,电影,用户> for value, itemid, userid in test.get(): try: predict = self.rs_predict(itemid, userid) self.rmse.add(value, predict) except KeyError: continue # 计算返回误差(均方误差) error = self.rmse.compute() print "模型误差为%s:" % error return None
s3 = 0 for item in items: item_history = train_item[item].values() mean = statistics.mean(item_history) s1 += (train_item[item][user1]-mean)*(train_item[item][user2]-mean) s2 += math.pow((train_item[item][user1]-mean), 2) s3 += math.pow((train_item[item][user2]-mean), 2) if math.sqrt(s2*s3) == 0: return -sys.float_info.max else: return s1/(math.sqrt(s2*s3)) # Evaluate k = 8 rmse = RMSE() i = 0 for rating, item_id, user_id in test: print "===========================================" try: print i i += 1 dist = {} if item_id in train_item.keys(): for user in train_item[item_id].keys(): sim = similarity(user_id, user) if sim >= 0: if len(dist) < k:
data.load(sys.argv[1], sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) #Train & Test data train, test = data.split_train_test(percent=PERCENT_TRAIN) baseline = Baseline() baseline.set_data(train) baseline.compute() # In this case, it does nothing # Evaluate rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = baseline.predict(item_id, user_id, user_is_row=False) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute( ) # in my case (~80% train, ~20% test set) returns RMSE = 1.036374 print 'MAE=%s' % mae.compute( ) # in my case (~80% train, ~20% test set) returns MAE = 0.829024