def calculate_stats_users(pct_train): dat_file = 'user_data_working.csv' data = Data() data.load(dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) train, test = data.split_train_test(percent=pct_train) svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=False) rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def ex1(dat_file='./ml-1m/ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int}) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K=100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def evaluate(data, count=5, K=100): results = [] for i in range(count): train, test = data.split_train_test(percent=PERCENT_TRAIN) print len(data.get()), len(train.get()), len(test.get()) #test_in_train(test, train) #print train.get() svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) #Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: #print "keyerror: ===========================================================>" continue try: rsu = {} rsu["RMSE"] = rmse.compute() rsu["MAE"] = mae.compute() print rsu results.append(rsu) except: print "one error....++++++++++++++++++++++++++++++++++++++++++++++++++++" return results
def test_SVD(svd,train,test,pct_train): rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def test_SVD(svd, train, test, pct_train): rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def eval_rmse(self): # Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() for rating, item_id, user_id in self.test.get(): try: pred_rating = self.svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def eval_reco(model, test): """ Compute RMSE and MAE on test set """ #Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = model.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue return rmse, mae
def evaluate(_svd, _testData, verbose=False): global rmse, mae, rating, item_id, user_id, pred_rating rmse = RMSE() mae = MAE() for rating, item_id, user_id in _testData.get(): try: pred_rating = _svd.predict(item_id, user_id, MIN_VALUE=0, MAX_VALUE=10) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) if verbose: print item_id, user_id, rating, pred_rating except Exception as e: print 'ERROR occurred:', e.message print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'col': 0 -> Cols in matrix come from column 0 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat # file # 'ids': int -> Ids (row and col ids) are integers (not strings) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K = 100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() # mae is mean ABSOLUTE error # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5 mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def evaulte(train_set, test_set): svd = SVD() svd.set_data(train_set) svd.compute(k=KKK, min_values=MIN_ITEM, pre_normalize=None, mean_center=True, post_normalize=True) mae = MAE() k_err = 0 for rating, item_id, user_id in test_set.get(): try: pred_rating = svd.predict(item_id, user_id) mae.add(rating, pred_rating) except KeyError: #print "keyerror: ===========================================================>" k_err += 1 continue print "k_err", k_err, " -- ", "test-len: ", len(test_set.get()), "train-len: ", len(train_set.get()) result = mae.compute()/2.0 return result
def calculate_stats_users(pct_train): dat_file = 'user_data_working.csv' data = Data() data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int}) train, test = data.split_train_test(percent=pct_train) svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=False) rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def evaluate(clf, _testData, verbose = False): rmse = RMSE() mae = MAE() numErrors = 0 for rating, item_id, user_id in _testData.get(): try: pred_rating = clf.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) if verbose: print item_id, user_id, rating, pred_rating except KeyError as e: if verbose: print 'ERROR occurred:', e.message numErrors += 1 print '\n%i/%i data points raised errors.' % (numErrors, len(_testData)) print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def evaulte(train_set, test_set): svd = SVD() svd.set_data(train_set) svd.compute(k=KKK, min_values=MIN_ITEM, pre_normalize=None, mean_center=True, post_normalize=True) mae = MAE() k_err = 0 for rating, item_id, user_id in test_set.get(): try: pred_rating = svd.predict(item_id, user_id) mae.add(rating, pred_rating) except KeyError: #print "keyerror: ===========================================================>" k_err += 1 continue print "k_err", k_err, " -- ", "test-len: ", len( test_set.get()), "train-len: ", len(train_set.get()) result = mae.compute() / 2.0 return result
def ex1(dat_file='ml-1m/ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int}) # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'col': 0 -> Cols in matrix come from column 0 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat # file # 'ids': int -> Ids (row and col ids) are integers (not strings) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K = 100 svd = SVD() svd.set_data(train) svd.compute( k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
class TestPrediction(Test): def __init__(self): super(TestPrediction, self).__init__() # Prediction-based metrics: MAE, RMSE, Pearson self.mae = MAE(self.DATA_PRED) self.rmse = RMSE(self.DATA_PRED) self.R = 3 # Real Rating (ground truth) self.R_PRED = 2.1 # Predicted Rating # test_PRED MAE def test_PRED_MAE_compute_one(self): assert_equal(self.mae.compute(self.R, self.R_PRED), 0.9) def test_PRED_MAE_compute_one_empty_datasets(self): mae = MAE() assert_equal(mae.compute(self.R, self.R_PRED), 0.9) def test_PRED_MAE_compute_all(self): assert_equal(self.mae.compute(), 0.7) def test_PRED_MAE_nan(self): mae = MAE() mae.add(2.0, nan) assert_equal(mae.get_test(), []) assert_equal(mae.get_ground_truth(), []) def test_PRED_MAE_load(self): mae = MAE() mae.load(self.GT_DATA, self.TEST_DATA) assert_equal(mae.compute(), 0.7) def test_PRED_MAE_load_test(self): mae = MAE() mae.load_test(self.TEST_DATA) assert_equal(len(mae.get_test()), len(self.TEST_DATA)) assert_equal(len(mae.get_ground_truth()), 0) assert_raises(ValueError, mae.compute) #Raise: GT is empty! def test_PRED_MAE_load_test_and_ground_truth(self): mae = MAE() mae.load_test(self.TEST_DATA) mae.load_ground_truth(self.GT_DATA) assert_equal(mae.compute(), 0.7) def test_PRED_MAE_add_entry(self): self.mae.add(1, 4) #1: GT rating, 4: Predicted rating assert_equal(len(self.mae.get_test()), len(self.DATA_PRED)+1) assert_equal(self.mae.compute(), 1.083333) def test_PRED_MAE_different_list_sizes(self): mae = MAE() GT = [3, 1, 5, 2] # GT list has one element less than self.TEST_DATA mae.load(GT, self.TEST_DATA) assert_raises(ValueError, mae.compute) # test_PRED RMSE def test_PRED_RMSE_compute_one(self): #Even though rmse has data, we only compute these two param values assert_equal(self.rmse.compute(self.R, self.R_PRED), 0.9) def test_PRED_RMSE_compute_one_empty_datasets(self): rmse = RMSE() assert_equal(rmse.compute(self.R, self.R_PRED), 0.9) def test_PRED_RMSE_compute_all(self): assert_equal(self.rmse.compute(), 0.891067) def test_PRED_RMSE_load_test(self): rmse = RMSE() self.TEST_DATA = [2.3, 0.9, 4.9, 0.9, 1.5] rmse.load_test(self.TEST_DATA) assert_equal(len(rmse.get_test()), len(self.TEST_DATA)) def test_PRED_RMSE_add_entry(self): self.rmse.add(1,4) assert_equal(len(self.rmse.get_test()), len(self.DATA_PRED)+1) assert_equal(self.rmse.compute(), 1.470261) def test_PRED_RMSE_different_list_sizes(self): rmse = RMSE() GT = [3, 1, 5, 2] # GT list has one element less than self.TEST_DATA rmse.load(GT, self.TEST_DATA) assert_raises(ValueError, rmse.compute) def test_PRED_RMSE_numpy_array(self): rmse = RMSE() rmse.load(array(self.GT_DATA), array(self.TEST_DATA)) assert(rmse.compute(), 0.891067)
#Dataset PERCENT_TRAIN = int(sys.argv[2]) data = Data() data.load(sys.argv[1], sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int}) #Train & Test data train, test = data.split_train_test(percent=PERCENT_TRAIN) svdlibc = SVDLIBC('./ml-1m/ratings.dat') svdlibc.to_sparse_matrix(sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) svdlibc.compute(k=100) svd = svdlibc.export() svd.save_model('/tmp/svd-model', options={'k': 100}) #svd.similar(ITEMID1) # results might be different than example 4. as there's no min_values=10 set here #Evaluation using prediction-based metrics print 'Evaluating...' rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id, 0.0, 5.0) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
post_normalize=True) # Evaluate rmse_svd = RMSE() mae_svd = MAE() rmse_svd_neig = RMSE() mae_svd_neig = MAE() i = 1 total = len(test.get()) print 'Total Test ratings: %s' % total for rating, item_id, user_id in test: try: pred_rating_svd = svd.predict(item_id, user_id) rmse_svd.add(rating, pred_rating_svd) mae_svd.add(rating, pred_rating_svd) pred_rating_svd_neig = svd_neig.predict(item_id, user_id) #Koren & co. if pred_rating_svd_neig is not nan: rmse_svd_neig.add(rating, pred_rating_svd_neig) mae_svd_neig.add(rating, pred_rating_svd_neig) print "\rProcessed test rating %d" % i, sys.stdout.flush() i += 1 except KeyError: continue rmse_svd_all.append(rmse_svd.compute())
def test_PRED_MAE_nan(self): mae = MAE() mae.add(2.0, nan) assert_equal(mae.get_test(), []) assert_equal(mae.get_ground_truth(), [])
class TestPrediction(Test): def __init__(self): super(TestPrediction, self).__init__() # Prediction-based metrics: MAE, RMSE, Pearson self.mae = MAE(self.DATA_PRED) self.rmse = RMSE(self.DATA_PRED) self.R = 3 # Real Rating (ground truth) self.R_PRED = 2.1 # Predicted Rating # test_PRED MAE def test_PRED_MAE_compute_one(self): assert_equal(self.mae.compute(self.R, self.R_PRED), 0.9) def test_PRED_MAE_compute_one_empty_datasets(self): mae = MAE() assert_equal(mae.compute(self.R, self.R_PRED), 0.9) def test_PRED_MAE_compute_all(self): assert_equal(self.mae.compute(), 0.7) def test_PRED_MAE_nan(self): mae = MAE() mae.add(2.0, nan) assert_equal(mae.get_test(), []) assert_equal(mae.get_ground_truth(), []) def test_PRED_MAE_load(self): mae = MAE() mae.load(self.GT_DATA, self.TEST_DATA) assert_equal(mae.compute(), 0.7) def test_PRED_MAE_load_test(self): mae = MAE() mae.load_test(self.TEST_DATA) assert_equal(len(mae.get_test()), len(self.TEST_DATA)) assert_equal(len(mae.get_ground_truth()), 0) assert_raises(ValueError, mae.compute) #Raise: GT is empty! def test_PRED_MAE_load_test_and_ground_truth(self): mae = MAE() mae.load_test(self.TEST_DATA) mae.load_ground_truth(self.GT_DATA) assert_equal(mae.compute(), 0.7) def test_PRED_MAE_add_entry(self): self.mae.add(1, 4) #1: GT rating, 4: Predicted rating assert_equal(len(self.mae.get_test()), len(self.DATA_PRED) + 1) assert_equal(self.mae.compute(), 1.083333) def test_PRED_MAE_different_list_sizes(self): mae = MAE() GT = [3, 1, 5, 2] # GT list has one element less than self.TEST_DATA mae.load(GT, self.TEST_DATA) assert_raises(ValueError, mae.compute) # test_PRED RMSE def test_PRED_RMSE_compute_one(self): #Even though rmse has data, we only compute these two param values assert_equal(self.rmse.compute(self.R, self.R_PRED), 0.9) def test_PRED_RMSE_compute_one_empty_datasets(self): rmse = RMSE() assert_equal(rmse.compute(self.R, self.R_PRED), 0.9) def test_PRED_RMSE_compute_all(self): assert_equal(self.rmse.compute(), 0.891067) def test_PRED_RMSE_load_test(self): rmse = RMSE() self.TEST_DATA = [2.3, 0.9, 4.9, 0.9, 1.5] rmse.load_test(self.TEST_DATA) assert_equal(len(rmse.get_test()), len(self.TEST_DATA)) def test_PRED_RMSE_add_entry(self): self.rmse.add(1, 4) assert_equal(len(self.rmse.get_test()), len(self.DATA_PRED) + 1) assert_equal(self.rmse.compute(), 1.470261) def test_PRED_RMSE_different_list_sizes(self): rmse = RMSE() GT = [3, 1, 5, 2] # GT list has one element less than self.TEST_DATA rmse.load(GT, self.TEST_DATA) assert_raises(ValueError, rmse.compute) def test_PRED_RMSE_numpy_array(self): rmse = RMSE() rmse.load(array(self.GT_DATA), array(self.TEST_DATA)) assert (rmse.compute(), 0.891067)
#Load SVD from /tmp svd2 = SVD(filename='/tmp/movielens') # Loading already computed SVD model #Predict User rating for given user and movie: USERID = 2 ITEMID= 1 # Toy Story rating1=svd2.predict(ITEMID, USERID, 0.0, 5.0) print 'Predicted rating=%f'% rating1 flag=0 #Retrieve actual rating for given user and movie for rating, item_id, user_id in data.get(): if user_id == USERID and item_id == ITEMID: rat = rating #print 'Actual rating=%f' % rating flag=1 break if flag == 1: print 'Actual rating=%f'% rat else : sys.exit("No actual rating available") #Evaluating prediction rmse = RMSE() mae = MAE() rmse.add(rating1, rat) mae.add(rating1, rat) print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
svd_neig.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True) # Evaluate rmse_svd = RMSE() mae_svd = MAE() rmse_svd_neig = RMSE() mae_svd_neig = MAE() i = 1 total = len(test.get()) print "Total Test ratings: %s" % total for rating, item_id, user_id in test: try: pred_rating_svd = svd.predict(item_id, user_id) rmse_svd.add(rating, pred_rating_svd) mae_svd.add(rating, pred_rating_svd) pred_rating_svd_neig = svd_neig.predict(item_id, user_id) # Koren & co. if pred_rating_svd_neig is not nan: rmse_svd_neig.add(rating, pred_rating_svd_neig) mae_svd_neig.add(rating, pred_rating_svd_neig) print "\rProcessed test rating %d" % i, sys.stdout.flush() i += 1 except KeyError: continue rmse_svd_all.append(rmse_svd.compute()) mae_svd_all.append(mae_svd.compute())
MAX_RATING) # predicted rating value print svd.get_matrix().value(ITEMID, USERID) # real rating value print '' print 'GENERATING RECOMMENDATION' print svd.recommend(USERID, n=5, only_unknowns=True, is_row=False) #Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() spearman = SpearmanRho() kendall = KendallTau() #decision = PrecisionRecallF1() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) spearman.add(rating, pred_rating) kendall.add(rating, pred_rating) except KeyError: continue print '' print 'EVALUATION RESULT' print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute() print 'Spearman\'s rho=%s' % spearman.compute() print 'Kendall-tau=%s' % kendall.compute() #print decision.compute() print ''