Exemplo n.º 1
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    train, test = data.split_train_test(percent=pct_train)
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100,
                min_values=2,
                pre_normalize=None,
                mean_center=True,
                post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Exemplo n.º 2
0
def ex1(dat_file='./ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int})
       

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Exemplo n.º 3
0
def evaluate(data, count=5, K=100):
    results = []

    for i in range(count):
        train, test = data.split_train_test(percent=PERCENT_TRAIN)
        print len(data.get()), len(train.get()), len(test.get())
        #test_in_train(test, train)
        #print train.get()
        svd = SVD()
        svd.set_data(train)
        svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

        #Evaluation using prediction-based metrics
        rmse = RMSE()
        mae = MAE()
        for rating, item_id, user_id in test.get():
            try:
                pred_rating = svd.predict(item_id, user_id)
                rmse.add(rating, pred_rating)
                mae.add(rating, pred_rating)
            except KeyError:
                #print "keyerror: ===========================================================>"
                continue
        try:
            rsu = {}
            rsu["RMSE"] = rmse.compute()
            rsu["MAE"] = mae.compute()
            print rsu
            results.append(rsu)
        except:
            print "one error....++++++++++++++++++++++++++++++++++++++++++++++++++++"
        

    return results
Exemplo n.º 4
0
    def __init__(self):
        super(TestPrediction, self).__init__()
        # Prediction-based metrics: MAE, RMSE, Pearson
        self.mae = MAE(self.DATA_PRED)
        self.rmse = RMSE(self.DATA_PRED)

        self.R = 3  # Real Rating (ground truth)
        self.R_PRED = 2.1  # Predicted Rating
Exemplo n.º 5
0
def test_SVD(svd, train, test, pct_train):
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Exemplo n.º 6
0
 def eval_rmse(self):
     # Evaluation using prediction-based metrics
     rmse = RMSE()
     mae = MAE()
     for rating, item_id, user_id in self.test.get():
         try:
             pred_rating = self.svd.predict(item_id, user_id)
             rmse.add(rating, pred_rating)
             mae.add(rating, pred_rating)
         except KeyError:
             continue
     print 'RMSE=%s' % rmse.compute()
     print 'MAE=%s' % mae.compute()
Exemplo n.º 7
0
def test_SVD(svd,train,test,pct_train):
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():      
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Exemplo n.º 8
0
def eval_reco(model, test):
    """ Compute RMSE and MAE on test set
    """

    #Evaluation using prediction-based metrics
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = model.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    return rmse, mae
Exemplo n.º 9
0
def eval_reco(model, test):
    """ Compute RMSE and MAE on test set
    """

    #Evaluation using prediction-based metrics
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = model.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    return rmse, mae
Exemplo n.º 10
0
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5):

    data = Data()
    data.load(dat_file,
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    # About format parameter:
    #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
    #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
    #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
    #   file
    #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=5,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    # mae is mean ABSOLUTE error
    # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Exemplo n.º 11
0
def evaluate(_svd, _testData, verbose=False):
    global rmse, mae, rating, item_id, user_id, pred_rating
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in _testData.get():
        try:
            pred_rating = _svd.predict(item_id, user_id, MIN_VALUE=0, MAX_VALUE=10)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)

            if verbose:
                print item_id, user_id, rating, pred_rating
        except Exception as e:
            print 'ERROR occurred:', e.message

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Exemplo n.º 12
0
    def __init__(self):
        super(TestPrediction, self).__init__()
        # Prediction-based metrics: MAE, RMSE, Pearson
        self.mae = MAE(self.DATA_PRED)
        self.rmse = RMSE(self.DATA_PRED)

        self.R = 3        # Real Rating (ground truth)
        self.R_PRED = 2.1 # Predicted Rating
Exemplo n.º 13
0
def get_mae_rmse(step):

    data = Data()

    format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'}

    filename = 'second_train_test.dat.{step}'.format(step=step)

    data.load(filename, sep='::', format=format)

    train, test = data.split_train_test(percent=80)

    try:

        svd = SVD('svdn_model_{step}.zip'.format(step=step))
        print('Loading model... {step}'.format(step=step))

    except:

        return

    mae_predicted, rmse_predicted = [], []
    for rating, item_id, user_id in test:
        try:

            predicted = svd.predict(item_id, user_id)

            mae_predicted.append((rating, predicted))
            rmse_predicted.append((rating, predicted))

        except:

            pass

    mae_value, rmse_value = np.nan, np.nan

    if len(mae_predicted) > 0:
        mae = MAE(mae_predicted)
        mae_value = mae.compute()

    if len(rmse_predicted) > 0:
        rmse = RMSE(rmse_predicted)
        rmse_value = rmse.compute()

    return mae_value, rmse_value
Exemplo n.º 14
0
Arquivo: day_07.py Projeto: lmlzk/ML
    def __init__(self, filename, sep, **format):
        # 文件信息
        self.filename = filename
        self.sep = sep
        self.format = format

        # 初始化矩阵分解
        self.svd = SVD()

        # 矩阵信息
        self.k = 100  #  矩阵的隐因子睡昂
        self.min_values = 10  #  删除评分少于10人的电影
        self.post_normalize = False

        # 设置是否加载模型标志
        self.load_model = False

        # 初始化均方误差
        self.rmse = RMSE()
Exemplo n.º 15
0
def test_random(data):

    mae_predicted, rmse_predicted = [], []
    for rating in data:
        random_predicted = float(random_score(review_percentages))
        mae_predicted.append((rating, random_predicted))
        rmse_predicted.append((rating, random_predicted))

    mae_value, rmse_value = np.nan, np.nan

    if len(mae_predicted) > 0:
        mae = MAE(mae_predicted)
        mae_value = mae.compute()

    if len(rmse_predicted) > 0:
        rmse = RMSE(rmse_predicted)
        rmse_value = rmse.compute()

    return mae_value, rmse_value
Exemplo n.º 16
0
    def __init__(self, filename, sep, **format):
        self.filename = filename
        self.sep = sep
        self.format = format

        # 训练参数
        self.k = 100
        self.min_values = 10
        self.post_normalize = True

        self.svd = SVD()

        # 判断是否加载
        self.is_load = False

        # 添加数据处理
        self.data = Data()

        # 添加模型评估
        self.rmse = RMSE()
Exemplo n.º 17
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True,
    post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():      
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Exemplo n.º 18
0
def evaluate(clf, _testData, verbose = False):

    rmse = RMSE()
    mae = MAE()
    numErrors = 0

    for rating, item_id, user_id in _testData.get():
        try:
            pred_rating = clf.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)

            if verbose:
                print item_id, user_id, rating, pred_rating
        except KeyError as e:
            if verbose:
                print 'ERROR occurred:', e.message
            numErrors += 1

    print '\n%i/%i data points raised errors.' % (numErrors, len(_testData))
    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Exemplo n.º 19
0
def ex1(dat_file='ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,
    'ids':int})
        # About format parameter:
        #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
        #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
        #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
        #   file
        #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(
        k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Exemplo n.º 20
0
def root_mean_square_error(train_values, predicted_values):

    if len(train_values) != len(predicted_values):
        sys.stderr.write("mean_absolute_error: Invalid list lengths")
        exit(1)

    rmse = RMSE()
    rmse.load_ground_truth(train_values)
    rmse.load_test(predicted_values)
    return rmse.compute()
Exemplo n.º 21
0
 def test_PRED_RMSE_load_test(self):
     rmse = RMSE()
     self.TEST_DATA = [2.3, 0.9, 4.9, 0.9, 1.5]
     rmse.load_test(self.TEST_DATA)
     assert_equal(len(rmse.get_test()), len(self.TEST_DATA))
Exemplo n.º 22
0
 def test_PRED_RMSE_compute_one_empty_datasets(self):
     rmse = RMSE()
     assert_equal(rmse.compute(self.R, self.R_PRED), 0.9)
Exemplo n.º 23
0
class TestPrediction(Test):
    def __init__(self):
        super(TestPrediction, self).__init__()
        # Prediction-based metrics: MAE, RMSE, Pearson
        self.mae = MAE(self.DATA_PRED)
        self.rmse = RMSE(self.DATA_PRED)

        self.R = 3        # Real Rating (ground truth)
        self.R_PRED = 2.1 # Predicted Rating

    # test_PRED MAE
    def test_PRED_MAE_compute_one(self):
        assert_equal(self.mae.compute(self.R, self.R_PRED), 0.9)

    def test_PRED_MAE_compute_one_empty_datasets(self):
        mae = MAE()
        assert_equal(mae.compute(self.R, self.R_PRED), 0.9)

    def test_PRED_MAE_compute_all(self):
        assert_equal(self.mae.compute(), 0.7)

    def test_PRED_MAE_nan(self):
        mae = MAE()
        mae.add(2.0, nan)
        assert_equal(mae.get_test(), [])
        assert_equal(mae.get_ground_truth(), [])

    def test_PRED_MAE_load(self):
        mae = MAE()
        mae.load(self.GT_DATA, self.TEST_DATA)
        assert_equal(mae.compute(), 0.7)

    def test_PRED_MAE_load_test(self):
        mae = MAE()
        mae.load_test(self.TEST_DATA)
        assert_equal(len(mae.get_test()), len(self.TEST_DATA))
        assert_equal(len(mae.get_ground_truth()), 0)
        assert_raises(ValueError, mae.compute) #Raise: GT is empty!

    def test_PRED_MAE_load_test_and_ground_truth(self):
        mae = MAE()
        mae.load_test(self.TEST_DATA)
        mae.load_ground_truth(self.GT_DATA)
        assert_equal(mae.compute(), 0.7)

    def test_PRED_MAE_add_entry(self):
        self.mae.add(1, 4) #1: GT rating, 4: Predicted rating
        assert_equal(len(self.mae.get_test()), len(self.DATA_PRED)+1)
        assert_equal(self.mae.compute(), 1.083333)

    def test_PRED_MAE_different_list_sizes(self):
        mae = MAE()
        GT = [3, 1, 5, 2]
        # GT list has one element less than self.TEST_DATA
        mae.load(GT, self.TEST_DATA)
        assert_raises(ValueError, mae.compute)

    # test_PRED RMSE
    def test_PRED_RMSE_compute_one(self):
        #Even though rmse has data, we only compute these two param values
        assert_equal(self.rmse.compute(self.R, self.R_PRED), 0.9)

    def test_PRED_RMSE_compute_one_empty_datasets(self):
        rmse = RMSE()
        assert_equal(rmse.compute(self.R, self.R_PRED), 0.9)

    def test_PRED_RMSE_compute_all(self):
        assert_equal(self.rmse.compute(), 0.891067)

    def test_PRED_RMSE_load_test(self):
        rmse = RMSE()
        self.TEST_DATA = [2.3, 0.9, 4.9, 0.9, 1.5]
        rmse.load_test(self.TEST_DATA)
        assert_equal(len(rmse.get_test()), len(self.TEST_DATA))

    def test_PRED_RMSE_add_entry(self):
        self.rmse.add(1,4)
        assert_equal(len(self.rmse.get_test()), len(self.DATA_PRED)+1)
        assert_equal(self.rmse.compute(), 1.470261)

    def test_PRED_RMSE_different_list_sizes(self):
        rmse = RMSE()
        GT = [3, 1, 5, 2]
        # GT list has one element less than self.TEST_DATA
        rmse.load(GT, self.TEST_DATA)
        assert_raises(ValueError, rmse.compute)

    def test_PRED_RMSE_numpy_array(self):
        rmse = RMSE()
        rmse.load(array(self.GT_DATA), array(self.TEST_DATA))
        assert(rmse.compute(), 0.891067)
Exemplo n.º 24
0
 def test_PRED_RMSE_different_list_sizes(self):
     rmse = RMSE()
     GT = [3, 1, 5, 2]
     # GT list has one element less than self.TEST_DATA
     rmse.load(GT, self.TEST_DATA)
     assert_raises(ValueError, rmse.compute)
        dist[i] = sorted(dist[i], key=itemgetter(1), reverse=True)
        #print str(i)+": "+str(item_id)
        #print dist[i]
        i += 1

    #print dist
    print "mean leng = "+str(statistics.mean(leng))
    print "max leng = "+str(max(leng))
    print "min leng = "+str(min(leng))

    for k in range(3, 46):
        print str(k)+"NN: "+str(p)+" fold..."
        if k not in rmse.keys():
            rmse[k] = []

        result = RMSE()
        i = 0
        for rating, item_id, user_id in test:
            if len(dist[i]) < 5:
                if item_id in train_item.keys():
                    pred_rating = statistics.mean(train_item[item_id].values())
                elif user_id in train_user.keys():
                    pred_rating = statistics.mean(train_user[user_id].values())
                else:
                    pred_rating = average
            else:
                ratings = []
                for j in range(0, k):
                    if j == len(dist[i]):
                        break
                    ratings.append(train_item[dist[i][j][0]][user_id])
Exemplo n.º 26
0
#3.10
[items_full[str(x[0])].get_data() for x in films]

#3.11
get_name_item_reviewed(10,user_full,items_full)

#3.12
items_full[str(2628)].get_data()
users_for_star_wars = svd.recommend(2628,only_unknowns=True)
users_for_star_wars

#3.13
movies_reviewed_by_sw_rec  =[get_name_item_reviewed(x[0],user_full,items_full) for x in users_for_star_wars]
movies_flatten = [movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list]
movie_aggregate = movies_by_category(movies_flatten, 3)
movies_sort = sorted(movie_aggregate,key=lambda x: x[1], reverse=True)
movies_sort

#3.14
from recsys.evaluation.prediction import RMSE
err = RMSE()
for rating, item_id, user_id in data.get():
    try:
        prediction = svd.predict(item_id, user_id)
        err.add(rating, prediction)
    except KeyError, k:
        continue

print 'RMSE is ' + str(err.compute())
Exemplo n.º 27
0
    svd_neig.set_data(train)

    #Compute SVD
    svd.compute(k=K,
                min_values=None,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    svd_neig.compute(k=K,
                     min_values=None,
                     pre_normalize=None,
                     mean_center=True,
                     post_normalize=True)

    # Evaluate
    rmse_svd = RMSE()
    mae_svd = MAE()
    rmse_svd_neig = RMSE()
    mae_svd_neig = MAE()

    i = 1
    total = len(test.get())
    print 'Total Test ratings: %s' % total
    for rating, item_id, user_id in test:
        try:
            pred_rating_svd = svd.predict(item_id, user_id)
            rmse_svd.add(rating, pred_rating_svd)
            mae_svd.add(rating, pred_rating_svd)

            pred_rating_svd_neig = svd_neig.predict(item_id,
                                                    user_id)  #Koren & co.
Exemplo n.º 28
0
 def test_PRED_RMSE_compute_one_empty_datasets(self):
     rmse = RMSE()
     assert_equal(rmse.compute(self.R, self.R_PRED), 0.9)
Exemplo n.º 29
0
#Create SVD
list = []
for j in range(50,80,2):
    sum_value = 0.0
    for i in range(1,11):
        #Train & Test data
        train, test = data.split_train_test(percent=PERCENT_TRAIN)

        K=j
        svd = SVDNeighbourhood()
        svd.set_data(train)
        svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

        #Evaluation using prediction-based metrics
        rmse = RMSE()
        mae = MAE()
        for rating, item_id, user_id in test.get():
            try:
                pred_rating = svd.predict(item_id, user_id)
                rmse.add(rating, pred_rating)
                mae.add(rating, pred_rating)
            except KeyError:
                continue

        print 'RMSE=%s' % rmse.compute()
        sum_value = sum_value + rmse.compute()
    print '-------'
    print 'the k value is %s' %j
    print 'Final RMSE=%s' % sum_value
    print '-------'
Exemplo n.º 30
0
 def evaluate_matrices_rmse(self, original_matrix, imputed_matrix):
     return self.evaluate_matrices(original_matrix,
                                   imputed_matrix,
                                   evaluator=RMSE())
Exemplo n.º 31
0
from recsys.evaluation.prediction import RMSE, MAE
from recsys.datamodel.data import Data

from baseline import Baseline #Import the test class we've just created

#Dataset
PERCENT_TRAIN = int(sys.argv[2])
data = Data()
data.load(sys.argv[1], sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
#Train & Test data
train, test = data.split_train_test(percent=PERCENT_TRAIN)

baseline = Baseline()
baseline.set_data(train)
baseline.compute() # In this case, it does nothing

# Evaluate
rmse = RMSE()
mae = MAE()
for rating, item_id, user_id in test.get():
    try:
        pred_rating = baseline.predict(item_id, user_id, user_is_row=False)
        rmse.add(rating, pred_rating)
        mae.add(rating, pred_rating)
    except KeyError:
        continue

print 'RMSE=%s' % rmse.compute() # in my case (~80% train, ~20% test set) returns RMSE = 1.036374
print 'MAE=%s' % mae.compute()   # in my case (~80% train, ~20% test set) returns  MAE = 0.829024
Exemplo n.º 32
0
#Dataset
PERCENT_TRAIN = int(sys.argv[2])
data = Data()
data.load(sys.argv[1], sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
#Train & Test data
train, test = data.split_train_test(percent=PERCENT_TRAIN)

svdlibc = SVDLIBC('./ml-1m/ratings.dat')
svdlibc.to_sparse_matrix(sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
svdlibc.compute(k=100)
svd = svdlibc.export()
svd.save_model('/tmp/svd-model', options={'k': 100})
#svd.similar(ITEMID1) # results might be different than example 4. as there's no min_values=10 set here


#Evaluation using prediction-based metrics
print 'Evaluating...'
rmse = RMSE()
mae = MAE()
for rating, item_id, user_id in test.get():
    try:
        pred_rating = svd.predict(item_id, user_id, 0.0, 5.0)
        rmse.add(rating, pred_rating)
        mae.add(rating, pred_rating)
    except KeyError:
        continue

print 'RMSE=%s' % rmse.compute()
print 'MAE=%s' % mae.compute()
Exemplo n.º 33
0
RUNS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for run in RUNS:
    print "RUN(%d)" % run
    # Train & Test data
    train, test = data.split_train_test(percent=PERCENT_TRAIN)

    svd.set_data(train)
    svd_neig.set_data(train)

    # Compute SVD
    svd.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True)
    svd_neig.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True)

    # Evaluate
    rmse_svd = RMSE()
    mae_svd = MAE()
    rmse_svd_neig = RMSE()
    mae_svd_neig = MAE()

    i = 1
    total = len(test.get())
    print "Total Test ratings: %s" % total
    for rating, item_id, user_id in test:
        try:
            pred_rating_svd = svd.predict(item_id, user_id)
            rmse_svd.add(rating, pred_rating_svd)
            mae_svd.add(rating, pred_rating_svd)

            pred_rating_svd_neig = svd_neig.predict(item_id, user_id)  # Koren & co.
            if pred_rating_svd_neig is not nan:
Exemplo n.º 34
0
 def test_PRED_RMSE_different_list_sizes(self):
     rmse = RMSE()
     GT = [3, 1, 5, 2]
     # GT list has one element less than self.TEST_DATA
     rmse.load(GT, self.TEST_DATA)
     assert_raises(ValueError, rmse.compute)
Exemplo n.º 35
0
 def test_PRED_RMSE_numpy_array(self):
     rmse = RMSE()
     rmse.load(array(self.GT_DATA), array(self.TEST_DATA))
     assert (rmse.compute(), 0.891067)
Exemplo n.º 36
0
print ''
print 'GENERATING PREDICTION'
MIN_RATING = 0.0
MAX_RATING = 5.0
ITEMID = 1
USERID = 1
print svd.predict(ITEMID, USERID, MIN_RATING,
                  MAX_RATING)  # predicted rating value
print svd.get_matrix().value(ITEMID, USERID)  # real rating value

print ''
print 'GENERATING RECOMMENDATION'
print svd.recommend(USERID, n=5, only_unknowns=True, is_row=False)

#Evaluation using prediction-based metrics
rmse = RMSE()
mae = MAE()
spearman = SpearmanRho()
kendall = KendallTau()
#decision = PrecisionRecallF1()
for rating, item_id, user_id in test.get():
    try:
        pred_rating = svd.predict(item_id, user_id)
        rmse.add(rating, pred_rating)
        mae.add(rating, pred_rating)
        spearman.add(rating, pred_rating)
        kendall.add(rating, pred_rating)
    except KeyError:
        continue

print ''
Exemplo n.º 37
0
class TestPrediction(Test):
    def __init__(self):
        super(TestPrediction, self).__init__()
        # Prediction-based metrics: MAE, RMSE, Pearson
        self.mae = MAE(self.DATA_PRED)
        self.rmse = RMSE(self.DATA_PRED)

        self.R = 3  # Real Rating (ground truth)
        self.R_PRED = 2.1  # Predicted Rating

    # test_PRED MAE
    def test_PRED_MAE_compute_one(self):
        assert_equal(self.mae.compute(self.R, self.R_PRED), 0.9)

    def test_PRED_MAE_compute_one_empty_datasets(self):
        mae = MAE()
        assert_equal(mae.compute(self.R, self.R_PRED), 0.9)

    def test_PRED_MAE_compute_all(self):
        assert_equal(self.mae.compute(), 0.7)

    def test_PRED_MAE_nan(self):
        mae = MAE()
        mae.add(2.0, nan)
        assert_equal(mae.get_test(), [])
        assert_equal(mae.get_ground_truth(), [])

    def test_PRED_MAE_load(self):
        mae = MAE()
        mae.load(self.GT_DATA, self.TEST_DATA)
        assert_equal(mae.compute(), 0.7)

    def test_PRED_MAE_load_test(self):
        mae = MAE()
        mae.load_test(self.TEST_DATA)
        assert_equal(len(mae.get_test()), len(self.TEST_DATA))
        assert_equal(len(mae.get_ground_truth()), 0)
        assert_raises(ValueError, mae.compute)  #Raise: GT is empty!

    def test_PRED_MAE_load_test_and_ground_truth(self):
        mae = MAE()
        mae.load_test(self.TEST_DATA)
        mae.load_ground_truth(self.GT_DATA)
        assert_equal(mae.compute(), 0.7)

    def test_PRED_MAE_add_entry(self):
        self.mae.add(1, 4)  #1: GT rating, 4: Predicted rating
        assert_equal(len(self.mae.get_test()), len(self.DATA_PRED) + 1)
        assert_equal(self.mae.compute(), 1.083333)

    def test_PRED_MAE_different_list_sizes(self):
        mae = MAE()
        GT = [3, 1, 5, 2]
        # GT list has one element less than self.TEST_DATA
        mae.load(GT, self.TEST_DATA)
        assert_raises(ValueError, mae.compute)

    # test_PRED RMSE
    def test_PRED_RMSE_compute_one(self):
        #Even though rmse has data, we only compute these two param values
        assert_equal(self.rmse.compute(self.R, self.R_PRED), 0.9)

    def test_PRED_RMSE_compute_one_empty_datasets(self):
        rmse = RMSE()
        assert_equal(rmse.compute(self.R, self.R_PRED), 0.9)

    def test_PRED_RMSE_compute_all(self):
        assert_equal(self.rmse.compute(), 0.891067)

    def test_PRED_RMSE_load_test(self):
        rmse = RMSE()
        self.TEST_DATA = [2.3, 0.9, 4.9, 0.9, 1.5]
        rmse.load_test(self.TEST_DATA)
        assert_equal(len(rmse.get_test()), len(self.TEST_DATA))

    def test_PRED_RMSE_add_entry(self):
        self.rmse.add(1, 4)
        assert_equal(len(self.rmse.get_test()), len(self.DATA_PRED) + 1)
        assert_equal(self.rmse.compute(), 1.470261)

    def test_PRED_RMSE_different_list_sizes(self):
        rmse = RMSE()
        GT = [3, 1, 5, 2]
        # GT list has one element less than self.TEST_DATA
        rmse.load(GT, self.TEST_DATA)
        assert_raises(ValueError, rmse.compute)

    def test_PRED_RMSE_numpy_array(self):
        rmse = RMSE()
        rmse.load(array(self.GT_DATA), array(self.TEST_DATA))
        assert (rmse.compute(), 0.891067)
Exemplo n.º 38
0
 def test_PRED_RMSE_numpy_array(self):
     rmse = RMSE()
     rmse.load(array(self.GT_DATA), array(self.TEST_DATA))
     assert(rmse.compute(), 0.891067)
Exemplo n.º 39
0
Arquivo: day_07.py Projeto: lmlzk/ML
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        # 文件信息
        self.filename = filename
        self.sep = sep
        self.format = format

        # 初始化矩阵分解
        self.svd = SVD()

        # 矩阵信息
        self.k = 100  #  矩阵的隐因子睡昂
        self.min_values = 10  #  删除评分少于10人的电影
        self.post_normalize = False

        # 设置是否加载模型标志
        self.load_model = False

        # 初始化均方误差
        self.rmse = RMSE()

    def get_data(self):
        # 如果模型不存在,则需要加载数据
        if not os.path.exists(filename):
            if not os.path.exists(self.filename):
                sys.exit()
            # SVD加载数据
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            data = Data()

            data.load(self.filename, sep=self.sep, format=self.format)

            # 分割数据集
            train, test = data.split_train_test(percent=80)

            return train, test

        else:
            # 直接加载模型
            self.svd.load_model(filename)

            # 将是否加载模型设为True
            self.load_model = True

            return None, None

    def train(self, train):
        """
        训练数据
        :param train: 训练集
        :return:
        """
        if not self.load_model:
            # svd去获取训练数据集
            self.svd.set_data(train)
            # 注意传入的文件名字,不是带后缀名
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=filename[:-4])
        return None

    def recommend_to_user(self, userid):
        """
        推荐结果
        :param usrid: 用于ID
        :return: None
        """

        recommend_list = self.svd.recommend(userid, is_row=False)

        # 打印电影的名称,和预测的评分

        # 构建电影名字的列表
        movies_list = []

        for line in open("./data/ml-1m/movies.dat", "r"):
            movies_list.append(' '.join(line.split("::")[1:2]))

        # 依次取出推荐ID
        for itemid, rating in recommend_list:

            print "给你推荐的电影叫%s, 预测你对它的评分是%f" % (movies_list[itemid], rating)

        return None

    def rs_predict(self, userid, itemid):
        """
        得出评分
        :param userid: 用户ID
        :param itemid: 物品ID
        :return: 评分
        """
        score = self.svd.predict(itemid, userid)

        return score

    def evaluation(self, test):
        """
        均方误差评估模型
        :param test: 测试数据
        :return: None
        """
        if not self.load_model:
            # 获取测试数据中的id,rat, <rat, row(itemid), col(userid)>
            for rating, itemid, userid in test.get():
                try:
                    # rating真是值
                    score = self.rs_predict(userid, itemid)

                    # 添加所有的测试数据
                    self.rmse.add(rating, score)
                except KeyError:
                    continue

            error = self.rmse.compute()

            print "均方误差为:%s" % error

        return None
Exemplo n.º 40
0
get_name_item_reviewed(10, user_full, items_full)

#3.12
items_full[str(2628)].get_data()
users_for_star_wars = svd.recommend(2628, only_unknowns=True)
users_for_star_wars

#3.13
movies_reviewed_by_sw_rec = [
    get_name_item_reviewed(x[0], user_full, items_full)
    for x in users_for_star_wars
]
movies_flatten = [
    movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list
]
movie_aggregate = movies_by_category(movies_flatten, 3)
movies_sort = sorted(movie_aggregate, key=lambda x: x[1], reverse=True)
movies_sort

#3.14
from recsys.evaluation.prediction import RMSE
err = RMSE()
for rating, item_id, user_id in data.get():
    try:
        prediction = svd.predict(item_id, user_id)
        err.add(rating, prediction)
    except KeyError, k:
        continue

print 'RMSE is ' + str(err.compute())
Exemplo n.º 41
0
#Load SVD from /tmp
svd2 = SVD(filename='/tmp/movielens') # Loading already computed SVD model

#Predict User rating for given user and movie:
USERID = 2   
ITEMID= 1 # Toy Story
rating1=svd2.predict(ITEMID, USERID, 0.0, 5.0)
print 'Predicted rating=%f'% rating1

flag=0
#Retrieve actual rating for given user and movie
for rating, item_id, user_id in data.get():
	if user_id == USERID and item_id == ITEMID:
		rat = rating
		#print 'Actual rating=%f' % rating
		flag=1
		break
		
if flag == 1:
	print 'Actual rating=%f'% rat
else :
	sys.exit("No actual rating available")

#Evaluating prediction
rmse = RMSE()
mae = MAE()
rmse.add(rating1, rat)
mae.add(rating1, rat)
print 'RMSE=%s' % rmse.compute()
print 'MAE=%s' % mae.compute()
Exemplo n.º 42
0
 def test_PRED_RMSE_load_test(self):
     rmse = RMSE()
     self.TEST_DATA = [2.3, 0.9, 4.9, 0.9, 1.5]
     rmse.load_test(self.TEST_DATA)
     assert_equal(len(rmse.get_test()), len(self.TEST_DATA))
Exemplo n.º 43
0
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        self.filename = filename
        self.sep = sep
        self.format = format

        # 训练参数
        self.k = 100
        self.min_values = 10
        self.post_normalize = True

        self.svd = SVD()

        # 判断是否加载
        self.is_load = False

        # 添加数据处理
        self.data = Data()

        # 添加模型评估
        self.rmse = RMSE()

    def get_data(self):
        """
        获取数据
        :return: None
        """
        # 如果模型不存在
        if not os.path.exists(tmpfile):
            # 如果数据文件不存在
            if not os.path.exists(self.filename):
                sys.exit()
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            # 使用Data()来获取数据
            self.data.load(self.filename, sep=self.sep, format=self.format)
            train, test = self.data.split_train_test(percent=80)
            return train, test
        else:
            self.svd.load_model(tmpfile)
            self.is_load = True
            return None, None

    def train(self, train):
        """
        训练模型
        :param train: 训练数据
        :return: None
        """
        if not self.is_load:
            self.svd.set_data(train)
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=tmpfile[:-4])
        return None

    def rs_predict(self, itemid, userid):
        """
        评分预测
        :param itemid: 电影id
        :param userid: 用户id
        :return: None
        """
        score = self.svd.predict(itemid, userid)
        print "推荐的分数为:%f" % score
        return score

    def recommend_to_user(self, userid):
        """
        推荐给用户
        :param userid: 用户id
        :return: None
        """
        recommend_list = self.svd.recommend(userid, is_row=False)

        # 读取文件里的电影名称
        movie_list = []

        for line in open(moviefile, "r"):
            movie_list.append(' '.join(line.split("::")[1:2]))

        # 推荐具体电影名字和分数
        for itemid, rate in recommend_list:
            print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate)
        return None

    def evaluation(self, test):
        """
        模型的评估
        :param test: 测试集
        :return: None
        """
        # 如果模型不是直接加载
        if not self.is_load:

            # 循环取出测试集里面的元组数据<评分,电影,用户>
            for value, itemid, userid in test.get():
                try:
                    predict = self.rs_predict(itemid, userid)
                    self.rmse.add(value, predict)
                except KeyError:
                    continue
            # 计算返回误差(均方误差)
            error = self.rmse.compute()

            print "模型误差为%s:" % error

        return None
Exemplo n.º 44
0
    s3 = 0
    for item in items:
        item_history = train_item[item].values()
        mean = statistics.mean(item_history)
        s1 += (train_item[item][user1]-mean)*(train_item[item][user2]-mean)
        s2 += math.pow((train_item[item][user1]-mean), 2)
        s3 += math.pow((train_item[item][user2]-mean), 2)
    if math.sqrt(s2*s3) == 0:
        return -sys.float_info.max
    else:
        return s1/(math.sqrt(s2*s3))


# Evaluate
k = 8
rmse = RMSE()
i = 0

for rating, item_id, user_id in test:
    print "==========================================="

    try:
        print i
        i += 1
        dist = {}

        if item_id in train_item.keys():
            for user in train_item[item_id].keys():
                sim = similarity(user_id, user)
                if sim >= 0:
                    if len(dist) < k:
Exemplo n.º 45
0
data.load(sys.argv[1],
          sep='::',
          format={
              'col': 0,
              'row': 1,
              'value': 2,
              'ids': int
          })
#Train & Test data
train, test = data.split_train_test(percent=PERCENT_TRAIN)

baseline = Baseline()
baseline.set_data(train)
baseline.compute()  # In this case, it does nothing

# Evaluate
rmse = RMSE()
mae = MAE()
for rating, item_id, user_id in test.get():
    try:
        pred_rating = baseline.predict(item_id, user_id, user_is_row=False)
        rmse.add(rating, pred_rating)
        mae.add(rating, pred_rating)
    except KeyError:
        continue

print 'RMSE=%s' % rmse.compute(
)  # in my case (~80% train, ~20% test set) returns RMSE = 1.036374
print 'MAE=%s' % mae.compute(
)  # in my case (~80% train, ~20% test set) returns  MAE = 0.829024