Пример #1
0
def parse_data():
	filename = '../data/ml-1m/ratings.dat'
	data = Data()
	format = {'col':0, 'row':1, 'value':2, 'ids': int}
	data.load(filename, sep='::', format=format)
	train, test = data.split_train_test(percent=80) # 80% train, 20% test
	data.save(os.path.join(utils.get_add_dir(), 'ratings'), pickle=True)
Пример #2
0
def main():
    svd = SVD()
    train = Data()
    test = Data()
    train.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int})
    test.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int})
    svd.set_data(train)
    svd.compute(k=100, min_values=0.5, pre_normalize=False, mean_center=True, post_normalize=True)

    # rmse = RMSE()
    # mae = MAE()
    # for rating, item_id, user_id in test.get():
    #     try:
    #         pred_rating = svd.predict(item_id, user_id)
    #         rmse.add(rating, pred_rating)
    #         mae.add(rating, pred_rating)
    #     except KeyError:
    #         continue
    # print 'RMSE=%s' % rmse.compute()
    # print 'MAE=%s' % mae.compute()

    # test = make_test()
    # print precision_and_recall(test, svd)
    # rec_list = svd.recommend(200, n=5, only_unknowns=False, is_row=False)
    print svd.recommend(1, n=5, only_unknowns=False, is_row=False)
Пример #3
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    train, test = data.split_train_test(percent=pct_train)
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100,
                min_values=2,
                pre_normalize=None,
                mean_center=True,
                post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Пример #4
0
def similar_users(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user': user})['recommended'] == False:
        user_files = db.user_list.find({'user': user})
        f = open('./dc_recom.dat', 'a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {
            'user': user,
            'recommended': True
        })

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col': 1, 'row': 0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True)
    return [i[0] for i in svd.similar(user)]
Пример #5
0
def reCompute(user_id):
    data = Data()
    fname = 'ratings.dat'
    dataset = Data()
    format = {'col': 0, 'row': 1, 'value': 2, 'ids': 'int'}
    dataset.load(fname, sep=':', format=format)

    svd = SVD()
    svd.set_data(dataset)

    k = 100
    svd.compute(k=k,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    #New ID of Added User
    USERID = user_id

    a = svd.recommend(USERID, is_row=False)
    for j in range(1, len(a)):
        global a
        k = a[j][0]
        print df_movies.query('movie_id==@k')
Пример #6
0
def test_save_n_load(percent_train,
         modelKlass = SVD,
         dataFname ='/Users/jennyyuejin/recommender/Data/movieData/u.data',
         dataFormat = {'col':0, 'row':1, 'value':2, 'ids':int}):

    data = Data()
    data.load(dataFname, sep='\t', format=dataFormat)

    print '------ evaluating original'
    train, test = data.split_train_test(percent=percent_train, shuffle_data=False)
    print len(train), 'training data points;', len(test), 'testing data points'

    #Create SVD
    K=100
    svd = modelKlass()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)
    evaluate(svd, test)

    svd.save_model('./model/svd.obj.zip',
                   {'k': K, 'min_values': 5,
                    'pre_normalize': None, 'mean_center': True, 'post_normalize': True})


    print '------ evaluating copy'
    data2 = Data()
    data2.load(dataFname, sep='\t', format=dataFormat)
    _, test2 = data2.split_train_test(percent=percent_train, shuffle_data=False)   # reload data
    print len(test2), 'testing data points'

    svd_pred = modelKlass()
    svd_pred.load_model('./model/svd.obj.zip')

    evaluate(svd_pred, test2)
Пример #7
0
def train_and_save(filename):

    step = filename.split('.')[-1]

    data = Data()

    format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'}
    data.load(filename, sep='::', format=format)

    train, test = data.split_train_test(percent=80)

    try:

        svd = SVD('svdn_model_{step}.zip'.format(step=step))
        print('Already exists: svdn_model_{step}.zip'.format(step=step))

    except:

        svd = SVD()
        svd.set_data(train)

        svd.compute(
            k=100,
            min_values=2,
            pre_normalize=False,
            mean_center=True,
            post_normalize=True,
            savefile='svdn_model_{step}'.format(step=step)
        )

        print('Saved svdn_model_{step}.zip'.format(step=step))
Пример #8
0
def ex1(dat_file='./ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int})
       

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Пример #9
0
def load_ratings(filename):
    """ Load ratings
    """
    
    data = Data()
    format = {'col':0, 'row':1, 'value':2, 'ids': 'int'}
    data.load(filename, sep=',', format=format)
    
    return data
Пример #10
0
def recommended_files(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user':user})['recommended']==False:
        user_files = db.user_list.find({'user':user})
        f = open('./dc_recom.dat','a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {'user':user, 'recommended': True})

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col':1,'row':0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    similar_users = [i[0] for i in svd.similar(user,n=10)]

    newdata = Data()
    for i in range(0,len(similar_users),1):
        files = db.user_list.find({'user':similar_users[i]})
        for f in files:
            newdata.add_tuple((1.0,similar_users[i],f['tth']))
    svd.set_data(newdata)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    recoms = svd.recommend(user,is_row=True,only_unknowns=True,n=100)

    res = []
    c_res = 0
    for p in recoms:
        flag=0
        for r in res:
            if similar(db.tths.find_one({'tth':p[0]})['name'],db.tths.find_one({'tth':r[0]})['name']):
                flag = 1
                break
        if flag == 0:
            res.append(p)
            c_res += 1
            if c_res > 10:
                k = []
                for i in res:
                    try:
                        j = 'magnet:?xt=urn:tree:tiger:'+i[0] + "&dn=" + unidecode.unidecode(db.tths.find_one({'tth': i[0]})['name'])
                    except:
                        j = 'magnet:?xt=urn:tree:tiger:'+i[0]
                    k.append(j)
                return k
    k = []
    for i in res:
        try:
            j = 'magnet:?xt=urn:tree:tiger:'+i[0] + "&dn=" + unidecode.unidecode(db.tths.find_one({'tth': i[0]})['name'])
        except:
            j = 'magnet:?xt=urn:tree:tiger:'+i[0]
        k.append(j)

    return k
Пример #11
0
def load_ratings(filename):
    """ Load ratings
    """

    data = Data()
    format = {'col': 0, 'row': 1, 'value': 2, 'ids': 'int'}
    data.load(filename, sep=',', format=format)

    return data
Пример #12
0
def getAverageRating(ITEMID):
    averageRating = 0
    totalUsers = 0
    data = Data()
    data.load('./data/movielens/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
    for rating, item_id, user_id in data.get():
        if(item_id == ITEMID):
            totalUsers += 1
            averageRating += rating
    print averageRating/totalUsers
Пример #13
0
def calculate_stats_features(pct_train):
    dat_file='feature_matrix.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=0, pre_normalize=None, mean_center=False,
    post_normalize=False)
    return svd,train,test
def SVDloadData2():
    dat_file = '/home/commons/RecSys/MOVIEDATA/ml-1m/ratings.dat'
    pct_train = 0.5
    data = Data()
    data.load(dat_file,
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    return data
Пример #15
0
def setup():
    global user, items, data
    user = User(USERID)
    items = _read_items(os.path.join(MOVIELENS_DATA_PATH, 'movies.dat'))
    data = Data()
    data.load(os.path.join(MOVIELENS_DATA_PATH, 'ratings.dat'),
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
Пример #16
0
def read_user_data_from_ratings(data_file):
    data = Data()
    format = {'col':0, 'row':1, 'value':2, 'ids': 'int'}    
    data.load(dat_file, sep='::', format=format)
    
    userdict = {}
    for d in data.get():
        if d[2] in userdict:
            user = userdict[d[2]] 
        else:
            user = User(d[2]) 
        
        user.add_item(d[1],d[0])
        userdict[d[2]] = user
    return userdict
Пример #17
0
def read_user_data_from_ratings(data_file):
    data = Data()
    format = {'col': 0, 'row': 1, 'value': 2, 'ids': 'int'}
    data.load(dat_file, sep='::', format=format)

    userdict = {}
    for d in data.get():
        if d[2] in userdict:
            user = userdict[d[2]]
        else:
            user = User(d[2])

        user.add_item(d[1], d[0])
        userdict[d[2]] = user
    return userdict
Пример #18
0
def main():
    # Load train and test Dataset
    train = Data()
    test = Data()
    train.load('./dataset/train.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int})
    test.load('./dataset/test.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int})
    test_dict = file_to_dict('./dataset/test.csv')
    #data = Data()
    #data.load('./ratings.dat', force=True, sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
    #train, test = data.split_train_test(percent=80)
    rec = Recommender(train, test)
    rec.svd.compute(k=100, min_values=0.1, pre_normalize=None, mean_center=False, post_normalize=True)
    #rec.eval_rmse()
    #rec_list = rec.recommend()
    rec_list = []
    print rec.precisionRecall(rec_list, test_dict)
Пример #19
0
def set_rating(rating, userID=45, probCode='GSS1', compute=False, SVDNeighbourhood=False):
	problems_recsys = pickle.load( open( os.path.join(utils.get_add_dir(), 'problems_recsys'), "rb" ) )
	problem_keys = list(problems_recsys)
	
	data = Data()
	data.load(os.path.join(utils.get_add_dir(), 'ratings'), pickle=True)

	data.add_tuple((rating, problem_keys.index(probCode), userID))

	data.save(os.path.join(utils.get_add_dir(), 'ratings'), pickle=True)

	if compute:
		if SVDNeighbourhood:
			compute_SVDNeighbourhood()
		else:
			compute_SVD()
Пример #20
0
def test_utf8_data():
    data_in = Data()

    NUM_PLAYS = 69
    ITEMID = u'Bj\xf6rk' 
    data_in.add_tuple([NUM_PLAYS, ITEMID, USERID1])

    NUM_PLAYS = 34
    ITEMID = 'Björk' 
    data_in.add_tuple([NUM_PLAYS, ITEMID, USERID2])

    data_in.save(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.saved.utf8'))

    data_saved = Data()
    data_saved.load(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.saved.utf8'))

    assert_equal(len(data_in), len(data_saved))
Пример #21
0
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5):

    data = Data()
    data.load(dat_file,
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    # About format parameter:
    #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
    #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
    #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
    #   file
    #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=5,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    # mae is mean ABSOLUTE error
    # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Пример #22
0
def get_mae_rmse(step):

    data = Data()

    format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'}

    filename = 'second_train_test.dat.{step}'.format(step=step)

    data.load(filename, sep='::', format=format)

    train, test = data.split_train_test(percent=80)

    try:

        svd = SVD('svdn_model_{step}.zip'.format(step=step))
        print('Loading model... {step}'.format(step=step))

    except:

        return

    mae_predicted, rmse_predicted = [], []
    for rating, item_id, user_id in test:
        try:

            predicted = svd.predict(item_id, user_id)

            mae_predicted.append((rating, predicted))
            rmse_predicted.append((rating, predicted))

        except:

            pass

    mae_value, rmse_value = np.nan, np.nan

    if len(mae_predicted) > 0:
        mae = MAE(mae_predicted)
        mae_value = mae.compute()

    if len(rmse_predicted) > 0:
        rmse = RMSE(rmse_predicted)
        rmse_value = rmse.compute()

    return mae_value, rmse_value
Пример #23
0
def similar_users(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user':user})['recommended']==False:
        user_files = db.user_list.find({'user':user})
        f = open('./dc_recom.dat','a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {'user':user, 'recommended': True})

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col':1,'row':0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    return [i[0] for i in svd.similar(user)]
Пример #24
0
def test_utf8_data():
    data_in = Data()

    NUM_PLAYS = 69
    ITEMID = u'Bj\xf6rk'
    data_in.add_tuple([NUM_PLAYS, ITEMID, USERID1])

    NUM_PLAYS = 34
    ITEMID = 'Björk'
    data_in.add_tuple([NUM_PLAYS, ITEMID, USERID2])

    data_in.save(os.path.join(MOVIELENS_DATA_PATH,
                              'ratings.matrix.saved.utf8'))

    data_saved = Data()
    data_saved.load(
        os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.saved.utf8'))

    assert_equal(len(data_in), len(data_saved))
Пример #25
0
def calculate_stats_features(pct_train):
    dat_file = 'feature_matrix.csv'
    data = Data()
    data.load(dat_file,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    train, test = data.split_train_test(percent=pct_train)
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=False)
    return svd, train, test
Пример #26
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True,
    post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():      
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Пример #27
0
def ex1(dat_file='ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,
    'ids':int})
        # About format parameter:
        #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
        #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
        #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
        #   file
        #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(
        k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Пример #28
0
Файл: day_07.py Проект: lmlzk/ML
    def get_data(self):
        # 如果模型不存在,则需要加载数据
        if not os.path.exists(filename):
            if not os.path.exists(self.filename):
                sys.exit()
            # SVD加载数据
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            data = Data()

            data.load(self.filename, sep=self.sep, format=self.format)

            # 分割数据集
            train, test = data.split_train_test(percent=80)

            return train, test

        else:
            # 直接加载模型
            self.svd.load_model(filename)

            # 将是否加载模型设为True
            self.load_model = True

            return None, None
Пример #29
0
import sys

#To show some messages:
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data
from recsys.utils.svdlibc import SVDLIBC
from recsys.evaluation.prediction import RMSE, MAE

#Dataset
PERCENT_TRAIN = int(sys.argv[2])
data = Data()
data.load(sys.argv[1], sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
#Train & Test data
train, test = data.split_train_test(percent=PERCENT_TRAIN)

svdlibc = SVDLIBC('./ml-1m/ratings.dat')
svdlibc.to_sparse_matrix(sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
svdlibc.compute(k=100)
svd = svdlibc.export()
svd.save_model('/tmp/svd-model', options={'k': 100})
#svd.similar(ITEMID1) # results might be different than example 4. as there's no min_values=10 set here


#Evaluation using prediction-based metrics
print 'Evaluating...'
rmse = RMSE()
mae = MAE()
for rating, item_id, user_id in test.get():
Пример #30
0
import sqlite3
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
from recsys.evaluation.prediction import RMSE, MAE

from recsys.datamodel.data import Data
from recsys.datamodel.item import Item
from recsys.datamodel.user import User

data = Data()
data.load("../data/ratings.tsv", sep='|', format={'col':0, 'row':1, 'value':2, 'ids':float})

K=100
svd = SVD()
svd.set_data(data)
svd.compute(k=K, min_values=0.1, pre_normalize=None, mean_center=True, post_normalize=True)

[(beers[b].get_data()['name'], b, val) for b, val in  svd.similar(1502, 50)\
 if beers[b].get_data()['brewery']!=232 and beers[b].get_data()['style_id']==17] #Bell's two hearted
Пример #31
0
    #[(item,value),(item1, value1)...]
    recommendations = []
    for i in itemdict.keys():
        if (int(i) not in items_reviewed(int(user_id),userdict)):
            recommendations.append((i,predict_rating(user_id, i))) #only get those not predicted.
    recommendations.sort(key=lambda t: t[1], reverse=True)
    return recommendations[:top_n]

#3.3:
data = Data()
format = {'col':0, 'row':1, 'value':2, 'ids': 'int'}
    # About format parameter:
    #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
    #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat file
    #   'ids': int -> Ids (row and col ids) are integers (not strings)
data.load(dat_file, sep='::', format=format)

similarity_matrix = SimilarityMatrix()
recommend(0,10)
recommend(1,10)
recommend(2,10)

##################
#Now we do SVD
##################

#3.8
svd = SVD()
recsys.algorithm.VERBOSE = True

dat_file = './ml-1m/ratings.dat'
Пример #32
0
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

filename = "./data/ratings.dat"
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': int}
# About format parameter:
#   'row': 1 -> Rows in matrix come from second column in ratings.dat file
#   'col': 0 -> Cols in matrix come from first column in ratings.dat file
#   'value': 2 -> Values (Mij) in matrix come from third column in ratings.dat file
#   'ids': int -> Ids (row and col ids) are integers (not strings)
data.load(filename, sep="::", format=format)
train, test = data.split_train_test(percent=80)  # 80% train ,20%test

svd = SVD()
svd.set_data(train)

print(svd.predict(22, 22, MIN_VALUE=0.0, MAX_VALUE=5.0))
# the prediction for user loving item
print(svd.recommend(1, n=10, only_unknowns=True, is_row=False))
#item recomended for user ,only from known
print(svd.recommend(1, n=10, only_unknowns=False, is_row=False))
#item recomended for user
Пример #33
0
            print 'UCF Content No Tag:'
            print 'Precision:\t', ucf_con_notag[0]
            print 'Recall: \t', ucf_con_notag[1]
            print '-' * 100
            remap_oneday_con = open('onedaySet/Content/remap1.csv')
            train_oneday_con = open('onedaySet/Content/rate1.csv')
            icf_con_notag = test100_ItemCF(train_oneday_con, test, remap_oneday_con, i, j, True)
            s += str(icf_con_notag[0]) + ',' + str(icf_con_notag[1]) + ','
            print 'ICF Content No Tag:'
            print 'Precision:\t', icf_con_notag[0]
            print 'Recall: \t', icf_con_notag[1]
            print '-' * 100

            remap = open('randUser/DiffRate/remap1.csv')
            train = open('randUser/DiffRate/rate1.csv')
            svd_train.load('./randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':str})
            svd = test100_SVD(svd_train, train, test, remap, i, j, False)
            # s += str(svd[0]) + ',' + str(svd[1]) + ','
            print 'SVD:'
            print 'Precision:\t', svd[0]
            print 'Recall: \t', svd[1]
            print '-' * 100

            remap_oneday = open('onedaySet/remap1.csv')
            train_oneday = open('onedaySet/rate1.csv')
            svd_train.load('./onedaySet/rate2.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':str})
            svd_notag = test100_SVD(svd_train, train_oneday, test, remap_oneday, i, j, False)
            s += str(svd_notag[0]) + ',' + str(svd_notag[1]) + ','
            print 'SVD No Tag:'
            print 'Precision:\t', svd_notag[0]
            print 'Recall: \t', svd_notag[1]
Пример #34
0
from recsys.algorithm.factorize import SVD, SVDNeighbourhood
from recsys.datamodel.data import Data
from recsys.evaluation.prediction import RMSE, MAE

# Create SVD
K = 100
svd = SVD()
svd_neig = SVDNeighbourhood()

#Dataset
PERCENT_TRAIN = int(sys.argv[2])
data = Data()
data.load(sys.argv[1],
          sep='::',
          format={
              'col': 0,
              'row': 1,
              'value': 2,
              'ids': int
          })

rmse_svd_all = []
mae_svd_all = []
rmse_svd_neig_all = []
mae_svd_neig_all = []

RUNS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for run in RUNS:
    print 'RUN(%d)' % run
    #Train & Test data
    train, test = data.split_train_test(percent=PERCENT_TRAIN)
Пример #35
0
        if (int(i) not in items_reviewed(int(user_id), userdict)):
            recommendations.append(
                (i, predict_rating(user_id,
                                   i)))  #only get those not predicted.
    recommendations.sort(key=lambda t: t[1], reverse=True)
    return recommendations[:top_n]


#3.3:
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': 'int'}
# About format parameter:
#   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
#   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat file
#   'ids': int -> Ids (row and col ids) are integers (not strings)
data.load(dat_file, sep='::', format=format)

similarity_matrix = SimilarityMatrix()
recommend(0, 10)
recommend(1, 10)
recommend(2, 10)

##################
#Now we do SVD
##################

#3.8
svd = SVD()
recsys.algorithm.VERBOSE = True

dat_file = './ml-1m/ratings.dat'
Пример #36
0
import sqlite3
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
from recsys.evaluation.prediction import RMSE, MAE

from recsys.datamodel.data import Data
from recsys.datamodel.item import Item
from recsys.datamodel.user import User

data = Data()
data.load("../data/ratings.tsv",
          sep='|',
          format={
              'col': 0,
              'row': 1,
              'value': 2,
              'ids': float
          })

K = 100
svd = SVD()
svd.set_data(data)
svd.compute(k=K,
            min_values=0.1,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)

[(beers[b].get_data()['name'], b, val) for b, val in  svd.similar(1502, 50)\
 if beers[b].get_data()['brewery']!=232 and beers[b].get_data()['style_id']==17] #Bell's two hearted
Пример #37
0
    def to_sparse_matrix(self, sep='\t', format=None):
        # http://tedlab.mit.edu/~dr/SVDLIBC/SVD_F_ST.html
        data = Data()
        data.load(self._data_file, sep=sep, format=format)

        f = open(self._matrix_file, 'w')
        f_row_ids = codecs.open('%s.ids.rows' % self._svd_prefix, 'w', 'utf8')
        f_col_ids = codecs.open('%s.ids.cols' % self._svd_prefix, 'w', 'utf8')

        num_rows = len(set(map(itemgetter(1), data)))
        num_cols = len(set(map(itemgetter(2), data)))
        non_zero = len(data)
        f.write('%s %s %s\n' % (num_rows, num_cols, non_zero))

        #print 'sorting data by col'
        l = data.get()
        #l.sort(key=itemgetter(2, 1)) #by col, and then row
        l.sort(key=itemgetter(2))

        rows = dict()
        cols = dict()
        prev_col_id = None
        col_values = []
        row, col = (0, 0)
        for value, row_id, col_id in l:
            #if not row_id or not col_id or not value:
            #    if VERBOSE:
            #        sys.stdout.write('Skipping: %s, %s, %s\n' % (value, row_id, col_id))
            #    continue
            if col_id != prev_col_id:
                if col_values:
                    f.write('%s\n' % len(col_values))
                    for col_row_id, col_value in col_values:
                        _row = rows[col_row_id]
                        f.write('%s %s\n' % (_row, col_value))
                col_values = []
                cols[col_id] = col
                col += 1
            if not rows.has_key(row_id):
                rows[row_id] = row
                row += 1
            col_values.append((row_id, value))
            prev_col_id = col_id
        if col_values:
            f.write('%s\n' % len(col_values))
            for col_row_id, col_value in col_values:
                row = rows[col_row_id]
                f.write('%s %s\n' % (row, col_value))
            cols[col_id] = col
        f.close()

        # Now write f_row_ids and f_col_ids
        rows = rows.items()
        rows.sort(key=itemgetter(1))
        for row_id, _ in rows:
            if row_id == '':
                continue
            if isinstance(row_id, int):
                row_id = str(row_id)
            f_row_ids.write(row_id + '\n')
        f_row_ids.close()

        cols = cols.items()
        cols.sort(key=itemgetter(1))
        for col_id, _ in cols:
            if col_id == '':
                continue
            if isinstance(col_id, int):
                col_id = str(col_id)
            f_col_ids.write(col_id + '\n')
        f_col_ids.close()
Пример #38
0
svd = SVD()
svd.load_data(filename='./data/ratings.dat',
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })

#Haciendo el split al dataset
filename = './data/ratings.dat'
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': int}
data.load(filename, sep='::', format=format)
train_80, test_20 = data.split_train_test(percent=80)  # 80% train, 20% test
svd = SVD()
svd.set_data(train_80)

#Ingresando  variables para crear la matrizx
k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)

k = 100
svd.compute(k=k,
            min_values=10,
Пример #39
0
4. See my ratings
5. Change user
6. Save & quit
    """
    
    choice = input("Enter your choice: ")
    
    return choice


if __name__ == "__main__":
    
    #Load data
    try:
        ratings = Data()
        ratings.load('../data/myratings.data')
    except:
        ratings = load_ratings('../data/ml-latest-small/ratings.csv')
    movies = load_movies('../data/ml-latest-small/movies.csv')
    tags = load_tags('../data/ml-latest-small/tags.csv')
    
    os.system('clear')
    print """
#####################################################
####           COMMAND LINE RECOMMENDER          ####
#####################################################

A minimalistic command line recommender system using
SVD decomposistion.
"""
    
Пример #40
0
import recsys.algorithm
#recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data
from recsys.evaluation.prediction import RMSE, MAE
from recsys.evaluation.decision import PrecisionRecallF1
from recsys.evaluation.ranking import SpearmanRho, KendallTau

#Dataset
PERCENT_TRAIN = 70
data = Data()
data.load('./data/dataset-recsys.csv',
          sep=',',
          format={
              'col': 0,
              'row': 1,
              'value': 2,
              'ids': int
          })

#Train & Test data
train, test = data.split_train_test(percent=PERCENT_TRAIN)

#Create SVD
K = 100
svd = SVD()
svd.set_data(train)

svd.compute(k=K,
            min_values=1,
            pre_normalize=None,
Пример #41
0
    return choice


if __name__ == "__main__":
    
    # Load data from custom path
    try:
        data_path = sys.argv[1]
    except IndexError:
        data_path = '/data'
    
    
    #Load data
    ratings = Data()
    if os.path.isfile(data_path + '/myratings.data'):
        ratings.load(data_path + '/myratings.data')
    else:
        try:
            ratings = load_ratings(data_path + '/ratings.csv')
        except IOError:
            raise Exception('Data not found. Please specify it.'
                            % data_path)
    movies = load_movies(data_path + '/movies.csv')
    tags = load_tags(data_path + '/tags.csv')
            
    os.system('clear')
    print """
#####################################################
####           COMMAND LINE RECOMMENDER          ####
#####################################################
Пример #42
0
class Collaborative_filtering(object):
    def __init__(self, ratings_file,
                 movies):  #No need to pass as ,will be provided in views.py
        #self.users = users
        self.movies = movies
        self.K = 100
        self.PERCENT_TRAIN = 85
        #Need to provide a default file location for ratings.csv instead of loading everytime.run below 2lines only once
        #or just provide this file instead.
        #self.users.to_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index= False)
        self.ratings_file = ratings_file  #Give your path to ratings.csv created from above 2 lines.
        self.data = None
        self.svd = None
        self.recommend_movies_list = None
        self.recommend_movies_ids = None
        self.similar_movies_list = None
        self.similar_movies_ids = None

        self.movie_id = None
        self.train = None
        self.test = None

    def compute_svd(self):
        '''    
        ratings = pd.read_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index_col= False)
        ratings = ratings.ix[1:]
        ratings.to_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index = False)
        self.data = Data()      
        self.data.load(self.ratings_file, sep=',', format={'col':0, 'row':1 ,'value':2, 'ids':float})
        self.train , self.test = self.data.split_train_test(percent=self.PERCENT_TRAIN)    
        self.svd = SVD()
        self.svd.set_data(self.train)    
        self.svd.compute(k=self.K, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True)'''
        self.data = Data()
        self.data.load(self.ratings_file,
                       sep=',',
                       format={
                           'col': 0,
                           'row': 1,
                           'value': 2,
                           'ids': float
                       })
        self.train, self.test = self.data.split_train_test(percent=85)
        self.svd = SVDNeighbourhood()
        self.svd.set_data(self.train)
        self.svd.compute(k=100,
                         min_values=1,
                         pre_normalize=None,
                         mean_center=False,
                         post_normalize=True)

    def similarity_measure(
            self, movie1,
            movie2):  #gives a similarity measure value between -1 to 1
        return round(self.svd.similarity(movie1, movie2), 4)

    def recommend_movies(self, user_id):
        l = self.svd.recommend(user_id, n=10, only_unknowns=True, is_row=False)
        self.recommend_movies_list = []
        self.recommend_movies_ids = []
        for p in l:
            #movie names
            bb = str(movies.ix[movies['movie_id'] == p[0]]['title']).split()
            q = bb.index('Name:')
            bb = ' '.join(bb[1:q])
            self.recommend_movies_list.append(bb)
            #movie ids
            gg = movies.ix[movies['movie_id'] == p[0]]
            gg = gg.reset_index()
            del gg['index']
            gg = gg.ix[:, 0:2].as_matrix(columns=None).tolist()
            self.recommend_movies_ids.append(gg[0][0])
        return self.recommend_movies_list, self.recommend_movies_ids

    def get_similar_movies(self,
                           movie1):  #Returns a PYTHON list for similar movies.
        movie1 = int(movie1)
        l = self.svd.similar(movie1)
        self.similar_movies_list = []
        self.similar_movies_ids = []
        l = l[1:]

        for p in l:
            #getting movie names
            bb = str(movies.ix[movies['movie_id'] == p[0]]['title']).split()
            q = bb.index('Name:')
            bb = ' '.join(bb[1:q])
            self.similar_movies_list.append(bb)
            #getting movie id's
            self.similar_movies_ids.append(p[0])

        return self.similar_movies_list, self.similar_movies_ids
Пример #43
0
recsys.algorithm.VERBOSE = True

from recsys.evaluation.prediction import RMSE, MAE
from recsys.datamodel.data import Data

from baseline import Baseline #Import the test class we've just created
import time
start_time = time.time()
#rmsem = []
#for k in range(1, 11):
#    print str(k)+" fold..."
#Dataset
dat_file='ratings_user.csv'

data = Data()
data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2})
train, test = data.split_train_test(percent=80)

print train
print test

################ kNN ################
train_item = {}
train_user = {}
for rating, item_id, user_id in train:
    if item_id in train_item:
        train_item[item_id][user_id] = rating
    else:
        train_item[item_id] = {user_id: rating}
    if user_id in train_user:
        train_user[user_id][item_id] = rating
Пример #44
0
def recommended_files(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user': user})['recommended'] == False:
        user_files = db.user_list.find({'user': user})
        f = open('./dc_recom.dat', 'a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {
            'user': user,
            'recommended': True
        })

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col': 1, 'row': 0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True)
    similar_users = [i[0] for i in svd.similar(user, n=10)]

    newdata = Data()
    for i in range(0, len(similar_users), 1):
        files = db.user_list.find({'user': similar_users[i]})
        for f in files:
            newdata.add_tuple((1.0, similar_users[i], f['tth']))
    svd.set_data(newdata)
    svd.compute(k=1000,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True)
    recoms = svd.recommend(user, is_row=True, only_unknowns=True, n=100)

    res = []
    c_res = 0
    for p in recoms:
        flag = 0
        for r in res:
            if similar(
                    db.tths.find_one({'tth': p[0]})['name'],
                    db.tths.find_one({'tth': r[0]})['name']):
                flag = 1
                break
        if flag == 0:
            res.append(p)
            c_res += 1
            if c_res > 10:
                k = []
                for i in res:
                    try:
                        j = 'magnet:?xt=urn:tree:tiger:' + i[
                            0] + "&dn=" + unidecode.unidecode(
                                db.tths.find_one({'tth': i[0]})['name'])
                    except:
                        j = 'magnet:?xt=urn:tree:tiger:' + i[0]
                    k.append(j)
                return k
    k = []
    for i in res:
        try:
            j = 'magnet:?xt=urn:tree:tiger:' + i[
                0] + "&dn=" + unidecode.unidecode(
                    db.tths.find_one({'tth': i[0]})['name'])
        except:
            j = 'magnet:?xt=urn:tree:tiger:' + i[0]
        k.append(j)

    return k
Пример #45
0
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD, SVDNeighbourhood
from recsys.datamodel.data import Data
from recsys.evaluation.prediction import RMSE, MAE

# Create SVD
K = 100
svd = SVD()
svd_neig = SVDNeighbourhood()

# Dataset
PERCENT_TRAIN = int(sys.argv[2])
data = Data()
data.load(sys.argv[1], sep="::", format={"col": 0, "row": 1, "value": 2, "ids": int})

rmse_svd_all = []
mae_svd_all = []
rmse_svd_neig_all = []
mae_svd_neig_all = []

RUNS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for run in RUNS:
    print "RUN(%d)" % run
    # Train & Test data
    train, test = data.split_train_test(percent=PERCENT_TRAIN)

    svd.set_data(train)
    svd_neig.set_data(train)
Пример #46
0
class Algorithm(object):
    """
    Base class Algorithm

    It has the basic methods to load a dataset, get the matrix and the raw input
    data, add more data (tuples), etc.

    Any other Algorithm derives from this base class
    """
    def __init__(self):
        self._data = Data()
        self._matrix = SparseMatrix()
        self._matrix_similarity = None  #self-similarity matrix (only for the input Matrix rows)
        self._matrix_and_data_aligned = False  #both Matrix and Data contain the same info?

    def __len__(self):
        return len(self.get_data())

    def __repr__(self):
        s = '%d rows.' % len(self.get_data())
        if len(self.get_data()):
            s += '\nE.g: %s' % str(self.get_data()[0])
        return s

    def get_matrix(self):
        """
        :returns: matrix *M*
        """
        if not self._matrix.get():
            self.create_matrix()
        return self._matrix

    def get_matrix_similarity(self):
        """
        :returns: the self-similarity matrix
        """
        return self._matrix_similarity

    def set_data(self, data):
        """
        Sets the raw dataset (input for matrix *M*)

        :param data: a Dataset class (list of tuples <value, row, col>)
        :type data: Data
        """
        #self._data = Data()
        #self._data.set(data)
        self._data = data
        self._matrix_and_data_aligned = False

    def get_data(self):
        """
        :returns: An instance of Data class. The raw dataset (input for matrix *M*). 
        """
        return self._data

    def add_tuple(self, tuple):
        """
        Add a tuple in the dataset

        :param tuple: a tuple containing <rating, user, item> information. Or, more general: <value, row, col>
        """
        self.get_data().add_tuple(tuple)
        self._matrix_and_data_aligned = False

    def load_data(self,
                  filename,
                  force=True,
                  sep='\t',
                  format={
                      'value': 0,
                      'row': 1,
                      'col': 2
                  },
                  pickle=False):
        """
        Loads a dataset file

        See params definition in *datamodel.Data.load()*
        """
        if force:
            self._data = Data()
            self._matrix_similarity = None

        self._data.load(filename, force, sep, format, pickle)

    def save_data(self, filename, pickle=False):
        """
        Saves the dataset in divisi2 matrix format (i.e: value <tab> row <tab> col)

        :param filename: file to store the data
        :type filename: string
        :param pickle: save in pickle format?
        :type filename: boolean
        """
        self._data.save(filename, pickle)

    def create_matrix(self):
        if VERBOSE:
            sys.stdout.write('Creating matrix (%s tuples)\n' % len(self._data))
        try:
            self._matrix.create(self._data.get())
        except AttributeError:
            self._matrix.create(self._data)

        if VERBOSE:
            sys.stdout.write("Matrix density is: %s%%\n" %
                             self._matrix.density())
        self._matrix_and_data_aligned = True

    def compute(self, min_values=None):
        if self._matrix.empty() and (not isinstance(self._data, list)
                                     and not self._data.get()):
            raise ValueError('No data set. Matrix is empty!')
        if self._matrix.empty() and (isinstance(self._data, list)
                                     and not self._data):
            raise ValueError('No data set. Matrix is empty!')
        if not self._matrix.empty() or not self._matrix_and_data_aligned:
            self.create_matrix()

        if min_values:
            if VERBOSE:
                sys.stdout.write(
                    'Updating matrix: squish to at least %s values\n' %
                    min_values)
            self._matrix.set(self._matrix.get().squish(min_values))

    def _get_row_similarity(self, i):
        if not self.get_matrix_similarity():
            self.compute()
        try:
            return self.get_matrix_similarity().get_row(i)
        except KeyError:
            raise KeyError("%s not found!" % i)

    def similar(self, i, n=10):
        """
        :param i: a row in *M*
        :type i: user or item id
        :param n: number of similar elements
        :type n: int
        :returns: the most similar elements of *i*
        """
        if not self.get_matrix_similarity():
            self.compute()
        return self._get_row_similarity(i).top_items(n)

    def similarity(self, i, j):
        """
        :param i: a row in *M*
        :type i: user or item id
        :param j: a row in *M*
        :type j: user or item id
        :returns: the similarity between the two elements *i* and *j*
        """
        if not self.get_matrix_similarity():
            self.compute()
        return self.get_matrix_similarity().value(i, j)

    def predict(self, i, j, MIN_VALUE=None, MAX_VALUE=None):
        raise NotImplementedError("cannot instantiate Abstract Base Class")

    def recommend(self, i, n=10):
        raise NotImplementedError("cannot instantiate Abstract Base Class")

    ### OTHER METHODS ###
    def _cosine(self, v1, v2):
        return float(divisi2.dot(v1, v2) / (norm(v1) * norm(v2)))

    def centroid(self, ids, are_rows=True):
        if VERBOSE:
            sys.stdout.write('Computing centroid for ids=%s\n' % str(ids))
        points = []
        for id in ids:
            if are_rows:
                point = self.get_matrix().get_row(id)
            else:
                point = self.get_matrix().get_col(id)
            points.append(point)
        M = divisi2.SparseMatrix(points)
        return M.col_op(sum) / len(points)  #TODO numpy.sum seems slower?

    def _kinit(self, X, k):
        #Init k seeds according to kmeans++
        n = X.shape[0]
        #Choose the 1st seed randomly, and store D(x)^2 in D[]
        centers = [X[randint(0, n - 1)]]
        D = [norm(x - centers[0])**2 for x in X]

        for _ in range(k - 1):
            bestDsum = bestIdx = -1
            for i in range(n):
                #Dsum = sum_{x in X} min(D(x)^2,||x-xi||^2)
                Dsum = reduce(lambda x, y: x + y, (min(D[j],
                                                       norm(X[j] - X[i])**2)
                                                   for j in xrange(n)))
                if bestDsum < 0 or Dsum < bestDsum:
                    bestDsum, bestIdx = Dsum, i
            centers.append(X[bestIdx])
            D = [min(D[i], norm(X[i] - X[bestIdx])**2) for i in xrange(n)]
        return array(centers)

    def kmeans(self, id, k=5, is_row=True):
        """
        K-means clustering. http://en.wikipedia.org/wiki/K-means_clustering

        Clusterizes the (cols) values of a given row, or viceversa

        :param id: row (or col) id to cluster its values
        :param k: number of clusters
        :param is_row: is param *id* a row (or a col)?
        :type is_row: Boolean
        """
        # TODO: switch to Pycluster?
        # http://pypi.python.org/pypi/Pycluster
        if VERBOSE:
            sys.stdout.write('Computing k-means, k=%s, for id %s\n' % (k, id))
        point = None
        if is_row:
            point = self.get_matrix().get_row(id)
        else:
            point = self.get_matrix().get_col(id)
        points = []
        points_id = []
        for i in point.nonzero_entries():
            label = point.label(i)
            points_id.append(label)
            if not is_row:
                points.append(self.get_matrix().get_row(label))
            else:
                points.append(self.get_matrix().get_col(label))
        #return kmeans(array(points), k)
        if VERBOSE:
            sys.stdout.write('id %s has %s points\n' % (id, len(points)))
        M = array(points)

        MAX_POINTS = 150
        # Only apply Matrix initialization if num. points is not that big!
        if len(points) <= MAX_POINTS:
            centers = self._kinit(array(points), k)
            centroids, labels = kmeans2(M, centers, minit='matrix')
        else:
            centroids, labels = kmeans2(M, k, minit='random')
        i = 0
        clusters = dict()
        for cluster in labels:
            if not clusters.has_key(cluster):
                clusters[cluster] = dict()
                clusters[cluster]['centroid'] = centroids[cluster]
                clusters[cluster]['points'] = []
            clusters[cluster]['points'].append(points_id[i])
            i += 1
        return clusters
Пример #47
0
class Algorithm(object):
    """
    Base class Algorithm

    It has the basic methods to load a dataset, get the matrix and the raw input
    data, add more data (tuples), etc.

    Any other Algorithm derives from this base class
    """
    def __init__(self):
        self._data = Data()
        self._matrix = SparseMatrix()
        self._matrix_similarity = None #self-similarity matrix (only for the input Matrix rows)
        self._matrix_and_data_aligned = False #both Matrix and Data contain the same info?

    def __len__(self):
        return len(self.get_data())

    def __repr__(self):
        s = '%d rows.' % len(self.get_data())
        if len(self.get_data()):
            s += '\nE.g: %s' % str(self.get_data()[0])
        return s

    def get_matrix(self):
        """
        :returns: matrix *M*
        """
        if not self._matrix.get():
            self.create_matrix()
        return self._matrix

    def get_matrix_similarity(self):
        """
        :returns: the self-similarity matrix
        """
        return self._matrix_similarity

    def set_data(self, data):
        """
        Sets the raw dataset (input for matrix *M*)

        :param data: a Dataset class (list of tuples <value, row, col>)
        :type data: Data
        """
        #self._data = Data()
        #self._data.set(data)
        self._data = data
        self._matrix_and_data_aligned = False

    def get_data(self):
        """
        :returns: An instance of Data class. The raw dataset (input for matrix *M*). 
        """
        return self._data

    def add_tuple(self, tuple):
        """
        Add a tuple in the dataset

        :param tuple: a tuple containing <rating, user, item> information. Or, more general: <value, row, col>
        """
        self.get_data().add_tuple(tuple)
        self._matrix_and_data_aligned = False

    def load_data(self, filename, force=True, sep='\t', format={'value':0, 'row':1, 'col':2}, pickle=False):
        """
        Loads a dataset file

        See params definition in *datamodel.Data.load()*
        """
        if force:
            self._data = Data()
            self._matrix_similarity = None

        self._data.load(filename, force, sep, format, pickle)
    
    def save_data(self, filename, pickle=False):
        """
        Saves the dataset in divisi2 matrix format (i.e: value <tab> row <tab> col)

        :param filename: file to store the data
        :type filename: string
        :param pickle: save in pickle format?
        :type filename: boolean
        """
        self._data.save(filename, pickle)

    def create_matrix(self):
        if VERBOSE:
            sys.stdout.write('Creating matrix (%s tuples)\n' % len(self._data))
        try:
            self._matrix.create(self._data.get())
        except AttributeError:
            self._matrix.create(self._data)

        if VERBOSE:
            sys.stdout.write("Matrix density is: %s%%\n" % self._matrix.density())
        self._matrix_and_data_aligned = True

    def compute(self, min_values=None):
        if self._matrix.empty() and (not isinstance(self._data, list) and not self._data.get()):
            raise ValueError('No data set. Matrix is empty!')
        if self._matrix.empty() and (isinstance(self._data, list) and not self._data):
            raise ValueError('No data set. Matrix is empty!')
        if not self._matrix.empty() or not self._matrix_and_data_aligned:
            self.create_matrix()

        if min_values:
            if VERBOSE:
                sys.stdout.write('Updating matrix: squish to at least %s values\n' % min_values)
            self._matrix.set(self._matrix.get().squish(min_values))

    def _get_row_similarity(self, i):
        if not self.get_matrix_similarity():
            self.compute()
        try:
            return self.get_matrix_similarity().get_row(i)
        except KeyError:
            raise KeyError("%s not found!" % i)

    def similar(self, i, n=10):
        """
        :param i: a row in *M*
        :type i: user or item id
        :param n: number of similar elements
        :type n: int
        :returns: the most similar elements of *i*
        """
        if not self.get_matrix_similarity():
            self.compute()
        return self._get_row_similarity(i).top_items(n)

    def similarity(self, i, j):
        """
        :param i: a row in *M*
        :type i: user or item id
        :param j: a row in *M*
        :type j: user or item id
        :returns: the similarity between the two elements *i* and *j*
        """
        if not self.get_matrix_similarity():
            self.compute()
        return self.get_matrix_similarity().value(i, j)

    def predict(self, i, j, MIN_VALUE=None, MAX_VALUE=None):
        raise NotImplementedError("cannot instantiate Abstract Base Class")

    def recommend(self, i, n=10):
        raise NotImplementedError("cannot instantiate Abstract Base Class")

    ### OTHER METHODS ###
    def _cosine(self, v1, v2):
        return float(divisi2.dot(v1,v2) / (norm(v1) * norm(v2)))

    def centroid(self, ids, are_rows=True):
        if VERBOSE:
            sys.stdout.write('Computing centroid for ids=%s\n' % str(ids))
        points = []
        for id in ids:
            if are_rows:
                point = self.get_matrix().get_row(id)
            else:
                point = self.get_matrix().get_col(id)
            points.append(point)
        M = divisi2.SparseMatrix(points)
        return M.col_op(sum)/len(points) #TODO numpy.sum seems slower?

    def _kinit(self, X, k):
        #Init k seeds according to kmeans++
        n = X.shape[0]
        #Choose the 1st seed randomly, and store D(x)^2 in D[]
        centers = [X[randint(0, n-1)]]
        D = [norm(x-centers[0])**2 for x in X]

        for _ in range(k-1):
            bestDsum = bestIdx = -1
            for i in range(n):
                #Dsum = sum_{x in X} min(D(x)^2,||x-xi||^2)
                Dsum = reduce(lambda x,y:x+y,
                              (min(D[j], norm(X[j]-X[i])**2) for j in xrange(n)))
                if bestDsum < 0 or Dsum < bestDsum:
                    bestDsum, bestIdx = Dsum, i
            centers.append(X[bestIdx])
            D = [min(D[i], norm(X[i]-X[bestIdx])**2) for i in xrange(n)]
        return array(centers)

    def kmeans(self, id, k=5, is_row=True):
        """
        K-means clustering. http://en.wikipedia.org/wiki/K-means_clustering

        Clusterizes the (cols) values of a given row, or viceversa

        :param id: row (or col) id to cluster its values
        :param k: number of clusters
        :param is_row: is param *id* a row (or a col)?
        :type is_row: Boolean
        """
        # TODO: switch to Pycluster?
        # http://pypi.python.org/pypi/Pycluster
        if VERBOSE:
            sys.stdout.write('Computing k-means, k=%s, for id %s\n' % (k, id))
        point = None
        if is_row:
            point = self.get_matrix().get_row(id)
        else:
            point = self.get_matrix().get_col(id)
        points = []
        points_id = []
        for i in point.nonzero_entries():
            label = point.label(i)
            points_id.append(label)
            if not is_row:
                points.append(self.get_matrix().get_row(label))
            else:
                points.append(self.get_matrix().get_col(label))
        #return kmeans(array(points), k)
        if VERBOSE:
            sys.stdout.write('id %s has %s points\n' % (id, len(points)))
        M = array(points)

        MAX_POINTS = 150
        # Only apply Matrix initialization if num. points is not that big!
        if len(points) <= MAX_POINTS:
            centers = self._kinit(array(points), k)
            centroids, labels = kmeans2(M, centers, minit='matrix')
        else:
            centroids, labels = kmeans2(M, k, minit='random')
        i = 0
        clusters = dict()
        for cluster in labels:
            if not clusters.has_key(cluster): 
                clusters[cluster] = dict()
                clusters[cluster]['centroid'] = centroids[cluster]
                clusters[cluster]['points'] = []
            clusters[cluster]['points'].append(points_id[i])
            i += 1
        return clusters
Пример #48
0
__author__ = 'admin'
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

svd = SVD()
data = Data()
data.load(path='../data/userchlfav',#
          force=True, sep=','
          , format={'col':0, 'row':1, 'ids': int} #, 'value':2
          , pickle=False)

print len(data._data)

for rate in data._data:
    rate[0]

data.set([rate for rate in data._data if rate[1]<1000])

print len(data._data)

svd.set_data(data)

k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)


#ITEMID1 = 1    # Toy Story (1995)
Пример #49
0
class SVD(Algorithm):
    """
    Inherits from base class Algorithm.
    It computes SVD (Singular Value Decomposition) on a matrix *M*

    It also provides recommendations and predictions using the reconstructed matrix *M'*

    :param filename: Path to a Zip file, containing an already computed SVD (U, Sigma, and V) for a matrix *M*
    :type filename: string
    """
    def __init__(self, filename=None):
        #Call parent constructor
        super(SVD, self).__init__()

        # self._U: Eigen vector. Relates the concepts of the input matrix to the principal axes
        # self._S (or \Sigma): Singular -or eigen- values. It represents the strength of each eigenvector.
        # self._V: Eigen vector. Relates features to the principal axes
        self._U, self._S, self._V = (None, None, None)
        # Mean centered Matrix: row and col shifts
        self._shifts = None
        # self._matrix_reconstructed: M' = U S V^t
        self._matrix_reconstructed = None

        # Similarity matrix: (U \Sigma)(U \Sigma)^T = U \Sigma^2 U^T
        # U \Sigma is concept_axes weighted by axis_weights.
        self._matrix_similarity = SimilarityMatrix()

        if filename:
            self.load_model(filename)

        # Row and Col ids. Only when importing from SVDLIBC
        self._file_row_ids = None
        self._file_col_ids = None

        #Update feature
        self._foldinZeroes = {}
        self.inv_S = None  #since it doesn't get updated so redundent to calculate each time

    def __repr__(self):
        try:
            s = '\n'.join(('M\':' + str(self._reconstruct_matrix()), \
                'A row (U):' + str(self._reconstruct_matrix().right[1]), \
                'A col (V):' + str(self._reconstruct_matrix().left[1])))
        except TypeError:
            s = self._data.__repr__()
        return s

    def load_model(self, filename):
        """
        Loads SVD transformation (U, Sigma and V matrices) from a ZIP file

        :param filename: path to the SVD matrix transformation (a ZIP file)
        :type filename: string
        """
        try:
            zip = zipfile.ZipFile(filename, allowZip64=True)
        except:
            zip = zipfile.ZipFile(filename + '.zip', allowZip64=True)
        # Options file
        options = dict()
        for line in zip.open('README'):
            data = line.strip().split('\t')
            options[data[0]] = data[1]
        try:
            k = int(options['k'])
        except:
            k = 100  #TODO: nasty!!!

        # Load U, S, and V
        """
        #Python 2.6 only:
        #self._U = loads(zip.open('.U').read())
        #self._S = loads(zip.open('.S').read())
        #self._V = loads(zip.open('.V').read())
        """
        try:
            self._U = loads(zip.read('.U'))
        except:
            matrix = fromfile(zip.extract('.U', TMPDIR))
            vectors = []
            i = 0
            while i < len(matrix) / k:
                v = DenseVector(matrix[k * i:k * (i + 1)])
                vectors.append(v)
                i += 1
            try:
                idx = [
                    int(idx.strip())
                    for idx in zip.read('.row_ids').split('\n') if idx
                ]
            except:
                idx = [
                    idx.strip() for idx in zip.read('.row_ids').split('\n')
                    if idx
                ]
            #self._U = DenseMatrix(vectors)
            self._U = DenseMatrix(vectors, OrderedSet(idx), None)
        try:
            self._V = loads(zip.read('.V'))
        except:
            matrix = fromfile(zip.extract('.V', TMPDIR))
            vectors = []
            i = 0
            while i < len(matrix) / k:
                v = DenseVector(matrix[k * i:k * (i + 1)])
                vectors.append(v)
                i += 1
            try:
                idx = [
                    int(idx.strip())
                    for idx in zip.read('.col_ids').split('\n') if idx
                ]
            except:
                idx = [
                    idx.strip() for idx in zip.read('.col_ids').split('\n')
                    if idx
                ]
            #self._V = DenseMatrix(vectors)
            self._V = DenseMatrix(vectors, OrderedSet(idx), None)

        self._S = loads(zip.read('.S'))

        # Shifts for Mean Centerer Matrix
        self._shifts = None
        if '.shifts.row' in zip.namelist():
            self._shifts = [
                loads(zip.read('.shifts.row')),
                loads(zip.read('.shifts.col')),
                loads(zip.read('.shifts.total'))
            ]
        self._reconstruct_matrix(shifts=self._shifts, force=True)
        self._reconstruct_similarity(force=True)

    def save_model(self, filename, options={}):
        """
        Saves SVD transformation (U, Sigma and V matrices) to a ZIP file

        :param filename: path to save the SVD matrix transformation (U, Sigma and V matrices)
        :type filename: string
        :param options: a dict() containing the info about the SVD transformation. E.g. {'k': 100, 'min_values': 5, 'pre_normalize': None, 'mean_center': True, 'post_normalize': True}
        :type options: dict
        """
        if VERBOSE:
            sys.stdout.write('Saving svd model to %s\n' % filename)

        f_opt = open(filename + '.config', 'w')
        for option, value in options.items():
            f_opt.write('\t'.join((option, str(value))) + '\n')
        f_opt.close()
        # U, S, and V
        MAX_VECTORS = 2**21
        if len(self._U) < MAX_VECTORS:
            self._U.dump(filename + '.U')
        else:
            self._U.tofile(filename + '.U')
        if len(self._V) < MAX_VECTORS:
            self._V.dump(filename + '.V')
        else:
            self._V.tofile(filename + '.V')
        self._S.dump(filename + '.S')

        # Shifts for Mean Centered Matrix
        if self._shifts:
            #(row_shift, col_shift, total_shift)
            self._shifts[0].dump(filename + '.shifts.row')
            self._shifts[1].dump(filename + '.shifts.col')
            self._shifts[2].dump(filename + '.shifts.total')

        zip = filename
        if not filename.endswith('.zip') and not filename.endswith('.ZIP'):
            zip += '.zip'
        fp = zipfile.ZipFile(zip, 'w', allowZip64=True)

        # Store Options in the ZIP file
        fp.write(filename=filename + '.config', arcname='README')
        os.remove(filename + '.config')

        # Store matrices in the ZIP file
        for extension in ['.U', '.S', '.V']:
            fp.write(filename=filename + extension, arcname=extension)
            os.remove(filename + extension)

        # Store mean center shifts in the ZIP file
        if self._shifts:
            for extension in ['.shifts.row', '.shifts.col', '.shifts.total']:
                fp.write(filename=filename + extension, arcname=extension)
                os.remove(filename + extension)

        # Store row and col ids file, if importing from SVDLIBC
        if self._file_row_ids:
            fp.write(filename=self._file_row_ids, arcname='.row_ids')
        if self._file_col_ids:
            fp.write(filename=self._file_col_ids, arcname='.col_ids')

    def _reconstruct_similarity(self, post_normalize=True, force=True):
        if not self.get_matrix_similarity() or force:
            self._matrix_similarity = SimilarityMatrix()
            self._matrix_similarity.create(self._U,
                                           self._S,
                                           post_normalize=post_normalize)
        return self._matrix_similarity

    def _reconstruct_matrix(self, shifts=None, force=True):
        if not self._matrix_reconstructed or force:
            if shifts:
                self._matrix_reconstructed = divisi2.reconstruct(self._U,
                                                                 self._S,
                                                                 self._V,
                                                                 shifts=shifts)
            else:
                self._matrix_reconstructed = divisi2.reconstruct(
                    self._U, self._S, self._V)
        return self._matrix_reconstructed

    def compute(self,
                k=100,
                min_values=None,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True,
                savefile=None):
        """
        Computes SVD on matrix *M*, :math:`M = U \Sigma V^T`

        :param k: number of dimensions
        :type k: int
        :param min_values: min. number of non-zeros (or non-empty values) any row or col must have
        :type min_values: int
        :param pre_normalize: normalize input matrix. Possible values are tfidf, rows, cols, all.
        :type pre_normalize: string
        :param mean_center: centering the input matrix (aka mean substraction)
        :type mean_center: Boolean
        :param post_normalize: Normalize every row of :math:`U \Sigma` to be a unit vector. Thus, row similarity (using cosine distance) returns :math:`[-1.0 .. 1.0]`
        :type post_normalize: Boolean
        :param savefile: path to save the SVD factorization (U, Sigma and V matrices)
        :type savefile: string
        """
        super(SVD, self).compute(
            min_values
        )  #creates matrix and does squish to not have empty values

        if VERBOSE:
            sys.stdout.write(
                'Computing svd k=%s, min_values=%s, pre_normalize=%s, mean_center=%s, post_normalize=%s\n'
                % (k, min_values, pre_normalize, mean_center, post_normalize))
            if not min_values:
                sys.stdout.write(
                    '[WARNING] min_values is set to None, meaning that some funky recommendations might appear!\n'
                )

        # Get SparseMatrix
        matrix = self._matrix.get()

        # Mean center?
        shifts, row_shift, col_shift, total_shift = (None, None, None, None)
        if mean_center:
            if VERBOSE:
                sys.stdout.write(
                    "[WARNING] mean_center is True. svd.similar(...) might return nan's. If so, then do svd.compute(..., mean_center=False)\n"
                )
            matrix, row_shift, col_shift, total_shift = matrix.mean_center()
            self._shifts = (row_shift, col_shift, total_shift)

        # Pre-normalize input matrix?
        if pre_normalize:
            """
            Divisi2 divides each entry by the geometric mean of its row norm and its column norm.
            The rows and columns don't actually become unit vectors, but they all become closer to unit vectors.
            """
            if pre_normalize == 'tfidf':
                matrix = matrix.normalize_tfidf(
                )  #TODO By default, treats the matrix as terms-by-documents;
                # pass cols_are_terms=True if the matrix is instead documents-by-terms.
            elif pre_normalize == 'rows':
                matrix = matrix.normalize_rows()
            elif pre_normalize == 'cols':
                matrix = matrix.normalize_cols()
            elif pre_normalize == 'all':
                matrix = matrix.normalize_all()
            else:
                raise ValueError("Pre-normalize option (%s) is not correct.\n \
                                  Possible values are: 'tfidf', 'rows', 'cols' or 'all'"
                                 % pre_normalize)
        #Compute SVD(M, k)
        self._U, self._S, self._V = matrix.svd(k)
        # Sim. matrix = U \Sigma^2 U^T
        self._reconstruct_similarity(post_normalize=post_normalize, force=True)
        # M' = U S V^t
        self._reconstruct_matrix(shifts=self._shifts, force=True)

        if savefile:
            options = {
                'k': k,
                'min_values': min_values,
                'pre_normalize': pre_normalize,
                'mean_center': mean_center,
                'post_normalize': post_normalize
            }
            self.save_model(savefile, options)

    def _get_row_reconstructed(
        self,
        i,
        zeros=None
    ):  #if foldin that means it is known what the user rated and zeros contains the rated items
        if zeros:
            return self._matrix_reconstructed.row_named(i)[zeros]
        return self._matrix_reconstructed.row_named(i)

    def _get_col_reconstructed(self, j, zeros=None):
        if zeros:
            return self._matrix_reconstructed.col_named(j)[zeros]
        return self._matrix_reconstructed.col_named(j)

    def _get_row_unrated(
        self, i, rated
    ):  # use for foldin since that means users new rated items are known so no need to squish or need normal matrix
        sparse_matrix = self._matrix_reconstructed.row_named(i).to_sparse()
        # values: np array with the predicted ratings or ratings
        # named_rows: normal array with movie names
        values, named_cols = sparse_matrix.named_lists(
        )  #values contains a np array with predicted ratings , while named_cols contains list of labels of columns
        removal_indicies = []  #array of indicies for removal

        for item in rated:
            index_remove = named_cols.index(item)
            del named_cols[
                index_remove]  #since its a normal list can remove like this
            removal_indicies.append(index_remove)

        values = np.delete(
            values, removal_indicies
        )  #since it's a numpy array so must remove like this

        return divisiSparseVector.from_named_lists(values,
                                                   named_cols).to_dense()

    def _get_col_unrated(
        self, j, rated
    ):  # use for foldin since that means users new rated items are known so no need to squish or need normal matrix
        sparse_matrix = self._matrix_reconstructed.col_named(j).to_sparse()
        # values: np array with the predicted ratings or ratings
        # named_rows: normal array with movie names
        values, named_rows = sparse_matrix.named_lists()
        removal_indicies = []

        for item in rated:
            index_remove = named_rows.index(item)
            del named_rows[index_remove]
            removal_indicies.append(index_remove)

        values = np.delete(values, removal_indicies)

        return divisiSparseVector.from_named_lists(values,
                                                   named_rows).to_dense()

    def predict(self, i, j, MIN_VALUE=None, MAX_VALUE=None):
        """
        Predicts the value of :math:`M_{i,j}`, using reconstructed matrix :math:`M^\prime = U \Sigma_k V^T`

        :param i: row in M, :math:`M_{i \cdot}`
        :type i: user or item id
        :param j: col in M, :math:`M_{\cdot j}`
        :type j: item or user id
        :param MIN_VALUE: min. value in M (e.g. in ratings[1..5] => 1)
        :type MIN_VALUE: float
        :param MAX_VALUE: max. value in M (e.g. in ratings[1..5] => 5)
        :type MAX_VALUE: float
        """
        if not self._matrix_reconstructed:
            self.compute()  #will use default values!
        predicted_value = self._matrix_reconstructed.entry_named(
            i, j)  #M' = U S V^t
        if MIN_VALUE:
            predicted_value = max(predicted_value, MIN_VALUE)
        if MAX_VALUE:
            predicted_value = min(predicted_value, MAX_VALUE)
        return float(predicted_value)

    def recommend(self, i, n=10, only_unknowns=False, is_row=True):
        """
        Recommends items to a user (or users to an item) using reconstructed matrix :math:`M^\prime = U \Sigma_k V^T`

        E.g. if *i* is a row and *only_unknowns* is True, it returns the higher values of :math:`M^\prime_{i,\cdot}` :math:`\\forall_j{M_{i,j}=\emptyset}`

        :param i: row or col in M
        :type i: user or item id
        :param n: number of recommendations to return
        :type n: int
        :param only_unknowns: only return unknown values in *M*? (e.g. items not rated by the user)
        :type only_unknowns: Boolean
        :param is_row: is param *i* a row (or a col)?
        :type is_row: Boolean
        """
        if not self._matrix_reconstructed:
            self.compute()  #will use default values!
        item = None
        zeros = []
        seeDict = False
        if only_unknowns and not self._matrix.get() and len(
                self._foldinZeroes) == 0:
            raise ValueError(
                "Matrix is empty! If you loaded an SVD model you can't use only_unknowns=True, unless svd.create_matrix() is called"
            )
        if not self._matrix.get():
            seeDict = True
        if is_row:
            if only_unknowns:
                if seeDict:
                    zeros = self._foldinZeroes[
                        i]  #zeros in this instance contains the rated items
                    if len(zeros) == 0:
                        raise ValueError(
                            "Matrix is empty! If you loaded an SVD model you can't use only_unknowns=True, unless svd.create_matrix() is called or youve just folded them in"
                        )
                    else:
                        item = self._get_row_unrated(
                            i, zeros
                        )  #removing the rated items from utility row for recommendations
                else:
                    zeros = self._matrix.get().row_named(i).zero_entries()
                    item = self._get_row_reconstructed(i, zeros)
            else:
                item = self._get_row_reconstructed(i, zeros)
        else:
            if only_unknowns:
                if seeDict:
                    zeros = self._foldinZeroes[
                        i]  #zeros in this instance contains the rated items
                    if len(zeros) == 0:
                        raise ValueError(
                            "Matrix is empty! If you loaded an SVD model you can't use only_unknowns=True, unless svd.create_matrix() is called or you just folded them in"
                        )
                    else:
                        item = self._get_col_unrated(
                            i, zeros
                        )  #removing the rated items from utility columns for recommendations
                else:
                    zeros = self._matrix.get().col_named(i).zero_entries()
                    item = self._get_col_reconstructed(i, zeros)
            else:
                item = self._get_row_reconstructed(i, zeros)

        return item.top_items(n)

    def _calc_mean_center(
        self,
        matrix,
        is_row=True
    ):  #created this to use the loaded shifts and calculate the row or column shift
        row_shift, col_shift, total_shift = self._shifts

        total_mean = total_shift  # use the global shift one
        if is_row:
            row_means = matrix.row_op(
                np.mean) - total_mean  # calculate row shift
            col_means = col_shift  # use already given col shifts
        else:
            row_means = row_shift  # use already given row shifts
            col_means = matrix.col_op(
                np.mean) - total_mean  # calculate col shifts

        row_lengths = matrix.row_op(len)
        col_lengths = matrix.col_op(len)

        shifted = matrix.copy()
        for row, col in shifted.keys():
            shifted[row, col] -= (
                (row_means[row] * row_lengths[row] +
                 col_means[col] * col_lengths[col]) /
                (row_lengths[row] + col_lengths[col])) + total_mean

        return (shifted, row_means, col_means, total_mean)
        # return shifted

    def load_updateDataTuple_foldin(self,
                                    filename,
                                    force=True,
                                    sep='\t',
                                    format={
                                        'value': 0,
                                        'row': 1,
                                        'col': 2
                                    },
                                    pickle=False,
                                    is_row=True,
                                    truncate=True,
                                    post_normalize=False):
        """
        Folds-in a SINGLE user OR item. First loads a dataset file that contains a SINGLE tuple (a dataset for a single user OR item , has to be either same row or same column depending on is_row aka tuple)

        For params: filename,force,sep,format,pickle then see params definition in *datamodel.Data.load()*

        :param is_row: are you trying to foldin a row or a column ? yes->foldin row , no->foldin column
        :type is_row: boolean
        :param truncate: sometimes new users rate new items not in the original SVD matrix so would you like new items to be truncated or folded in ? default is foldin
        :type truncate: boolean
        :param post_normalize: Normalize every row of :math:`U \Sigma` to be a unit vector. Thus, row similarity (using cosine distance) returns :math:`[-1.0 .. 1.0]`
        :type post_normalize: Boolean

        """
        if force:
            self._updateData = Data()

        self._updateData.load(filename, force, sep, format, pickle)

        if VERBOSE:
            print "reading the new tuple"
        if (is_row):
            nDimensionLabels = self._V.all_labels()[
                0]  #get labels from V matrix to complete the sparse matrix
            print type(nDimensionLabels)
            print type(nDimensionLabels[0])
            print len(nDimensionLabels)
            self._singleUpdateMatrix.create(self._updateData.get(),
                                            col_labels=nDimensionLabels,
                                            foldin=True,
                                            truncate=truncate)
            self._foldinZeroes[self._singleUpdateMatrix.get_rows()
                               [0]] = self._singleUpdateMatrix.get_cols()

        else:
            nDimensionLabels = self._U.all_labels(
            )  #get labels from U matrix to complete the sparse matrix
            print nDimensionLabels
            self._singleUpdateMatrix.create(self._updateData.get(),
                                            row_labels=nDimensionLabels,
                                            foldin=True,
                                            truncate=truncate)
            self._foldinZeroes[self._singleUpdateMatrix.get_cols()
                               [0]] = self._singleUpdateMatrix.get_rows()

        if not truncate:
            additionalElements = self._singleUpdateMatrix.get_additional_elements(
            )
            #If it's trying to foldin a new user who has rated a new item which was not used before, then foldin the item first then foldin that user
            print "dimension", len(nDimensionLabels)
            print "additional elements:", additionalElements
            print "length", len(additionalElements)
            if len(additionalElements) != 0:
                for item in additionalElements:
                    if (
                            is_row
                    ):  #if I am folding in a row then , the additionals added that shouldn't be are the columns to be folded in to the rows
                        self._singleAdditionalFoldin.create(
                            [(0, nDimensionLabels[0], item)],
                            row_labels=self._U.all_labels()[0])
                    else:
                        self._singleAdditionalFoldin.create(
                            [(0, item, nDimensionLabels[0])],
                            col_labels=self._V.all_labels()[0])
                    self._update(update_matrix=self._singleAdditionalFoldin,
                                 is_row=not is_row)

        # #update the data matrix
        if VERBOSE:
            print "updating the sparse matrix"
        if self._matrix.get():  #if matrix not there due to load ignore it
            self._matrix.update(
                self._singleUpdateMatrix
            )  # updating the data matrix for the zeroes , also for saving the data matrix if needed

        # Mean centering
        if self._shifts:  #if not None then it means mean_center was equal true
            row_shift, col_shift, total_shift = self._shifts

            meanedMatrix, rowShift, colShift, totalShift = self._calc_mean_center(
                self._singleUpdateMatrix.get(), is_row=is_row)

            self._singleUpdateMatrix.set(meanedMatrix)

            if is_row:
                values, named_rows = row_shift.to_sparse().named_lists(
                )  #values numpy array, named_rows normal array
                valuesFold, named_rowsFold = rowShift.to_sparse().named_lists()

            else:
                values, named_rows = col_shift.to_sparse().named_lists(
                )  # values numpy array, named_rows normal array
                valuesFold, named_rowsFold = colShift.to_sparse().named_lists()

            values = np.concatenate((values, valuesFold))
            named_rows.extend(named_rowsFold)

            if is_row:
                row_shift = divisiSparseVector.from_named_lists(
                    values, named_rows).to_dense()
            else:
                col_shift = divisiSparseVector.from_named_lists(
                    values, named_rows).to_dense()

            self._shifts = (row_shift, col_shift, total_shift)

        self._update(is_row=is_row, post_normalize=post_normalize)

    def _construct_batch_dictionary(self, data, is_row=True):
        """

        :param data: Data()
        :param is_row: Boolean
        :return: constructs a dictionary with the row or col as the keys (depending on which is being added) with values as the tuples
        in self._batchDict
        """

        key_idx = 1  #key index default is the row
        if not is_row:
            key_idx = 2

        #collecting the significant col or row tuples at one place to fold them in at once

        for item in data:  #data is a list of tuples so item is 1 tuple
            try:
                self._batchDict[item[key_idx]].append(item)
            except KeyError:
                self._batchDict[item[key_idx]] = []
                self._batchDict[item[key_idx]].append(item)

        #batch loaded , now need to fold them in one by one
        print "Batch loaded successfully"

    def load_updateDataBatch_foldin(self,
                                    filename=None,
                                    data=None,
                                    force=True,
                                    sep='\t',
                                    format={
                                        'value': 0,
                                        'row': 1,
                                        'col': 2
                                    },
                                    pickle=False,
                                    is_row=True,
                                    truncate=True,
                                    post_normalize=False):
        """
            Folds in the batch users or items, first Loads a dataset file that contains Multiple tuples (users or items) or uses the preloaded data from the datamodel/data.py object then folds them in with their ratings

            :param data: Contains the dataset that was loaded using the Data() class
            :type data: Data()

            For params: filename,force,sep,format,pickle then see params definition in *datamodel.Data.load()*

            :param is_row: are you trying to foldin a row or a column ? yes->foldin row , no->foldin column
            :type is_row: boolean
            :param truncate: sometimes new users rate new items not in the original SVD matrix so would you like new items to be truncated or folded in ? default is foldin
            :type truncate: boolean
            :param post_normalize: Normalize every row of :math:`U \Sigma` to be a unit vector. Thus, row similarity (using cosine distance) returns :math:`[-1.0 .. 1.0]`
            :type post_normalize: Boolean
            """

        if force:
            self._updateData = Data()
        if filename:  #not null
            self._updateData.load(filename, force, sep, format,
                                  pickle)  #load array of tuples
        else:
            if data:
                self._updateData = data
            else:
                raise ValueError('No data or filename set!')
        print "Reading the new batch"

        self._construct_batch_dictionary(self._updateData.get(), is_row)

        print "Folding in batch entries"
        nDimensionLabels = None
        if (is_row):
            nDimensionLabels = self._V.all_labels()[
                0]  # get labels from V matrix to complete the sparse matrix
        else:
            nDimensionLabels = self._U.all_labels()[
                0]  # get labels from U matrix to complete the sparse matrix
        length_of_dict = len(self._batchDict)
        i = 0
        meanDenseVector = []
        isbatch = True
        for key_idx in self._batchDict:  #data in batchDict in form {key:[(tuple)]}
            i += 1
            if VERBOSE:
                if i % 100 == 0:
                    sys.stdout.write('.')
                if i % 1000 == 0:
                    sys.stdout.write('|')
                if i % 10000 == 0:
                    sys.stdout.write(' (%d K user)\n' % int(i / 1000))

            if (is_row):
                self._singleUpdateMatrix.create(self._batchDict[key_idx],
                                                col_labels=nDimensionLabels,
                                                foldin=True,
                                                truncate=truncate)

            else:
                self._singleUpdateMatrix.create(self._batchDict[key_idx],
                                                row_labels=nDimensionLabels,
                                                foldin=True,
                                                truncate=truncate)

            # If it's trying to foldin a new user who has rated a new item which was not used before, then foldin the item first then foldin that user
            if not truncate:
                additionalElements = self._singleUpdateMatrix.get_additional_elements(
                )

                if len(additionalElements) != 0:
                    for item in additionalElements:
                        if (
                                is_row
                        ):  # if I am folding in a row then , the additionals added that shouldn't be are the columns to be folded in to the rows
                            self._singleAdditionalFoldin.create(
                                [(0, nDimensionLabels[0], item)],
                                row_labels=self._U.all_labels()[0])
                        else:
                            self._singleAdditionalFoldin.create(
                                [(0, item, nDimensionLabels[0])],
                                col_labels=self._V.all_labels()[0])

                        self._update(
                            update_matrix=self._singleAdditionalFoldin,
                            is_row=not is_row)

            if self._shifts:  # if not None then it means mean_center was equal true
                row_shift, col_shift, total_shift = self._shifts

                meanedMatrix, rowShift, colShift, totalShift = self._calc_mean_center(
                    self._singleUpdateMatrix.get(), is_row=is_row)

                self._singleUpdateMatrix.set(meanedMatrix)
                # row shift cause it's row for the time being
                if is_row:
                    meanDenseVector.append(rowShift)

                else:
                    meanDenseVector.append(colShift)

            if self._matrix.get():  #if matrix not there due to load ignore it
                self._matrix.update(
                    self._singleUpdateMatrix, is_batch=isbatch
                )  # updating the data matrix for the zeroes , also for saving the data matrix if needed

            self._update(
                is_row=is_row,
                is_batch=isbatch)  #Do foldin on the singleUpdateMatrix tuple
        if VERBOSE:
            sys.stdout.write('\n')
        #     UPDATING MEAN CENTER PART
        if self._shifts:
            sys.stdout.write("updating shifts")
            if is_row:
                values, named_rows = row_shift.to_sparse().named_lists(
                )  # values numpy array, named_rows normal array
            else:
                values, named_rows = col_shift.to_sparse().named_lists(
                )  # values numpy array, named_rows normal array
            for vector in meanDenseVector:
                valuesFold, named_rowsFold = vector.to_sparse().named_lists(
                )  # rowShift contains new calculated row shift
                values = np.concatenate((values, valuesFold))
                named_rows.extend(named_rowsFold)
            if is_row:
                row_shift = divisiSparseVector.from_named_lists(
                    values, named_rows).to_dense()
            else:
                col_shift = divisiSparseVector.from_named_lists(
                    values, named_rows).to_dense()

            self._shifts = (row_shift, col_shift, total_shift)

        self.update_sparse_matrix_data(is_batch=True,
                                       squish=False,
                                       post_normalize=post_normalize)

    def update_sparse_matrix_data(self,
                                  squishFactor=10,
                                  is_batch=False,
                                  squish=True,
                                  post_normalize=False):
        #update the data matrix
        if is_batch:
            if self._matrix.get():
                if VERBOSE:
                    print "updating sparse index"
                self._matrix.index_sparseMatrix()
            if VERBOSE:
                print "before updating, M=", self._matrix_reconstructed.shape
            # Sim. matrix = U \Sigma^2 U^T
            self._reconstruct_similarity(post_normalize=post_normalize,
                                         force=True)
            # M' = U S V^t
            self._reconstruct_matrix(shifts=self._shifts, force=True)
            if VERBOSE:
                print "done updating, M=", self._matrix_reconstructed.shape
        if squish:
            if self._matrix.get():  #if loaded model there is no matrix
                if VERBOSE:
                    print "commiting the sparse data matrix by removing empty rows and columns divisi created"
                self._matrix.squish(
                    squishFactor
                )  # updating the data matrix for the zeroes ,#NOTE: Intensive so do at end

    def _update(self,
                update_matrix=None,
                is_row=True,
                is_batch=False,
                post_normalize=False):
        #The function which does the actual folding-in process
        if self.inv_S is None:
            self.inv_S = np.zeros((self._S.shape[0], self._S.shape[0]))
            for i in range(self._S.shape[0]):
                self.inv_S[i, i] = self._S[
                    i]**-1  # creating diagonal matrix and inverting using special property of diagonal matrix

        #if new is row -> V*S^-1
        if is_row:
            prodM = self._V.dot(self.inv_S)
            # if VERBOSE:
            #     print "dimension of VxS^-1=", prodM.shape
        else:  #if new is col -> U*S^-1
            prodM = self._U.dot(self.inv_S)
            # if VERBOSE:
            #     print "dimension of UxS^-1=", prodM.shape

        if update_matrix:
            updateTupleMatrix = update_matrix.get()
        else:
            updateTupleMatrix = self._singleUpdateMatrix.get()

        if not is_row:
            updateTupleMatrix = updateTupleMatrix.transpose()  #transpose

        res = updateTupleMatrix.dot(prodM)

        if is_row:
            #new value can now be concatinated with U

            self._U = self._U.concatenate(res)

        else:
            #new value can now be concatinated with V

            self._V = self._V.concatenate(res)

        if not is_batch:  #will reconstruct all at end with batch using another function
            if VERBOSE:
                print "before updating, M=", self._matrix_reconstructed.shape
            # Sim. matrix = U \Sigma^2 U^T
            self._reconstruct_similarity(post_normalize=post_normalize,
                                         force=True)
            # M' = U S V^t
            self._reconstruct_matrix(shifts=self._shifts, force=True)
            if VERBOSE:
                print "done updating, M=", self._matrix_reconstructed.shape

    def centroid(self, ids, is_row=True):
        points = []
        for id in ids:
            if is_row:
                point = self._U.row_named(id)
            else:
                point = self._V.row_named(id)
            points.append(point)
        M = divisi2.SparseMatrix(points)
        return M.col_op(sum) / len(points)  #TODO Numpy.sum?

    def kmeans(self, ids, k=5, components=3, are_rows=True):
        """
        K-means clustering. It uses k-means++ (http://en.wikipedia.org/wiki/K-means%2B%2B) to choose the initial centroids of the clusters

        Clusterizes a list of IDs (either row or cols)

        :param ids: list of row (or col) ids to cluster
        :param k: number of clusters
        :param components: how many eigen values use (from SVD)
        :param are_rows: is param *ids* a list of rows (or cols)?
        :type are_rows: Boolean
        """
        if not isinstance(ids, list):
            # Cluster the whole row(or col) values. It's slow!
            return super(SVD, self).kmeans(ids, k=k, is_row=are_rows)
        if VERBOSE:
            sys.stdout.write('Computing k-means, k=%s for ids %s\n' % (k, ids))
        MAX_POINTS = 150
        points = []
        for id in ids:
            if are_rows:
                points.append(self._U.row_named(id)[:components])
            else:
                points.append(self._V.row_named(id)[:components])
        M = array(points)
        # Only apply Matrix initialization if num. points is not that big!
        if len(points) <= MAX_POINTS:
            centers = self._kinit(array(points), k)
            centroids, labels = kmeans2(M, centers, minit='matrix')
        else:
            centroids, labels = kmeans2(M, k, minit='random')
        i = 0
        clusters = dict()
        for cluster in labels:
            if not clusters.has_key(cluster):
                clusters[cluster] = dict()
                clusters[cluster]['centroid'] = centroids[cluster]
                clusters[cluster]['points'] = []
            point = self._U.row_named(ids[i])[:components]
            centroid = clusters[cluster]['centroid']
            to_centroid = self._cosine(centroid, point)
            clusters[cluster]['points'].append((ids[i], to_centroid))
            clusters[cluster]['points'].sort(key=itemgetter(1), reverse=True)
            i += 1
        return clusters

    '''
Пример #50
0
def setup():
    global user, items, data
    user = User(USERID)
    items = _read_items(os.path.join(MOVIELENS_DATA_PATH, 'movies.dat'))
    data = Data()
    data.load(os.path.join(MOVIELENS_DATA_PATH, 'ratings.dat'), sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
def SVDloadData2():
    dat_file='/home/commons/RecSys/MOVIEDATA/ml-1m/ratings.dat'
    pct_train=0.5
    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int})
    return data
Пример #52
0
		for line in file:
			if cnt % 10000 == 9999: 
				print "%d / 1524458 loaded\r"%(cnt+1),
			cnt += 1
			# if cnt == 100000: break
			(user, item, week, time, feat1, feat2)=line.split('\t')
			test.append(
				{"1_user_id": int(user),
				 "2_item_id": int(item)
				})		
	return test

recsys.algorithm.VERBOSE = True
print "loading data"
data = Data()
data.load('../item_recom/train_info.tsv',sep='\t', format={'col':0, 'row':1, 'value':6, 'ids': int})

topic = 48
print "compute svd"
svd = SVD()
svd.set_data(data)
svd.compute(k=topic, min_values=0.0, pre_normalize=None, mean_center=True, post_normalize=True)

print "loading test data"
test = loadTest('../item_recom/test_info.tsv')

print svd.predict(0,0)

print "creating submission"
with open('../submissions/recsys_3.csv', 'w') as csvfile:
	fieldnames = ['uid#iid', 'pred']
Пример #53
0
def SVDloadData2():
    dat_file='ratings.dat'
    pct_train=0.5
    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int})
    return data
Пример #54
0
class Algorithm(object):
    """
    Base class Algorithm

    It has the basic methods to load a dataset, get the matrix and the raw input
    data, add more data (tuples), etc.

    Any other Algorithm derives from this base class
    """
    def __init__(self):
        self._data = Data()
        self._matrix = SparseMatrix()
        self._matrix_similarity = None #self-similarity matrix (only for the input Matrix rows)
        self._matrix_and_data_aligned = False #both Matrix and Data contain the same info?

    def __len__(self):
        return len(self.get_data())

    def __repr__(self):
        s = '%d rows.' % len(self.get_data())
        if len(self.get_data()):
            s += '\nE.g: %s' % str(self.get_data()[0])
        return s

    def get_matrix(self):
        """
        :returns: matrix *M*
        """
        if not self._matrix.get():
            self.create_matrix()
        return self._matrix

    def get_matrix_similarity(self):
        """
        :returns: the self-similarity matrix
        """
        return self._matrix_similarity

    def set_data(self, data):
        """
        Sets the raw dataset (input for matrix *M*)

        :param data: a Dataset class (list of tuples <value, row, col>)
        :type data: Data
        """
        #self._data = Data()
        #self._data.set(data)
        self._data = data
        self._matrix_and_data_aligned = False

    def get_data(self):
        """
        :returns: An instance of Data class. The raw dataset (input for matrix *M*). 
        """
        return self._data

    def add_tuple(self, tuple):
        """
        Add a tuple in the dataset

        :param tuple: a tuple containing <rating, user, item> information. Or, more general: <value, row, col>
        """
        self.get_data().add_tuple(tuple)
        self._matrix_and_data_aligned = False

    def load_data(self, filename, force=True, sep='\t', format={'value':0, 'row':1, 'col':2}, pickle=False):
        """
        Loads a dataset file

        See params definition in *datamodel.Data.load()*
        """
        if force:
            self._data = Data()
            self._matrix_similarity = None

        self._data.load(filename, force, sep, format, pickle)
    
    def save_data(self, filename, pickle=False):
        """
        Saves the dataset in divisi2 matrix format (i.e: value <tab> row <tab> col)

        :param filename: file to store the data
        :type filename: string
        :param pickle: save in pickle format?
        :type filename: boolean
        """
        self._data.save(filename, pickle)

    def create_matrix(self):
        if VERBOSE:
            sys.stdout.write('Creating matrix (%s tuples)\n' % len(self._data))
        try:
            self._matrix.create(self._data.get())
        except AttributeError:
            self._matrix.create(self._data)

        if VERBOSE:
            sys.stdout.write("Matrix density is: %s%%\n" % self._matrix.density())
        self._matrix_and_data_aligned = True

    def compute(self, min_values=None):
        if self._matrix.empty() and (not isinstance(self._data, list) and not self._data.get()):
            raise ValueError('No data set. Matrix is empty!')
        if self._matrix.empty() and (isinstance(self._data, list) and not self._data):
            raise ValueError('No data set. Matrix is empty!')
        if not self._matrix.empty() or not self._matrix_and_data_aligned:
            self.create_matrix()

        if min_values:
            if VERBOSE:
                sys.stdout.write('Updating matrix: squish to at least %s values\n' % min_values)
            self._matrix.set(self._matrix.get().squish(min_values))

    def _get_row_similarity(self, i):
        if not self.get_matrix_similarity() or self.get_matrix_similarity().get() is None:
            self.compute()
        try:
            return self.get_matrix_similarity().get_row(i)
        except KeyError:
            raise KeyError("%s not found!" % i)

    def similar(self, i, n=10):
        """
        :param i: a row in *M*
        :type i: user or item id
        :param n: number of similar elements
        :type n: int
        :returns: the most similar elements of *i*
        """
        if not self.get_matrix_similarity() or self.get_matrix_similarity().get() is None:
            self.compute()
        return self._get_row_similarity(i).top_items(n)

    def similarity(self, i, j):
        """
        :param i: a row in *M*
        :type i: user or item id
        :param j: a row in *M*
        :type j: user or item id
        :returns: the similarity between the two elements *i* and *j*
        """
        if not self.get_matrix_similarity() or self.get_matrix_similarity().get() is None:
            self.compute()
        return self.get_matrix_similarity().value(i, j)

    def predict(self, i, j, MIN_VALUE=None, MAX_VALUE=None):
        raise NotImplementedError("cannot instantiate Abstract Base Class")

    def recommend(self, i, n=10, only_unknowns=False, is_row=True):
        """
        Recommends items to a user (or users to an item) using reconstructed matrix :math:`M^\prime = U \Sigma_k V^T`

        E.g. if *i* is a row and *only_unknowns* is True, it returns the higher values of :math:`M^\prime_{i,\cdot}` :math:`\\forall_j{M_{i,j}=\emptyset}`

        :param i: row or col in M
        :type i: user or item id
        :param n: number of recommendations to return
        :type n: int
        :param only_unknowns: only return unknown values in *M*? (e.g. items not rated by the user)
        :type only_unknowns: Boolean
        :param is_row: is param *i* a row (or a col)?
        :type is_row: Boolean
        """
        if not self._matrix_reconstructed:
            self.compute() #will use default values!
        item = None
        zeros = []
        if only_unknowns and not self._matrix.get():
            raise ValueError("Matrix is empty! If you loaded an SVD model you can't use only_unknowns=True, unless svd.create_matrix() is called")
        if is_row:
            if only_unknowns:
                zeros = self._matrix.get().row_named(i).zero_entries()
            item = self._get_row_reconstructed(i, zeros)
        else:
            if only_unknowns:
                zeros = self._matrix.get().col_named(i).zero_entries()
            item = self._get_col_reconstructed(i, zeros)
        return item.top_items(n)

    ### OTHER METHODS ###
    def _cosine(self, v1, v2):
        return float(divisi2.dot(v1,v2) / (norm(v1) * norm(v2)))

    def centroid(self, ids, are_rows=True):
        if VERBOSE:
            sys.stdout.write('Computing centroid for ids=%s\n' % str(ids))
        points = []
        for id in ids:
            if are_rows:
                point = self.get_matrix().get_row(id)
            else:
                point = self.get_matrix().get_col(id)
            points.append(point)
        M = divisi2.SparseMatrix(points)
        return M.col_op(sum)/len(points) #TODO numpy.sum seems slower?
Пример #55
0
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        self.filename = filename
        self.sep = sep
        self.format = format

        # 训练参数
        self.k = 100
        self.min_values = 10
        self.post_normalize = True

        self.svd = SVD()

        # 判断是否加载
        self.is_load = False

        # 添加数据处理
        self.data = Data()

        # 添加模型评估
        self.rmse = RMSE()

    def get_data(self):
        """
        获取数据
        :return: None
        """
        # 如果模型不存在
        if not os.path.exists(tmpfile):
            # 如果数据文件不存在
            if not os.path.exists(self.filename):
                sys.exit()
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            # 使用Data()来获取数据
            self.data.load(self.filename, sep=self.sep, format=self.format)
            train, test = self.data.split_train_test(percent=80)
            return train, test
        else:
            self.svd.load_model(tmpfile)
            self.is_load = True
            return None, None

    def train(self, train):
        """
        训练模型
        :param train: 训练数据
        :return: None
        """
        if not self.is_load:
            self.svd.set_data(train)
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=tmpfile[:-4])
        return None

    def rs_predict(self, itemid, userid):
        """
        评分预测
        :param itemid: 电影id
        :param userid: 用户id
        :return: None
        """
        score = self.svd.predict(itemid, userid)
        print "推荐的分数为:%f" % score
        return score

    def recommend_to_user(self, userid):
        """
        推荐给用户
        :param userid: 用户id
        :return: None
        """
        recommend_list = self.svd.recommend(userid, is_row=False)

        # 读取文件里的电影名称
        movie_list = []

        for line in open(moviefile, "r"):
            movie_list.append(' '.join(line.split("::")[1:2]))

        # 推荐具体电影名字和分数
        for itemid, rate in recommend_list:
            print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate)
        return None

    def evaluation(self, test):
        """
        模型的评估
        :param test: 测试集
        :return: None
        """
        # 如果模型不是直接加载
        if not self.is_load:

            # 循环取出测试集里面的元组数据<评分,电影,用户>
            for value, itemid, userid in test.get():
                try:
                    predict = self.rs_predict(itemid, userid)
                    self.rmse.add(value, predict)
                except KeyError:
                    continue
            # 计算返回误差(均方误差)
            error = self.rmse.compute()

            print "模型误差为%s:" % error

        return None