def test_save_n_load(percent_train, modelKlass = SVD, dataFname ='/Users/jennyyuejin/recommender/Data/movieData/u.data', dataFormat = {'col':0, 'row':1, 'value':2, 'ids':int}): data = Data() data.load(dataFname, sep='\t', format=dataFormat) print '------ evaluating original' train, test = data.split_train_test(percent=percent_train, shuffle_data=False) print len(train), 'training data points;', len(test), 'testing data points' #Create SVD K=100 svd = modelKlass() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) evaluate(svd, test) svd.save_model('./model/svd.obj.zip', {'k': K, 'min_values': 5, 'pre_normalize': None, 'mean_center': True, 'post_normalize': True}) print '------ evaluating copy' data2 = Data() data2.load(dataFname, sep='\t', format=dataFormat) _, test2 = data2.split_train_test(percent=percent_train, shuffle_data=False) # reload data print len(test2), 'testing data points' svd_pred = modelKlass() svd_pred.load_model('./model/svd.obj.zip') evaluate(svd_pred, test2)
def ex1(dat_file='./ml-1m/ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int}) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K=100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def parse_data(): filename = '../data/ml-1m/ratings.dat' data = Data() format = {'col':0, 'row':1, 'value':2, 'ids': int} data.load(filename, sep='::', format=format) train, test = data.split_train_test(percent=80) # 80% train, 20% test data.save(os.path.join(utils.get_add_dir(), 'ratings'), pickle=True)
def train_and_save(filename): step = filename.split('.')[-1] data = Data() format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'} data.load(filename, sep='::', format=format) train, test = data.split_train_test(percent=80) try: svd = SVD('svdn_model_{step}.zip'.format(step=step)) print('Already exists: svdn_model_{step}.zip'.format(step=step)) except: svd = SVD() svd.set_data(train) svd.compute( k=100, min_values=2, pre_normalize=False, mean_center=True, post_normalize=True, savefile='svdn_model_{step}'.format(step=step) ) print('Saved svdn_model_{step}.zip'.format(step=step))
def calculate_stats_users(pct_train): dat_file = 'user_data_working.csv' data = Data() data.load(dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) train, test = data.split_train_test(percent=pct_train) svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=False) rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def calculate_stats_features(pct_train): dat_file='feature_matrix.csv' data = Data() data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int}) train, test = data.split_train_test(percent=pct_train) K=100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=0, pre_normalize=None, mean_center=False, post_normalize=False) return svd,train,test
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'col': 0 -> Cols in matrix come from column 0 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat # file # 'ids': int -> Ids (row and col ids) are integers (not strings) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K = 100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() # mae is mean ABSOLUTE error # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5 mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def get_mae_rmse(step): data = Data() format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'} filename = 'second_train_test.dat.{step}'.format(step=step) data.load(filename, sep='::', format=format) train, test = data.split_train_test(percent=80) try: svd = SVD('svdn_model_{step}.zip'.format(step=step)) print('Loading model... {step}'.format(step=step)) except: return mae_predicted, rmse_predicted = [], [] for rating, item_id, user_id in test: try: predicted = svd.predict(item_id, user_id) mae_predicted.append((rating, predicted)) rmse_predicted.append((rating, predicted)) except: pass mae_value, rmse_value = np.nan, np.nan if len(mae_predicted) > 0: mae = MAE(mae_predicted) mae_value = mae.compute() if len(rmse_predicted) > 0: rmse = RMSE(rmse_predicted) rmse_value = rmse.compute() return mae_value, rmse_value
def calculate_stats_features(pct_train): dat_file = 'feature_matrix.csv' data = Data() data.load(dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) train, test = data.split_train_test(percent=pct_train) K = 100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=0, pre_normalize=None, mean_center=False, post_normalize=False) return svd, train, test
def calculate_stats_users(pct_train): dat_file = 'user_data_working.csv' data = Data() data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int}) train, test = data.split_train_test(percent=pct_train) svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=False) rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def ex1(dat_file='ml-1m/ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int}) # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'col': 0 -> Cols in matrix come from column 0 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat # file # 'ids': int -> Ids (row and col ids) are integers (not strings) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K = 100 svd = SVD() svd.set_data(train) svd.compute( k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def get_data(self): # 如果模型不存在,则需要加载数据 if not os.path.exists(filename): if not os.path.exists(self.filename): sys.exit() # SVD加载数据 # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format) data = Data() data.load(self.filename, sep=self.sep, format=self.format) # 分割数据集 train, test = data.split_train_test(percent=80) return train, test else: # 直接加载模型 self.svd.load_model(filename) # 将是否加载模型设为True self.load_model = True return None, None
from recsys.evaluation.prediction import RMSE, MAE from recsys.datamodel.data import Data from baseline import Baseline #Import the test class we've just created import time start_time = time.time() #rmsem = [] #for k in range(1, 11): # print str(k)+" fold..." #Dataset dat_file='ratings_user.csv' data = Data() data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2}) train, test = data.split_train_test(percent=80) print train print test ################ kNN ################ train_item = {} train_user = {} for rating, item_id, user_id in train: if item_id in train_item: train_item[item_id][user_id] = rating else: train_item[item_id] = {user_id: rating} if user_id in train_user: train_user[user_id][item_id] = rating else:
class Collaborative_filtering(object): def __init__(self, ratings_file, movies): #No need to pass as ,will be provided in views.py #self.users = users self.movies = movies self.K = 100 self.PERCENT_TRAIN = 85 #Need to provide a default file location for ratings.csv instead of loading everytime.run below 2lines only once #or just provide this file instead. #self.users.to_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index= False) self.ratings_file = ratings_file #Give your path to ratings.csv created from above 2 lines. self.data = None self.svd = None self.recommend_movies_list = None self.recommend_movies_ids = None self.similar_movies_list = None self.similar_movies_ids = None self.movie_id = None self.train = None self.test = None def compute_svd(self): ''' ratings = pd.read_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index_col= False) ratings = ratings.ix[1:] ratings.to_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index = False) self.data = Data() self.data.load(self.ratings_file, sep=',', format={'col':0, 'row':1 ,'value':2, 'ids':float}) self.train , self.test = self.data.split_train_test(percent=self.PERCENT_TRAIN) self.svd = SVD() self.svd.set_data(self.train) self.svd.compute(k=self.K, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True)''' self.data = Data() self.data.load(self.ratings_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': float }) self.train, self.test = self.data.split_train_test(percent=85) self.svd = SVDNeighbourhood() self.svd.set_data(self.train) self.svd.compute(k=100, min_values=1, pre_normalize=None, mean_center=False, post_normalize=True) def similarity_measure( self, movie1, movie2): #gives a similarity measure value between -1 to 1 return round(self.svd.similarity(movie1, movie2), 4) def recommend_movies(self, user_id): l = self.svd.recommend(user_id, n=10, only_unknowns=True, is_row=False) self.recommend_movies_list = [] self.recommend_movies_ids = [] for p in l: #movie names bb = str(movies.ix[movies['movie_id'] == p[0]]['title']).split() q = bb.index('Name:') bb = ' '.join(bb[1:q]) self.recommend_movies_list.append(bb) #movie ids gg = movies.ix[movies['movie_id'] == p[0]] gg = gg.reset_index() del gg['index'] gg = gg.ix[:, 0:2].as_matrix(columns=None).tolist() self.recommend_movies_ids.append(gg[0][0]) return self.recommend_movies_list, self.recommend_movies_ids def get_similar_movies(self, movie1): #Returns a PYTHON list for similar movies. movie1 = int(movie1) l = self.svd.similar(movie1) self.similar_movies_list = [] self.similar_movies_ids = [] l = l[1:] for p in l: #getting movie names bb = str(movies.ix[movies['movie_id'] == p[0]]['title']).split() q = bb.index('Name:') bb = ' '.join(bb[1:q]) self.similar_movies_list.append(bb) #getting movie id's self.similar_movies_ids.append(p[0]) return self.similar_movies_list, self.similar_movies_ids
#To show some messages: import recsys.algorithm recsys.algorithm.VERBOSE = True from recsys.algorithm.factorize import SVD from recsys.datamodel.data import Data from recsys.utils.svdlibc import SVDLIBC from recsys.evaluation.prediction import RMSE, MAE #Dataset PERCENT_TRAIN = int(sys.argv[2]) data = Data() data.load(sys.argv[1], sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int}) #Train & Test data train, test = data.split_train_test(percent=PERCENT_TRAIN) svdlibc = SVDLIBC('./ml-1m/ratings.dat') svdlibc.to_sparse_matrix(sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) svdlibc.compute(k=100) svd = svdlibc.export() svd.save_model('/tmp/svd-model', options={'k': 100}) #svd.similar(ITEMID1) # results might be different than example 4. as there's no min_values=10 set here #Evaluation using prediction-based metrics print 'Evaluating...' rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try:
svd = SVD() svd.load_data(filename='./data/ratings.dat', sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) #Haciendo el split al dataset filename = './data/ratings.dat' data = Data() format = {'col': 0, 'row': 1, 'value': 2, 'ids': int} data.load(filename, sep='::', format=format) train_80, test_20 = data.split_train_test(percent=80) # 80% train, 20% test svd = SVD() svd.set_data(train_80) #Ingresando variables para crear la matrizx k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) k = 100 svd.compute(k=k, min_values=10, pre_normalize=None,
class RecommendSystem(object): def __init__(self, filename, sep, **format): self.filename = filename self.sep = sep self.format = format # 训练参数 self.k = 100 self.min_values = 10 self.post_normalize = True self.svd = SVD() # 判断是否加载 self.is_load = False # 添加数据处理 self.data = Data() # 添加模型评估 self.rmse = RMSE() def get_data(self): """ 获取数据 :return: None """ # 如果模型不存在 if not os.path.exists(tmpfile): # 如果数据文件不存在 if not os.path.exists(self.filename): sys.exit() # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format) # 使用Data()来获取数据 self.data.load(self.filename, sep=self.sep, format=self.format) train, test = self.data.split_train_test(percent=80) return train, test else: self.svd.load_model(tmpfile) self.is_load = True return None, None def train(self, train): """ 训练模型 :param train: 训练数据 :return: None """ if not self.is_load: self.svd.set_data(train) self.svd.compute(k=self.k, min_values=self.min_values, post_normalize=self.post_normalize, savefile=tmpfile[:-4]) return None def rs_predict(self, itemid, userid): """ 评分预测 :param itemid: 电影id :param userid: 用户id :return: None """ score = self.svd.predict(itemid, userid) print "推荐的分数为:%f" % score return score def recommend_to_user(self, userid): """ 推荐给用户 :param userid: 用户id :return: None """ recommend_list = self.svd.recommend(userid, is_row=False) # 读取文件里的电影名称 movie_list = [] for line in open(moviefile, "r"): movie_list.append(' '.join(line.split("::")[1:2])) # 推荐具体电影名字和分数 for itemid, rate in recommend_list: print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate) return None def evaluation(self, test): """ 模型的评估 :param test: 测试集 :return: None """ # 如果模型不是直接加载 if not self.is_load: # 循环取出测试集里面的元组数据<评分,电影,用户> for value, itemid, userid in test.get(): try: predict = self.rs_predict(itemid, userid) self.rmse.add(value, predict) except KeyError: continue # 计算返回误差(均方误差) error = self.rmse.compute() print "模型误差为%s:" % error return None
from recsys.algorithm.factorize import SVD from recsys.datamodel.data import Data filename = "./data/ratings.dat" data = Data() format = {'col': 0, 'row': 1, 'value': 2, 'ids': int} # About format parameter: # 'row': 1 -> Rows in matrix come from second column in ratings.dat file # 'col': 0 -> Cols in matrix come from first column in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from third column in ratings.dat file # 'ids': int -> Ids (row and col ids) are integers (not strings) data.load(filename, sep="::", format=format) train, test = data.split_train_test(percent=80) # 80% train ,20%test svd = SVD() svd.set_data(train) print(svd.predict(22, 22, MIN_VALUE=0.0, MAX_VALUE=5.0)) # the prediction for user loving item print(svd.recommend(1, n=10, only_unknowns=True, is_row=False)) #item recomended for user ,only from known print(svd.recommend(1, n=10, only_unknowns=False, is_row=False)) #item recomended for user
print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute() if __name__ == '__main__': #Dataset PERCENT_TRAIN = 100 data = Data() data.load('/Users/jennyyuejin/recommender/Data/test_0/userProd.data', sep='\t', format={'col':0, 'row':1, 'value':2, 'ids':int}) #Train & Test data train, test = data.split_train_test(percent=PERCENT_TRAIN, shuffle_data=True) print len(train), 'training data points;', len(test), 'testing data points' itemId = 0 item = Item(itemId) item.add_data({'name': 'project0', 'popularity': 0.5, 'tags': [0, 0, 1] }) itemId = 1 item2 = Item(itemId) item2.add_data({'name': 'project1', 'popularity': 0.9, 'tags': [0, 0, 1] })
'col': 0, 'row': 1, 'value': 2, 'ids': int }) rmse_svd_all = [] mae_svd_all = [] rmse_svd_neig_all = [] mae_svd_neig_all = [] RUNS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] for run in RUNS: print 'RUN(%d)' % run #Train & Test data train, test = data.split_train_test(percent=PERCENT_TRAIN) svd.set_data(train) svd_neig.set_data(train) #Compute SVD svd.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True) svd_neig.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True)