def SVDloadData(): svd = SVD() recsys.algorithm.VERBOSE = True dat_file = '/home/commons/RecSys/MOVIEDATA/MOVIEDATA/ml-1m/ratings.dat' svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) print svd.get_matrix() return svd
def SVDtrain2(data,pct_train): train, test = data.split_train_test(percent=pct_train) K=100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) return svd,train,test
def getSimilarityMatrix(svd_model_file): """ Returns similarity matrix from svd_model_file """ #Import SVD from file svd=SVD() svd.load_model(svd_model_file) return svd.get_matrix_similarity()
def build_model(self,uids,kn): data = Data() for uid,songs in uids.items(): for song in songs: data.add_tuple((1,song,uid)) svd = SVD() svd.set_data(data) svd.compute(k=kn,min_values=1) self.model = svd
def setup(): global users, items, svd print 'Reading items...' items = _read_items(os.path.join(MOVIELENS_DATA_PATH, 'movies.dat')) users = [] svd = SVD() svd.load_data(filename=os.path.join(MOVIELENS_DATA_PATH, 'ratings.dat'), sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
def calculate_SVD_features(): print "Thanks for input, calculating..." svd = SVD() recsys.algorithm.VERBOSE = True dat_file = 'feature_matrix.csv' svd.load_data(filename=dat_file, sep=',', format = {'col':0, 'row':1, 'value': 2, 'ids': int}) svd.compute(k=100, min_values=0, pre_normalize=None, mean_center=False, post_normalize=True) return svd
def getSVD(): filename = "/home/udaysagar/Documents/Classes/239/recsys/model/movielens.zip" if os.path.exists(filename): return SVD("./model/movielens") else: svd = SVD() svd.load_data(filename='./data/movielens/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./model/movielens') return svd
def train_svd(data): """ This method load processed data and modelling data using Singular Value Decomposition :return: SVD model """ svd = SVD() svd.set_data(get_data_model_matrix(data)) k = 30 svd.compute(k=k, min_values=0, pre_normalize=None, mean_center=True, post_normalize=True) return svd
def get_model(model_name,datasource_name,start,end,model_params): if not model_name in model_data: model_data[model_name] = (datasource_name,start,end,model_params) if not os.path.exists(model_dir+model_name): #initialize model with new data svd = SVD() svd.load_data(filename=data_dir+datasource_name+'.csv', sep=',', format={'col':0, 'row':1, 'value':2, 'ids': int}) models[model_name] = svd else: if not model_name in models: models[model_name] = SVD(filename=model_dir+model_name)
def calculate_stats_features(pct_train): dat_file='feature_matrix.csv' data = Data() data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int}) train, test = data.split_train_test(percent=pct_train) K=100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=0, pre_normalize=None, mean_center=False, post_normalize=False) return svd,train,test
def calculate_SVD_users(): print "Thanks for input, calculating..." svd = SVD() recsys.algorithm.VERBOSE = True dat_file = 'user_data_working.csv' svd.load_data(filename=dat_file, sep=',', format = {'col':0, 'row':1, 'value': 2, 'ids': int}) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=True) shutil.copy('user_data_original.csv','user_data_working.csv') return svd
def create_svd_model(train): """ Build SVD model """ svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=0, pre_normalize=None, mean_center=True, post_normalize=True) return svd
def impute_to_file(self, tastings, k=100, min_values=2, verbose=True): # create a data file in Movielens format with the tastings data self.save_tastings_to_movielens_format_file(tastings) # for logging/testing purposes we may like this verbose if verbose: recsys.algorithm.VERBOSE = True svd = SVD() # load source data, perform SVD, save to zip file source_file = self.file_location(self.tastings_movielens_format) svd.load_data(filename=source_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) outfile = self.file_location(self.tastings_recsys_svd) svd.compute(k=k, min_values=min_values, pre_normalize=None, mean_center=True, post_normalize=True, savefile=outfile) return svd
def __init__(self): #Dataset data = Data() self.filename = "emag" if False and os.path.isfile(self.filename + ".zip"): svd = SVD(filename=self.filename) else: svd = SVD() svd.set_data(data) #svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True, savefile="svd") self.svd = svd self.iterations = 0
def export(self): # http://tedlab.mit.edu/~dr/SVDLIBC/SVD_F_DT.html # only importing default 'dt' S, Ut and Vt (dense text output matrices) PREFIX = self._svd_prefix file_Ut = PREFIX + '-Ut' file_Vt = PREFIX + '-Vt' file_S = PREFIX + '-S' # Not really used: file_U = PREFIX + '-U' file_V = PREFIX + '-V' # Read matrices files (U, S, Vt), using CSV (it's much faster than numpy.loadtxt()!) try: Ut = array(list(csv.reader(open(file_Ut), delimiter=' '))[1:]).astype('float') U = Ut.transpose() except: U = array(list(csv.reader(open(file_U), delimiter=' '))[1:]).astype('float') try: Vt = array(list(csv.reader(open(file_Vt), delimiter=' '))[1:]).astype('float') V = Vt.transpose() except: V = array(list(csv.reader(open(file_V), delimiter=' '))[1:]).astype('float') #Vt = V.transpose() _S = array(list(csv.reader(open(file_S), delimiter=' '))[1:]).astype('float') S = _S.reshape(_S.shape[0], ) PREFIX_INDEXES = PREFIX + '.ids.' file_U_idx = PREFIX_INDEXES + 'rows' file_V_idx = PREFIX_INDEXES + 'cols' try: U_idx = [int(idx.strip()) for idx in open(file_U_idx)] except: U_idx = [idx.strip() for idx in open(file_U_idx)] try: V_idx = [int(idx.strip()) for idx in open(file_V_idx)] except: V_idx = [idx.strip() for idx in open(file_V_idx)] #Check no duplicated IDs!!! assert (len(U_idx) == len(OrderedSet(U_idx))) assert (len(V_idx) == len(OrderedSet(V_idx))) # Create SVD svd = SVD() svd._U = DenseMatrix(U, OrderedSet(U_idx), None) svd._S = S svd._V = DenseMatrix(V, OrderedSet(V_idx), None) svd._matrix_similarity = svd._reconstruct_similarity() svd._matrix_reconstructed = svd._reconstruct_matrix() return svd
def process_svd(preload): if preload: svd = SVD(filename='./data/svd-all') # Loading already computed SVD model else: print "Reading data..." svdlibc = SVDLIBC('./data/behavior-ml-score.csv') svdlibc.to_sparse_matrix(sep=',', format={'col':0, 'row':1, 'value':2, 'ids': str}) k=100 print "Computing SVD..." svdlibc.compute(k) svd = svdlibc.export() svd.save_model('./data/svd-all', options={'k': k}) #svd.predict('TV268', 9, 1, 3) return svd
def setup(): global users, items, svd print 'Reading items...' items = _read_items(os.path.join(MOVIELENS_DATA_PATH, 'movies.dat')) users = [] svd = SVD() svd.load_data(filename=os.path.join(MOVIELENS_DATA_PATH, 'ratings.dat'), sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int })
def get_movie(movie_id): movie = {} rating = 0 with sqlite3.connect('data/data100.db') as con: cur = con.cursor() cur.execute("SELECT * FROM movies WHERE movie_id = ?", (movie_id,)) movie_result = cur.fetchone() cur.execute("SELECT director FROM movie_directors WHERE movie_id = ?", (movie_id,)) directors = cur.fetchall() cur.execute("SELECT actor FROM movie_actors WHERE movie_id = ?", (movie_id,)) actors = cur.fetchall() cur.execute("SELECT writer FROM movie_writers WHERE movie_id = ?", (movie_id,)) writers = cur.fetchall() cur.execute("SELECT genre FROM movie_genres WHERE movie_id = ?", (movie_id,)) genres = cur.fetchall() if 'session_user' in request.cookies: cur.execute("SELECT * FROM ratings WHERE user_id = ? AND movie_id = ?", (request.get_cookie('session_user', secret='recsys')[0], movie_id,)) rating = cur.fetchone() cur.execute("SELECT * FROM ratings") rating_results = cur.fetchall() d = Data() d.set(rating_results) # with open('data/tmp.dat', 'a') as f: # for l in rating_results: # f.write('%d,%d,%d\n' % (l[0], l[1], l[2])) svd = SVD() # svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int}) svd.set_data(d) similar_list = [str(s[0]) for s in svd.similar(int(movie_id))] cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(similar_list))) similar_movies = cur.fetchall() movie = { 'mid': movie_result[0], 'title': movie_result[1], 'description': movie_result[2], 'image': movie_result[3], 'year': movie_result[4], 'directors': [d[0] for d in directors], 'writers': [w[0] for w in writers], 'actors': [a[0] for a in actors], 'genres': [g[0] for g in genres], 'rating': rating, 'similar_movies': similar_movies, } session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None return template('static/movie.html', movie=movie, session_user=session_user)
def build_svd_item_based(user_op_item_cnt, item_op_users, user_idx, item_idx, min_nonzero): svd = SVD() data = Data() item_lst = [] for ui in user_op_item_cnt: if len(user_op_item_cnt[ui]) < min_nonzero: continue for ti in user_op_item_cnt[ui]: if item_op_users[ti] < min_nonzero: continue if 1.0*user_op_item_cnt[ui][ti] < 1: continue item_lst.append(ti) data.add_tuple(((1.0*user_op_item_cnt[ui][ti]), item_idx[ti], user_idx[ui])) item_lst = list(set(item_lst)) svd.set_data(data) return svd, item_lst
def test_classifier(model, filename=None, itemkey="track", selector="SELECT * FROM train"): conn = sqlite3.connect("db.sqlite") conn.row_factory = dict_factory cur = conn.cursor() s = 0 c = 0 t_p = 0 for i in range(0,10): svd = SVD() if filename is not None: svd.load_model(filename) l = list(cur.execute(selector)) random.shuffle(l) count = len(l) svd.set_data([(x["rating"],x["track"],x["user"]) for x in l[0:int(count*0.7)]]) K = 1000 svd.compute(k=K, min_values=0.0, pre_normalize=None, mean_center=True, post_normalize=True) pairs = [] for idx,item in enumerate(l[int(count*0.7):]): user = item["user"] track = item[itemkey] pairs.append((predict_item(svd, track,user), item["rating"])) t_p += len(pairs) s += RMSE(pairs).compute() c += 1.0 print "iteration" print s/c, t_p
def recommended_files(data,user): svd = SVD() svd.set_data(data) svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True) similar_users = [i[0] for i in svd.similar(user)] #recoms = svd.recommend(user,is_row=True,only_unknowns=True,n=50) predict_arr = [] user_tths = db.user_list.find({'user':user}) tths = [i['tth'] for i in user_tths] movie_names = [] for i in similar_users[1:]: for j in db.user_list.find({'user':i}): if j['tth'] not in tths: movie_name = db.tths.find_one({'tth':j['tth']})['name'] movie_names.append(movie_name) tths.append(j['tth']) predict_arr.append((movie_name,j['tth'],svd.predict(user,j['tth']))) predict_arr = sorted(predict_arr,key=lambda x:x[2],reverse=True) res = [] c_res = 0 for p in predict_arr: flag=0 for r in res: if similar(p[0],r[0]): flag = 1 break if flag == 0: res.append(p[1]) c_res += 1 if c_res > 10: return res
def get_mae_rmse(step): data = Data() format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'} filename = 'second_train_test.dat.{step}'.format(step=step) data.load(filename, sep='::', format=format) train, test = data.split_train_test(percent=80) try: svd = SVD('svdn_model_{step}.zip'.format(step=step)) print('Loading model... {step}'.format(step=step)) except: return mae_predicted, rmse_predicted = [], [] for rating, item_id, user_id in test: try: predicted = svd.predict(item_id, user_id) mae_predicted.append((rating, predicted)) rmse_predicted.append((rating, predicted)) except: pass mae_value, rmse_value = np.nan, np.nan if len(mae_predicted) > 0: mae = MAE(mae_predicted) mae_value = mae.compute() if len(rmse_predicted) > 0: rmse = RMSE(rmse_predicted) rmse_value = rmse.compute() return mae_value, rmse_value
def evaluate(data, count=5, K=100): results = [] for i in range(count): train, test = data.split_train_test(percent=PERCENT_TRAIN) print len(data.get()), len(train.get()), len(test.get()) #test_in_train(test, train) #print train.get() svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) #Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: #print "keyerror: ===========================================================>" continue try: rsu = {} rsu["RMSE"] = rmse.compute() rsu["MAE"] = mae.compute() print rsu results.append(rsu) except: print "one error....++++++++++++++++++++++++++++++++++++++++++++++++++++" return results
def reCompute(user_id): data = Data() fname = 'ratings.dat' dataset = Data() format = {'col': 0, 'row': 1, 'value': 2, 'ids': 'int'} dataset.load(fname, sep=':', format=format) svd = SVD() svd.set_data(dataset) k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) #New ID of Added User USERID = user_id a = svd.recommend(USERID, is_row=False) for j in range(1, len(a)): global a k = a[j][0] print df_movies.query('movie_id==@k')
def calculate_stats_users(pct_train): dat_file = 'user_data_working.csv' data = Data() data.load(dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) train, test = data.split_train_test(percent=pct_train) svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=False) rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def similar_users(user): if not type(user) is str: user = unidecode.unidecode(user) if db.done_users.find_one({'user': user})['recommended'] == False: user_files = db.user_list.find({'user': user}) f = open('./dc_recom.dat', 'a') for u in user_files: f.write(u['user'] + '::' + u['tth']) f.write('\n') f.close() db.done_users.update({'user': user}, { 'user': user, 'recommended': True }) data = Data() data.load('./dc_recom.dat', sep='::', format={'col': 1, 'row': 0}) svd = SVD() svd.set_data(data) svd.compute(k=1000, min_values=0, pre_normalize=None, mean_center=False, post_normalize=True) return [i[0] for i in svd.similar(user)]
def compute_SVD(): svd = SVD() svd.set_data(load_data()) K=100 svd.compute(k=K, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile=None) svd.save_model(os.path.join(utils.get_add_dir(), 'ratings'))
def main(): svd = SVD() train = Data() test = Data() train.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int}) test.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int}) svd.set_data(train) svd.compute(k=100, min_values=0.5, pre_normalize=False, mean_center=True, post_normalize=True) # rmse = RMSE() # mae = MAE() # for rating, item_id, user_id in test.get(): # try: # pred_rating = svd.predict(item_id, user_id) # rmse.add(rating, pred_rating) # mae.add(rating, pred_rating) # except KeyError: # continue # print 'RMSE=%s' % rmse.compute() # print 'MAE=%s' % mae.compute() # test = make_test() # print precision_and_recall(test, svd) # rec_list = svd.recommend(200, n=5, only_unknowns=False, is_row=False) print svd.recommend(1, n=5, only_unknowns=False, is_row=False)
def ex1(dat_file='./ml-1m/ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int}) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K=100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def __init__(self, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.start = True self.rating_file = rating_file self.movie_file = movie_file self.detail_file = detail_file self.svd = SVD(filename=model) self.svd.load_data(filename=rating_file, sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids': int}) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http')
def __init__(self, train, test, remap, svd_train): self.train_set = train self.test_set = test self.remap = self._get_remap(remap) self.W = self._reverse_user_item() self.svd = SVD() self.svd.set_data(svd_train) self.svd.compute(k=10, min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
def build_svd_cat_based(user_op_cat_cnt, cat_op_users, user_idx, cat_idx, min_nonzero): svd = SVD() data = Data() cat_lst = [] for ui in user_op_cat_cnt: if len(user_op_cat_cnt[ui]) < min_nonzero: continue for ci in user_op_cat_cnt[ui]: if cat_op_users[ci] < min_nonzero: continue if 1.0*user_op_cat_cnt[ui][ci] < 1: continue cat_lst.append(ci) data.add_tuple(((1.0*user_op_cat_cnt[ui][ci]), cat_idx[ci], user_idx[ui])) cat_lst = list(set(cat_lst)) print 'cat =', len(cat_lst) svd.set_data(data) return svd, cat_lst
def get_model(model_name, datasource_name, start, end, model_params): if not model_name in model_data: model_data[model_name] = (datasource_name, start, end, model_params) if not os.path.exists(model_dir + model_name): #initialize model with new data svd = SVD() svd.load_data(filename=data_dir + datasource_name + '.csv', sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) models[model_name] = svd else: if not model_name in models: models[model_name] = SVD(filename=model_dir + model_name)
def __init__(self, filename, sep, **format): # 文件信息 self.filename = filename self.sep = sep self.format = format # 初始化矩阵分解 self.svd = SVD() # 矩阵信息 self.k = 100 # 矩阵的隐因子睡昂 self.min_values = 10 # 删除评分少于10人的电影 self.post_normalize = False # 设置是否加载模型标志 self.load_model = False # 初始化均方误差 self.rmse = RMSE()
def load_recsys_svd(self): from recsys.algorithm.factorize import SVD svd = [] # if there's an svd file, load it - otherwise we're out of luck as # we don't want to build these matrices at runtime! tastings_svd_file = self.file_location(self.tastings_recsys_svd) if os.path.isfile(tastings_svd_file): svd = SVD(tastings_svd_file) # return the recsys SVD object, ready to make some recommendations... return svd
def get_feeds(): movielist = {} with sqlite3.connect('data/data100.db') as con: cur = con.cursor() cur.execute("SELECT * FROM ratings WHERE user_id = ?", (request.get_cookie('session_user', secret='recsys')[0],)) if cur.fetchone(): cur.execute("SELECT ratings, movie_id, user_id FROM ratings") rating_results = cur.fetchall() d = Data() d.set(rating_results) # with open('data/tmp.dat', 'a') as f: # for l in rating_results: # f.write('%d,%d,%d\n' % (l[0], l[1], l[2])) svd = SVD() # svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int}) svd.set_data(d) recommendations = [str(s[0]) for s in svd.recommend(request.get_cookie('session_user', secret='recsys')[0], is_row=False)] cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(recommendations))) similar_movies = cur.fetchall() for m in similar_movies: movielist[m] = { 'mid': m[0], 'title': m[1], 'description': m[2], 'image': m[3], 'year': m[4] } else: cur.execute("SELECT * FROM movies") movies = cur.fetchall() for m in movies: cur.execute("SELECT AVG(ratings) FROM ratings WHERE movie_id = ?", (m[0],)) avg = cur.fetchone()[0] movielist[avg] = { 'mid': m[0], 'title': m[1], 'description': m[2], 'image': m[3], 'year': m[4] } session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None return template('static/feeds.html', movielist=movielist, session_user=session_user)
def compute(aws_region, s3_bucket, filename, sep, col_index, row_index, value_index, ids_type): download_from_s3(aws_region, s3_bucket, filename) svd = SVD() print 'Loading data to SVD module' svd.load_data(filename='./data/' + filename, sep=sep, format={'col':int(col_index), 'row':int(row_index), 'value':int(value_index), 'ids': ids_type}) k = derive_latent_dimensions(svd, energy_level=0.6) print 'Stating to compute SVD at ', strftime("%Y-%m-%d %H:%M:%S", gmtime()) svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./models/recommender') print "SVD model saved at ", strftime("%Y-%m-%d %H:%M:%S", gmtime()) sys.exit() # to make sure that process finishes at the end
def __init__(self, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.start = True self.rating_file = rating_file self.movie_file = movie_file self.detail_file = detail_file self.svd = SVD(filename=model) self.svd.load_data(filename=rating_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http')
def __init__(self, filename, sep, **format): self.filename = filename self.sep = sep self.format = format # 训练参数 self.k = 100 self.min_values = 10 self.post_normalize = True self.svd = SVD() # 判断是否加载 self.is_load = False # 添加数据处理 self.data = Data() # 添加模型评估 self.rmse = RMSE()
def export(self): # http://tedlab.mit.edu/~dr/SVDLIBC/SVD_F_DT.html # only importing default 'dt' S, Ut and Vt (dense text output matrices) PREFIX = self._svd_prefix file_Ut = PREFIX + '-Ut' file_Vt = PREFIX + '-Vt' file_S = PREFIX + '-S' # Not really used: file_U = PREFIX + '-U' file_V = PREFIX + '-V' # Read matrices files (U, S, Vt), using CSV (it's much faster than numpy.loadtxt()!) try: Ut = array(list(csv.reader(open(file_Ut),delimiter=' '))[1:]).astype('float') U = Ut.transpose() except: U = array(list(csv.reader(open(file_U),delimiter=' '))[1:]).astype('float') try: Vt = array(list(csv.reader(open(file_Vt),delimiter=' '))[1:]).astype('float') V = Vt.transpose() except: V = array(list(csv.reader(open(file_V),delimiter=' '))[1:]).astype('float') #Vt = V.transpose() _S = array(list(csv.reader(open(file_S),delimiter=' '))[1:]).astype('float') S = _S.reshape(_S.shape[0], ) PREFIX_INDEXES = PREFIX + '.ids.' file_U_idx = PREFIX_INDEXES + 'rows' file_V_idx = PREFIX_INDEXES + 'cols' try: U_idx = [ int(idx.strip()) for idx in open(file_U_idx)] except: U_idx = [ idx.strip() for idx in open(file_U_idx)] try: V_idx = [ int(idx.strip()) for idx in open(file_V_idx)] except: V_idx = [ idx.strip() for idx in open(file_V_idx)] #Check no duplicated IDs!!! assert(len(U_idx) == len(OrderedSet(U_idx))) assert(len(V_idx) == len(OrderedSet(V_idx))) # Create SVD svd = SVD() svd._U = DenseMatrix(U, OrderedSet(U_idx), None) svd._S = S svd._V = DenseMatrix(V, OrderedSet(V_idx), None) svd._matrix_similarity = svd._reconstruct_similarity() svd._matrix_reconstructed = svd._reconstruct_matrix() return svd
def impute_to_file(self, tastings, k=100, min_values=2, verbose=True): # create a data file in Movielens format with the tastings data self.save_tastings_to_movielens_format_file(tastings) # for logging/testing purposes we may like this verbose if verbose: recsys.algorithm.VERBOSE = True svd = SVD() # load source data, perform SVD, save to zip file source_file = self.file_location(self.tastings_movielens_format) svd.load_data(filename=source_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) outfile = self.file_location(self.tastings_recsys_svd) svd.compute(k=k, min_values=min_values, pre_normalize=None, mean_center=True, post_normalize=True, savefile=outfile) return svd
def recommend(dimension=100): svd = SVD() svd.load_data(filename='rating.dat', sep='\t', format={'col':2, 'row':1, 'value':0, 'ids': int}) k = dimension svd.compute(k=k, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True) game_recdict={} for item in svd.recommend(1, is_row=False): appid=item[0] game=Game(appid) if (game.success==1): game_recdict[game.rec]=[game.appid, game.genre, game.name, game.img] sorted_list=sorted(game_recdict.keys(), reverse=True) print ("Games Recommended:") for i in sorted_list: # image urllib.urlretrieve(game_recdict[i][3], "local-filename.jpg") image = plt.imread("local-filename.jpg") plt.imshow(image) plt.show() #name print game_recdict[i][2]
def __init__( self, sc, datapath='/media/psf/Home/CS/GIT_HUB/Movie-Recommendation-Project/frontend/', rating_file='ratings_small.csv', complete_rating_file='ratings.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.sc = sc self.start = True self.rating_file = datapath + rating_file self.complete_rating_file = datapath + complete_rating_file self.movie_file = datapath + movie_file self.detail_file = datapath + detail_file self.integration_folder = datapath self.svd = SVD(filename=datapath + model) self.svd.load_data(filename=self.rating_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http') # als stuff self.sqlContext = SQLContext(self.sc) self.movie_data = self.sc.textFile(self.movie_file) self.ratings_data = self.sc.textFile( self.complete_rating_file).map(lambda line: line.split(",")).map( lambda x: (int(x[0]), int(x[1]), float(x[2]))) self.als_model_path = datapath + 'Model_Collaborative_Filtering' self.als_model = MatrixFactorizationModel.load(sc, self.als_model_path) self.movie_df = self.sqlContext.read.load(datapath + 'tables/movies') self.detail_df = self.sqlContext.read.load(datapath + 'tables/detail') self.rating_df = self.sqlContext.read.load(datapath + 'tables/ratings')
def build_model(self, uids, kn): data = Data() for uid, songs in uids.items(): for song in songs: data.add_tuple((1, song, uid)) svd = SVD() svd.set_data(data) svd.compute(k=kn, min_values=1) self.model = svd
def setup_svd(self, vote_list): if self.svd is None: self.cache['svd'] = SVD() data = Data() for vote in vote_list: user_id = vote[0].id item_id = vote[1] value = float(vote[2]) data.add_tuple( (value, item_id, user_id)) # Tuple format is: <value, row, column> self.cache['svd'].set_data(data) self.cache['svd'].compute(k=self.k, min_values=1) return self.svd
def SVDtrain2(data, pct_train): train, test = data.split_train_test(percent=pct_train) K = 100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) return svd, train, test
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'col': 0 -> Cols in matrix come from column 0 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat # file # 'ids': int -> Ids (row and col ids) are integers (not strings) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K = 100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() # mae is mean ABSOLUTE error # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5 mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def SVDloadData(): svd = SVD() recsys.algorithm.VERBOSE = True dat_file = '/home/commons/RecSys/MOVIEDATA/MOVIEDATA/ml-1m/ratings.dat' svd.load_data(filename=dat_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) print svd.get_matrix() return svd
def Compute(): svd = SVD() svd.load_data(filename='./ml-1m/ratings.dat', sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) svd.compute(k=100, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./mvsvd')
def color_user(input_file, output_file, data_file): data = Data() # VALUE = 1.0 # for username in likes: # for user_likes in likes[username]: # data.add_tuple((VALUE, username, user_likes)) # Tuple format is: <value, row, column> #读取所有user的履历,制作成SVD可执行的matrix f_r = open(data_file, 'r') for line in f_r: info = line.split(',') data.add_tuple((1.0, info[0], info[1])) svd = SVD() svd.set_data(data) k = 5 # Usually, in a real dataset, you should set a higher number, e.g. 100 svd.compute(k=k, min_values=3, pre_normalize=None, mean_center=False, post_normalize=True) #从question里读取需要被推荐的userid fr = open(input_file, 'r') for line in fr: userid = line user_list = svd.similar(userid) #print('=============================================') #print(user_list) #print(len(user_list)) #保存所有相似度大于50%的用户id到answer file fw = open(output_file, 'w') del user_list[0] #删除需要被推荐的用户自身id for user in user_list: if user[1] > 0.5: fw.write(user[0] + '\n') fw.close()
def calculate_SVD_features(): print "Thanks for input, calculating..." svd = SVD() recsys.algorithm.VERBOSE = True dat_file = 'feature_matrix.csv' svd.load_data(filename=dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) svd.compute(k=100, min_values=0, pre_normalize=None, mean_center=False, post_normalize=True) return svd
def calculate_SVD_users(): print "Thanks for input, calculating..." svd = SVD() recsys.algorithm.VERBOSE = True dat_file = 'user_data_working.csv' svd.load_data(filename=dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=True) shutil.copy('user_data_original.csv', 'user_data_working.csv') return svd
def evaulte(train_set, test_set): svd = SVD() svd.set_data(train_set) svd.compute(k=KKK, min_values=MIN_ITEM, pre_normalize=None, mean_center=True, post_normalize=True) mae = MAE() k_err = 0 for rating, item_id, user_id in test_set.get(): try: pred_rating = svd.predict(item_id, user_id) mae.add(rating, pred_rating) except KeyError: #print "keyerror: ===========================================================>" k_err += 1 continue print "k_err", k_err, " -- ", "test-len: ", len( test_set.get()), "train-len: ", len(train_set.get()) result = mae.compute() / 2.0 return result
def calculate_stats_features(pct_train): dat_file = 'feature_matrix.csv' data = Data() data.load(dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) train, test = data.split_train_test(percent=pct_train) K = 100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=0, pre_normalize=None, mean_center=False, post_normalize=False) return svd, train, test
#This is the recommendation algorithm based on the SVD #This code can be run in real time but the model has to be pre-computed import recsys.algorithm from recsys.algorithm.factorize import SVD ''' SVD recommendation for best fit movies. Includes known and unknown movies ''' #Lets make things Verbose recsys.algorithm.VERBOSE = True #Loading the computed model svd = SVD(filename='movielens_small') #Loading the movielens file of movies which has a mapping of movies to movie-id loop = True while (loop): ratings_file = open('ratings_small.csv', 'r+') movie_lens = open('movies.csv', 'r+') user_found = False movie_found = False USERID = int(input("Enter user id: ")) #Check if the user_id exists. Since currently we are using the small database, we need to check each and every field. #If using the complete database, just check if the number lies in the range. for rating_row in ratings_file: rating_item = rating_row.split(',') if (int(rating_item[0]) == USERID): user_found = True break if (movie_found): for movie_row in movie_lens:
# if cnt == 100000: break (user, item, week, time, feat1, feat2)=line.split('\t') test.append( {"1_user_id": int(user), "2_item_id": int(item) }) return test recsys.algorithm.VERBOSE = True print "loading data" data = Data() data.load('../item_recom/train_info.tsv',sep='\t', format={'col':0, 'row':1, 'value':6, 'ids': int}) topic = 48 print "compute svd" svd = SVD() svd.set_data(data) svd.compute(k=topic, min_values=0.0, pre_normalize=None, mean_center=True, post_normalize=True) print "loading test data" test = loadTest('../item_recom/test_info.tsv') print svd.predict(0,0) print "creating submission" with open('../submissions/recsys_3.csv', 'w') as csvfile: fieldnames = ['uid#iid', 'pred'] writer = csv.DictWriter(csvfile, fieldnames) writer.writeheader() for ind in xrange(len(test)): writer.writerow(