def SVDloadData():
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = '/home/commons/RecSys/MOVIEDATA/MOVIEDATA/ml-1m/ratings.dat'
    svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
    print svd.get_matrix()
    return svd
Пример #2
0
def SVDtrain2(data,pct_train):
    train, test = data.split_train_test(percent=pct_train)                                                                                                                                                                     
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True,
    post_normalize=True)
    return svd,train,test
Пример #3
0
def getSimilarityMatrix(svd_model_file):
	""" Returns similarity matrix from svd_model_file
	"""
	#Import SVD from file
	svd=SVD()
	svd.load_model(svd_model_file)

	return svd.get_matrix_similarity()
Пример #4
0
	def build_model(self,uids,kn):
		data = Data()
		for uid,songs in uids.items():
			for song in songs:
				data.add_tuple((1,song,uid))
		svd = SVD()
		svd.set_data(data)
		svd.compute(k=kn,min_values=1)
		self.model = svd
Пример #5
0
def setup():
    global users, items, svd

    print 'Reading items...'
    items = _read_items(os.path.join(MOVIELENS_DATA_PATH, 'movies.dat'))
    users = []

    svd = SVD()
    svd.load_data(filename=os.path.join(MOVIELENS_DATA_PATH, 'ratings.dat'), sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
Пример #6
0
def calculate_SVD_features():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'feature_matrix.csv'
    svd.load_data(filename=dat_file, sep=',', 
                format = {'col':0, 'row':1, 'value': 2, 'ids': int})
    svd.compute(k=100, min_values=0, pre_normalize=None, 
                mean_center=False, post_normalize=True)
    return svd       
Пример #7
0
def getSVD():
    filename = "/home/udaysagar/Documents/Classes/239/recsys/model/movielens.zip"
    if os.path.exists(filename):
        return SVD("./model/movielens")
    else:
        svd = SVD()
        svd.load_data(filename='./data/movielens/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
        k = 100
        svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./model/movielens')
        return svd
def train_svd(data):
    """
    This method load processed data and modelling data using Singular Value Decomposition
    :return: SVD model
    """
    svd = SVD()
    svd.set_data(get_data_model_matrix(data))
    k = 30
    svd.compute(k=k, min_values=0, pre_normalize=None, mean_center=True, post_normalize=True)
    return svd
Пример #9
0
def get_model(model_name,datasource_name,start,end,model_params):
    if not model_name in model_data:
        model_data[model_name] = (datasource_name,start,end,model_params) 
    if not os.path.exists(model_dir+model_name):
        #initialize model with new data
        svd = SVD()
        svd.load_data(filename=data_dir+datasource_name+'.csv', sep=',', format={'col':0, 'row':1, 'value':2, 'ids': int})
        models[model_name] = svd
    else:
        if not model_name in models:
            models[model_name] = SVD(filename=model_dir+model_name)
Пример #10
0
def calculate_stats_features(pct_train):
    dat_file='feature_matrix.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=0, pre_normalize=None, mean_center=False,
    post_normalize=False)
    return svd,train,test
Пример #11
0
def calculate_SVD_users():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'user_data_working.csv'
    svd.load_data(filename=dat_file, sep=',', 
                format = {'col':0, 'row':1, 'value': 2, 'ids': int})
    svd.compute(k=100, min_values=2, pre_normalize=None, 
                mean_center=True, post_normalize=True)
    shutil.copy('user_data_original.csv','user_data_working.csv')
    return svd
Пример #12
0
def create_svd_model(train):
    """ Build SVD model
    """
    
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100,
                min_values=0,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    
    return svd
Пример #13
0
 def impute_to_file(self, tastings, k=100, min_values=2, verbose=True):
     # create a data file in Movielens format with the tastings data
     self.save_tastings_to_movielens_format_file(tastings)
     # for logging/testing purposes we may like this verbose
     if verbose:
         recsys.algorithm.VERBOSE = True
     svd = SVD()
     # load source data, perform SVD, save to zip file
     source_file = self.file_location(self.tastings_movielens_format)
     svd.load_data(filename=source_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
     outfile = self.file_location(self.tastings_recsys_svd)
     svd.compute(k=k, min_values=min_values, pre_normalize=None, mean_center=True, post_normalize=True, savefile=outfile)
     return svd
Пример #14
0
    def __init__(self):
        #Dataset

        data = Data()
        self.filename = "emag"
        if False and os.path.isfile(self.filename + ".zip"):
            svd = SVD(filename=self.filename)
        else:
            svd = SVD()
        svd.set_data(data)
        #svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True, savefile="svd")
        self.svd = svd
        self.iterations = 0
Пример #15
0
    def export(self):
        # http://tedlab.mit.edu/~dr/SVDLIBC/SVD_F_DT.html
        # only importing default 'dt' S, Ut and Vt (dense text output matrices)
        PREFIX = self._svd_prefix
        file_Ut = PREFIX + '-Ut'
        file_Vt = PREFIX + '-Vt'
        file_S = PREFIX + '-S'
        # Not really used:
        file_U = PREFIX + '-U'
        file_V = PREFIX + '-V'

        # Read matrices files (U, S, Vt), using CSV (it's much faster than numpy.loadtxt()!)
        try:
            Ut = array(list(csv.reader(open(file_Ut),
                                       delimiter=' '))[1:]).astype('float')
            U = Ut.transpose()
        except:
            U = array(list(csv.reader(open(file_U),
                                      delimiter=' '))[1:]).astype('float')
        try:
            Vt = array(list(csv.reader(open(file_Vt),
                                       delimiter=' '))[1:]).astype('float')
            V = Vt.transpose()
        except:
            V = array(list(csv.reader(open(file_V),
                                      delimiter=' '))[1:]).astype('float')
            #Vt = V.transpose()
        _S = array(list(csv.reader(open(file_S),
                                   delimiter=' '))[1:]).astype('float')
        S = _S.reshape(_S.shape[0], )

        PREFIX_INDEXES = PREFIX + '.ids.'
        file_U_idx = PREFIX_INDEXES + 'rows'
        file_V_idx = PREFIX_INDEXES + 'cols'
        try:
            U_idx = [int(idx.strip()) for idx in open(file_U_idx)]
        except:
            U_idx = [idx.strip() for idx in open(file_U_idx)]
        try:
            V_idx = [int(idx.strip()) for idx in open(file_V_idx)]
        except:
            V_idx = [idx.strip() for idx in open(file_V_idx)]

        #Check no duplicated IDs!!!
        assert (len(U_idx) == len(OrderedSet(U_idx)))
        assert (len(V_idx) == len(OrderedSet(V_idx)))

        # Create SVD
        svd = SVD()
        svd._U = DenseMatrix(U, OrderedSet(U_idx), None)
        svd._S = S
        svd._V = DenseMatrix(V, OrderedSet(V_idx), None)
        svd._matrix_similarity = svd._reconstruct_similarity()
        svd._matrix_reconstructed = svd._reconstruct_matrix()

        return svd
Пример #16
0
def process_svd(preload):
    if preload:
        svd = SVD(filename='./data/svd-all') # Loading already computed SVD model
    else:
        print "Reading data..."
        svdlibc = SVDLIBC('./data/behavior-ml-score.csv')
        svdlibc.to_sparse_matrix(sep=',', format={'col':0, 'row':1, 'value':2, 'ids': str})
        k=100
        print "Computing SVD..."
        svdlibc.compute(k)
        svd = svdlibc.export()
        svd.save_model('./data/svd-all', options={'k': k})
    #svd.predict('TV268', 9, 1, 3)
    return svd
Пример #17
0
def process_svd(preload):
    if preload:
        svd = SVD(filename='./data/svd-all') # Loading already computed SVD model
    else:
        print "Reading data..."
        svdlibc = SVDLIBC('./data/behavior-ml-score.csv')
        svdlibc.to_sparse_matrix(sep=',', format={'col':0, 'row':1, 'value':2, 'ids': str})
        k=100
        print "Computing SVD..."
        svdlibc.compute(k)
        svd = svdlibc.export()
        svd.save_model('./data/svd-all', options={'k': k})
    #svd.predict('TV268', 9, 1, 3)
    return svd
Пример #18
0
def setup():
    global users, items, svd

    print 'Reading items...'
    items = _read_items(os.path.join(MOVIELENS_DATA_PATH, 'movies.dat'))
    users = []

    svd = SVD()
    svd.load_data(filename=os.path.join(MOVIELENS_DATA_PATH, 'ratings.dat'),
                  sep='::',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
Пример #19
0
def get_movie(movie_id):
	movie = {}
	rating = 0
	with sqlite3.connect('data/data100.db') as con:
		cur = con.cursor()
		cur.execute("SELECT * FROM movies WHERE movie_id = ?", (movie_id,))
		movie_result = cur.fetchone()
		cur.execute("SELECT director FROM movie_directors WHERE movie_id = ?", (movie_id,))
		directors = cur.fetchall()
		cur.execute("SELECT actor FROM movie_actors WHERE movie_id = ?", (movie_id,))
		actors = cur.fetchall()
		cur.execute("SELECT writer FROM movie_writers WHERE movie_id = ?", (movie_id,))
		writers = cur.fetchall()
		cur.execute("SELECT genre FROM movie_genres WHERE movie_id = ?", (movie_id,))
		genres = cur.fetchall()
		if 'session_user' in request.cookies:
			cur.execute("SELECT * FROM ratings WHERE user_id = ? AND movie_id = ?", (request.get_cookie('session_user', secret='recsys')[0], movie_id,))
			rating = cur.fetchone()
		cur.execute("SELECT * FROM ratings")
		rating_results = cur.fetchall()
		d = Data()
		d.set(rating_results)
			# with open('data/tmp.dat', 'a') as f:
			# 	for l in rating_results:
			# 		f.write('%d,%d,%d\n' % (l[0], l[1], l[2]))
		svd = SVD()
			# svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int})
		svd.set_data(d)
		similar_list = [str(s[0]) for s in svd.similar(int(movie_id))]
		cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(similar_list)))
		similar_movies = cur.fetchall()
		movie = {
			'mid': movie_result[0],
			'title': movie_result[1],
			'description': movie_result[2],
			'image': movie_result[3],
			'year': movie_result[4],
			'directors': [d[0] for d in directors],
			'writers': [w[0] for w in writers],
			'actors': [a[0] for a in actors],
			'genres': [g[0] for g in genres],
			'rating': rating,
			'similar_movies': similar_movies,
		}
	session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None
	return template('static/movie.html', movie=movie, session_user=session_user)
def build_svd_item_based(user_op_item_cnt, item_op_users, user_idx, item_idx, min_nonzero):
    svd = SVD()
    data = Data()
    item_lst = []
    for ui in user_op_item_cnt:
        if len(user_op_item_cnt[ui]) < min_nonzero:
            continue
        for ti in user_op_item_cnt[ui]:
            if item_op_users[ti] < min_nonzero:
                continue
            if 1.0*user_op_item_cnt[ui][ti] < 1:
                continue
            item_lst.append(ti)
            data.add_tuple(((1.0*user_op_item_cnt[ui][ti]), item_idx[ti], user_idx[ui]))
    item_lst = list(set(item_lst))
    svd.set_data(data)
    return svd, item_lst
Пример #21
0
def test_classifier(model, filename=None, itemkey="track", selector="SELECT * FROM train"):
    conn = sqlite3.connect("db.sqlite")
    conn.row_factory = dict_factory
    cur = conn.cursor()
    s = 0
    c = 0
    t_p = 0
    for i in range(0,10):
        svd = SVD()
        if filename is not None:
            svd.load_model(filename)
        l = list(cur.execute(selector))
        random.shuffle(l)
        count = len(l)
        svd.set_data([(x["rating"],x["track"],x["user"]) for x in l[0:int(count*0.7)]])
        K = 1000
        svd.compute(k=K, min_values=0.0, pre_normalize=None, mean_center=True, post_normalize=True)

        pairs = []
        for idx,item in enumerate(l[int(count*0.7):]): 
            user = item["user"]
            track = item[itemkey]
            pairs.append((predict_item(svd, track,user), item["rating"]))
        t_p += len(pairs)
        s += RMSE(pairs).compute()
        c += 1.0
        print "iteration"
    print s/c, t_p
Пример #22
0
def recommended_files(data,user):
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    similar_users = [i[0] for i in svd.similar(user)]
    
    #recoms = svd.recommend(user,is_row=True,only_unknowns=True,n=50)
    predict_arr = []

    user_tths = db.user_list.find({'user':user})
    tths = [i['tth'] for i in user_tths]
    movie_names = []
    
    for i in similar_users[1:]:
        for j in db.user_list.find({'user':i}):
            if j['tth'] not in tths:
                movie_name = db.tths.find_one({'tth':j['tth']})['name']
                movie_names.append(movie_name)               
                tths.append(j['tth'])   
                predict_arr.append((movie_name,j['tth'],svd.predict(user,j['tth'])))
    
    predict_arr = sorted(predict_arr,key=lambda x:x[2],reverse=True)
    res = []
    c_res = 0
    for p in predict_arr:
        flag=0
        for r in res:                
            if similar(p[0],r[0]):
                flag = 1
                break
        if flag == 0:
            res.append(p[1])
            c_res += 1
            if c_res > 10:
                return res
Пример #23
0
def get_mae_rmse(step):

    data = Data()

    format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'}

    filename = 'second_train_test.dat.{step}'.format(step=step)

    data.load(filename, sep='::', format=format)

    train, test = data.split_train_test(percent=80)

    try:

        svd = SVD('svdn_model_{step}.zip'.format(step=step))
        print('Loading model... {step}'.format(step=step))

    except:

        return

    mae_predicted, rmse_predicted = [], []
    for rating, item_id, user_id in test:
        try:

            predicted = svd.predict(item_id, user_id)

            mae_predicted.append((rating, predicted))
            rmse_predicted.append((rating, predicted))

        except:

            pass

    mae_value, rmse_value = np.nan, np.nan

    if len(mae_predicted) > 0:
        mae = MAE(mae_predicted)
        mae_value = mae.compute()

    if len(rmse_predicted) > 0:
        rmse = RMSE(rmse_predicted)
        rmse_value = rmse.compute()

    return mae_value, rmse_value
Пример #24
0
def evaluate(data, count=5, K=100):
    results = []

    for i in range(count):
        train, test = data.split_train_test(percent=PERCENT_TRAIN)
        print len(data.get()), len(train.get()), len(test.get())
        #test_in_train(test, train)
        #print train.get()
        svd = SVD()
        svd.set_data(train)
        svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

        #Evaluation using prediction-based metrics
        rmse = RMSE()
        mae = MAE()
        for rating, item_id, user_id in test.get():
            try:
                pred_rating = svd.predict(item_id, user_id)
                rmse.add(rating, pred_rating)
                mae.add(rating, pred_rating)
            except KeyError:
                #print "keyerror: ===========================================================>"
                continue
        try:
            rsu = {}
            rsu["RMSE"] = rmse.compute()
            rsu["MAE"] = mae.compute()
            print rsu
            results.append(rsu)
        except:
            print "one error....++++++++++++++++++++++++++++++++++++++++++++++++++++"
        

    return results
Пример #25
0
def reCompute(user_id):
    data = Data()
    fname = 'ratings.dat'
    dataset = Data()
    format = {'col': 0, 'row': 1, 'value': 2, 'ids': 'int'}
    dataset.load(fname, sep=':', format=format)

    svd = SVD()
    svd.set_data(dataset)

    k = 100
    svd.compute(k=k,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    #New ID of Added User
    USERID = user_id

    a = svd.recommend(USERID, is_row=False)
    for j in range(1, len(a)):
        global a
        k = a[j][0]
        print df_movies.query('movie_id==@k')
Пример #26
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    train, test = data.split_train_test(percent=pct_train)
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100,
                min_values=2,
                pre_normalize=None,
                mean_center=True,
                post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Пример #27
0
def similar_users(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user': user})['recommended'] == False:
        user_files = db.user_list.find({'user': user})
        f = open('./dc_recom.dat', 'a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {
            'user': user,
            'recommended': True
        })

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col': 1, 'row': 0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True)
    return [i[0] for i in svd.similar(user)]
Пример #28
0
def compute_SVD():
	svd = SVD()
	svd.set_data(load_data())

	K=100
	svd.compute(k=K, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile=None)
	svd.save_model(os.path.join(utils.get_add_dir(), 'ratings'))
Пример #29
0
def main():
    svd = SVD()
    train = Data()
    test = Data()
    train.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int})
    test.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int})
    svd.set_data(train)
    svd.compute(k=100, min_values=0.5, pre_normalize=False, mean_center=True, post_normalize=True)

    # rmse = RMSE()
    # mae = MAE()
    # for rating, item_id, user_id in test.get():
    #     try:
    #         pred_rating = svd.predict(item_id, user_id)
    #         rmse.add(rating, pred_rating)
    #         mae.add(rating, pred_rating)
    #     except KeyError:
    #         continue
    # print 'RMSE=%s' % rmse.compute()
    # print 'MAE=%s' % mae.compute()

    # test = make_test()
    # print precision_and_recall(test, svd)
    # rec_list = svd.recommend(200, n=5, only_unknowns=False, is_row=False)
    print svd.recommend(1, n=5, only_unknowns=False, is_row=False)
Пример #30
0
def ex1(dat_file='./ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int})
       

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
 def __init__(self, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'):
     self.start = True
     self.rating_file = rating_file
     self.movie_file = movie_file
     self.detail_file = detail_file
     self.svd = SVD(filename=model)
     self.svd.load_data(filename=rating_file, sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids': int})
     self.svd.create_matrix()
     self.ia = imdb.IMDb(accessSystem='http')
Пример #32
0
    def __init__(self, train, test, remap, svd_train):
        self.train_set = train
        self.test_set = test
        self.remap = self._get_remap(remap)
        self.W = self._reverse_user_item()

        self.svd = SVD()
        self.svd.set_data(svd_train)
        self.svd.compute(k=10, min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
def build_svd_cat_based(user_op_cat_cnt, cat_op_users, user_idx, cat_idx, min_nonzero):
    svd = SVD()
    data = Data()
    cat_lst = []
    for ui in user_op_cat_cnt:
        if len(user_op_cat_cnt[ui]) < min_nonzero:
            continue
        for ci in user_op_cat_cnt[ui]:
            if cat_op_users[ci] < min_nonzero:
                continue
            if 1.0*user_op_cat_cnt[ui][ci] < 1:
                continue
            cat_lst.append(ci)
            data.add_tuple(((1.0*user_op_cat_cnt[ui][ci]), cat_idx[ci], user_idx[ui]))
    cat_lst = list(set(cat_lst))
    print 'cat =', len(cat_lst)
    svd.set_data(data)
    return svd, cat_lst
Пример #34
0
def get_model(model_name, datasource_name, start, end, model_params):
    if not model_name in model_data:
        model_data[model_name] = (datasource_name, start, end, model_params)
    if not os.path.exists(model_dir + model_name):
        #initialize model with new data
        svd = SVD()
        svd.load_data(filename=data_dir + datasource_name + '.csv',
                      sep=',',
                      format={
                          'col': 0,
                          'row': 1,
                          'value': 2,
                          'ids': int
                      })
        models[model_name] = svd
    else:
        if not model_name in models:
            models[model_name] = SVD(filename=model_dir + model_name)
Пример #35
0
Файл: day_07.py Проект: lmlzk/ML
    def __init__(self, filename, sep, **format):
        # 文件信息
        self.filename = filename
        self.sep = sep
        self.format = format

        # 初始化矩阵分解
        self.svd = SVD()

        # 矩阵信息
        self.k = 100  #  矩阵的隐因子睡昂
        self.min_values = 10  #  删除评分少于10人的电影
        self.post_normalize = False

        # 设置是否加载模型标志
        self.load_model = False

        # 初始化均方误差
        self.rmse = RMSE()
Пример #36
0
 def load_recsys_svd(self):
     from recsys.algorithm.factorize import SVD
     svd = []
     # if there's an svd file, load it - otherwise we're out of luck as
     # we don't want to build these matrices at runtime!
     tastings_svd_file = self.file_location(self.tastings_recsys_svd)
     if os.path.isfile(tastings_svd_file):
         svd = SVD(tastings_svd_file)
     # return the recsys SVD object, ready to make some recommendations...
     return svd
Пример #37
0
def get_feeds():
	movielist = {}
	with sqlite3.connect('data/data100.db') as con:
		cur = con.cursor()
		cur.execute("SELECT * FROM ratings WHERE user_id = ?", (request.get_cookie('session_user', secret='recsys')[0],))
		if cur.fetchone():
			cur.execute("SELECT ratings, movie_id, user_id FROM ratings")
			rating_results = cur.fetchall()
			d = Data()
			d.set(rating_results)
			# with open('data/tmp.dat', 'a') as f:
			# 	for l in rating_results:
			# 		f.write('%d,%d,%d\n' % (l[0], l[1], l[2]))
			svd = SVD()
			# svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int})
			svd.set_data(d)
			recommendations = [str(s[0]) for s in svd.recommend(request.get_cookie('session_user', secret='recsys')[0], is_row=False)]
			cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(recommendations)))
			similar_movies = cur.fetchall()
			for m in similar_movies:
				movielist[m] = {
					'mid': m[0],
					'title': m[1],
					'description': m[2],
					'image': m[3],
					'year': m[4]
				}
		else:
			cur.execute("SELECT * FROM movies")
			movies = cur.fetchall()
			for m in movies:
				cur.execute("SELECT AVG(ratings) FROM ratings WHERE movie_id = ?", (m[0],))
				avg = cur.fetchone()[0]
				movielist[avg] = {
					'mid': m[0],
					'title': m[1],
					'description': m[2],
					'image': m[3],
					'year': m[4]
				}
	session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None
	return template('static/feeds.html', movielist=movielist, session_user=session_user)
Пример #38
0
def compute(aws_region, s3_bucket, filename, sep, col_index, row_index, value_index, ids_type):
    download_from_s3(aws_region, s3_bucket, filename)
    svd = SVD()

    print 'Loading data to SVD module'
    svd.load_data(filename='./data/' + filename,
                  sep=sep,
                  format={'col':int(col_index), 'row':int(row_index), 'value':int(value_index), 'ids': ids_type})

    k = derive_latent_dimensions(svd, energy_level=0.6)

    print 'Stating to compute SVD at ', strftime("%Y-%m-%d %H:%M:%S", gmtime())
    svd.compute(k=k,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True,
                savefile='./models/recommender')
    print "SVD model saved at ", strftime("%Y-%m-%d %H:%M:%S", gmtime())
    sys.exit() # to make sure that process finishes at the end
 def __init__(self,
              rating_file='ratings_small.csv',
              movie_file='movies.csv',
              detail_file='modified.csv',
              model='movielens_small'):
     self.start = True
     self.rating_file = rating_file
     self.movie_file = movie_file
     self.detail_file = detail_file
     self.svd = SVD(filename=model)
     self.svd.load_data(filename=rating_file,
                        sep=',',
                        format={
                            'col': 0,
                            'row': 1,
                            'value': 2,
                            'ids': int
                        })
     self.svd.create_matrix()
     self.ia = imdb.IMDb(accessSystem='http')
Пример #40
0
    def __init__(self, filename, sep, **format):
        self.filename = filename
        self.sep = sep
        self.format = format

        # 训练参数
        self.k = 100
        self.min_values = 10
        self.post_normalize = True

        self.svd = SVD()

        # 判断是否加载
        self.is_load = False

        # 添加数据处理
        self.data = Data()

        # 添加模型评估
        self.rmse = RMSE()
Пример #41
0
 def export(self):
     # http://tedlab.mit.edu/~dr/SVDLIBC/SVD_F_DT.html
     # only importing default 'dt' S, Ut and Vt (dense text output matrices)
     PREFIX = self._svd_prefix
     file_Ut = PREFIX + '-Ut'
     file_Vt = PREFIX + '-Vt'
     file_S = PREFIX + '-S'
     # Not really used:
     file_U = PREFIX + '-U'
     file_V = PREFIX + '-V'
     
     # Read matrices files (U, S, Vt), using CSV (it's much faster than numpy.loadtxt()!)
     try:
         Ut = array(list(csv.reader(open(file_Ut),delimiter=' '))[1:]).astype('float')
         U = Ut.transpose()
     except:
         U = array(list(csv.reader(open(file_U),delimiter=' '))[1:]).astype('float')
     try:
         Vt = array(list(csv.reader(open(file_Vt),delimiter=' '))[1:]).astype('float')
         V = Vt.transpose()
     except:
         V = array(list(csv.reader(open(file_V),delimiter=' '))[1:]).astype('float')
         #Vt = V.transpose()
     _S = array(list(csv.reader(open(file_S),delimiter=' '))[1:]).astype('float')
     S = _S.reshape(_S.shape[0], )
     
     PREFIX_INDEXES = PREFIX + '.ids.'
     file_U_idx = PREFIX_INDEXES + 'rows'
     file_V_idx = PREFIX_INDEXES + 'cols'
     try:
         U_idx = [ int(idx.strip()) for idx in open(file_U_idx)]
     except:
         U_idx = [ idx.strip() for idx in open(file_U_idx)]
     try:
         V_idx = [ int(idx.strip()) for idx in open(file_V_idx)]
     except:
         V_idx = [ idx.strip() for idx in open(file_V_idx)]
     
     #Check no duplicated IDs!!!
     assert(len(U_idx) == len(OrderedSet(U_idx)))
     assert(len(V_idx) == len(OrderedSet(V_idx)))
     
     # Create SVD
     svd = SVD()
     svd._U = DenseMatrix(U, OrderedSet(U_idx), None)
     svd._S = S
     svd._V = DenseMatrix(V, OrderedSet(V_idx), None)
     svd._matrix_similarity = svd._reconstruct_similarity()
     svd._matrix_reconstructed = svd._reconstruct_matrix()
     
     return svd
Пример #42
0
 def impute_to_file(self, tastings, k=100, min_values=2, verbose=True):
     # create a data file in Movielens format with the tastings data
     self.save_tastings_to_movielens_format_file(tastings)
     # for logging/testing purposes we may like this verbose
     if verbose:
         recsys.algorithm.VERBOSE = True
     svd = SVD()
     # load source data, perform SVD, save to zip file
     source_file = self.file_location(self.tastings_movielens_format)
     svd.load_data(filename=source_file,
                   sep='::',
                   format={
                       'col': 0,
                       'row': 1,
                       'value': 2,
                       'ids': int
                   })
     outfile = self.file_location(self.tastings_recsys_svd)
     svd.compute(k=k,
                 min_values=min_values,
                 pre_normalize=None,
                 mean_center=True,
                 post_normalize=True,
                 savefile=outfile)
     return svd
def recommend(dimension=100): 
    svd = SVD()
    svd.load_data(filename='rating.dat',
                sep='\t',
                format={'col':2, 'row':1, 'value':0, 'ids': int})

    k = dimension
    svd.compute(k=k, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True)
    
    game_recdict={}
    for item in svd.recommend(1, is_row=False):
        appid=item[0]
        game=Game(appid)
        if (game.success==1):
            game_recdict[game.rec]=[game.appid, game.genre, game.name, game.img]
        
    sorted_list=sorted(game_recdict.keys(), reverse=True)
    print ("Games Recommended:")
    for i in sorted_list:
        # image
        urllib.urlretrieve(game_recdict[i][3], "local-filename.jpg")
        image = plt.imread("local-filename.jpg")
        plt.imshow(image)
        plt.show()
    
        #name
        print game_recdict[i][2]
    def __init__(
            self,
            sc,
            datapath='/media/psf/Home/CS/GIT_HUB/Movie-Recommendation-Project/frontend/',
            rating_file='ratings_small.csv',
            complete_rating_file='ratings.csv',
            movie_file='movies.csv',
            detail_file='modified.csv',
            model='movielens_small'):
        self.sc = sc
        self.start = True
        self.rating_file = datapath + rating_file
        self.complete_rating_file = datapath + complete_rating_file
        self.movie_file = datapath + movie_file
        self.detail_file = datapath + detail_file
        self.integration_folder = datapath
        self.svd = SVD(filename=datapath + model)
        self.svd.load_data(filename=self.rating_file,
                           sep=',',
                           format={
                               'col': 0,
                               'row': 1,
                               'value': 2,
                               'ids': int
                           })
        self.svd.create_matrix()
        self.ia = imdb.IMDb(accessSystem='http')

        # als stuff
        self.sqlContext = SQLContext(self.sc)
        self.movie_data = self.sc.textFile(self.movie_file)
        self.ratings_data = self.sc.textFile(
            self.complete_rating_file).map(lambda line: line.split(",")).map(
                lambda x: (int(x[0]), int(x[1]), float(x[2])))
        self.als_model_path = datapath + 'Model_Collaborative_Filtering'
        self.als_model = MatrixFactorizationModel.load(sc, self.als_model_path)
        self.movie_df = self.sqlContext.read.load(datapath + 'tables/movies')
        self.detail_df = self.sqlContext.read.load(datapath + 'tables/detail')
        self.rating_df = self.sqlContext.read.load(datapath + 'tables/ratings')
Пример #45
0
 def build_model(self, uids, kn):
     data = Data()
     for uid, songs in uids.items():
         for song in songs:
             data.add_tuple((1, song, uid))
     svd = SVD()
     svd.set_data(data)
     svd.compute(k=kn, min_values=1)
     self.model = svd
Пример #46
0
    def setup_svd(self, vote_list):
        if self.svd is None:
            self.cache['svd'] = SVD()
            data = Data()

            for vote in vote_list:
                user_id = vote[0].id
                item_id = vote[1]
                value = float(vote[2])
                data.add_tuple(
                    (value, item_id,
                     user_id))  # Tuple format is: <value, row, column>
            self.cache['svd'].set_data(data)
            self.cache['svd'].compute(k=self.k, min_values=1)
        return self.svd
def SVDtrain2(data, pct_train):
    train, test = data.split_train_test(percent=pct_train)
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=5,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    return svd, train, test
Пример #48
0
def create_svd_model(train):
    """ Build SVD model
    """

    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100,
                min_values=0,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    return svd
Пример #49
0
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5):

    data = Data()
    data.load(dat_file,
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    # About format parameter:
    #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
    #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
    #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
    #   file
    #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=5,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    # mae is mean ABSOLUTE error
    # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
def train_svd(data):
    """
    This method load processed data and modelling data using Singular Value Decomposition
    :return: SVD model
    """
    svd = SVD()
    svd.set_data(get_data_model_matrix(data))
    k = 30
    svd.compute(k=k,
                min_values=0,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    return svd
def SVDloadData():
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = '/home/commons/RecSys/MOVIEDATA/MOVIEDATA/ml-1m/ratings.dat'
    svd.load_data(filename=dat_file,
                  sep='::',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    print svd.get_matrix()
    return svd
Пример #52
0
def Compute():
    svd = SVD()
    svd.load_data(filename='./ml-1m/ratings.dat',
                  sep='::',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    svd.compute(k=100,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True,
                savefile='./mvsvd')
Пример #53
0
def color_user(input_file, output_file, data_file):

    data = Data()

    # VALUE = 1.0
    # for username in likes:
    #     for user_likes in likes[username]:
    #         data.add_tuple((VALUE, username, user_likes)) # Tuple format is: <value, row, column>

    #读取所有user的履历,制作成SVD可执行的matrix
    f_r = open(data_file, 'r')
    for line in f_r:
        info = line.split(',')
        data.add_tuple((1.0, info[0], info[1]))

    svd = SVD()
    svd.set_data(data)
    k = 5 # Usually, in a real dataset, you should set a higher number, e.g. 100
    svd.compute(k=k, min_values=3, pre_normalize=None, mean_center=False, post_normalize=True)

    #从question里读取需要被推荐的userid
    fr = open(input_file, 'r')
    for line in fr:
        userid = line
        user_list = svd.similar(userid) 

    #print('=============================================')
    #print(user_list)
    #print(len(user_list))

    #保存所有相似度大于50%的用户id到answer file
    fw = open(output_file, 'w')

    del user_list[0] #删除需要被推荐的用户自身id

    for user in user_list:
        if user[1] > 0.5: 
            fw.write(user[0] + '\n')
    fw.close()
Пример #54
0
def calculate_SVD_features():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'feature_matrix.csv'
    svd.load_data(filename=dat_file,
                  sep=',',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    svd.compute(k=100,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True)
    return svd
Пример #55
0
def calculate_SVD_users():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'user_data_working.csv'
    svd.load_data(filename=dat_file,
                  sep=',',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    svd.compute(k=100,
                min_values=2,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    shutil.copy('user_data_original.csv', 'user_data_working.csv')
    return svd
Пример #56
0
def evaulte(train_set, test_set):
    svd = SVD()
    svd.set_data(train_set)
    svd.compute(k=KKK,
                min_values=MIN_ITEM,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    mae = MAE()
    k_err = 0
    for rating, item_id, user_id in test_set.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            mae.add(rating, pred_rating)
        except KeyError:
            #print "keyerror: ===========================================================>"
            k_err += 1
            continue

    print "k_err", k_err, " -- ", "test-len: ", len(
        test_set.get()), "train-len: ", len(train_set.get())
    result = mae.compute() / 2.0
    return result
Пример #57
0
def calculate_stats_features(pct_train):
    dat_file = 'feature_matrix.csv'
    data = Data()
    data.load(dat_file,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    train, test = data.split_train_test(percent=pct_train)
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=False)
    return svd, train, test
#This is the recommendation algorithm based on the SVD
#This code can be run in real time but the model has to be pre-computed

import recsys.algorithm
from recsys.algorithm.factorize import SVD
'''
SVD recommendation for best fit movies. Includes known and unknown movies
'''

#Lets make things Verbose
recsys.algorithm.VERBOSE = True
#Loading the computed model
svd = SVD(filename='movielens_small')
#Loading the movielens file of movies which has a mapping of movies to movie-id
loop = True

while (loop):
    ratings_file = open('ratings_small.csv', 'r+')
    movie_lens = open('movies.csv', 'r+')
    user_found = False
    movie_found = False
    USERID = int(input("Enter user id: "))
    #Check if the user_id exists. Since currently we are using the small database, we need to check each and every field.
    #If using the complete database, just check if the number lies in the range.
    for rating_row in ratings_file:
        rating_item = rating_row.split(',')
        if (int(rating_item[0]) == USERID):
            user_found = True
            break
    if (movie_found):
        for movie_row in movie_lens:
Пример #59
0
			# if cnt == 100000: break
			(user, item, week, time, feat1, feat2)=line.split('\t')
			test.append(
				{"1_user_id": int(user),
				 "2_item_id": int(item)
				})		
	return test

recsys.algorithm.VERBOSE = True
print "loading data"
data = Data()
data.load('../item_recom/train_info.tsv',sep='\t', format={'col':0, 'row':1, 'value':6, 'ids': int})

topic = 48
print "compute svd"
svd = SVD()
svd.set_data(data)
svd.compute(k=topic, min_values=0.0, pre_normalize=None, mean_center=True, post_normalize=True)

print "loading test data"
test = loadTest('../item_recom/test_info.tsv')

print svd.predict(0,0)

print "creating submission"
with open('../submissions/recsys_3.csv', 'w') as csvfile:
	fieldnames = ['uid#iid', 'pred']
	writer = csv.DictWriter(csvfile, fieldnames)
	writer.writeheader()
	for ind in xrange(len(test)):
		writer.writerow(