Пример #1
0
def ex1(dat_file='./ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int})
       

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Пример #2
0
 def impute_to_file(self, tastings, k=100, min_values=2, verbose=True):
     # create a data file in Movielens format with the tastings data
     self.save_tastings_to_movielens_format_file(tastings)
     # for logging/testing purposes we may like this verbose
     if verbose:
         recsys.algorithm.VERBOSE = True
     svd = SVD()
     # load source data, perform SVD, save to zip file
     source_file = self.file_location(self.tastings_movielens_format)
     svd.load_data(filename=source_file,
                   sep='::',
                   format={
                       'col': 0,
                       'row': 1,
                       'value': 2,
                       'ids': int
                   })
     outfile = self.file_location(self.tastings_recsys_svd)
     svd.compute(k=k,
                 min_values=min_values,
                 pre_normalize=None,
                 mean_center=True,
                 post_normalize=True,
                 savefile=outfile)
     return svd
Пример #3
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    train, test = data.split_train_test(percent=pct_train)
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100,
                min_values=2,
                pre_normalize=None,
                mean_center=True,
                post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Пример #4
0
def test_classifier(model, filename=None, itemkey="track", selector="SELECT * FROM train"):
    conn = sqlite3.connect("db.sqlite")
    conn.row_factory = dict_factory
    cur = conn.cursor()
    s = 0
    c = 0
    t_p = 0
    for i in range(0,10):
        svd = SVD()
        if filename is not None:
            svd.load_model(filename)
        l = list(cur.execute(selector))
        random.shuffle(l)
        count = len(l)
        svd.set_data([(x["rating"],x["track"],x["user"]) for x in l[0:int(count*0.7)]])
        K = 1000
        svd.compute(k=K, min_values=0.0, pre_normalize=None, mean_center=True, post_normalize=True)

        pairs = []
        for idx,item in enumerate(l[int(count*0.7):]): 
            user = item["user"]
            track = item[itemkey]
            pairs.append((predict_item(svd, track,user), item["rating"]))
        t_p += len(pairs)
        s += RMSE(pairs).compute()
        c += 1.0
        print "iteration"
    print s/c, t_p
Пример #5
0
def evaluate(data, count=5, K=100):
    results = []

    for i in range(count):
        train, test = data.split_train_test(percent=PERCENT_TRAIN)
        print len(data.get()), len(train.get()), len(test.get())
        #test_in_train(test, train)
        #print train.get()
        svd = SVD()
        svd.set_data(train)
        svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

        #Evaluation using prediction-based metrics
        rmse = RMSE()
        mae = MAE()
        for rating, item_id, user_id in test.get():
            try:
                pred_rating = svd.predict(item_id, user_id)
                rmse.add(rating, pred_rating)
                mae.add(rating, pred_rating)
            except KeyError:
                #print "keyerror: ===========================================================>"
                continue
        try:
            rsu = {}
            rsu["RMSE"] = rmse.compute()
            rsu["MAE"] = mae.compute()
            print rsu
            results.append(rsu)
        except:
            print "one error....++++++++++++++++++++++++++++++++++++++++++++++++++++"
        

    return results
Пример #6
0
def main():
    svd = SVD()
    train = Data()
    test = Data()
    train.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int})
    test.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int})
    svd.set_data(train)
    svd.compute(k=100, min_values=0.5, pre_normalize=False, mean_center=True, post_normalize=True)

    # rmse = RMSE()
    # mae = MAE()
    # for rating, item_id, user_id in test.get():
    #     try:
    #         pred_rating = svd.predict(item_id, user_id)
    #         rmse.add(rating, pred_rating)
    #         mae.add(rating, pred_rating)
    #     except KeyError:
    #         continue
    # print 'RMSE=%s' % rmse.compute()
    # print 'MAE=%s' % mae.compute()

    # test = make_test()
    # print precision_and_recall(test, svd)
    # rec_list = svd.recommend(200, n=5, only_unknowns=False, is_row=False)
    print svd.recommend(1, n=5, only_unknowns=False, is_row=False)
def recommend(dimension=100): 
    svd = SVD()
    svd.load_data(filename='rating.dat',
                sep='\t',
                format={'col':2, 'row':1, 'value':0, 'ids': int})

    k = dimension
    svd.compute(k=k, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True)
    
    game_recdict={}
    for item in svd.recommend(1, is_row=False):
        appid=item[0]
        game=Game(appid)
        if (game.success==1):
            game_recdict[game.rec]=[game.appid, game.genre, game.name, game.img]
        
    sorted_list=sorted(game_recdict.keys(), reverse=True)
    print ("Games Recommended:")
    for i in sorted_list:
        # image
        urllib.urlretrieve(game_recdict[i][3], "local-filename.jpg")
        image = plt.imread("local-filename.jpg")
        plt.imshow(image)
        plt.show()
    
        #name
        print game_recdict[i][2]
Пример #8
0
def train_and_save(filename):

    step = filename.split('.')[-1]

    data = Data()

    format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'}
    data.load(filename, sep='::', format=format)

    train, test = data.split_train_test(percent=80)

    try:

        svd = SVD('svdn_model_{step}.zip'.format(step=step))
        print('Already exists: svdn_model_{step}.zip'.format(step=step))

    except:

        svd = SVD()
        svd.set_data(train)

        svd.compute(
            k=100,
            min_values=2,
            pre_normalize=False,
            mean_center=True,
            post_normalize=True,
            savefile='svdn_model_{step}'.format(step=step)
        )

        print('Saved svdn_model_{step}.zip'.format(step=step))
Пример #9
0
def similar_users(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user': user})['recommended'] == False:
        user_files = db.user_list.find({'user': user})
        f = open('./dc_recom.dat', 'a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {
            'user': user,
            'recommended': True
        })

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col': 1, 'row': 0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True)
    return [i[0] for i in svd.similar(user)]
Пример #10
0
def reCompute(user_id):
    data = Data()
    fname = 'ratings.dat'
    dataset = Data()
    format = {'col': 0, 'row': 1, 'value': 2, 'ids': 'int'}
    dataset.load(fname, sep=':', format=format)

    svd = SVD()
    svd.set_data(dataset)

    k = 100
    svd.compute(k=k,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    #New ID of Added User
    USERID = user_id

    a = svd.recommend(USERID, is_row=False)
    for j in range(1, len(a)):
        global a
        k = a[j][0]
        print df_movies.query('movie_id==@k')
Пример #11
0
def compute_SVD():
	svd = SVD()
	svd.set_data(load_data())

	K=100
	svd.compute(k=K, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile=None)
	svd.save_model(os.path.join(utils.get_add_dir(), 'ratings'))
Пример #12
0
def recommended_files(data,user):
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    similar_users = [i[0] for i in svd.similar(user)]
    
    #recoms = svd.recommend(user,is_row=True,only_unknowns=True,n=50)
    predict_arr = []

    user_tths = db.user_list.find({'user':user})
    tths = [i['tth'] for i in user_tths]
    movie_names = []
    
    for i in similar_users[1:]:
        for j in db.user_list.find({'user':i}):
            if j['tth'] not in tths:
                movie_name = db.tths.find_one({'tth':j['tth']})['name']
                movie_names.append(movie_name)               
                tths.append(j['tth'])   
                predict_arr.append((movie_name,j['tth'],svd.predict(user,j['tth'])))
    
    predict_arr = sorted(predict_arr,key=lambda x:x[2],reverse=True)
    res = []
    c_res = 0
    for p in predict_arr:
        flag=0
        for r in res:                
            if similar(p[0],r[0]):
                flag = 1
                break
        if flag == 0:
            res.append(p[1])
            c_res += 1
            if c_res > 10:
                return res
Пример #13
0
def SVDtrain2(data,pct_train):
    train, test = data.split_train_test(percent=pct_train)                                                                                                                                                                     
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True,
    post_normalize=True)
    return svd,train,test
Пример #14
0
def recommended_files(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user':user})['recommended']==False:
        user_files = db.user_list.find({'user':user})
        f = open('./dc_recom.dat','a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {'user':user, 'recommended': True})

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col':1,'row':0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    similar_users = [i[0] for i in svd.similar(user,n=10)]

    newdata = Data()
    for i in range(0,len(similar_users),1):
        files = db.user_list.find({'user':similar_users[i]})
        for f in files:
            newdata.add_tuple((1.0,similar_users[i],f['tth']))
    svd.set_data(newdata)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    recoms = svd.recommend(user,is_row=True,only_unknowns=True,n=100)

    res = []
    c_res = 0
    for p in recoms:
        flag=0
        for r in res:
            if similar(db.tths.find_one({'tth':p[0]})['name'],db.tths.find_one({'tth':r[0]})['name']):
                flag = 1
                break
        if flag == 0:
            res.append(p)
            c_res += 1
            if c_res > 10:
                k = []
                for i in res:
                    try:
                        j = 'magnet:?xt=urn:tree:tiger:'+i[0] + "&dn=" + unidecode.unidecode(db.tths.find_one({'tth': i[0]})['name'])
                    except:
                        j = 'magnet:?xt=urn:tree:tiger:'+i[0]
                    k.append(j)
                return k
    k = []
    for i in res:
        try:
            j = 'magnet:?xt=urn:tree:tiger:'+i[0] + "&dn=" + unidecode.unidecode(db.tths.find_one({'tth': i[0]})['name'])
        except:
            j = 'magnet:?xt=urn:tree:tiger:'+i[0]
        k.append(j)

    return k
Пример #15
0
 def build_model(self, uids, kn):
     data = Data()
     for uid, songs in uids.items():
         for song in songs:
             data.add_tuple((1, song, uid))
     svd = SVD()
     svd.set_data(data)
     svd.compute(k=kn, min_values=1)
     self.model = svd
Пример #16
0
	def build_model(self,uids,kn):
		data = Data()
		for uid,songs in uids.items():
			for song in songs:
				data.add_tuple((1,song,uid))
		svd = SVD()
		svd.set_data(data)
		svd.compute(k=kn,min_values=1)
		self.model = svd
Пример #17
0
def calculate_SVD_features():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'feature_matrix.csv'
    svd.load_data(filename=dat_file, sep=',', 
                format = {'col':0, 'row':1, 'value': 2, 'ids': int})
    svd.compute(k=100, min_values=0, pre_normalize=None, 
                mean_center=False, post_normalize=True)
    return svd       
def train_svd(data):
    """
    This method load processed data and modelling data using Singular Value Decomposition
    :return: SVD model
    """
    svd = SVD()
    svd.set_data(get_data_model_matrix(data))
    k = 30
    svd.compute(k=k, min_values=0, pre_normalize=None, mean_center=True, post_normalize=True)
    return svd
Пример #19
0
def getSVD():
    filename = "/home/udaysagar/Documents/Classes/239/recsys/model/movielens.zip"
    if os.path.exists(filename):
        return SVD("./model/movielens")
    else:
        svd = SVD()
        svd.load_data(filename='./data/movielens/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
        k = 100
        svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./model/movielens')
        return svd
def SVDtrain2(data, pct_train):
    train, test = data.split_train_test(percent=pct_train)
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=5,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    return svd, train, test
Пример #21
0
def calculate_SVD_users():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'user_data_working.csv'
    svd.load_data(filename=dat_file, sep=',', 
                format = {'col':0, 'row':1, 'value': 2, 'ids': int})
    svd.compute(k=100, min_values=2, pre_normalize=None, 
                mean_center=True, post_normalize=True)
    shutil.copy('user_data_original.csv','user_data_working.csv')
    return svd
Пример #22
0
def calculate_stats_features(pct_train):
    dat_file='feature_matrix.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=0, pre_normalize=None, mean_center=False,
    post_normalize=False)
    return svd,train,test
Пример #23
0
def create_svd_model(train):
    """ Build SVD model
    """
    
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100,
                min_values=0,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    
    return svd
Пример #24
0
 def impute_to_file(self, tastings, k=100, min_values=2, verbose=True):
     # create a data file in Movielens format with the tastings data
     self.save_tastings_to_movielens_format_file(tastings)
     # for logging/testing purposes we may like this verbose
     if verbose:
         recsys.algorithm.VERBOSE = True
     svd = SVD()
     # load source data, perform SVD, save to zip file
     source_file = self.file_location(self.tastings_movielens_format)
     svd.load_data(filename=source_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
     outfile = self.file_location(self.tastings_recsys_svd)
     svd.compute(k=k, min_values=min_values, pre_normalize=None, mean_center=True, post_normalize=True, savefile=outfile)
     return svd
Пример #25
0
def create_svd_model(train):
    """ Build SVD model
    """

    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100,
                min_values=0,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    return svd
def train_svd(data):
    """
    This method load processed data and modelling data using Singular Value Decomposition
    :return: SVD model
    """
    svd = SVD()
    svd.set_data(get_data_model_matrix(data))
    k = 30
    svd.compute(k=k,
                min_values=0,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    return svd
Пример #27
0
def Compute():
    svd = SVD()
    svd.load_data(filename='./ml-1m/ratings.dat',
                  sep='::',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    svd.compute(k=100,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True,
                savefile='./mvsvd')
Пример #28
0
def quickstart():
    svd = SVD()
    recsys.algorithm.VERBOSE = True

    # load movielens data
    dat_file = DATA_DIR + 'ml-1m-ratings.dat'
    svd.load_data(filename=dat_file,
                  sep='::',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })

    # compute svd
    k = 100
    svd.compute(k=k,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    pdb.set_trace()

    # movie id's
    ITEMID1 = 1  # toy story
    ITEMID2 = 1221  # godfather II

    # get movies similar to toy story
    print svd.similar(ITEMID1)

    # get predicted rating for given user & movie
    MIN_RATING = 0.0
    MAX_RATING = 5.0
    USERID = 1
    ITEMID = 1

    # get predicted rating for user1 and item1, mapped onto min max
    pred = svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)
    actual = svd.get_matrix().value(ITEMID, USERID)
    print 'predicted rating = {0}'.format(pred)
    print 'actual rating = {0}'.format(actual)

    print 'which users should see Toy Story?:'
    print svd.recommend(ITEMID)
Пример #29
0
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5):

    data = Data()
    data.load(dat_file,
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    # About format parameter:
    #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
    #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
    #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
    #   file
    #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=5,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    # mae is mean ABSOLUTE error
    # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Пример #30
0
def similar_users(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user':user})['recommended']==False:
        user_files = db.user_list.find({'user':user})
        f = open('./dc_recom.dat','a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {'user':user, 'recommended': True})

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col':1,'row':0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    return [i[0] for i in svd.similar(user)]
Пример #31
0
def calculate_SVD_features():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'feature_matrix.csv'
    svd.load_data(filename=dat_file,
                  sep=',',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    svd.compute(k=100,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True)
    return svd
Пример #32
0
def evaulte(train_set, test_set):
    svd = SVD()
    svd.set_data(train_set)
    svd.compute(k=KKK, min_values=MIN_ITEM, pre_normalize=None, mean_center=True, post_normalize=True)

    mae = MAE()
    k_err = 0
    for rating, item_id, user_id in test_set.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            mae.add(rating, pred_rating)
        except KeyError:
            #print "keyerror: ===========================================================>"
            k_err += 1
            continue
    
    print "k_err", k_err, " -- ", "test-len: ", len(test_set.get()), "train-len: ", len(train_set.get())
    result = mae.compute()/2.0
    return result
Пример #33
0
def calculate_SVD_users():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'user_data_working.csv'
    svd.load_data(filename=dat_file,
                  sep=',',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    svd.compute(k=100,
                min_values=2,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    shutil.copy('user_data_original.csv', 'user_data_working.csv')
    return svd
Пример #34
0
def compute(aws_region, s3_bucket, filename, sep, col_index, row_index, value_index, ids_type):
    download_from_s3(aws_region, s3_bucket, filename)
    svd = SVD()

    print 'Loading data to SVD module'
    svd.load_data(filename='./data/' + filename,
                  sep=sep,
                  format={'col':int(col_index), 'row':int(row_index), 'value':int(value_index), 'ids': ids_type})

    k = derive_latent_dimensions(svd, energy_level=0.6)

    print 'Stating to compute SVD at ', strftime("%Y-%m-%d %H:%M:%S", gmtime())
    svd.compute(k=k,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True,
                savefile='./models/recommender')
    print "SVD model saved at ", strftime("%Y-%m-%d %H:%M:%S", gmtime())
    sys.exit() # to make sure that process finishes at the end
Пример #35
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True,
    post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():      
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Пример #36
0
def calculate_stats_features(pct_train):
    dat_file = 'feature_matrix.csv'
    data = Data()
    data.load(dat_file,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    train, test = data.split_train_test(percent=pct_train)
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=False)
    return svd, train, test
Пример #37
0
def color_user(input_file, output_file, data_file):

    data = Data()

    # VALUE = 1.0
    # for username in likes:
    #     for user_likes in likes[username]:
    #         data.add_tuple((VALUE, username, user_likes)) # Tuple format is: <value, row, column>

    #读取所有user的履历,制作成SVD可执行的matrix
    f_r = open(data_file, 'r')
    for line in f_r:
        info = line.split(',')
        data.add_tuple((1.0, info[0], info[1]))

    svd = SVD()
    svd.set_data(data)
    k = 5 # Usually, in a real dataset, you should set a higher number, e.g. 100
    svd.compute(k=k, min_values=3, pre_normalize=None, mean_center=False, post_normalize=True)

    #从question里读取需要被推荐的userid
    fr = open(input_file, 'r')
    for line in fr:
        userid = line
        user_list = svd.similar(userid) 

    #print('=============================================')
    #print(user_list)
    #print(len(user_list))

    #保存所有相似度大于50%的用户id到answer file
    fw = open(output_file, 'w')

    del user_list[0] #删除需要被推荐的用户自身id

    for user in user_list:
        if user[1] > 0.5: 
            fw.write(user[0] + '\n')
    fw.close()
Пример #38
0
def loadSVD():        
    
    filename = 'favRate.dat'
    svd = SVD()
    svd.load_data(filename=filename, sep='::', format={'col':0, 'row':1, 'value':2})
    
    svd.save_data("svd.dat", False)
    
    K=20
    svd.compute(k=K, min_values=1, pre_normalize="rows", mean_center=False, post_normalize=True, savefile='.')
    
    
    #svd.recommend(USERID, n=10, only_unknowns=True, is_row=False)
    
    sparse_matrix = svd.get_matrix()
    
    sim_matrix = svd.get_matrix_similarity()
    
    
    
    print sparse_matrix
    
    #print sim_matrix
    
    #1173893,1396943
    sim = svd.similar(897346, 10)
    
    filename = 'swoffering.yaml'
    titleStream = file(filename, 'r')
    titleList = yaml.load(titleStream)
    
    #print sim
    
    for row in sim:
        
        (offid, similar) = row
        
        print offid, titleList[str(offid)], similar        
Пример #39
0
def evaulte(train_set, test_set):
    svd = SVD()
    svd.set_data(train_set)
    svd.compute(k=KKK,
                min_values=MIN_ITEM,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    mae = MAE()
    k_err = 0
    for rating, item_id, user_id in test_set.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            mae.add(rating, pred_rating)
        except KeyError:
            #print "keyerror: ===========================================================>"
            k_err += 1
            continue

    print "k_err", k_err, " -- ", "test-len: ", len(
        test_set.get()), "train-len: ", len(train_set.get())
    result = mae.compute() / 2.0
    return result
Пример #40
0
def ex1(dat_file='ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,
    'ids':int})
        # About format parameter:
        #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
        #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
        #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
        #   file
        #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(
        k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Пример #41
0
def quickstart():
    svd = SVD()
    recsys.algorithm.VERBOSE = True

    # load movielens data
    dat_file = 'ml-1m/ratings.dat'
    svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})

    # compute svd
    k = 100
    svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True,
        post_normalize=True)

    pdb.set_trace()

    # movie id's
    ITEMID1 = 1      # toy story
    ITEMID2 = 1221   # godfather II

    # get movies similar to toy story
    svd.similar(ITEMID1)

    # get predicted rating for given user & movie
    MIN_RATING = 0.0
    MAX_RATING = 5.0
    USERID = 1
    ITEMID = 1

    # get predicted rating
    pred = svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)
    actual = svd.get_matrix().value(ITEMID, USERID)
    print 'predicted rating = {0}'.format(pred)
    print 'actual rating = {0}'.format(actual)

    # which users should see Toy Story?
    svd.recommend(ITEMID)
Пример #42
0
def Compute():
	svd = SVD()
	svd.load_data(filename='./ml-1m/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
	svd.compute(k=100, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./mvsvd')
Пример #43
0
svd = SVD()
filename = './data4'
filename = './data3.csv'
#filename = './data2.csv'
filename = './data.csv'
filename = './data_l2.csv'
filename = './2016.6.29.for_svd.csv'
svd.load_data(filename=filename,
        sep=',',
        format={'col':0, 'row':1, 'value':2, 'ids': str})
# col -> user, row -> item, value -> label, ids -> timestamp

k = 100
r = svd.compute(k=k,
            min_values=2,
            pre_normalize=None,
            mean_center=False,
            post_normalize=True,
            savefile='/tmp/movielens')

#ITEMID1 = 109    # Toy Story (1995)
#ITEMID2 = 106 # A bug's life (1998)

#print(svd.similarity(ITEMID1, ITEMID2))
# 0.67706936677315799


item_set = set()
import csv
with open(filename, 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
Пример #44
0
path = "datasets/ml-latest-small/ratings_train_1.csv"

svd = SVD()
svd.load_data(filename=path,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': float
              })

k = 30
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True,
            savefile='/tmp/movielens')

# ITEMID1 = 1    # Toy Story (1995)
# ITEMID2 = 2355 # A bug's life (1998)

# print svd.similarity(ITEMID1, ITEMID2)

MIN_RATING = 1.0
MAX_RATING = 5.0

USERID = 1
ITEMID = 1129

print svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)
Пример #45
0
def recommended_files(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user': user})['recommended'] == False:
        user_files = db.user_list.find({'user': user})
        f = open('./dc_recom.dat', 'a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {
            'user': user,
            'recommended': True
        })

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col': 1, 'row': 0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True)
    similar_users = [i[0] for i in svd.similar(user, n=10)]

    newdata = Data()
    for i in range(0, len(similar_users), 1):
        files = db.user_list.find({'user': similar_users[i]})
        for f in files:
            newdata.add_tuple((1.0, similar_users[i], f['tth']))
    svd.set_data(newdata)
    svd.compute(k=1000,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True)
    recoms = svd.recommend(user, is_row=True, only_unknowns=True, n=100)

    res = []
    c_res = 0
    for p in recoms:
        flag = 0
        for r in res:
            if similar(
                    db.tths.find_one({'tth': p[0]})['name'],
                    db.tths.find_one({'tth': r[0]})['name']):
                flag = 1
                break
        if flag == 0:
            res.append(p)
            c_res += 1
            if c_res > 10:
                k = []
                for i in res:
                    try:
                        j = 'magnet:?xt=urn:tree:tiger:' + i[
                            0] + "&dn=" + unidecode.unidecode(
                                db.tths.find_one({'tth': i[0]})['name'])
                    except:
                        j = 'magnet:?xt=urn:tree:tiger:' + i[0]
                    k.append(j)
                return k
    k = []
    for i in res:
        try:
            j = 'magnet:?xt=urn:tree:tiger:' + i[
                0] + "&dn=" + unidecode.unidecode(
                    db.tths.find_one({'tth': i[0]})['name'])
        except:
            j = 'magnet:?xt=urn:tree:tiger:' + i[0]
        k.append(j)

    return k
Пример #46
0
              })

#Haciendo el split al dataset
filename = './data/ratings.dat'
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': int}
data.load(filename, sep='::', format=format)
train_80, test_20 = data.split_train_test(percent=80)  # 80% train, 20% test
svd = SVD()
svd.set_data(train_80)

#Ingresando  variables para crear la matrizx
k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)

k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True,
            savefile='./temporal/')

#Hallando similitud entre  2 items
from recsys.algorithm.factorize import SVD

svd2 = SVD(filename='./temporal/')  # Loading already computed SVD model
        "country music", "office", "birds"
    }
}

data = Data()
VALUE = 1.0
for username in likes:
    for user_likes in likes[username]:
        data.add_tuple((VALUE, username,
                        user_likes))  # Tuple format is: <value, row, column>

svd = SVD()
svd.set_data(data)
k = 5  # Usually, in a real dataset, you should set a higher number, e.g. 100
svd.compute(k=k,
            min_values=3,
            pre_normalize=None,
            mean_center=False,
            post_normalize=True)

print(svd.similar('sheila'))
print("######################")
import difflib
for key in likes:
    rajat = likes['rajat']
    key1 = likes[key]
    rajat_list = list(rajat)
    key_list = list(key1)
    print 'rajat', key, difflib.SequenceMatcher(None, rajat_list,
                                                key_list).ratio()
Пример #48
0
def svd(filepath):

    src_folder = parseOutputFolderPath(filepath)
    base_file_name = parseFileName(filepath)

    avg_rmse = 0.0
    avg_mae = 0.0

    out_file_base = base_file_name + "_pred_svd"
    out_file = open(src_folder + "output/" + out_file_base + EXT, "w")

    # for each fold
    for fold_index in xrange(1, NUM_FOLDS + 1):

        print "*** \t FOLD {0} \t ***".format(fold_index)

        M_test = lil_matrix((_N, _M))
        rmse = 0.0
        mae = 0.0

        train_path = src_folder + base_file_name + TRAIN_PREFIX + str(
            fold_index) + EXT
        test_path = src_folder + base_file_name + TEST_PREFIX + str(
            fold_index) + EXT

        print train_path
        print test_path

        svd = SVD()
        svd.load_data(filename=train_path,
                      sep=',',
                      format={
                          'col': 0,
                          'row': 1,
                          'value': 2,
                          'ids': float
                      })

        svd.compute(k=_K,
                    min_values=1,
                    pre_normalize=None,
                    mean_center=True,
                    post_normalize=True)

        with open(test_path, "r") as infile:
            reader = csv.reader(infile, delimiter=",")
            for line in reader:
                userid = int(line[0], 10)
                movieid = int(line[1], 10)
                score = float(line[2])
                M_test[userid, movieid] = score

        # GROUND_TRUTH = [3.0, 1.0, 5.0, 2.0, 3.0]
        # TEST = [2.3, 0.9, 4.9, 0.9, 1.5]
        # mae = MAE()
        # mae.load_ground_truth(GROUND_TRUTH)
        # mae.load_test(TEST)
        # mae.compute() #returns 0.7

        # write predictions only for first test (fold)
        if (fold_index == 1):
            rows, cols = M_test.nonzero()
            for row, col in zip(rows, cols):
                try:
                    r_xi = svd.predict(col, row, MIN_RATING, MAX_RATING)
                except:
                    print row, col
                out_file.write(
                    str(row) + '\t' + str(col) + '\t' + str(r_xi) + '\n')

        print "..done"
        print ""

        exit()

    out_file.close()

    # average rmse and mae on validation folds
    eval_out_path = src_folder + "output/" + out_file_base + "_eval" + EXT

    with open(eval_out_path, "w") as file:
        file.write("RMSE" + "\t" + "MAE" + "\n")
        avg_rmse /= float(NUM_FOLDS)
        avg_mae /= float(NUM_FOLDS)
        file.write(str(avg_rmse) + "\t" + str(avg_mae))
Пример #49
0
class NewsRec():
    def __init__(self):
        self.svd = SVD()
        self.test_set = []

    def load_data(self, filename='train_set_for_svd'):
        self.svd.load_data(filename,
                           sep='\t',
                           format={
                               'value': 0,
                               'row': 2,
                               'col': 1,
                               'ids': int
                           })

    def load_test(self, filename='test_set_for_svd'):
        with open(filename, 'r') as f:
            for line in f:
                strs = line.split('\t')
                self.test_set.append((int(strs[1]), int(strs[2])))

    def recom(self, user_id, recom_num=3, only_unknown=True):
        try:
            #index = self.svd._matrix._matrix.col_index(user_id)
            index = user_id
            return self.svd.recommend(index,
                                      recom_num,
                                      only_unknowns=only_unknown,
                                      is_row=False)
        except IndexError as e:
            return -1

    def compute(self, k=100):
        self.svd.compute(k=k,
                         min_values=None,
                         pre_normalize=None,
                         mean_center=False,
                         post_normalize=True)

    def test(self, recom_num=3):
        hit_cnt = 0
        self.ret = []
        for user, item in self.test_set:
            re = self.recom(user, recom_num)
            #print re
            if type(re) != type([]):
                continue
            try:
                #item_index = self.svd._matrix._matrix.row_index(item)
                item_index = item
            except KeyError as e:
                continue
            for rec_index, rec_rate in re:
                self.ret.append((user, rec_index))
                if item_index == rec_index:
                    hit_cnt += 1
        if hit_cnt == 0:
            return
        user_sum = len(self.test_set)
        recom_sum = recom_num * user_sum
        precise = float(hit_cnt) / recom_sum
        recall = float(hit_cnt) / user_sum
        f = 2.0 / ((1.0 / precise) + (1.0 / recall))
        print 'hit:', hit_cnt
        print 'precise:', precise
        print 'recall:', recall
        print 'F:', f

    def print_ret(self, filename):
        string = ["userid,newsid\n"]
        for user, item in self.ret:
            string.append(str(user))
            string.append(',')
            string.append(str(item))
            string.append('\n')
        with open(filename, 'w') as f:
            f.write("".join(string))
Пример #50
0
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        self.filename = filename
        self.sep = sep
        self.format = format

        # 训练参数
        self.k = 100
        self.min_values = 10
        self.post_normalize = True

        self.svd = SVD()

        # 判断是否加载
        self.is_load = False

        # 添加数据处理
        self.data = Data()

        # 添加模型评估
        self.rmse = RMSE()

    def get_data(self):
        """
        获取数据
        :return: None
        """
        # 如果模型不存在
        if not os.path.exists(tmpfile):
            # 如果数据文件不存在
            if not os.path.exists(self.filename):
                sys.exit()
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            # 使用Data()来获取数据
            self.data.load(self.filename, sep=self.sep, format=self.format)
            train, test = self.data.split_train_test(percent=80)
            return train, test
        else:
            self.svd.load_model(tmpfile)
            self.is_load = True
            return None, None

    def train(self, train):
        """
        训练模型
        :param train: 训练数据
        :return: None
        """
        if not self.is_load:
            self.svd.set_data(train)
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=tmpfile[:-4])
        return None

    def rs_predict(self, itemid, userid):
        """
        评分预测
        :param itemid: 电影id
        :param userid: 用户id
        :return: None
        """
        score = self.svd.predict(itemid, userid)
        print "推荐的分数为:%f" % score
        return score

    def recommend_to_user(self, userid):
        """
        推荐给用户
        :param userid: 用户id
        :return: None
        """
        recommend_list = self.svd.recommend(userid, is_row=False)

        # 读取文件里的电影名称
        movie_list = []

        for line in open(moviefile, "r"):
            movie_list.append(' '.join(line.split("::")[1:2]))

        # 推荐具体电影名字和分数
        for itemid, rate in recommend_list:
            print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate)
        return None

    def evaluation(self, test):
        """
        模型的评估
        :param test: 测试集
        :return: None
        """
        # 如果模型不是直接加载
        if not self.is_load:

            # 循环取出测试集里面的元组数据<评分,电影,用户>
            for value, itemid, userid in test.get():
                try:
                    predict = self.rs_predict(itemid, userid)
                    self.rmse.add(value, predict)
                except KeyError:
                    continue
            # 计算返回误差(均方误差)
            error = self.rmse.compute()

            print "模型误差为%s:" % error

        return None
Пример #51
0
#This algorithm is called singular value decomposition and is used to compute the model from the ratings.csv file
#This needs to be run only once. The computed model is created as a zip folder. 
# U(Sigma)V^T is the mathematical formula used for computing SVD. using the pyrecsys library to implement the SVD algorithm
#Refer to docs for more details on SVD. 

import recsys.algorithm
from recsys.algorithm.factorize import SVD


#To obtain make the script verbose.
recsys.algorithm.VERBOSE = True

#computing the SVD model
svd = SVD()
#loading the ratings file. Format is used to create the matrix for SVD
svd.load_data(filename='ratings_complete.csv', sep=',' , format={'col':0, 'row':1,  'value':2, 'ids':int})
#Now, lets compute the SVD. Formula is M = U(Sigma)V^T
k = 100
svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='movielens_complete')

print("Model Computed and Created")
class NewsRec():
	def __init__(self):
		self.svd = SVD()
		self.test_set = []

	def load_data(self,filename = 'train_set_for_svd'):
		self.svd.load_data(filename,sep='\t',format={'value':0,'row':2,'col':1,'ids':int})
	
	def load_test(self,filename = 'test_set_for_svd'):
		with open(filename,'r') as f:
			for line in f:
				strs = line.split('\t')
				self.test_set.append((int(strs[1]),int(strs[2])))

	def recom(self,user_id,recom_num=3,only_unknown=True):
		try:
			#index = self.svd._matrix._matrix.col_index(user_id)
			index = user_id
			return self.svd.recommend(index,recom_num,only_unknowns=only_unknown,is_row=False)
		except IndexError as e:
			return -1

	def compute(self,k = 100):
		self.svd.compute(k=k, min_values=None, pre_normalize=None, mean_center=False, post_normalize=True)

	def test(self,recom_num=3):
		hit_cnt = 0
		self.ret = []
		for user,item in self.test_set:
			re = self.recom(user,recom_num)
			#print re
			if type(re) !=	type([]):
				continue
			try:
				#item_index = self.svd._matrix._matrix.row_index(item)
				item_index = item
			except KeyError as e:
				continue
			for rec_index,rec_rate in re:
				self.ret.append((user,rec_index))
				if item_index == rec_index:
					hit_cnt += 1
		if hit_cnt == 0:
			return
		user_sum = len(self.test_set)
		recom_sum = recom_num * user_sum
		precise = float(hit_cnt) / recom_sum
		recall = float(hit_cnt) / user_sum
		f = 2.0 / (( 1.0 / precise) + (1.0 / recall))
		print 'hit:',hit_cnt
		print 'precise:',precise
		print 'recall:',recall
		print 'F:',f

	def print_ret(self,filename):
		string = ["userid,newsid\n"]
		for user,item in self.ret:
			string.append(str(user))
			string.append(',')
			string.append(str(item))
			string.append('\n')
		with open(filename,'w') as f:
			f.write("".join(string))
class Recommender:
    def __init__(self, datafile_path=None):
        self.svd = SVD()
        self.matrix = None
        self.datafile_path = datafile_path
        self.predict_matrix = None
        self.load_local_data(self.datafile_path, 100, 0)

    def load_web_data(self,
                      filename,
                      film_names_with_rate_list,
                      K,
                      min_values,
                      MAX_COUNT_USER_FILMS=None,
                      MAX_COUNT_FILM_USERS=None):
        self.matrix = rm.MatrixCreator(MAX_COUNT_USER_FILMS, MAX_COUNT_FILM_USERS).\
            create_matrix_by_film_titles(film_names_with_rate_list)
        self.matrix.save_rating_matrix_as_file(filename)
        self.datafile_path = filename
        self.__compute_matrix(K, min_values)

    def load_local_data(self, filename, K, min_values):
        self.matrix = rm.MatrixCreator().restore_from_file(filename)
        self.datafile_path = filename
        self.__compute_matrix(K, min_values)

    def get_predictions_for_all_users(self,
                                      min_rate=1,
                                      max_rate=10,
                                      top=None,
                                      K=None,
                                      min_values=0):
        if K:
            self.__compute_matrix(K)

        self.predict_matrix = np.zeros((len(self.matrix.users_indexes_map),
                                        len(self.matrix.films_indexes_map)))
        for user in self.matrix.users_indexes_map.keys():
            for film in self.matrix.films_indexes_map.keys():
                user_index = self.matrix.users_indexes_map[user]
                film_index = self.matrix.films_indexes_map[film]
                self.predict_matrix[user_index][film_index] = self.svd.predict(
                    user_index,
                    film_index,
                    MIN_VALUE=min_rate,
                    MAX_VALUE=max_rate)
        return self.predict_matrix

    def predict_for_user(self,
                         user_index,
                         min_rate=1,
                         max_rate=10,
                         top=None,
                         repeat=False,
                         K=None,
                         min_values=None):
        """
        :param K: to change the number of properties
        :return: {Film : int(rate), ...} or
                [(Film, int(rate)), ...] if top is not None
        """
        if K:
            self.__compute_matrix(K)

        prediction = {}
        np_matrix = self.matrix.get_rating_matrix()
        for index in xrange(np_matrix.shape[1]):
            rate = self.svd.predict(user_index,
                                    index,
                                    MIN_VALUE=min_rate,
                                    MAX_VALUE=max_rate)
            film = self.matrix.indexes_films_map[index]
            prediction[film] = rate

        if not repeat:
            fake_user_index = self.matrix.indexes_with_fake_user_ids.keys()[0]
            user = self.matrix.indexes_users_map[fake_user_index]
            films = user.get_preferences().keys()

            prediction = [(x, prediction[x]) for x in prediction
                          if x not in films]

        if top:
            prediction = sorted(prediction.items(), key=operator.itemgetter(1))
            prediction = list(reversed(prediction[-top:]))

        return prediction

    def predict_for_all_fake_users(self,
                                   min_rate=1,
                                   max_rate=10,
                                   top=None,
                                   K=None,
                                   min_values=0):
        """
        :param K: to change the number of properties
        :return: [{Film : int(rate), ...}, ...]
        """
        if K:
            self.__compute_matrix(K)

        predictions = []

        for user_index in self.matrix.indexes_with_fake_user_ids.keys():
            prediction = self.predict_for_user(user_index, min_rate, max_rate,
                                               top)
            predictions.append(prediction)

        return predictions

    def predicted_rating_submatrix(self, user_indexes):
        self.__compute_matrix(100)
        predicted = np.empty((1, self.matrix.rating_matrix.shape[1]), int)
        for index in user_indexes:
            row = []
            for film_index in xrange(self.matrix.rating_matrix.shape[1]):
                row.append(
                    self.svd.predict(index,
                                     film_index,
                                     MIN_VALUE=1,
                                     MAX_VALUE=10))

            predicted = np.append(predicted, [row], axis=0)
        return predicted[1:]

    def predicted_rating_submatrix_for_fake(self):
        return self.predicted_rating_submatrix(
            self.matrix.indexes_with_fake_user_ids.keys())

    def __compute_matrix(self,
                         K,
                         min_values=0,
                         pre_normalize=None,
                         mean_center=True,
                         post_normalize=True):
        self.svd.load_data(self.datafile_path,
                           sep=' ',
                           format={
                               'col': 1,
                               'row': 0,
                               'value': 2,
                               'ids': int
                           })
        self.svd.compute(K,
                         min_values,
                         pre_normalize,
                         mean_center,
                         post_normalize,
                         savefile=None)

    def filter_films_data(self, min_user_votes):
        film_indexes = []
        counter = collections.Counter()
        with open(self.datafile_path, 'rb') as my_file:
            r = csv.reader(my_file)
            for row in r:
                user_index, film_index, rate = row[0].split(' ')
                counter[int(film_index)] += 1

            for k, v in counter.iteritems():
                if v < min_user_votes:
                    film_indexes.append(k)

        copyfile(self.datafile_path + '_user_map',
                 self.datafile_path + '_' + str(min_user_votes) + '_user_map')

        new_indexes = {}
        with open(self.datafile_path + '_film_map', 'rb') as read_file:
            r = csv.reader(read_file)
            with open(
                    self.datafile_path + '_' + str(min_user_votes) +
                    '_film_map', 'wb') as write_file:
                wr = csv.writer(write_file, delimiter=' ')
                index = 0
                for row in r:
                    film_index, film_id = row[0].split(' ')
                    if int(film_index) in film_indexes:
                        continue
                    new_indexes[film_index] = index
                    wr.writerow([index, film_id])
                    index += 1

        with open(self.datafile_path, 'rb') as read_file:
            r = csv.reader(read_file)
            with open(self.datafile_path + '_' + str(min_user_votes),
                      'wb') as write_file:
                wr = csv.writer(write_file, delimiter=' ')
                for row in r:
                    user_index, film_index, rate = row[0].split(' ')
                    if int(film_index) in film_indexes:
                        continue
                    wr.writerow([user_index, new_indexes[film_index], rate])
Пример #54
0
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
svd = SVD()
svd.load_data(filename='train.csv', sep=',', format={'col':0, 'row':1, 'value':2})

k = 100
svd.compute(k=k, pre_normalize=None, mean_center=True, post_normalize=True)

MIN_RATING = 0.0
MAX_RATING = 5000.0

import csv
test_file = 'test.csv'
soln_file = 'recsys.csv'

with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]
            res    = svd.predict(artist, user, MIN_RATING, MAX_RATING)
            soln_csv.writerow([id, res])
Пример #55
0
print len(data._data)

for rate in data._data:
    rate[0]

data.set([rate for rate in data._data if rate[1]<1000])

print len(data._data)

svd.set_data(data)

k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)


#ITEMID1 = 1    # Toy Story (1995)
#ITEMID2 = 2355 # A bug's life (1998)

#print svd.similarity(ITEMID1, ITEMID2)
#print svd.similar(ITEMID1)

MIN_RATING = 0.0
MAX_RATING = 5.0
ITEMID = 1
USERID = 1
Пример #56
0
#!/usr/bin/env python
# coding=utf-8

from recsys.algorithm.factorize import SVD
svd = SVD()
svd.load_data(filename='../invited_info_train_question_sort.txt',
              sep='\t',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': str
              })
k = 200
svd.compute(k=k, savefile='../tmp/weight')

svd2 = SVD(filename='../tmp/weight')  # Loading already computed SVD model

output_path = "./output.txt"
output_file = open(output_path, 'w')
validate_file = file("../validate_nolabel.txt")
line = validate_file.readline()
line = validate_file.readline().strip("\r\n")

while line:
    question_id = line.split(',')[0]
    user_id = line.split(',')[1]
    try:
        predict = svd2.predict(user_id, question_id, 0.0, 1.0)
    except:
        predict = 0
Пример #57
0
			test.append(
				{"1_user_id": int(user),
				 "2_item_id": int(item)
				})		
	return test

recsys.algorithm.VERBOSE = True
print "loading data"
data = Data()
data.load('../item_recom/train_info.tsv',sep='\t', format={'col':0, 'row':1, 'value':6, 'ids': int})

topic = 48
print "compute svd"
svd = SVD()
svd.set_data(data)
svd.compute(k=topic, min_values=0.0, pre_normalize=None, mean_center=True, post_normalize=True)

print "loading test data"
test = loadTest('../item_recom/test_info.tsv')

print svd.predict(0,0)

print "creating submission"
with open('../submissions/recsys_3.csv', 'w') as csvfile:
	fieldnames = ['uid#iid', 'pred']
	writer = csv.DictWriter(csvfile, fieldnames)
	writer.writeheader()
	for ind in xrange(len(test)):
		writer.writerow(
			{
				'uid#iid': "%d#%d"%(test[ind]["1_user_id"], test[ind]["2_item_id"]),
Пример #58
0
Файл: day_07.py Проект: lmlzk/ML
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        # 文件信息
        self.filename = filename
        self.sep = sep
        self.format = format

        # 初始化矩阵分解
        self.svd = SVD()

        # 矩阵信息
        self.k = 100  #  矩阵的隐因子睡昂
        self.min_values = 10  #  删除评分少于10人的电影
        self.post_normalize = False

        # 设置是否加载模型标志
        self.load_model = False

        # 初始化均方误差
        self.rmse = RMSE()

    def get_data(self):
        # 如果模型不存在,则需要加载数据
        if not os.path.exists(filename):
            if not os.path.exists(self.filename):
                sys.exit()
            # SVD加载数据
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            data = Data()

            data.load(self.filename, sep=self.sep, format=self.format)

            # 分割数据集
            train, test = data.split_train_test(percent=80)

            return train, test

        else:
            # 直接加载模型
            self.svd.load_model(filename)

            # 将是否加载模型设为True
            self.load_model = True

            return None, None

    def train(self, train):
        """
        训练数据
        :param train: 训练集
        :return:
        """
        if not self.load_model:
            # svd去获取训练数据集
            self.svd.set_data(train)
            # 注意传入的文件名字,不是带后缀名
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=filename[:-4])
        return None

    def recommend_to_user(self, userid):
        """
        推荐结果
        :param usrid: 用于ID
        :return: None
        """

        recommend_list = self.svd.recommend(userid, is_row=False)

        # 打印电影的名称,和预测的评分

        # 构建电影名字的列表
        movies_list = []

        for line in open("./data/ml-1m/movies.dat", "r"):
            movies_list.append(' '.join(line.split("::")[1:2]))

        # 依次取出推荐ID
        for itemid, rating in recommend_list:

            print "给你推荐的电影叫%s, 预测你对它的评分是%f" % (movies_list[itemid], rating)

        return None

    def rs_predict(self, userid, itemid):
        """
        得出评分
        :param userid: 用户ID
        :param itemid: 物品ID
        :return: 评分
        """
        score = self.svd.predict(itemid, userid)

        return score

    def evaluation(self, test):
        """
        均方误差评估模型
        :param test: 测试数据
        :return: None
        """
        if not self.load_model:
            # 获取测试数据中的id,rat, <rat, row(itemid), col(userid)>
            for rating, itemid, userid in test.get():
                try:
                    # rating真是值
                    score = self.rs_predict(userid, itemid)

                    # 添加所有的测试数据
                    self.rmse.add(rating, score)
                except KeyError:
                    continue

            error = self.rmse.compute()

            print "均方误差为:%s" % error

        return None