예제 #1
0
def svd_protocol_evaluation(data_path, params):
    solr = "http://localhost:8983/solr/grrecsys"
    test_c = consumption(
        ratings_path=data_path + 'test/test_N20.data',
        rel_thresh=0,
        with_ratings=True
    )  #debiera ser el test_c, pero como includeRated=False, da lo mismo
    train_c = consumption(ratings_path=data_path + 'eval_train_N20.data',
                          rel_thresh=0,
                          with_ratings=False)
    svd = pyreclab.SVD(dataset=data_path + 'eval_train_N20.data',
                       dlmchar=b',',
                       header=False,
                       usercol=0,
                       itemcol=1,
                       ratingcol=2)
    svd.train(factors=params['f'],
              maxiter=params['mi'],
              lr=params['lr'],
              lamb=params['lamb'])
    recommendationList, map, ndcg = svd.testrec(input_file=data_path +
                                                'test/test_N20.data',
                                                dlmchar=b',',
                                                header=False,
                                                usercol=0,
                                                itemcol=1,
                                                ratingcol=2,
                                                topn=100,
                                                relevance_threshold=0,
                                                includeRated=False)

    MRRs = dict((N, []) for N in [5, 10, 15, 20])
    nDCGs = dict((N, []) for N in [5, 10, 15, 20])
    APs = dict((N, []) for N in [5, 10, 15, 20])
    Rprecs = dict((N, []) for N in [5, 10, 15, 20])

    for userId, recList in recommendationList.items():
        book_recs = remove_consumed(
            user_consumption=train_c[userId],
            rec_list=recommendationList[userId]
        )  #da lo mismo este paso, según Gabriel el testrec no devuelve items consumidos
        book_recs = recs_cleaner(solr=solr,
                                 consumpt=train_c[userId],
                                 recs=book_recs[:100])
        recs = user_ranked_recs(user_recs=book_recs,
                                user_consumpt=test_c[userId])

        for N in [5, 10, 15, 20]:
            mini_recs = dict((k, recs[k]) for k in list(recs.keys())
                             [:N])  #python 3.x: list() es necesario
            MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1))
            nDCGs[N].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=False))
            APs[N].append(AP_at_N(n=N, recs=recs, rel_thresh=1))
            Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs))

    for N in [5, 10, 15, 20]:
        with open('TwitterRatings/funkSVD/clean/protocol.txt', 'a') as file:
            file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \
             (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )
예제 #2
0
def nDCGMAP_calculator(data_path, params, topN, output_filename):

    user_consumption = consumption(ratings_path=data_path + 'ratings.total',
                                   rel_thresh=0,
                                   with_ratings=True)
    svd = pyreclab.SVD(
        dataset=data_path + 'ratings.train',  #data_path+'train/train.'+str(i),
        dlmchar=b',',
        header=False,
        usercol=0,
        itemcol=1,
        ratingcol=2)
    svd.train(factors=params['f'],
              maxiter=params['mi'],
              lr=params['lr'],
              lamb=params['lamb'])
    recommendationList = svd.testrec(
        input_file=data_path + 'test/' +
        os.listdir(data_path + 'test/')[0],  #data_path+'val/val.'+str(i),
        dlmchar=b',',
        header=False,
        usercol=0,
        itemcol=1,
        ratingcol=2,
        topn=100,
        includeRated=False)
    MRR_thresh4 = []
    MRR_thresh3 = []
    nDCGs_bin_thresh4 = dict((n, []) for n in topN)
    nDCGs_bin_thresh3 = dict((n, []) for n in topN)
    nDCGs_normal = dict((n, []) for n in topN)
    nDCGs_altform = dict((n, []) for n in topN)
    APs_thresh4 = dict((n, []) for n in topN)
    APs_thresh3 = dict((n, []) for n in topN)
    APs_thresh2 = dict((n, []) for n in topN)

    for userId in recommendationList[0]:
        recs = user_ranked_recs(user_recs=recommendationList[0][userId],
                                user_consumpt=user_consumption[userId])

        MRR_thresh4.append(MRR(recs=recs, rel_thresh=4))
        MRR_thresh3.append(MRR(recs=recs, rel_thresh=3))
        for n in topN:
            mini_recs = dict((k, recs[k]) for k in recs.keys()[:n])
            nDCGs_bin_thresh4[n].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=4))
            nDCGs_bin_thresh3[n].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=3))
            nDCGs_normal[n].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=False))
            nDCGs_altform[n].append(
                nDCG(recs=mini_recs, alt_form=True, rel_thresh=False))
            APs_thresh4[n].append(AP_at_N(n=n, recs=recs, rel_thresh=4))
            APs_thresh3[n].append(AP_at_N(n=n, recs=recs, rel_thresh=3))
            APs_thresh2[n].append(AP_at_N(n=n, recs=recs, rel_thresh=2))

    with open('TwitterRatings/funkSVD/' + output_filename, 'a') as file:
        for n in topN:
            file.write( "N=%s, normal nDCG=%s, alternative nDCG=%s, bin nDCG(rel_thresh=4)=%s, bin nDCG(rel_thresh=3)=%s, MAP(rel_thresh=4)=%s, MAP(rel_thresh=3)=%s, MAP(rel_thresh=2)=%s, MRR(rel_thresh=4)=%s, MRR(rel_thresh=3)=%s\n" % \
             (n, mean(nDCGs_normal[n]), mean(nDCGs_altform[n]), mean(nDCGs_bin_thresh4[n]), mean(nDCGs_bin_thresh3[n]), mean(APs_thresh4[n]), mean(APs_thresh3[n]), mean(APs_thresh2[n]), mean(MRR_thresh4), mean(MRR_thresh3)) )
예제 #3
0
def SVDJob(data_path, params):
    # test = []
    # with open(data_path+'test/test.fold', 'r') as f:
    # 	for line in f:
    # 		test.append( line.strip() )
    val_folds = os.listdir(data_path + 'val/')
    maes, rmses = [], []
    """HARDCODED AS F**K: 4+1, N20"""  #Rationale: N20 por ser folds más chicos. Así, ninguno de estos folds se interlapa con alguno de testing.
    for i in range(1, 4 + 1):
        svd = pyreclab.SVD(dataset=data_path + 'train/train_N20.' + str(i),
                           dlmchar=b',',
                           header=False,
                           usercol=0,
                           itemcol=1,
                           ratingcol=2)
        svd.train(factors=params['f'],
                  maxiter=params['mi'],
                  lr=params['lr'],
                  lamb=params['lamb'])
        predlist, mae, rmse = svd.test(input_file=data_path + 'val/val_N20.' +
                                       str(i),
                                       dlmchar=b',',
                                       header=False,
                                       usercol=0,
                                       itemcol=1,
                                       ratingcol=2)
        maes.append(mae)
        rmses.append(rmse)
    return mean(maes), mean(rmses)
예제 #4
0
def test_funksvd():

    svd = pyreclab.SVD(dataset='dataset/u1.base',
                       dlmchar=b'\t',
                       header=False,
                       usercol=0,
                       itemcol=1,
                       ratingcol=2)

    svd.train(factors=1000, maxiter=100, lr=0.01, lamb=0.1)

    pred = svd.predict('457', '443')
    assert abs(pred - expected_prediction) < prediction_epsilon

    ranking = svd.recommend('457', 5, includeRated=False)

    assert ranking == expected_ranking

    predlist, mae, rmse = svd.test(input_file='dataset/u1.test',
                                   dlmchar=b'\t',
                                   header=False,
                                   usercol=0,
                                   itemcol=1,
                                   ratingcol=2,
                                   output_file='predictions.csv')

    assert abs(mae - expected_mae) < mae_epsilon
    assert abs(rmse - expected_rmse) < rmse_epsilon
예제 #5
0
def main(factors=100):
    svd = pyreclab.SVD( dataset = 'dataset/u1.base',
                       dlmchar = b'\t',
                       header = False,
                       usercol = 0,
                       itemcol = 1,
                       ratingcol = 2 )

    print( '-> training model' )
    start = time.clock()
    svd.train(factors = factors, maxiter = 100, lr = 0.01, lamb = 0.1 )
    end = time.clock()
    print( 'training time: ' + str( end - start ) )

    print( '-> individual test' )
    pred = svd.predict( '457', '443' )
    print( 'user 457, item 443, prediction ' + str( pred ) )

    ranking = svd.recommend( '457', 5, includeRated = False )
    print( 'recommendation for user 457: ' + str( ranking ) )

    print( '-> prediction test' )
    start = time.clock()
    predlist, mae, rmse = svd.test( input_file = 'dataset/u1.test',
                                   dlmchar = b'\t',
                                   header = False,
                                   usercol = 0,
                                   itemcol = 1,
                                   ratingcol = 2,
                                   output_file = 'predictions.csv' )
    end = time.clock()
    print( 'prediction time: ' + str( end - start ) )

    print( 'MAE: ' + str( mae ) )
    print( 'RMSE: ' + str( rmse ) )

    print( '-> recommendation test' )
    start = time.clock()
    recommendList = svd.testrec( input_file = 'dataset/u1.test',
                                dlmchar = b'\t',
                                header = False,
                                usercol = 0,
                                itemcol = 1,
                                ratingcol = 2,
                                topn = 10,
                                output_file = 'ranking.json',
                                includeRated = False )
    end = time.clock()
    print( 'recommendation time: ' + str( end - start ) )


    del svd

    time.sleep( 10 )
    return mae, rmse
예제 #6
0
def generate_recommends(params):

    svd = pyreclab.SVD(dataset='TwitterRatings/funkSVD/ratings.train',
                       dlmchar=b',',
                       header=False,
                       usercol=0,
                       itemcol=1,
                       ratingcol=2)

    logging.info("-> Entrenando modelo..")
    logging.info(
        "N° Factores: {0}; maxiter: {1}; learning rate: {2}; lambda: {3} ".
        format(f, mi, lr, lamb))

    start = time.clock()
    svd.train(factors=params['f'],
              maxiter=params['mi'],
              lr=params['lr'],
              lamb=params['lamb'])
    end = time.clock()

    logging.info("training time: " + str(end - start))

    logging.info("-> Test de Recomendación..")
    start = time.clock()
    recommendationList = svd.testrec(
        input_file='TwitterRatings/funkSVD/ratings.test',
        dlmchar=b',',
        header=False,
        usercol=0,
        itemcol=1,
        ratingcol=2,
        topn=10,
        output_file='TwitterRatings/funkSVD/ranking.json')
    end = time.clock()
    logging.info('recommendation time: ' + str(end - start))
예제 #7
0
prediction_filename = data_url + 'predictionsSVD '  #5.csv'
ordenada_filename = data_url + 'ordenadasSVD'  #5.csv'

print 'SVD'
for i in [1, 5]:  #range(1, data_chunks + 1):
    f_t = training_filename + str(i) + ".txt"
    f_p = probe_filename + str(i) + ".txt"
    f_pred = prediction_filename + str(i) + ".csv"
    f_ord = ordenada_filename + str(i) + ".csv"

    print "Corriendo experimento ", i, "..."
    print 'Entrenando...'
    obj = pyreclab.SVD(dataset=f_t,
                       dlmchar=b'\t',
                       header=True,
                       usercol=0,
                       itemcol=1,
                       ratingcol=2)
    obj.train(factors=1000, maxiter=100, lr=0.01, lamb=0.1)
    print 'Prediciendo...'
    #prediction = obj.predict( "630685", "1")
    #ranking = obj.recommend( "630685", 10, True)
    #print prediction
    #print ranking

    predictionList, mae, rmse = obj.test(input_file=f_p,
                                         dlmchar=b'\t',
                                         header=False,
                                         usercol=0,
                                         itemcol=1,
                                         ratingcol=2,
예제 #8
0
#! /usr/bin/env python

import time
import pyreclab

if __name__ == '__main__':

    svd = pyreclab.SVD(dataset='dataset/u1.base',
                       dlmchar=b'\t',
                       header=False,
                       usercol=0,
                       itemcol=1,
                       ratingcol=2)

    print('-> training model')
    start = time.clock()
    svd.train(factors=1000, maxiter=100, lr=0.01, lamb=0.1)
    end = time.clock()
    print('training time: ' + str(end - start))

    print('-> individual test')
    pred = svd.predict('457', '443')
    print('user 457, item 443, prediction ' + str(pred))

    ranking = svd.recommend('457', 5)
    print('recommendation for user 457: ' + str(ranking))

    print('-> prediction test')
    start = time.clock()
    predlist, mae, rmse = svd.test(input_file='dataset/u1.test',
                                   dlmchar=b'\t',
예제 #9
0
def svd_tuning(data_path):

    defaults = {'f': 1000, 'mi': 100, 'lr': 0.01, 'lamb': 0.1}
    results = {'f': {}, 'mi': {}, 'lr': {}, 'lamb': {}}

    for param in ['f', 'lamb', 'lr', 'mi']:

        if param == 'f':
            for i in range(100, 1525, 25):
                defaults['f'] = i
                logging.info(
                    "Entrenando con f={f}, lamb={lamb}, lr={lr}, mi={mi}".
                    format(f=defaults['f'],
                           lamb=defaults['lamb'],
                           lr=defaults['lr'],
                           mi=defaults['mi']))
                mae, rmse = SVDJob(data_path=data_path, params=defaults)
                results['f'][i] = rmse
            defaults['f'] = opt_value(results=results['f'], metric='rmse')

        elif param == 'mi':
            for i in range(10, 520, 20):
                defaults['mi'] = i
                logging.info(
                    "Entrenando con f={f}, lamb={lamb}, lr={lr}, mi={mi}".
                    format(f=defaults['f'],
                           lamb=defaults['lamb'],
                           lr=defaults['lr'],
                           mi=defaults['mi']))
                mae, rmse = SVDJob(data_path=data_path, params=defaults)
                results['mi'][i] = rmse
            defaults['mi'] = opt_value(results=results['mi'], metric='rmse')

        elif param == 'lamb':
            for i in [
                    0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35,
                    0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0
            ]:
                defaults['lamb'] = i
                logging.info(
                    "Entrenando con f={f}, lamb={lamb}, lr={lr}, mi={mi}".
                    format(f=defaults['f'],
                           lamb=defaults['lamb'],
                           lr=defaults['lr'],
                           mi=defaults['mi']))
                mae, rmse = SVDJob(data_path=data_path, params=defaults)
                results['lamb'][i] = rmse
            defaults['lamb'] = opt_value(results=results['lamb'],
                                         metric='rmse')

        elif param == 'lr':
            for i in [
                    0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008,
                    0.009, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08,
                    0.09, 0.1
            ]:
                defaults['lr'] = i
                logging.info(
                    "Entrenando con f={f}, lamb={lamb}, lr={lr}, mi={mi}".
                    format(f=defaults['f'],
                           lamb=defaults['lamb'],
                           lr=defaults['lr'],
                           mi=defaults['mi']))
                mae, rmse = SVDJob(data_path=data_path, params=defaults)
                results['lr'][i] = rmse
            defaults['lr'] = opt_value(results=results['lr'], metric='rmse')

    # Real testing
    svd = pyreclab.SVD(dataset=data_path + 'eval_train_N20.data',
                       dlmchar=b',',
                       header=False,
                       usercol=0,
                       itemcol=1,
                       ratingcol=2)
    svd.train(factors=defaults['f'],
              maxiter=defaults['mi'],
              lr=defaults['lr'],
              lamb=defaults['lamb'])
    predlist, mae, rmse = svd.test(input_file=data_path + 'test/test_N20.data',
                                   dlmchar=b',',
                                   header=False,
                                   usercol=0,
                                   itemcol=1,
                                   ratingcol=2)

    with open('TwitterRatings/funkSVD/opt_params_ptcdssplit.txt', 'w') as f:
        for param in defaults:
            f.write("{param}:{value}\n".format(param=param,
                                               value=defaults[param]))
        f.write("RMSE:{rmse}, MAE:{mae}".format(rmse=rmse, mae=mae))

    return defaults
예제 #10
0
def PRF_calculator(params, folds, topN):

    ratings_train, ratings_test = [], []
    with open('TwitterRatings/funkSVD/ratings.train', 'r') as f:
        for line in f:
            ratings_train.append(line.strip())

    with open('TwitterRatings/funkSVD/ratings.test', 'r') as f:
        for line in f:
            ratings_test.append(line.strip())

    preferred_consumption = consumption(
        ratings_path='TwitterRatings/funkSVD/ratings.test',
        rel_thresh=4,
        with_ratings=False)

    for n in topN:
        precision_folds, recall_folds = [], []
        # for _ in range(0, folds):
        # ratingsSampler(ratings_train, 'TwitterRatings/funkSVD/ratings_temp.train', 0.8)
        # ratingsSampler(ratings_test, 'TwitterRatings/funkSVD/ratings_temp.test', 0.8)

        svd = pyreclab.SVD(
            dataset=
            'TwitterRatings/funkSVD/ratings.train',  #o ratings_temp.train
            dlmchar=b',',
            header=False,
            usercol=0,
            itemcol=1,
            ratingcol=2)

        svd.train(factors=params['f'],
                  maxiter=params['mi'],
                  lr=params['lr'],
                  lamb=params['lamb'])

        recommendationList = svd.testrec(
            input_file=
            'TwitterRatings/funkSVD/ratings.test',  #o ratings_temp.test
            dlmchar=b',',
            header=False,
            usercol=0,
            itemcol=1,
            ratingcol=2,
            topn=n,
            includeRated=False)

        users_precisions, users_recalls = [], []
        for userId in recommendationList[0]:
            recs = set(recommendationList[0][userId])
            cons = set(preferred_consumption[userId])
            tp = len(recs & cons)
            fp = len(recs - cons)
            fn = len(cons - recs)
            users_precisions.append(float(tp) / (tp + fp))
            try:
                users_recalls.append(float(tp) / (tp + fn))
            except ZeroDivisionError as e:
                continue

        precision_folds.append(mean(users_precisions))
        recall_folds.append(mean(users_recalls))

        p = mean(precision_folds)
        r = mean(recall_folds)
        f = 2 * p * r / (p + r)

        with open('TwitterRatings/funkSVD/recall.txt', 'a') as file:
            file.write("N=%s, P=%s, R=%s, F=%s\n" % (n, p, r, f))
예제 #11
0
import time
import pyreclab

if __name__ == '__main__':

    model = pyreclab.SVD(factors=1000,
                         dataset='dataset/u1.base',
                         dlmchar=b'\t',
                         header=False,
                         usercol=0,
                         itemcol=1,
                         ratingcol=2)

    print('-> training model')
    start = time.clock()
    model.train(maxiter=100, lr=0.01, lamb=0.1, progress=True)
    end = time.clock()
    print('training time: ' + str(end - start))

    print('-> individual test')
    pred = model.predict('457', '443')
    print('user 457, item 443, prediction ' + str(pred))

    ranking = model.recommend('457', 5, includeRated=False)
    print('recommendation for user 457: ' + str(ranking))

    print('-> prediction test')
    start = time.clock()
    predlist, mae, rmse = model.test(input_file='dataset/u1.test',
                                     dlmchar=b'\t',
                                     header=False,