예제 #1
0
def svd_protocol_evaluation(data_path, params):
    solr = "http://localhost:8983/solr/grrecsys"
    test_c = consumption(
        ratings_path=data_path + 'test/test_N20.data',
        rel_thresh=0,
        with_ratings=True
    )  #debiera ser el test_c, pero como includeRated=False, da lo mismo
    train_c = consumption(ratings_path=data_path + 'eval_train_N20.data',
                          rel_thresh=0,
                          with_ratings=False)
    svd = pyreclab.SVD(dataset=data_path + 'eval_train_N20.data',
                       dlmchar=b',',
                       header=False,
                       usercol=0,
                       itemcol=1,
                       ratingcol=2)
    svd.train(factors=params['f'],
              maxiter=params['mi'],
              lr=params['lr'],
              lamb=params['lamb'])
    recommendationList, map, ndcg = svd.testrec(input_file=data_path +
                                                'test/test_N20.data',
                                                dlmchar=b',',
                                                header=False,
                                                usercol=0,
                                                itemcol=1,
                                                ratingcol=2,
                                                topn=100,
                                                relevance_threshold=0,
                                                includeRated=False)

    MRRs = dict((N, []) for N in [5, 10, 15, 20])
    nDCGs = dict((N, []) for N in [5, 10, 15, 20])
    APs = dict((N, []) for N in [5, 10, 15, 20])
    Rprecs = dict((N, []) for N in [5, 10, 15, 20])

    for userId, recList in recommendationList.items():
        book_recs = remove_consumed(
            user_consumption=train_c[userId],
            rec_list=recommendationList[userId]
        )  #da lo mismo este paso, según Gabriel el testrec no devuelve items consumidos
        book_recs = recs_cleaner(solr=solr,
                                 consumpt=train_c[userId],
                                 recs=book_recs[:100])
        recs = user_ranked_recs(user_recs=book_recs,
                                user_consumpt=test_c[userId])

        for N in [5, 10, 15, 20]:
            mini_recs = dict((k, recs[k]) for k in list(recs.keys())
                             [:N])  #python 3.x: list() es necesario
            MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1))
            nDCGs[N].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=False))
            APs[N].append(AP_at_N(n=N, recs=recs, rel_thresh=1))
            Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs))

    for N in [5, 10, 15, 20]:
        with open('TwitterRatings/funkSVD/clean/protocol.txt', 'a') as file:
            file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \
             (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )
예제 #2
0
def nDCGMAP_calculator(data_path, params, topN, output_filename):

    user_consumption = consumption(ratings_path=data_path + 'ratings.total',
                                   rel_thresh=0,
                                   with_ratings=True)
    svd = pyreclab.SVD(
        dataset=data_path + 'ratings.train',  #data_path+'train/train.'+str(i),
        dlmchar=b',',
        header=False,
        usercol=0,
        itemcol=1,
        ratingcol=2)
    svd.train(factors=params['f'],
              maxiter=params['mi'],
              lr=params['lr'],
              lamb=params['lamb'])
    recommendationList = svd.testrec(
        input_file=data_path + 'test/' +
        os.listdir(data_path + 'test/')[0],  #data_path+'val/val.'+str(i),
        dlmchar=b',',
        header=False,
        usercol=0,
        itemcol=1,
        ratingcol=2,
        topn=100,
        includeRated=False)
    MRR_thresh4 = []
    MRR_thresh3 = []
    nDCGs_bin_thresh4 = dict((n, []) for n in topN)
    nDCGs_bin_thresh3 = dict((n, []) for n in topN)
    nDCGs_normal = dict((n, []) for n in topN)
    nDCGs_altform = dict((n, []) for n in topN)
    APs_thresh4 = dict((n, []) for n in topN)
    APs_thresh3 = dict((n, []) for n in topN)
    APs_thresh2 = dict((n, []) for n in topN)

    for userId in recommendationList[0]:
        recs = user_ranked_recs(user_recs=recommendationList[0][userId],
                                user_consumpt=user_consumption[userId])

        MRR_thresh4.append(MRR(recs=recs, rel_thresh=4))
        MRR_thresh3.append(MRR(recs=recs, rel_thresh=3))
        for n in topN:
            mini_recs = dict((k, recs[k]) for k in recs.keys()[:n])
            nDCGs_bin_thresh4[n].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=4))
            nDCGs_bin_thresh3[n].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=3))
            nDCGs_normal[n].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=False))
            nDCGs_altform[n].append(
                nDCG(recs=mini_recs, alt_form=True, rel_thresh=False))
            APs_thresh4[n].append(AP_at_N(n=n, recs=recs, rel_thresh=4))
            APs_thresh3[n].append(AP_at_N(n=n, recs=recs, rel_thresh=3))
            APs_thresh2[n].append(AP_at_N(n=n, recs=recs, rel_thresh=2))

    with open('TwitterRatings/funkSVD/' + output_filename, 'a') as file:
        for n in topN:
            file.write( "N=%s, normal nDCG=%s, alternative nDCG=%s, bin nDCG(rel_thresh=4)=%s, bin nDCG(rel_thresh=3)=%s, MAP(rel_thresh=4)=%s, MAP(rel_thresh=3)=%s, MAP(rel_thresh=2)=%s, MRR(rel_thresh=4)=%s, MRR(rel_thresh=3)=%s\n" % \
             (n, mean(nDCGs_normal[n]), mean(nDCGs_altform[n]), mean(nDCGs_bin_thresh4[n]), mean(nDCGs_bin_thresh3[n]), mean(APs_thresh4[n]), mean(APs_thresh3[n]), mean(APs_thresh2[n]), mean(MRR_thresh4), mean(MRR_thresh3)) )
예제 #3
0
def save_testing_recommendations(data_path, which_model, metric,
                                 representation):
    solr = 'http://localhost:8983/solr/grrecsys'
    test_c = consumption(ratings_path=data_path + 'test/test_N20.data',
                         rel_thresh=0,
                         with_ratings=True)
    train_c = consumption(ratings_path=data_path + 'eval_train_N20.data',
                          rel_thresh=0,
                          with_ratings=False)
    docs2vec = np.load('./w2v-tmp/' + which_model + '/docs2vec_' +
                       which_model + '.npy').item()
    users2vec = np.load('./w2v-tmp/' + which_model + '/users2vec_' +
                        representation + '_' + which_model + '.npy').item()
    recommendations = {}

    i = 1
    for userId in test_c:
        logging.info("MODO 2. {0} de {1}. User ID: {2}".format(
            i, len(test_c), userId))
        i += 1

        distances = dict((bookId, 0.0) for bookId in docs2vec)
        for bookId in docs2vec:
            if metric == 'angular':
                distances[bookId] = spatial.distance.cosine(
                    users2vec[userId], docs2vec[bookId])
            elif metric == 'euclidean':
                distances[bookId] = spatial.distance.euclidean(
                    users2vec[userId], docs2vec[bookId])

        sorted_sims = sorted(
            distances.items(), key=operator.itemgetter(1),
            reverse=False)  #[(<grId>, MENOR dist), ..., (<grId>, MAYOR dist)]
        book_recs = [bookId for bookId, sim in sorted_sims]
        book_recs = remove_consumed(user_consumption=train_c[userId],
                                    rec_list=book_recs)
        book_recs = recs_cleaner(solr=solr,
                                 consumpt=train_c[userId],
                                 recs=book_recs[:50])
        recommendations[userId] = book_recs

    np.save('TwitterRatings/recommended_items/w2v_op2gbangular.npy',
            recommendations)
예제 #4
0
파일: scrapper.py 프로젝트: fdjlss/twitrec
def statistics_protocol(data_path, N, folds):
	logging.info( "N={N}".format(N=N) )

	all_c = consumption(ratings_path= data_path+'eval_all_N'+str(N)+'.data', rel_thresh= 0, with_ratings= True)

	# #users, #items, #ratings, avg. rating
	items = set()
	users = set()
	ratings = []
	with open(data_path+'eval_all_N'+str(N)+'.data', 'r') as f:
		for line in f:
			(userId,itemId,rating) = line.split(',')
			items.add(itemId)
			users.add(userId)
			ratings.append(int(rating))
	logging.info( "#users={users}".format(users= len(users)) )
	logging.info( "#users={users}".format(users= len(items)) )
	logging.info( "#ratings={ratings}".format(ratings= len(ratings)) )
	logging.info( "avg. rating={mean}±{stdev}".format(mean= mean(ratings), stdev= stdev(ratings)) )

	# Ratings por item
	item_ratings = dict((itemId, []) for itemId in items)
	for user in all_c:
		for item in all_c[user]:
			item_ratings[item].append( all_c[user][item] )
	ratings_per_item = []
	for item in item_ratings:
		ratings_per_item.append( len(item_ratings[item]) )
	logging.info( "promedio de ratings por item: {mean}±{stdev}".format(mean= mean(ratings_per_item), stdev= stdev(ratings_per_item)) )

	#Ratings por usuario
	ratings_per_user = []
	for user in all_c:
		ratings_per_user.append( len(all_c[user]) )
	logging.info( "promedio de ratings por usuario: {mean}±{stdev}".format(mean= mean(ratings_per_user), stdev= stdev(ratings_per_user)) )

	#Sparsity
	count = 0
	for freq in ratings_per_item:
		count += freq / float( len(users) )
	count = count / float( len(items) )
	count = count*100
	logging.info( "density: {}".format(count) )
예제 #5
0
def option1_protocol_evaluation(data_path, which_model, metric):
    solr = 'http://localhost:8983/solr/grrecsys'
    # userId='113447232' 285597345
    test_c = consumption(ratings_path=data_path + 'test/test_N20.data',
                         rel_thresh=0,
                         with_ratings=True)
    train_c = consumption(ratings_path=data_path + 'eval_train_N20.data',
                          rel_thresh=0,
                          with_ratings=False)
    MRRs = dict((N, []) for N in [5, 10, 15, 20])
    nDCGs = dict((N, []) for N in [5, 10, 15, 20])
    APs = dict((N, []) for N in [5, 10, 15, 20])
    Rprecs = dict((N, []) for N in [5, 10, 15, 20])

    docs2vec = np.load('./w2v-tmp/' + which_model + '/docs2vec_' +
                       which_model + '.npy').item()
    if which_model == 'twit':
        vector_size = 200
    else:
        vector_size = 300
    t = AnnoyIndex(vector_size, metric=metric)
    t.load('./w2v-tmp/' + which_model + '/doc_vecs_t100_' + metric + '_' +
           which_model + '.tree')
    num_to_grId = np.load('./w2v-tmp/' + which_model + '/num_to_grId_' +
                          metric + '_' + which_model + '.npy').item()
    grId_to_num = np.load('./w2v-tmp/' + which_model + '/grId_to_num_' +
                          metric + '_' + which_model + '.npy').item()

    i = 1
    for userId in test_c:
        logging.info("MODO 1. {0} de {1}. User ID: {2}".format(
            i, len(test_c), userId))
        i += 1

        book_recs = []
        for bookId in train_c[userId]:

            try:
                docs = t.get_nns_by_item(grId_to_num[bookId], 500)
                book_recs.append(
                    [str(num_to_grId[doc_num]) for doc_num in docs])
            except KeyError as e:
                logging.info(
                    "{} ES UNO DE LOS LIBROS CUYO HTML NO PUDO SER DESCARGADO. PROSIGUIENDO CON EL SIGUIENTE LIBRO.."
                    .format(bookId))
                continue

        book_recs = flatten_list(list_of_lists=book_recs,
                                 rows=len(
                                     book_recs[0]))  #rows=len(sorted_sims))
        book_recs = remove_consumed(user_consumption=train_c[userId],
                                    rec_list=book_recs)
        book_recs = recs_cleaner(solr=solr,
                                 consumpt=train_c[userId],
                                 recs=book_recs[:50])
        try:
            recs = user_ranked_recs(user_recs=book_recs,
                                    user_consumpt=test_c[userId])
        except KeyError as e:
            logging.info(
                "Usuario {0} del fold de train (total) no encontrado en fold de 'test'"
                .format(userId))
            continue

        for N in [5, 10, 15, 20]:
            mini_recs = dict((k, recs[k]) for k in list(recs.keys())[:N])
            MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1))
            nDCGs[N].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=False))
            APs[N].append(AP_at_N(n=N, recs=recs, rel_thresh=1))
            Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs))

    with open(
            'TwitterRatings/word2vec/clean/option1_protocol_' + which_model +
            '.txt', 'a') as file:
        file.write("METRIC: %s\n" % (metric))

    for N in [5, 10, 15, 20]:
        with open(
                'TwitterRatings/word2vec/clean/option1_protocol_' +
                which_model + '.txt', 'a') as file:
            file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \
             (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )
예제 #6
0
def option2_protocol_evaluation(data_path, which_model, metric,
                                representation):
    solr = 'http://localhost:8983/solr/grrecsys'
    test_c = consumption(ratings_path=data_path + 'test/test_N20.data',
                         rel_thresh=0,
                         with_ratings=True)
    train_c = consumption(ratings_path=data_path + 'eval_train_N20.data',
                          rel_thresh=0,
                          with_ratings=False)
    MRRs = dict((N, []) for N in [5, 10, 15, 20])
    nDCGs = dict((N, []) for N in [5, 10, 15, 20])
    APs = dict((N, []) for N in [5, 10, 15, 20])
    Rprecs = dict((N, []) for N in [5, 10, 15, 20])
    docs2vec = np.load('./w2v-tmp/' + which_model + '/docs2vec_' +
                       which_model + '.npy').item()
    users2vec = np.load('./w2v-tmp/' + which_model + '/users2vec_' +
                        representation + '_' + which_model + '.npy').item()

    i = 1
    for userId in test_c:
        logging.info("MODO 2. {0} de {1}. User ID: {2}".format(
            i, len(test_c), userId))
        i += 1

        distances = dict((bookId, 0.0) for bookId in docs2vec)
        for bookId in docs2vec:
            if metric == 'angular':
                distances[bookId] = spatial.distance.cosine(
                    users2vec[userId], docs2vec[bookId])
            elif metric == 'euclidean':
                distances[bookId] = spatial.distance.euclidean(
                    users2vec[userId], docs2vec[bookId])

        sorted_sims = sorted(
            distances.items(), key=operator.itemgetter(1),
            reverse=False)  #[(<grId>, MENOR dist), ..., (<grId>, MAYOR dist)]
        book_recs = [bookId for bookId, sim in sorted_sims]
        book_recs = remove_consumed(user_consumption=train_c[userId],
                                    rec_list=book_recs)
        book_recs = recs_cleaner(solr=solr,
                                 consumpt=train_c[userId],
                                 recs=book_recs[:50])
        try:
            recs = user_ranked_recs(user_recs=book_recs,
                                    user_consumpt=test_c[userId])
        except KeyError as e:
            logging.info(
                "Usuario {0} del fold de train (total) no encontrado en fold de 'test'"
                .format(userId))
            continue

        for N in [5, 10, 15, 20]:
            mini_recs = dict((k, recs[k]) for k in list(recs.keys())[:N])
            MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1))
            nDCGs[N].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=False))
            APs[N].append(AP_at_N(n=N, recs=recs, rel_thresh=1))
            Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs))

    with open(
            'TwitterRatings/word2vec/clean/option2_protocol_' + which_model +
            '.txt', 'a') as file:
        file.write("METRIC: %s \t REPR: %s\n" % (metric, representation))

    for N in [5, 10, 15, 20]:
        with open(
                'TwitterRatings/word2vec/clean/option2_protocol_' +
                which_model + '.txt', 'a') as file:
            file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \
             (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )
예제 #7
0
파일: scrapper.py 프로젝트: fdjlss/twitrec
def evaluation_set_with_authors(db_conn, N, folds, out_path):
	# 
	"""
	SÓLO SI YA SE EJECUTÓ ANTES evaluation_set().
	Guarda set de train y test con autores de los libros
	consumidos por los usuarios
	"""
	data_path = 'TwitterRatings/funkSVD/data/'
	c = db_conn.cursor()
	c.execute("SELECT DISTINCT user_reviews.bookId, authors.id, authors.name\
						 FROM user_reviews\
						 INNER JOIN authors\
						 ON user_reviews.bookId=authors.bookId;")
	all_rows = c.fetchall()
	books = {}
	for tupl in all_rows:
		bookId, authorId, author_name = tupl
		if str(bookId) not in books:
			books[str(bookId)] = []
		if str(authorId) not in books[str(bookId)]:
			books[str(bookId)].append( str(authorId) )

	logging.info("Guardando test..")
	test = consumption(ratings_path=data_path+'test/test_N'+str(N)+'.data', rel_thresh=0, with_ratings=True, with_timestamps=True)
	with open(out_path+'test/test_N'+str(N)+'.data', 'w') as f:
		for user, d in test.items():
			for item, tupl in d.items():
				s = '{user},{item},{rating},{timestamp}'.format(user=user, item=item, rating=tupl[0], timestamp=tupl[1])
				for i in range(3):#for author in books[item]: Dejamos 3 autores
					try:
						author = books[item][i] 
					except IndexError as e:
						author = 0
					if i!=2:
						s+=',{author}'.format(author=author)
					else:
						s+=',{author}\n'.format(author=author)
				f.write( s )


	logging.info("Guardando train..")
	train = consumption(ratings_path=data_path+'eval_train_N'+str(N)+'.data', rel_thresh=0, with_ratings=True, with_timestamps=True)
	with open(out_path+'eval_train_N'+str(N)+'.data', 'w') as f:
		for user, d in train.items():
			for item, tupl in d.items():
				s = '{user},{item},{rating},{timestamp}'.format(user=user, item=item, rating=tupl[0], timestamp=tupl[1])
				for i in range(3):#for author in books[item]: Dejamos 3 autores
					try:
						author = books[item][i] 
					except IndexError as e:
						author = 0
					if i!=2:
						s+=',{author}'.format(author=author)
					else:
						s+=',{author}\n'.format(author=author)
				f.write( s )

	for j in range(1, folds):
		logging.info("Guardando validation folds y training aggregated folds. Fold #i={}".format(j))
		
		val_f  = consumption(ratings_path=data_path+'val/val_N'+str(N)+'.'+str(j), rel_thresh=0, with_ratings=True, with_timestamps=True)
		with open(out_path+'val/val_N'+str(N)+'.'+str(j), 'w') as f:
			for user, d in val_f.items():
				for item, tupl in d.items():
					s = '{user},{item},{rating},{timestamp}'.format(user=user, item=item, rating=tupl[0], timestamp=tupl[1])
					for i in range(3):#for author in books[item]: Dejamos 3 autores
						try:
							author = books[item][i] 
						except IndexError as e:
							author = 0
						if i!=2:
							s+=',{author}'.format(author=author)
						else:
							s+=',{author}\n'.format(author=author)
					f.write( s )

		train_f = consumption(ratings_path=data_path+'train/train_N'+str(N)+'.'+str(j), rel_thresh=0, with_ratings=True, with_timestamps=True)
		with open(out_path+'train/train_N'+str(N)+'.'+str(j), 'w') as f:
			for user, d in train_f.items():
				for item, tupl in d.items():
					s = '{user},{item},{rating},{timestamp}'.format(user=user, item=item, rating=tupl[0], timestamp=tupl[1])
					for i in range(3):#for author in books[item]: Dejamos 3 autores
						try:
							author = books[item][i] 
						except IndexError as e:
							author = 0
						if i!=2:
							s+=',{author}'.format(author=author)
						else:
							s+=',{author}\n'.format(author=author)
					f.write( s )

	logging.info("Guardando total..")
	everything = consumption(ratings_path=data_path+'eval_all_N'+str(N)+'.data', rel_thresh=0, with_ratings=True, with_timestamps=True)
	with open(out_path+'eval_all_N'+str(N)+'.data', 'w') as f:
		for user, d in everything.items():
			for item, tupl in d.items():
				s = '{user},{item},{rating},{timestamp}'.format(user=user, item=item, rating=tupl[0], timestamp=tupl[1])
				for i in range(3):#for author in books[item]: Dejamos 3 autores
					try:
						author = books[item][i] 
					except IndexError as e:
						author = 0
					if i!=2:
						s+=',{author}'.format(author=author)
					else:
						s+=',{author}\n'.format(author=author)
				f.write( s )
예제 #8
0
def option1_protocol_evaluation(data_path, N, model):
    # userId='113447232' user_bookId='17310690'
    test_c = consumption(ratings_path=data_path + 'test/test_N' + str(N) +
                         '.data',
                         rel_thresh=0,
                         with_ratings=True)
    train_c = consumption(ratings_path=data_path + 'eval_train_N' + str(N) +
                          '.data',
                          rel_thresh=0,
                          with_ratings=False)
    MRRs = []
    nDCGs = []
    APs = []
    Rprecs = []
    flat_docs = np.load('./w2v-tmp/flattened_docs_fea05b2.npy').item()
    num_to_grId = np.load('./w2v-tmp/num_to_grId.npy').item()
    grId_to_num = np.load('./w2v-tmp/grId_to_num.npy').item()
    t = AnnoyIndex(300)
    t.load('./w2v-tmp/doc_vecs_t100.tree')
    num_best = 20

    i = 1
    for userId in test_c:
        logging.info("MODO 1. {0} de {1}. User ID: {2}".format(
            i, len(test_c), userId))
        i += 1
        # stream_url = solr + '/query?rows=1000&q=goodreadsId:{ids}'
        # ids_string = encoded_itemIds(item_list=train_c[userId])
        # url        = stream_url.format(ids=ids_string)
        # response   = json.loads( urlopen(url).read().decode('utf8') )
        # try:
        # 	docs     = response['response']['docs']
        # except TypeError as e:
        # 	continue

        book_recs = []
        book_recs_cos = []
        for user_bookId in train_c[userId]:  #for user_doc in docs:
            try:
                docs = t.get_nns_by_item(grId_to_num[user_bookId], 4)
                book_recs_cos += [
                    str(num_to_grId[doc_num]) for doc_num in docs
                ]
            except KeyError as e:
                logging.info(
                    "{} ES UNO DE LOS LIBROS CUYO HTML NO PUDO SER DESCARGADO. PROSIGUIENDO CON EL SIGUIENTE LIBRO.."
                    .format(bookId))
                continue

        # Removemos de la primera lista los items consumidos, dado que get_nns_by_items() los incluye
        book_recs_cos = [
            bookId for bookId in book_recs_cos if bookId not in train_c[userId]
        ]

        wmd_corpus = []
        num_to_grId_wmd = {}
        j = 0
        for grId in book_recs_cos:
            wmd_corpus.append(flat_docs[grId])
            num_to_grId_wmd[j] = grId
            j += 1
        grId_to_num_wmd = {v: k for k, v in num_to_grId_wmd.items()}

        index = WmdSimilarity(wmd_corpus,
                              model,
                              num_best=num_best,
                              normalize_w2v_and_replace=False)

        for user_bookId in train_c[userId]:
            r = index[flat_docs[user_bookId]]
            book_recs.append([num_to_grId_wmd[id] for id, score in r])

            # wmds = dict((bookId, 0.0) for bookId in flat_docs)
            # user_bookId = str(user_doc['goodreadsId'][0]) #id de libro consumido por user

            # for bookId in flat_docs: #ids de libros en la DB
            # if bookId == user_bookId: continue
            # wmds[bookId] = model.wmdistance(flat_docs[bookId], flat_docs[user_bookId]) #1 - dist = similarity

            # sorted_sims = sorted(wmds.items(), key=operator.itemgetter(1), reverse=False) #[(<grId>, MAYOR sim), ..., (<grId>, menor sim)]
            # book_recs.append( [ bookId for bookId, sim in sorted_sims ] )

        book_recs = flatten_list(list_of_lists=book_recs,
                                 rows=len(book_recs[0]))
        book_recs = remove_consumed(user_consumption=train_c[userId],
                                    rec_list=book_recs)
        try:
            recs = user_ranked_recs(user_recs=book_recs,
                                    user_consumpt=test_c[userId])
        except KeyError as e:
            logging.info(
                "Usuario {0} del fold de train (total) no encontrado en fold de 'test'"
                .format(userId))
            continue

        ####################################
        mini_recs = dict(
            (k, recs[k]) for k in list(recs.keys())
            [:N])  #Python 3.x: .keys() devuelve una vista, no una lista
        nDCGs.append(nDCG(recs=mini_recs, alt_form=False, rel_thresh=False))
        APs.append(AP_at_N(n=N, recs=mini_recs, rel_thresh=1))
        MRRs.append(MRR(recs=mini_recs, rel_thresh=1))
        Rprecs.append(R_precision(n_relevants=N, recs=mini_recs))
        ####################################

    with open('TwitterRatings/word2vec/option1_protocol_wmd.txt', 'a') as file:
        file.write( "N=%s, normal nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \
          (N, mean(nDCGs), mean(APs), mean(MRRs), mean(Rprecs)) )
예제 #9
0
def option2_protocol_evaluation(data_path, N, model):
    test_c = consumption(ratings_path=data_path + 'test/test_N' + str(N) +
                         '.data',
                         rel_thresh=0,
                         with_ratings=True)
    train_c = consumption(ratings_path=data_path + 'eval_train_N' + str(N) +
                          '.data',
                          rel_thresh=0,
                          with_ratings=False)
    MRRs = []
    nDCGs = []
    APs = []
    Rprecs = []
    flat_docs = np.load('./w2v-tmp/flattened_docs_fea05b2.npy').item()
    flat_users = np.load('./w2v-tmp/flattened_users_fea05b2.npy').item()
    docs2vec = np.load('./w2v-tmp/docs2vec.npy').item()
    users2vec = np.load('./w2v-tmp/users2vec.npy').item()
    num_best = 20

    i = 1
    for userId in test_c:
        logging.info("MODO 2. {0} de {1}. User ID: {2}".format(
            i, len(test_c), userId))
        i += 1

        # wmds = dict((bookId, 0.0) for bookId in flat_docs)
        # for bookId in flat_docs:
        # wmds[bookId] = model.wmdistance(flat_users[userId], flat_docs[bookId])

        cosines = dict((bookId, 0.0) for bookId in docs2vec)
        for bookId in docs2vec:
            cosines[bookId] = 1 - spatial.distance.cosine(
                users2vec[userId], docs2vec[bookId])  #1 - dist = similarity

        sorted_sims = sorted(
            cosines.items(), key=operator.itemgetter(1),
            reverse=True)  #[(<grId>, MAYOR sim), ..., (<grId>, menor sim)]
        book_recs_cos = [bookId for bookId, sim in sorted_sims]
        book_recs_cos = remove_consumed(user_consumption=train_c[userId],
                                        rec_list=book_recs)

        wmd_corpus = []
        num_to_grId_wmd = {}
        j = 0
        for grId in book_recs_cos[:50]:
            wmd_corpus.append(flat_docs[grId])
            num_to_grId_wmd[j] = grId
            j += 1
        grId_to_num_wmd = {v: k for k, v in num_to_grId_wmd.items()}
        # Creamos índice WMD con un subset de (50) ítems recomendados al usuario por cossim
        index = WmdSimilarity(wmd_corpus,
                              model,
                              num_best=num_best,
                              normalize_w2v_and_replace=False)
        r = index[flat_users[userId]]

        book_recs = [num_to_grId_wmd[id] for id, score in r]
        # sorted_sims = sorted(wmds.items(), key=operator.itemgetter(1), reverse=False) #[(<grId>, MAYOR sim), ..., (<grId>, menor sim)]
        # book_recs   = [ bookId for bookId, sim in sorted_sims ]
        book_recs = remove_consumed(user_consumption=train_c[userId],
                                    rec_list=book_recs)
        try:
            recs = user_ranked_recs(user_recs=book_recs,
                                    user_consumpt=test_c[userId])
        except KeyError as e:
            logging.info(
                "Usuario {0} del fold de train (total) no encontrado en fold de 'test'"
                .format(userId))
            continue

        ####################################
        mini_recs = dict(
            (k, recs[k]) for k in list(recs.keys())
            [:N])  #Python 3.x: .keys() devuelve una vista, no una lista
        nDCGs.append(nDCG(recs=mini_recs, alt_form=False, rel_thresh=False))
        APs.append(AP_at_N(n=N, recs=mini_recs, rel_thresh=1))
        MRRs.append(MRR(recs=mini_recs, rel_thresh=1))
        Rprecs.append(R_precision(n_relevants=N, recs=mini_recs))
        ####################################

    with open('TwitterRatings/word2vec/option2_protocol_wmd.txt', 'a') as file:
        file.write( "N=%s, normal nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \
          (N, mean(nDCGs), mean(APs), mean(MRRs), mean(Rprecs)) )
예제 #10
0
def PRF_calculator(params, folds, topN):

    ratings_train, ratings_test = [], []
    with open('TwitterRatings/funkSVD/ratings.train', 'r') as f:
        for line in f:
            ratings_train.append(line.strip())

    with open('TwitterRatings/funkSVD/ratings.test', 'r') as f:
        for line in f:
            ratings_test.append(line.strip())

    preferred_consumption = consumption(
        ratings_path='TwitterRatings/funkSVD/ratings.test',
        rel_thresh=4,
        with_ratings=False)

    for n in topN:
        precision_folds, recall_folds = [], []
        # for _ in range(0, folds):
        # ratingsSampler(ratings_train, 'TwitterRatings/funkSVD/ratings_temp.train', 0.8)
        # ratingsSampler(ratings_test, 'TwitterRatings/funkSVD/ratings_temp.test', 0.8)

        svd = pyreclab.SVD(
            dataset=
            'TwitterRatings/funkSVD/ratings.train',  #o ratings_temp.train
            dlmchar=b',',
            header=False,
            usercol=0,
            itemcol=1,
            ratingcol=2)

        svd.train(factors=params['f'],
                  maxiter=params['mi'],
                  lr=params['lr'],
                  lamb=params['lamb'])

        recommendationList = svd.testrec(
            input_file=
            'TwitterRatings/funkSVD/ratings.test',  #o ratings_temp.test
            dlmchar=b',',
            header=False,
            usercol=0,
            itemcol=1,
            ratingcol=2,
            topn=n,
            includeRated=False)

        users_precisions, users_recalls = [], []
        for userId in recommendationList[0]:
            recs = set(recommendationList[0][userId])
            cons = set(preferred_consumption[userId])
            tp = len(recs & cons)
            fp = len(recs - cons)
            fn = len(cons - recs)
            users_precisions.append(float(tp) / (tp + fp))
            try:
                users_recalls.append(float(tp) / (tp + fn))
            except ZeroDivisionError as e:
                continue

        precision_folds.append(mean(users_precisions))
        recall_folds.append(mean(users_recalls))

        p = mean(precision_folds)
        r = mean(recall_folds)
        f = 2 * p * r / (p + r)

        with open('TwitterRatings/funkSVD/recall.txt', 'a') as file:
            file.write("N=%s, P=%s, R=%s, F=%s\n" % (n, p, r, f))