def test(self, path): ml20m = MovieLens('data/ml-20m') mlsmall = MovieLens('data/ml-latest-small') movies = mlsmall.movies # pd.read_csv('\data\ml-100k\movies.csv', sep=',', encoding='latin-1') genome_scores = ml20m.tag_genome #pd.read_csv('\data\ml-100k\genome-scores.csv', sep=',',encoding='latin-1') ratings = mlsmall.ratings #pd.read_csv('\data\ml-100k\ratings.csv', sep=',', encoding='latin-1') genome_scores_np = genome_scores.values gen = [[0] * 1128] * int(len(genome_scores) / 1128) gen_rel = [0] * 1128 movs = [0] * int(len(genome_scores) / 1128) for m in range(0, int(len(genome_scores) / 1128)): for tag in range(0, 1128): gen_rel[tag] = genome_scores_np[1128 * m + tag][2] movs[m] = genome_scores_np[m * 1128][0] gen[m] = gen_rel[:] # compute the cosine similarity for all movies against all movies #cos_sim = cosine_similarity(gen) def get_movie_recommendation(id_movie_to_match): idx_matched_top5 = np.argsort(cos_sim[id_movie_to_match])[-6:-1] j = 0 movie_recommended = [id_movie_to_match] * 5 for i in idx_matched_top5: idx = movies['title'].get(movies['movieId'] == movs[i]) movie_recommended[j] = idx.values[0] #print(i, cos_sim[i][0]) j += 1 print(movie_recommended) def set_user_profiles(ratings, gen): #genearting a predicted rating for movies using content based filtering was not as straight forward as the other recomendation methods used in this project, thus this section is incomplete but three algorithms are compared in the project #user profiles users, rated = np.unique( ratings['userId'].values, return_counts=True) # only wwell rated movies are selected movies_to_gen_pred = 50 users_rated = [[0] * 1128] * len(users) counter = 0 # incremented by +rated by user i for i in range(0, len(users)): # for each user user_id = users[i] rated_movies = rated[i] user_tags = [[0] * 1128] * rated_movies for j in range( 0, rated_movies): # get averaged genome tags relevance movie_id = ratings['movieId'][counter] counter += 1 # get the row of that movie id, if the movie has available genome tags if movie_id in movs: m = movs.index(movie_id) user_tags[j] = gen[m]
def do_prepare(opts): name = opts['-d'] ml = MovieLens(f'data/{name}') train, test = next(sample_users(ml.ratings, 1, 10000, SampleN(5))) test.to_parquet(f'data/{name}-test.parquet', index=False) _log.info('getting popular recs') pop = Popular() pop.fit(train) pop_recs = recommend(pop, test['user'].unique(), 100) _log.info('getting ALS recs') als = ImplicitMF(20, iterations=10) als = Recommender.adapt(als) als.fit(train.drop(columns=['rating'])) als_recs = recommend(als, test['user'].unique(), 100) _log.info('merging recs') recs = pd.concat({ 'Popular': pop_recs, 'ALS': als_recs }, names=['Algorithm']) recs.reset_index('Algorithm', inplace=True) recs.to_parquet(f'data/{name}-recs.parquet', index=False)
def test_alogrithms(): data = MovieLens('ml-latest-small') #data = ML1M('ml-1m') ratings = data.ratings print('Initial ratings table head:') print(ratings.head()) algorithms = { 'Bias': basic.Bias(damping=5), 'Popular': basic.Popular(), 'ItemItem': item_knn.ItemItem(20), 'UserUser': user_knn.UserUser(20), 'BiasedMF': als.BiasedMF(50), 'ImplicitMF': als.ImplicitMF(50), 'FunkSVD': funksvd.FunkSVD(50) } all_recs, test_data = eval_algos(ratings, algorithms) ndcg_means = eval_ndcg(all_recs, test_data) print('NDCG means:') print(ndcg_means) plot_comparison(ndcg_means)
def test(self, path): algo_pop = Bias() algo_als5 = als.BiasedMF(5) def eval(aname, algo, train, test, all_preds): fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(train) # predict ratings preds = batch.predict(fittable, test) preds['Algorithm'] = aname all_preds.append(preds) if '100k' in path: ml100k = ML100K(path) ratings = ml100k.ratings elif '1m' in path: ml100k = ML1M(path) ratings = ml100k.ratings elif '10m' in path: ml100k = ML10M(path) ratings = ml100k.ratings else: mlsmall = MovieLens(path) ratings = mlsmall.ratings print(ratings.head()) all_preds = [] test_data = [] for train, test in xf.partition_users( ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)): test_data.append(test) eval('MF', algo_als5, train, test, all_preds) preds = pd.concat(all_preds, ignore_index=True) preds_mf = preds[preds['Algorithm'].str.match('MF')] test_data = pd.concat(test_data, ignore_index=True) print('RMSE MF:', rmse(preds_mf['prediction'], preds_mf['rating']))
import os import os.path import logging from contextlib import contextmanager import numpy as np from .. import matrix import pytest from lenskit.datasets import MovieLens, ML100K _log = logging.getLogger(__name__) ml_test = MovieLens('ml-latest-small') ml100k = ML100K() def ml_sample(): ratings = ml_test.ratings icounts = ratings.groupby('item').rating.count() top = icounts.nlargest(500) ratings = ratings.set_index('item') top_rates = ratings.loc[top.index, :] _log.info('top 500 items yield %d of %d ratings', len(top_rates), len(ratings)) return top_rates.reset_index() def rand_csr(nrows=100, ncols=50, nnz=1000, values=True):
from lenskit.algorithms import item_knn as knn try: import lenskit_tf except: lenskit_tf = None from lenskit.util import Stopwatch from lenskit.util import test as lktu import pytest from pytest import approx _log = logging.getLogger(__name__) _ml_path = Path('data/ml-20m') if _ml_path.exists(): _ml_20m = MovieLens(_ml_path) else: _ml_20m = None @pytest.fixture def ml20m(): if _ml_20m: return _ml_20m.ratings else: pytest.skip('ML-20M not available') @pytest.mark.slow @pytest.mark.realdata @pytest.mark.parametrize('n_jobs', [1, 2])
from lenskit.datasets import MovieLens import pandas as pd dataset = MovieLens('../datasets/ml-20m') tag_genome = dataset.tag_genome movies = set(dataset.ratings[['item']].values.flatten()) f = open('../datasets/movies-tags.csv', 'w', newline='') f.write('item,tags\n') for movie in movies: maybe_movie_tags = tag_genome.query('item == @movie') if not maybe_movie_tags.empty: scores = maybe_movie_tags.iloc[0] top_tags = scores.pipe(lambda x: x[x > 0.7]) tags = "" for tag in top_tags.items(): tags += " " + tag[0].replace(" ", "-") if tags != "": f.write('%d,%s \n' % (movie, tags)) f.close()
import os import os.path import logging from contextlib import contextmanager import numpy as np from .. import matrix import pytest from lenskit.datasets import MovieLens, ML100K _log = logging.getLogger(__name__) ml_test = MovieLens('data/ml-latest-small') ml100k = ML100K('data/ml-100k') def ml_sample(): ratings = ml_test.ratings icounts = ratings.groupby('item').rating.count() top = icounts.nlargest(500) ratings = ratings.set_index('item') top_rates = ratings.loc[top.index, :] _log.info('top 500 items yield %d of %d ratings', len(top_rates), len(ratings)) return top_rates.reset_index() def rand_csr(nrows=100, ncols=50, nnz=1000, values=True):