Пример #1
0
    def test(self, path):
        ml20m = MovieLens('data/ml-20m')
        mlsmall = MovieLens('data/ml-latest-small')
        movies = mlsmall.movies  # pd.read_csv('\data\ml-100k\movies.csv', sep=',', encoding='latin-1')
        genome_scores = ml20m.tag_genome  #pd.read_csv('\data\ml-100k\genome-scores.csv', sep=',',encoding='latin-1')
        ratings = mlsmall.ratings  #pd.read_csv('\data\ml-100k\ratings.csv', sep=',', encoding='latin-1')

        genome_scores_np = genome_scores.values
        gen = [[0] * 1128] * int(len(genome_scores) / 1128)
        gen_rel = [0] * 1128
        movs = [0] * int(len(genome_scores) / 1128)
        for m in range(0, int(len(genome_scores) / 1128)):
            for tag in range(0, 1128):
                gen_rel[tag] = genome_scores_np[1128 * m + tag][2]
            movs[m] = genome_scores_np[m * 1128][0]
            gen[m] = gen_rel[:]

        # compute the cosine similarity for all movies against all movies
        #cos_sim = cosine_similarity(gen)

        def get_movie_recommendation(id_movie_to_match):
            idx_matched_top5 = np.argsort(cos_sim[id_movie_to_match])[-6:-1]

            j = 0
            movie_recommended = [id_movie_to_match] * 5
            for i in idx_matched_top5:
                idx = movies['title'].get(movies['movieId'] == movs[i])
                movie_recommended[j] = idx.values[0]
                #print(i, cos_sim[i][0])
                j += 1
            print(movie_recommended)

        def set_user_profiles(ratings, gen):
            #genearting a predicted rating for movies using content based filtering was not as straight forward as the other recomendation methods used in this project, thus this section is incomplete but three algorithms are compared in the project
            #user profiles
            users, rated = np.unique(
                ratings['userId'].values,
                return_counts=True)  # only wwell rated movies are selected
            movies_to_gen_pred = 50
            users_rated = [[0] * 1128] * len(users)
            counter = 0  # incremented by +rated by user i
            for i in range(0, len(users)):  # for each user
                user_id = users[i]
                rated_movies = rated[i]
                user_tags = [[0] * 1128] * rated_movies
                for j in range(
                        0, rated_movies):  # get averaged genome tags relevance
                    movie_id = ratings['movieId'][counter]
                    counter += 1
                    # get the row of that movie id, if the movie has available genome tags
                    if movie_id in movs:
                        m = movs.index(movie_id)
                        user_tags[j] = gen[m]
Пример #2
0
def do_prepare(opts):
    name = opts['-d']
    ml = MovieLens(f'data/{name}')

    train, test = next(sample_users(ml.ratings, 1, 10000, SampleN(5)))

    test.to_parquet(f'data/{name}-test.parquet', index=False)

    _log.info('getting popular recs')
    pop = Popular()
    pop.fit(train)
    pop_recs = recommend(pop, test['user'].unique(), 100)

    _log.info('getting ALS recs')
    als = ImplicitMF(20, iterations=10)
    als = Recommender.adapt(als)
    als.fit(train.drop(columns=['rating']))
    als_recs = recommend(als, test['user'].unique(), 100)

    _log.info('merging recs')
    recs = pd.concat({
        'Popular': pop_recs,
        'ALS': als_recs
    },
                     names=['Algorithm'])
    recs.reset_index('Algorithm', inplace=True)
    recs.to_parquet(f'data/{name}-recs.parquet', index=False)
Пример #3
0
def test_alogrithms():
    data = MovieLens('ml-latest-small')
    #data = ML1M('ml-1m')
    ratings = data.ratings
    print('Initial ratings table head:')
    print(ratings.head())
    algorithms = {
        'Bias': basic.Bias(damping=5),
        'Popular': basic.Popular(),
        'ItemItem': item_knn.ItemItem(20),
        'UserUser': user_knn.UserUser(20),
        'BiasedMF': als.BiasedMF(50),
        'ImplicitMF': als.ImplicitMF(50),
        'FunkSVD': funksvd.FunkSVD(50)
    }
    all_recs, test_data = eval_algos(ratings, algorithms)
    ndcg_means = eval_ndcg(all_recs, test_data)
    print('NDCG means:')
    print(ndcg_means)
    plot_comparison(ndcg_means)
    def test(self, path):
        algo_pop = Bias()
        algo_als5 = als.BiasedMF(5)

        def eval(aname, algo, train, test, all_preds):
            fittable = util.clone(algo)
            fittable = Recommender.adapt(fittable)
            fittable.fit(train)
            # predict ratings
            preds = batch.predict(fittable, test)
            preds['Algorithm'] = aname
            all_preds.append(preds)

        if '100k' in path:
            ml100k = ML100K(path)
            ratings = ml100k.ratings
        elif '1m' in path:
            ml100k = ML1M(path)
            ratings = ml100k.ratings
        elif '10m' in path:
            ml100k = ML10M(path)
            ratings = ml100k.ratings
        else:
            mlsmall = MovieLens(path)
            ratings = mlsmall.ratings
        print(ratings.head())

        all_preds = []
        test_data = []
        for train, test in xf.partition_users(
                ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)):
            test_data.append(test)
            eval('MF', algo_als5, train, test, all_preds)
        preds = pd.concat(all_preds, ignore_index=True)
        preds_mf = preds[preds['Algorithm'].str.match('MF')]
        test_data = pd.concat(test_data, ignore_index=True)
        print('RMSE MF:', rmse(preds_mf['prediction'], preds_mf['rating']))
Пример #5
0
import os
import os.path
import logging
from contextlib import contextmanager

import numpy as np
from .. import matrix

import pytest

from lenskit.datasets import MovieLens, ML100K

_log = logging.getLogger(__name__)

ml_test = MovieLens('ml-latest-small')
ml100k = ML100K()


def ml_sample():
    ratings = ml_test.ratings
    icounts = ratings.groupby('item').rating.count()
    top = icounts.nlargest(500)
    ratings = ratings.set_index('item')
    top_rates = ratings.loc[top.index, :]
    _log.info('top 500 items yield %d of %d ratings', len(top_rates),
              len(ratings))
    return top_rates.reset_index()


def rand_csr(nrows=100, ncols=50, nnz=1000, values=True):
Пример #6
0
from lenskit.algorithms import item_knn as knn
try:
    import lenskit_tf
except:
    lenskit_tf = None
from lenskit.util import Stopwatch
from lenskit.util import test as lktu

import pytest
from pytest import approx

_log = logging.getLogger(__name__)

_ml_path = Path('data/ml-20m')
if _ml_path.exists():
    _ml_20m = MovieLens(_ml_path)
else:
    _ml_20m = None


@pytest.fixture
def ml20m():
    if _ml_20m:
        return _ml_20m.ratings
    else:
        pytest.skip('ML-20M not available')


@pytest.mark.slow
@pytest.mark.realdata
@pytest.mark.parametrize('n_jobs', [1, 2])
from lenskit.datasets import MovieLens

import pandas as pd

dataset = MovieLens('../datasets/ml-20m')
tag_genome = dataset.tag_genome
movies = set(dataset.ratings[['item']].values.flatten())

f = open('../datasets/movies-tags.csv', 'w', newline='')
f.write('item,tags\n')

for movie in movies:
    maybe_movie_tags = tag_genome.query('item == @movie')
    if not maybe_movie_tags.empty:
        scores = maybe_movie_tags.iloc[0]
        top_tags = scores.pipe(lambda x: x[x > 0.7])

        tags = ""
        for tag in top_tags.items():
            tags += " " + tag[0].replace(" ", "-")

        if tags != "":
            f.write('%d,%s \n' % (movie, tags))

f.close()
Пример #8
0
import os
import os.path
import logging
from contextlib import contextmanager

import numpy as np
from .. import matrix

import pytest

from lenskit.datasets import MovieLens, ML100K

_log = logging.getLogger(__name__)

ml_test = MovieLens('data/ml-latest-small')
ml100k = ML100K('data/ml-100k')


def ml_sample():
    ratings = ml_test.ratings
    icounts = ratings.groupby('item').rating.count()
    top = icounts.nlargest(500)
    ratings = ratings.set_index('item')
    top_rates = ratings.loc[top.index, :]
    _log.info('top 500 items yield %d of %d ratings', len(top_rates),
              len(ratings))
    return top_rates.reset_index()


def rand_csr(nrows=100, ncols=50, nnz=1000, values=True):