Exemplo n.º 1
0
class SimpleCF:
    def __init__(self, df, user_based=False):
        self.df = df
        self.user_based = user_based

        reader = Reader(line_format='user item rating')
        data = Dataset.load_from_df(df=self.df, reader=reader)
        self.eval_data = EvaluationData(data)

        sim_options = {'name': 'cosine', 'user_based': self.user_based}
        self.model = KNNBasic(sim_options=sim_options)

    def item_based_cf(self, k=10, eval=False):
        topN = defaultdict(list)
        testSet = self.eval_data.GetLOOCVTestSet()

        if eval == False:
            trainSet = self.eval_data.GetFullTrainSet()
        else:
            trainSet = self.eval_data.GetLOOCVTrainSet()

        self.model.fit(trainSet)
        simsMatrix = self.model.compute_similarities()

        for uiid in range(trainSet.n_users()):

            testUserRatings = trainSet.ur[uiid]
            KNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])

            candidates = defaultdict(float)
            for itemID, rating in KNeighbors:
                similarityRow = simsMatrix[itemID]
                for item_innerID, item_score in enumerate(similarityRow):
                    candidates[item_innerID] += item_score * (rating / 5.0)

            watched = {}
            for itemID, rating in trainSet.ur[uiid]:
                watched[itemID] = 1

            pos = 0
            for itemID, ratingSum in sorted(candidates.items(),
                                            key=itemgetter(1),
                                            reverse=True):
                if not itemID in watched:
                    topN[trainSet.to_raw_uid(uiid)].append(
                        (trainSet.to_raw_iid(itemID), ratingSum))
                    pos += 1
                    if pos > 10:
                        break
        if eval == False:
            return topN
        else:
            return RecommenderMetrics.HitRate(topN, testSet)

    def user_based_cf(self, k=10, eval=True):
        topN = defaultdict(list)
        testSet = self.eval_data.GetLOOCVTestSet()

        if eval == False:
            trainSet = self.eval_data.GetFullTrainSet()
        else:
            trainSet = self.eval_data.GetLOOCVTrainSet()

        self.model.fit(trainSet)
        simsMatrix = self.model.compute_similarities()

        for uiid in range(trainSet.n_users):

            similarityRow = simsMatrix[uiid]
            similarUsers = []
            for innerID, score in enumerate(similarityRow):
                if (innerID != uiid):
                    similarUsers.append((innerID, score))

            KNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
            candidates = defaultdict(float)
            for similarUser in KNeighbors:
                innerID = similarUser[0]
                userSimilarityScore = similarUser[1]
                theirRatings = trainSet.ur[innerID]
                for rating in theirRatings:
                    candidates[rating[0]] += userSimilarityScore * (rating[1] /
                                                                    5.0)

            watched = {}
            for itemID, rating in trainSet.ur[uiid]:
                watched[itemID] = 1

            pos = 0
            for itemID, ratingSum in sorted(candidates.items(),
                                            key=itemgetter(1),
                                            reverse=True):
                if not itemID in watched:
                    topN[trainSet.to_raw_uid(uiid)].append(
                        (trainSet.to_raw_iid(itemID), ratingSum))
                    pos += 1
                    if pos > 10:
                        break
        if eval == False:
            return topN
        else:
            return RecommenderMetrics.HitRate(topN, testSet)
Exemplo n.º 2
0
from surprise import KNNBasic
import heapq
from collections import defaultdict
from operator import itemgetter

testSubject = '85'
k = 10

# Load our data set and compute the user similarity matrix
ml = MovieLens()
data = ml.loadMovieLensLatestSmall()
trainSet = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': True}
model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

# Get top N similar users to our test subject
# (Alternate approach would be to select users up to some similarity threshold - try it!)
testUserInnerID = trainSet.to_inner_uid(testSubject)
similarityRow = simsMatrix[testUserInnerID]

similarUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != testUserInnerID):
        similarUsers.append((innerID, score))

kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

# Get the stuff they rated, and add up ratings for each item, weighted by user similarity
candidates = defaultdict(float)
Exemplo n.º 3
0
def runUserColaborativeFiltering(testSubject = "85", k = 14):
    
    # Load our data set and compute the user similarity matrix
    ml = MovieLens()
    data = ml.loadMovieLensLatestSmall()
    # the data is in surprise.dataset.DatasetAutoFolds fromat
    # to get the raw ratings use data.raw_ratings the format is --> userID movieID rating Timestamp
    
    # Trainsets are different from Datasets. You can think of a Dataset as the raw data,
    # and Trainsets as higher-level data where useful methods are defined.
    # build_full_trainset() method will build a trainset object for the entire dataset
    trainSet = data.build_full_trainset()
    
    # Options for similarity calculations
    sim_options = {'name': 'cosine',
                   'user_based': True}
    
    model = KNNBasic(sim_options=sim_options)
    # Fit must be called on a transient only not directly on the raw data
    model.fit(trainSet)
    simsMatrix = model.compute_similarities()
    
    # Get top N similar users to our test subject
    # (Alternate approach would be to select users up to some similarity threshold )
    testUserInnerID = trainSet.to_inner_uid(testSubject)
    similarityRow = simsMatrix[testUserInnerID]
    
    similarUsers = []
    for innerID, score in enumerate(similarityRow):
        if (innerID != testUserInnerID):
            similarUsers.append( (innerID, score) )
    
    kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
    
    # Get the stuff they rated, and add up ratings for each item, weighted by user similarity
    candidates = defaultdict(float)
    for similarUser in kNeighbors:
        innerID = similarUser[0]
        userSimilarityScore = similarUser[1]
        theirRatings = trainSet.ur[innerID]
        for rating in theirRatings:
            candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
        
    # Build a dictionary of stuff the user has already seen
    watched = {}
    for itemID, rating in trainSet.ur[testUserInnerID]:
        watched[itemID] = 1
        
    # Get top-rated items from similar users:
    recommendations = []
    pos = 0
    print("\n\n-------------------<><><><>--------------------")
    for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
        if not itemID in watched:
            movieID = trainSet.to_raw_iid(itemID)
            movieID = float(movieID)
            movieID = int(movieID)
            recommendations.append(int(movieID))
            print(ml.getMovieName(int(movieID)), ratingSum)
            pos += 1
            if (pos > 20):
                break
    print("-------------------<><><><>--------------------")
    # these are the id in the movie lens dataset
    return recommendations
    def getRecc(self, testSubject):
        print("Making Recommendation for user:"******"\nWatched:", sorted(watchedList))

# Get top-rated items from similar users:
        print("\nCollab Filt Recc:")
        pos = 0
        finalReccs = []
        for itemID, ratingSum in sorted(candidates.items(),
                                        key=itemgetter(1),
                                        reverse=True):
            if not itemID in watched:
                movieID = trainSet.to_raw_iid(itemID)
                finalReccs.append(movieID)
                print(movieID)
                pos += 1
                if (pos > 8):
                    break

        return (finalReccs)
Exemplo n.º 5
0
def recommendations(request):
    if Member.objects.filter(user=request.user).first() is None:
        messages.warning(request, 'You Need to first Update your profile.')
        return redirect('profile')
    testSubject = str(request.user.id)
    k = 10

    try:
        bk = BooksData('data/')
        data = bk.loadBooksData()

        trainSet = data.build_full_trainset()

        sim_options = {'name': 'cosine', 'user_based': True}

        model = KNNBasic(sim_options=sim_options)
        model.fit(trainSet)
        simsMatrix = model.compute_similarities()
        simsMatrix = np.nan_to_num(simsMatrix)

        # print(simsMatrix)
        # print(type(simsMatrix))

        # Get top N similar users to our test subject
        testUserInnerID = trainSet.to_inner_uid(testSubject)

        if sim_options['user_based']:
            similarityRow = simsMatrix[testUserInnerID]

            similarUsers = []
            for innerID, score in enumerate(similarityRow):
                if innerID != testUserInnerID:
                    similarUsers.append((innerID, score))

            kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

            candidates = defaultdict(float)
            for similarUser in kNeighbors:
                innerID = similarUser[0]
                userSimilarityScore = similarUser[1]
                theirRatings = trainSet.ur[innerID]
                for rating in theirRatings:
                    candidates[rating[0]] += (rating[1] /
                                              10.0) * userSimilarityScore

        else:
            testUserRatings = trainSet.ur[testUserInnerID]
            kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])

            candidates = defaultdict(float)
            for itemID, rating in kNeighbors:
                similarityRow = simsMatrix[itemID]
                for innerID, score in enumerate(similarityRow):
                    candidates[innerID] += score * (rating / 10.0)
        # Get the stuff they rated, and add up ratings for each item, weighted by user similarity

        # Build a dictionary of stuff the user has already read
        read = {}
        # print('\n\nBooks user already read.')
        # print("============================")
        for itemID, rating in trainSet.ur[testUserInnerID]:
            bookID = trainSet.to_raw_iid(itemID)
            # print(bk.getBookName(bookID))
            read[itemID] = 1

        # Get top-rated items from similar users:
        pos = 0
        bks2 = []
        for itemID, ratingSum in sorted(candidates.items(),
                                        key=itemgetter(1),
                                        reverse=True):
            if not itemID in read:
                bookID = trainSet.to_raw_iid(itemID)
                # print(bk.getBookName(bookID))
                bks2.append(bookID)
                pos += 1
                if (pos > 10):
                    break

        UCB = []

        for _ in bks2:
            UCB.append(Book.objects.get(ISBN=_))

        # SVD Algorithms
        def GetAntiTestSetForUser(testSubject, trainSet):
            fill = trainSet.global_mean
            anti_testset = []
            u = trainSet.to_inner_uid(str(testSubject))
            user_items = set([j for (j, _) in trainSet.ur[u]])
            anti_testset += [(trainSet.to_raw_uid(u), trainSet.to_raw_iid(i),
                              fill) for i in trainSet.all_items()
                             if i not in user_items]
            return anti_testset

        model = SVD()
        model.fit(trainSet)
        testSet = GetAntiTestSetForUser(testSubject, trainSet)
        predictions = model.test(testSet)
        recommendations = []
        for userID, ISBN, actualRating, estimatedRating, _ in predictions:
            isbn = ISBN
            recommendations.append((isbn, estimatedRating))

        recommendations.sort(key=lambda x: x[1], reverse=True)

        SVDB = []
        for ratings in recommendations[:k]:
            # print(bk.getBookName(ratings[0]))
            SVDB.append(Book.objects.get(ISBN=ratings[0]))
    except:
        UCB = []
        SVDB = []
    return render(request, 'LibraryMS/recommendations.html', {
        'UCB': UCB,
        'SVDB': SVDB
    })
from resources.RunDataLoader import run_data_loader
from surprise import KNNBasic

import numpy as np
import pickle
import pandas as pd

ml = run_data_loader()
no_ratings = len(ml.ratings_df)
no_ratings = 5000000

print('Number of ratings:' + str(no_ratings))
data = ml.loadData(no_ratings)
trainSet = data.build_full_trainset()

sim_options = {'name': 'cosine', 'user_based': False}
model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
similarity_matrix = model.compute_similarities()

# mlPath = 'data/ml.pkl'
# with open(mlPath, 'wb') as file:
#     pickle.dump(ml, file)

np.save('data/similarity_matrix', similarity_matrix)

trainSetPath = 'data/trainSet.pkl'
with open(trainSetPath, 'wb') as file:
    pickle.dump(trainSet, file)
def user_based_rec_loader(data, testUser, no_recs):
    trainSet = data.build_full_trainset()
    sim_options = {'name': 'cosine',
               'user_based': True
               }
    model = KNNBasic(sim_options=sim_options)
    model.fit(trainSet)

    similarity_matrix = model.compute_similarities()

    testUserInnerID = trainSet.to_inner_uid(testUser)
    similiarty_row = similarity_matrix[testUserInnerID]

    # removing the testUser from the similiarty_row
    similarUsers = []
    for innerID, score in enumerate(similiarty_row):
        if (innerID != testUserInnerID):
            similarUsers.append( (innerID, score) )
    # find the k users largest similarities
    k = 10
    kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

#     or can tune for ratings > threshold
#     kNeighbors = []
#     for rating in similarUsers:
#        if rating[1] > 0.75:
#            kNeighbors.append(rating)

    
    # Get the stuff the k users rated, and add up ratings for each item, weighted by user similarity
    # candidates will hold all possible items(movies) and combined rating from all k users
    candidates = defaultdict(float)
    for similarUser in kNeighbors:
        innerID = similarUser[0]
        userSimilarityScore = similarUser[1]
        # this will hold all the items they've rated and the ratings for each of those items
        theirRatings = trainSet.ur[innerID]
        for rating in theirRatings:
            candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

    # Build a dictionary of stuff the user has already seen
    excluded = {}
    for itemID, rating in trainSet.ur[testUserInnerID]:
        excluded[itemID] = 1
   
    # Build a dictionary for results
    results = {'book_title': [], 'rating_sum': []}
            
    # Get top-rated items from similar users:
    print('\n')
    pos = 0
    for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
        if itemID not in excluded:
            bookID = trainSet.to_raw_iid(itemID)
#             print(ml.getItemName(int(bookID)), ratingSum)
            results['book_title'].append(ml.getItemName(int(bookID)))
            results['rating_sum'].append(ratingSum)
            pos += 1
            if (pos > no_recs -1):
                break
                
    return pd.DataFrame(results)
Exemplo n.º 8
0
def simpleUserCFGive(id):
    testSubject = str(id)
    k = 10

    # Load our data set and compute the user similarity matrix
    ml = MovieLens()
    data = ml.loadMovieLensLatestSmall()

    trainSet = data.build_full_trainset()

    sim_options = {'name': 'cosine', 'user_based': True}

    model = KNNBasic(sim_options=sim_options)
    model.fit(trainSet)
    simsMatrix = model.compute_similarities()

    # Get top N similar users to our test subject
    # (Alternate approach would be to select users up to some similarity threshold - try it!)
    testUserInnerID = trainSet.to_inner_uid(testSubject)
    similarityRow = simsMatrix[testUserInnerID]

    similarUsers = []
    for innerID, score in enumerate(similarityRow):
        if (innerID != testUserInnerID):
            similarUsers.append((innerID, score))

    kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

    # Get the stuff they rated, and add up ratings for each item, weighted by user similarity
    candidates = defaultdict(float)
    for similarUser in kNeighbors:
        innerID = similarUser[0]
        userSimilarityScore = similarUser[1]
        theirRatings = trainSet.ur[innerID]
        for rating in theirRatings:
            candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

    # Build a dictionary of stuff the user has already seen
    watched = {}
    for itemID, rating in trainSet.ur[testUserInnerID]:
        watched[itemID] = 1

    # Get top-rated items from similar users:
    s = "\n" + str(id)
    pos = 0
    for itemID, ratingSum in sorted(candidates.items(),
                                    key=itemgetter(1),
                                    reverse=True):
        if not itemID in watched:
            movieID = trainSet.to_raw_iid(itemID)
            s += "," + ml.getMovieName(int(movieID))
            pos += 1
            if (pos > 10):
                break
    file = open("E:\\Neeraj\\SimpleUserCFBase.txt", "r")
    alld = file.readlines()
    file.close()
    file1 = open("E:\\Neeraj\\SimpleUserCFBase.txt", "w")
    for r1 in alld:
        print(r1)
        u = r1.find(",")
        if (r1[0:u] == str(id)):
            pass
        else:
            file1.write(r1)
    file1.write(s)
    file1.close()
    print("\nDone")
Exemplo n.º 9
0
 def __calc_sim_matrix(self):
     algo = KNNBasic(sim_options=self.sim_options)
     algo.fit(self.trainset)
     self.similarity_matrix = algo.compute_similarities()
Exemplo n.º 10
0
    def computeNovelCf(userid):
        testSubject = userid
        k = 10

        # Load our data set and compute the user similarity matrix
        ml = NovelLens()
        data = ml.loadNovelLensLatestSmall()

        trainSet = data.build_full_trainset()

        sim_options = {'name': 'cosine', 'user_based': True}

        model = KNNBasic(sim_options=sim_options)
        model.fit(trainSet)
        simsMatrix = model.compute_similarities()

        # Get top N similar users to our test subject
        # (Alternate approach would be to select users up to some similarity threshold - try it!)
        testUserInnerID = trainSet.to_inner_uid(testSubject)
        similarityRow = simsMatrix[testUserInnerID]

        similarUsers = []
        for innerID, score in enumerate(similarityRow):
            if (innerID != testUserInnerID):
                similarUsers.append((innerID, score))

        kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

        # Get the stuff they rated, and add up ratings for each item, weighted by user similarity
        candidates = defaultdict(float)
        for similarUser in kNeighbors:
            innerID = similarUser[0]
            userSimilarityScore = similarUser[1]
            theirRatings = trainSet.ur[innerID]
        for rating in theirRatings:
            candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

# Build a dictionary of stuff the user has already seen
        watched = {}
        for itemID, rating in trainSet.ur[testUserInnerID]:
            watched[itemID] = 1

# Get top-rated items from similar users:
        pos = 0
        noveldatapro = []
        novels = []
        for itemID, ratingSum in sorted(candidates.items(),
                                        key=itemgetter(1),
                                        reverse=True):

            if not itemID in watched:
                novelID = trainSet.to_raw_iid(itemID)
                noveldatapro.append(novelID)
                print(ml.getNovelName(int(novelID)), ratingSum)
                novels.append(ml.getNovelName(int(novelID)))
                pos += 1
                if (pos > 9):
                    print("The top 10 novels for the user: " + testSubject)
                    print(noveldatapro)
                    break
        return novels
Exemplo n.º 11
0
def generate_sim_matrix(trainSet, sim_metric, is_user=True):
    sim_options = {'name': sim_metric, 'user_based': is_user}
    model = KNNBasic(sim_options=sim_options, verbose=False)
    model.fit(trainSet)
    simsMatrix = model.compute_similarities()
    return simsMatrix