def get_med_recs_(user_id):
    
    #dataset
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()
    
    #create and fit the KNN classifier
    algo = KNNBaseline()
    algo.fit(trainset)
    
    d = dict()
    
    #store key (id) and value (prediction) in dict
    for i in range(1,1682):
        iid = str(i)
        prediction = algo.predict(user_id, iid, verbose= False)
        if prediction.est >= 2.3 and prediction.est <= 2.7:
            d.update({int(iid): prediction.est})
    
    #sort by val
    sort = sorted(d.items(), key=lambda x: x[1])
    #return top 5 ids in a list
    
    final_list = list(sort[-1:-6:-1])
    
    return list(final_list)
def get_surprise_recs_(user_id):
    
    file_path = ('C:/Users/frank/MovieSurprise/MovieSurprise/u_surpriseTest.tsv')
    reader = Reader(line_format = 'user item rating timestamp', sep = '\t')
    data = Dataset.load_from_file(file_path, reader)

    trainset = data.build_full_trainset()
    algo = KNNBaseline()
    algo.fit(trainset)
    
    d = dict()
    
    #store key (id) and value (prediction) in dict
    for i in range(1,1682):
        iid = str(i)
        prediction = algo.predict(user_id, iid, verbose= False)
        if prediction.est >= 3:
            d.update({int(iid): prediction.est})
    
    #sort by val
    sort = sorted(d.items(), key=lambda x: x[1])
    #return top 5 ids in a list
    
    final_list = list(sort[-1:-6:-1])
    
    return list(final_list)
def get_best_recs_(user_id):
    
    #dataset
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()
    
    #create and fit the KNN classifier
    algo = KNNBaseline()
    algo.fit(trainset)
    
    d = dict()
    #for each movie, find the prediction. If its predicted rating is 4.5+, then add it to the dict
    for i in range(1, 1682):
        iid = str(i)
        prediction = algo.predict(user_id, iid, verbose= False)
        if prediction.est >= 4.5:
            #store the item id as the key, prediction as value
            d.update({int(iid): prediction.est})
    #sort, converted to list where sort[x][0] is the predicted rating value (sorted in asc. order) and sort[x][1] is the movie id
    sort = sorted(d.items(), key=lambda x: x[1])
    #start at last element, end at 6th to last element, step of -1
    #what if list is less than 5? this could cause an error
    final_list = sort[-1:-6:-1]
    #return a 2d list where the first element is the id and the second is the rating
    return list(final_list)
def knn_baseline_movie(train, test, ids, Xtest, Xids):
    """
    nearest neighbour approach using the movie baseline
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """

    print('kNN Baseline Movie')
    bsl_option = {'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01}

    sim_option = {
        'name': 'pearson_baseline',
        'min_support': 1,
        'user_based': False
    }

    algo = KNNBaseline(k=100,
                       bsl_options=bsl_option,
                       sim_options=sim_option,
                       verbose=False)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Пример #5
0
def eval(user_id):

    # Step 1: Define variables
    ratings = pps.get_all_ratings_as_df() # read ratings from database
    ratings[RATING] = None
    ratings.loc[ratings[LIKED] == True, RATING] = 1
    ratings.loc[ratings[LIKED] == False, RATING] = 0

    reader = Reader(rating_scale=(0.0, 1.0))

    all_items = ratings.poi_id.unique() # find all items
    user_rmse = pd.DataFrame(columns=['est', 'true']) # define resulting dataframe for storing the probabilites

    # Step 2: Iterating over all items and leave out the current iteration's item (x) for training
    for x in np.nditer(all_items):

        # Step 2a: Define test dataset -> rating of currentUser and current (leaved out) item
        testset = ratings[(ratings.user_id == user_id)]
        testset = testset[(testset.poi_id == x)]

        # Step 2b: If user has given no rating for this item, the prediction cannot be compared to something true => thus skip
        if testset.rating.size == 0:
            continue

        # Step 2c: Define train dataset -> leave out the current item x
        trainset = ratings[~ratings.isin(testset).all(1)]
        trainset = Dataset.load_from_df(trainset[[USER_ID, POI_ID, RATING]], reader)
        trainset = trainset.build_full_trainset()

        # Step 2d: Apply algorithm by training and predicting of the item x that was leaved out
        algo = KNNBaseline(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
        algo.fit(trainset)

        pred = algo.predict(user_id, np.asscalar(x), r_ui=4, verbose=False) # execute the calculation

        # Step 2e: Store estimate and true value into output dataframe
        user_rmse.loc[len(user_rmse)] = [pred.est, np.asscalar(testset.rating)]

    # Step 3: Calculate the RMSE over all leave out estimatieons
    confidence = np.mean((user_rmse.est - user_rmse.true)**2)

    return confidence
Пример #6
0
def predict(rating_dic):

    df_clean = pd.read_csv("dataset_clean.csv")
    #######################
    # Fit surprise model
    #######################

    final_model = KNNBaseline(k=60, min_k=2, sim_options={'name': 'pearson_baseline', 'user_based': True})

    new_user_id = max(df_clean["userID"]) + 1
    ratings = np.array(list(rating_dic.values()))
    rated_mask = ratings != None
    ratings = ratings[rated_mask]
    items = np.array(list(rating_dic.keys()))[rated_mask]
    user = np.ones(len(items), dtype="int") * new_user_id
    new_user_df = pd.DataFrame({"userID": user, "itemID": items, "rating": ratings})

    total_df = df_clean.append(new_user_df)

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(0, 10))

    # The columns must correspond to user id, item id and ratings (in that order).
    new_trainset = Dataset.load_from_df(total_df, reader).build_full_trainset()

    ## Fit the best model

    final_model.fit(new_trainset)

    predicted_ratings = []
    for nootropic in nootropics_list:
        predicted_ratings.append(final_model.predict(new_user_id, nootropic).est)

    item_baselines = final_model.default_prediction() + final_model.compute_baselines()[
        1]  # mean rating + item baseline ?

    result_df = pd.DataFrame(
        {"nootropic": nootropics_list, "predicted_rating": predicted_ratings, "baseline_rating": item_baselines})

    nootropics_without_ratings = [nootropic for nootropic in nootropics_list if (nootropic not in rating_dic.keys())]
    new_result_df = result_df[result_df["nootropic"].isin(nootropics_without_ratings)]
    return new_result_df.sort_values("predicted_rating", ascending=False, ignore_index=True)
Пример #7
0
def KNN(data, kwargs):
    # Set algorithm
    k_neigbor     = kwargs.get('n_neigbor')
    min_neighb    = kwargs.get('min_neigbor')
    similarity    = kwargs.get('similarity')
    
    options = {'name': similarity}
    algo = KNNBaseline(k = k_neigbor, 
                       min_k = min_neighb, 
                       sim_options = options)
    
    # Train the algorithm on the data, and predict ratings for the testset
    algo.fit(data)
    
    prediction = np.zeros([10000,1000])
    for row in range(10000):
        for col in range(1000):
            prediction[row,col] = algo.predict(str(row+1),str(col+1)).est
            
    return prediction
Пример #8
0
algo2 = KNNWithZScore(k=40, min_k=1, sim_options=sim_options1, verbose=True)
algo2.fit(data_train.build_full_trainset())

algo3 = KNNWithMeans(k=40, min_k=1, sim_options=sim_options2, verbose=True)
algo3.fit(data_train.build_full_trainset())

pred1 = []
pred_f1 = []
pred2 = []
pred_f2 = []
pred3 = []
pred_f3 = []
with open("./data/testing.dat", "r", encoding='utf-8') as f:
    for line in f.readlines():
        line_data = line.strip().split(",")
        a = algo1.predict(str(line_data[0]), str(line_data[1]), None, True,
                          True)[3]
        b = algo2.predict(str(line_data[0]), str(line_data[1]), None, True,
                          True)[3]
        c = algo3.predict(str(line_data[0]), str(line_data[1]), None, True,
                          True)[3]
        pred1.append(int(round(a)))
        pred_f1.append(a)
        pred2.append(int(round(b)))
        pred_f2.append(b)
        pred3.append(int(round(c)))
        pred_f3.append(c)

with open("./雷雨轩_PB18111791_4.txt", "w") as f:
    for ratings in pred1:
        f.write(str(ratings) + "\n")
Пример #9
0
def Search(request):
    if request.method == 'POST':
        # Create a form instance and populate it with data from the request (binding):
        form = SearchForm(request.POST)
        city_to_search = form.data['city']
        # I wont heck if the form is valid...
        # then I need to give that user two lists of hotels

        # 1st: train again the model

        # change this so I read from database
        # start off from the ratings:
        df = pd.DataFrame(list(Rating.objects.all().values()))
        df['user_id'] = df['user_id'].astype(str)
        reader = Reader(rating_scale=(0, 6))
        data = Dataset.load_from_df(
            df[['user_id', 'hotel_id', 'rating_OVERALL']], reader)

        # train kNN-Baseline on the whole collection (both, user and item-wise)
        trainset = data.build_full_trainset()

        # Build two algorithms, and train them: algo and algo_items.
        algo = KNNBaseline()
        algo.fit(trainset)
        sim_options = {'name': 'pearson_baseline', 'user_based': False}
        algo_items = KNNBaseline(sim_options=sim_options)
        algo_items.fit(trainset)

        # 2nd: top hotels for user
        # find existing hotels:

        hotels = df['hotel_id'].unique().tolist()

        user1 = request.user.username
        print(df[df['user_id'] == user1])
        print(df.info())
        print(type(user1))
        hot_ratings_user = {}
        # loop to find ratings
        for hot in hotels:
            pred = algo.predict(user1, hot)
            hot_ratings_user[hot] = pred.est
        # the whole dictionary should be done now...
        # we need to filter hotels within the city
        df_hotels_names = pd.DataFrame(list(Hotel.objects.all().values()))
        # df_hotels_names has:  city  hotel_id  name  price_approx  star_class state  zipcode

        # this should be the first context
        sorted_hot_ratings_user = sorted(hot_ratings_user,
                                         key=hot_ratings_user.get,
                                         reverse=True)
        context1 = pd.DataFrame()
        # remove the printing part
        for key in sorted_hot_ratings_user:
            hotcurr = df_hotels_names[df_hotels_names['hotel_id'] == key]
            if hotcurr['city'].to_string(index=False) == city_to_search:
                dicttemp = pd.DataFrame({
                    'Hotel name': [hotcurr['name'].to_string(index=False)],
                    'Estimated rating':
                    round(hot_ratings_user[key], 2)
                })
                context1 = context1.append(dicttemp, ignore_index=True)

        # 3rd: item based:
        hot_ratings_i = {}
        for hot in hotels:
            pred_i = algo_items.predict(user1, hot)
            hot_ratings_i[hot] = pred_i.est
        # the whole dictionary should be done now...
        sorted_hot_ratings_i = sorted(hot_ratings_i,
                                      key=hot_ratings_i.get,
                                      reverse=True)
        context2 = pd.DataFrame()
        for key in sorted_hot_ratings_i:
            hotcurr = df_hotels_names[df_hotels_names['hotel_id'] == key]
            if hotcurr['city'].to_string(index=False) == city_to_search:
                dicttemp = pd.DataFrame({
                    'Hotel name': [hotcurr['name'].to_string(index=False)],
                    'Estimated rating':
                    round(hot_ratings_i[key], 2)
                })
                context2 = context2.append(dicttemp, ignore_index=True)

        # for sorting purposes, let's move it back to a df and sort it NOW...
        context1 = context1.sort_values(by='Estimated rating',
                                        ascending=False)[:10]
        context2 = context2.sort_values(by='Estimated rating',
                                        ascending=False)[:10]

        # let ma make it to the lists:
        Hotel_name_user = context1['Hotel name'].values.tolist()
        Estimated_rating_user = context1['Estimated rating'].values.tolist()
        Hotel_name_item = context2['Hotel name'].values.tolist()
        Estimated_rating_item = context2['Estimated rating'].values.tolist()

        context_rendering = {
            'city': city_to_search,
            'user': zip(Hotel_name_user, Estimated_rating_user),
            'item__': zip(Hotel_name_item, Estimated_rating_item)
        }

        # redirect to a thank you URL:
        return render(request, 'search_results.html', context_rendering)

    # If this is a GET (or any other method) create the default form.
    else:
        form = SearchForm()
    context = {
        'form': form,
    }

    return render(request, 'search_form.html', context)
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0


# we can now query for specific predicions
uid = str(1)  # raw user id
iid = str(20)  # raw item id

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)
#print(pred)

#Get a list of all animes
animeid = Rating['anime_id'].unique()
#Get a list of animes that uid 50 has rated
animeid50 = Rating.loc[Rating['user_id'] == 50, 'anime_id']
#Remove the animes that uid 50 has rated
anime_to_predict = np.setdiff1d(animeid, animeid50)

testing = [[50, anime_id, 4.] for anime_id in anime_to_predict]
predictions = algo.test(testing)
predictions[0]

pred_ratings = np.array([pred.est for pred in predictions])
print(pred_ratings)
Пример #11
0
    print()
    print('针对歌单进行预测:')
    current_playlist_name = convertor.get_name_by_index(39)
    print('歌单名称', current_playlist_name)

    playlist_rid = convertor.get_rid_by_name(current_playlist_name)
    print('歌单rid', playlist_rid)

    playlist_inner_id = algo.trainset.to_inner_uid(playlist_rid)
    print('歌曲inid', playlist_inner_id)

    playlist_neighbors_inner_ids = algo.get_neighbors(playlist_inner_id, k=10)
    playlist_neighbors_rids = (algo.trainset.to_raw_uid(inner_id) for inner_id in playlist_neighbors_inner_ids)
    playlist_neighbors_names = (convertor.get_name_by_rid(rid) for rid in playlist_neighbors_rids)

    print()
    print('歌单 《', current_playlist_name, '》 最接近的10个歌单为: \n')
    for playlist_name in playlist_neighbors_names:
        print(playlist_name, algo.trainset.to_inner_uid(convertor.get_rid_by_name(playlist_name)))

    print()
    print('针对用户(单个歌单表示一个用户)进行预测:')
    user_inner_id = 4
    print('用户内部id', user_inner_id)
    user_rating = trainset.ur[user_inner_id]
    print('用户评价过的歌曲数量', len(user_rating))
    items = map(lambda x:x[0], user_rating)
    for song in items:
        print(algo.predict(user_inner_id, song, r_ui=1), convertor.get_song_name_by_iid(algo.trainset.to_raw_iid(song)))
    surprise.dump.dump('./knn_baseline.model', algo=algo)
Пример #12
0
import pandas as pd
import numpy as np
from tqdm import tqdm
from surprise import KNNWithMeans, KNNBasic, KNNWithZScore, KNNBaseline
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from scipy.spatial.distance import cityblock, cosine, euclidean, hamming, jaccard, rogerstanimoto
data = Dataset.load_builtin('ml-1m')

trainset, testset = train_test_split(data, test_size=.15)

algo = KNNBaseline(k=50,
                   min_k=1,
                   sim_options={
                       'name': 'pearson_baseline',
                       'user_based': True
                   })
algo.fit(trainset)

test_pred = algo.test(testset)

print('accuracy', accuracy.rmse(test_pred, verbose=True))
print('predict', algo.predict(uid=2, iid='Fight Club (1999)').est)
Пример #13
0
iid1 = str(306)  # raw item id (as in the ratings file). They are **strings**!
iid2 = str(514)
iid3 = str(977)
iid4 = str(370)

r_ui1 = 4
r_ui2 = 4
r_ui3 = 1
r_ui4 = 3

verboseFlag = True

# get a prediction for specific users and items.
print("KNNBaseLine:")
predBaseLine1 = algoBaseLine.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag)
predBaseLine2 = algoBaseLine.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag)
predBaseLine3 = algoBaseLine.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag)
predBaseLine4 = algoBaseLine.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag)

print("\nKNNBasic:")
predBasic1 = algoBasic.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag)
predBasic2 = algoBasic.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag)
predBasic3 = algoBasic.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag)
predBasic4 = algoBasic.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag)

print("\nKNNWithMeans:")
predWithMeans1 = algoWithMeans.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag)
predWithMeans2 = algoWithMeans.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag)
predWithMeans3 = algoWithMeans.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag)
predWithMeans4 = algoWithMeans.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag)
Пример #14
0
import cPickle as pickle
# 重建歌曲id到歌曲名的映射字典
song_id_name_dic = pickle.load(open("popular_song.pkl", "rb"))
print("加载歌曲id到歌曲名的映射字典完成...")
# 重建歌曲名到歌曲id的映射字典
song_name_id_dic = {}
for song_id in song_id_name_dic:
    song_name_id_dic[song_id_name_dic[song_id]] = song_id
print("加载歌曲名到歌曲id的映射字典完成...")

user_inner_id = 4
user_rating = trainset.ur[user_inner_id]
items = map(lambda x: x[0], user_rating)
for song in items:
    print(algo.predict(user_inner_id, song, r_ui=1),
          song_id_name_dic[algo.trainset.to_raw_iid(song)])

### 使用NMF
from surprise import NMF, evaluate
from surprise import Dataset

file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 构建数据集和建模
algo = NMF()
trainset = music_data.build_full_trainset()
algo.train(trainset)
Пример #15
0
class Recommender():

    def __init__(self, dataset, new_products):
        '''
        Class which returns recommendations to a new customer.
        Initializes training data based on a full dataset.
        Initializes an item-item and a user-user recommender.

        Item-Item Recommender:
        - algorithm  :  KNNBaseline
        - K          :  21
        - sim        :  pearson correlation

        User-User Recommender:
        - algorithm  :  KNNwithMeans
        - K          :  12
        - sim        :  pearson correlation

        (for more information, see Surprise_CF.ipynb)
        '''
        self.new_products = new_products

        # Append new customer to data
        new_data = pd.DataFrame({'customer_id':[1]*len(self.new_products),
                                 'product_id': self.new_products,
                                 'star_rating':[5]*len(self.new_products)})
        full_data = pd.concat([new_data, dataset]).reset_index(drop=True)
        data = Dataset.load_from_df(full_data[['customer_id', 'product_id', 'star_rating']], Reader(rating_scale=(1, 5)))

        self.unique_products = dataset['product_id'].unique()
        self.trainset = data.build_full_trainset()
        self.ii_algo = KNNBaseline(k=21, sim_options={'name': 'pearson', 'user_based': False})
        self.uu_algo = KNNBaseline(k=99, sim_options={'name': 'msd', 'user_based': True})#KNNWithMeans(k=12, sim_options={'name': 'pearson', 'user_based': True})

    def new_recommendations(self):
        '''
        Function that takes in a list of new products and returns recommendations.

        Arguments:
        - new_products   :  list of products chosen by new user
        - orig_data      :  original dataframe of users, items and ratings
        - algo           :  algorithm for predicting ratings

        Returns:
        - recs_df        :  dataframe of recommendations
        '''

        # Train recommender systems
        self.ii_algo.fit(self.trainset)
        self.uu_algo.fit(self.trainset)

        recommendations = {'items': [], 'ii_rating': [], 'uu_rating': []}
        for item in self.unique_products:
            if item not in self.new_products:
                ii_rating = self.ii_algo.predict(1, item, verbose=False)[3]
                uu_rating = self.uu_algo.predict(1, item, verbose=False)[3]
                recommendations['items'].append(item)
                recommendations['ii_rating'].append(ii_rating)
                recommendations['uu_rating'].append(uu_rating)
        recs_df = pd.DataFrame(recommendations)
        ii_recs = recs_df.sort_values(by='ii_rating', ascending=False).head(10)['items']
        uu_recs = recs_df.sort_values(by='uu_rating', ascending=False).head(10)['items']

        return ii_recs, uu_recs
Пример #16
0
pred1 = algo1.predict(uid, iid, verbose=True)
#KNNWithMeans
algo2 = KNNWithMeans(k=30,
                     sim_options={
                         'name': 'cosine',
                         'user_based': False
                     },
                     verbose=True)
algo2.fit(trainset)
pred2 = algo2.predict(uid, iid, verbose=True)

#KNNWithZScore f
algo3 = KNNWithZScore(k=30,
                      sim_options={
                          'name': 'MSD',
                          'user_based': True
                      },
                      verbose=True)
algo3.fit(trainset)
pred3 = algo3.predict(uid, iid, verbose=True)
#KNNBaseline
algo4 = KNNBaseline(k=30,
                    sim_options={
                        'name': 'MSD',
                        'user_based': True
                    },
                    verbose=True)
algo4.fit(trainset)
pred4 = algo4.predict(uid, iid, verbose=True)
Пример #17
0
                      for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id]
                      for playlist_id in playlist_neighbors)

print("之前的啥:", playlist_neighbors)
print("和歌单《", current_palylist, "》最接近的10首歌单为:\n")
for playlist in playlist_neighbors:
    print(playlist, algo.trainset.to_inner_uid(name_id_dic[current_palylist]))

# 针对用户进行预测
song_id_name_dic = pickle.load(open("popular_song.pkl", "rb"),
                               encoding='utf-8')
print("加载歌曲id到歌曲名的映射字典完成...")
song_name_id_dic = {}
for song_id in song_id_name_dic:
    song_name_id_dic[song_id_name_dic[song_id]] = song_id
print("加载歌曲名到歌曲id的映射字典完成...")
# 内部编码的4号用户
user_inner_id = 4
user_rating = trainset.ur[user_inner_id]
items = map(lambda x: x[0], user_rating)
for song in items:
    print(algo.predict(user_inner_id, song, r_ui=1),
          song_id_name_dic[algo.trainset.to_raw_iid(song)])
print("完成...")

# 模型存储
surprise.dump.dump('./recommendation.model', algo=algo)
# 可以用以下方式载入
algo = surprise.dump.load('./recommendation.model')
Пример #18
0
bsl_options = {
    'method': 'sgd',  # 给定求解方式,可选值:als和sgd
    'n_epochs': 10,  # 迭代次数
    'reg': 0.02,  # 求解参数过程中的正则化系数
    'learning_rate': 0.1  # 参数更新的学习率
}
"""
	k=40: 给定预测时候的邻居样本的数目
	min_k=1:在产生预测值的时候,只要要求有多少个临近用户/物品
	sim_options={} : 给定相似度矩阵的计算方式
"""
sim_options = {
    'name': 'pearson',  # 指定相似度的计算法方式,可选值:pearson\msd\cosine\pearson_baseline
    'user_based': True  # 指定是基于用户的协同过滤,还是基于物品的协同过滤
}
algo = KNNBaseline(k=40, min_k=1, sim_options=sim_options)

# 4. 模型训练
algo.fit(trainset)

# 5. 模型效果评估
# TODO: surprise框架中需要单独的去设置这个效果评估的代码

# 6. 模型存储/持久化/模型预测
# 使用预测的时候必须使用predict方法,predict方法底层会调用estimate这个API产生预测值
# predict API输出的用户id和物品id必须是字符串的形式
uid = "196"
iid = "242"
pred = algo.predict(uid, iid)
print("用户{}对于物品{}的评分为:{}".format(uid, iid, pred.est))
Пример #19
0
iid1 = str(50)  # raw item id (as in the ratings file). They are **strings**!
iid2 = str(1223)
iid3 = str(131)
iid4 = str(395)

r_ui1 = 1
r_ui2 = 1
r_ui3 = 0
r_ui4 = 0

verboseFlag = True

# get a prediction for specific users and items.
print("KNNBaseLine:")
predBaseLine1 = algoBaseLine.predict(uid1,
                                     iid1,
                                     r_ui=r_ui1,
                                     verbose=verboseFlag)
predBaseLine2 = algoBaseLine.predict(uid2,
                                     iid2,
                                     r_ui=r_ui2,
                                     verbose=verboseFlag)
predBaseLine3 = algoBaseLine.predict(uid3,
                                     iid3,
                                     r_ui=r_ui3,
                                     verbose=verboseFlag)
predBaseLine4 = algoBaseLine.predict(uid4,
                                     iid4,
                                     r_ui=r_ui4,
                                     verbose=verboseFlag)

print("\nKNNBasic:")
Пример #20
0
    print(playlist, algo.trainset.to_inner_uid(name_id_dic[playlist]))

# 针对用户进行预测
song_id_name_dic = pickle.load(open("popular_song.pk1", "rb"))
print("加载歌曲id到歌曲名的映射字典完成......")
# 重建歌曲名到歌曲id的映射字典
song_name_id_dic = {}
for song_id in song_id_name_dic:
    song_name_id_dic[song_id_name_dic[song_id]] = song_id
print("加载歌曲名到歌曲id的映射字典完成......")

user_inner_id = 4
user_rating = trainset.ur[user_inner_id]
items = map(lambda x: x[0], user_rating)
for song in items:
    print(algo.predict(user_inner_id, song_id, r_ui=1),
          song_id_name_dic[algo.trainset])

from collections import defaultdict
from surprise import SVD
from surprise import Dataset


def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append(iid, est)
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n
Пример #21
0
def run_knn_baseline(sparse_data):
    #filename = "test.json"
    prefix = "knn_baseline_"
    trainFile = prefix + "train.txt"
    testFile = prefix + "test.txt"

    raw_data, userPurchasedSet, userTrueTestSet = preprocess(
        sparse_data, trainFile, testFile)
    folds_files = [(trainFile, testFile)]
    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()
    bsl_options = {
        'method': 'sgd',
        'n_epochs': 20,
        'learning_rate': 0.005,
    }
    ### sim name: cosine    msd       pearson     pearson_baseline
    ### user_based : True ---- similarity will be computed based on users
    ###            : False ---- similarity will be computed based on items.
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    predictions = {}
    top_n = {}
    testsSet = None
    total_precisions = 0.0
    total_recalls = 0.0
    total_hit = 0.0
    total_nDCG = 0.0
    total_ffeature = 0.0
    result_file = prefix + "result.txt"
    result_f = open(result_file, "w")
    for trainset, testset in pkf.split(data):
        testsSet = testset

        #algo = SVD(n_factors = 5)
        algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options)
        algo.fit(trainset)
        pre = algo.test(testset)
        accuracy.rmse(pre)
        accuracy.mae(pre)
        #calculate_rmse(predictions)

        ### test
        rowNum = raw_data.get_row_size()
        colNum = raw_data.get_col_size()
        cur_time = time.time()
        time_cost = 0

        for i in range(rowNum):
            user = raw_data.get_userID(i)
            predictions[user] = set()
            pq = []
            heapq.heapify(pq)
            for j in range(colNum):
                item = raw_data.get_itemID(j)
                if user not in userPurchasedSet or item in userPurchasedSet[
                        user]:
                    continue
                value = raw_data.get_val(user, item, 'rating')
                predict = algo.predict(user, item, r_ui=0, verbose=False)[3]
                if len(pq) >= 10:
                    heapq.heappop(pq)
                heapq.heappush(pq, (predict, item))
            top_n[user] = set()
            for items in pq:
                top_n[user].add(items[1])
            if user in userTrueTestSet:
                curPrecisions = calculate_precision(top_n[user],
                                                    userTrueTestSet[user])
                curRecalls = calculate_recall(top_n[user],
                                              userTrueTestSet[user])
                ffeature = calculate_f_feature(curPrecisions, curRecalls)
                curHit = isHit(top_n[user], userTrueTestSet[user])
                cur_nDCG = calculate_NDCG(top_n[user], userTrueTestSet[user])
                total_precisions += curPrecisions
                total_recalls += curRecalls
                total_hit += curHit
                total_nDCG += cur_nDCG
                total_ffeature += ffeature
                result_f.write(user + "\t" + str(curPrecisions) + "\t" +
                               str(curRecalls) + "\t" + str(ffeature) + "\t" +
                               str(curHit) + '\t' + str(cur_nDCG) + "\n")
            if i != 0 and i % 1000 == 0:
                duration = (time.time() - cur_time) / 60
                time_cost += duration
                remaining_time = ((rowNum - i) / 1000) * duration
                cur_time = time.time()
                #print 'precisions', total_precisions, ' recalls', total_recalls, ' nDCG', total_nDCG
                print 'i:', i, "/", rowNum, 'remaining time:', remaining_time, 'min'
    print 'precicions', total_precisions, ' recalls', total_recalls, ' hit', total_hit, 'nDCG:', total_nDCG
    rowNum = raw_data.get_row_size()
    print 'avg_precisions:', total_precisions / rowNum, 'avg_recalls:', total_recalls / rowNum, 'avg_ffeature', str(
        total_ffeature / rowNum
    ), 'avg_hit:', total_hit / rowNum, 'avg_nDCG:', total_nDCG / rowNum
    result_f.write("avg:\t" + str(total_precisions / rowNum) + "\t" +
                   str(total_recalls / rowNum) + "\t" +
                   str(total_ffeature / rowNum) + "\t" +
                   str(total_hit / rowNum) + '\t' + str(total_nDCG / rowNum) +
                   "\n")
    result_f.close()
Пример #22
0
class Recommender:
    def __init__(self):
        self.__load_rating_data_set()
        self.__load_movies_set()
        self.train_model()

    def train_model(self):
        self.__load_training_set()

        # Using KNN
        sim_options = {'name': 'pearson_baseline', 'user_based': True}
        self.algo = KNNBaseline(k=25, sim_options=sim_options)

        # Training the model
        self.algo.fit(self.training_set)

    def __get_not_rated_movies(self, user_id):
        m_ratings = \
            self.ratings_set.loc[
                self.ratings_set.user_id == user_id]

        rated_movies = list(m_ratings.item_id)

        not_rated = \
            [mid
                if (mid not in rated_movies)
                else None
             for mid in self.ratings_set.item_id.unique()]

        return list(not_rated)

    def __filter_by_movies_saved_in_kitso(self, preditions):
        kitso_movies_ids = list(self.movies_set.id)
        movies_id = list(map(lambda tupl: tupl[0], preditions))
        return list(
            filter(lambda movie_id: movie_id in kitso_movies_ids, movies_id))

    def __search_in_list_of_tuples(self, elem, list_tuples):
        tuples_with_elem = list(filter(lambda tup: elem in tup, list_tuples))
        return elem if len(tuples_with_elem) > 0 else False

    def __predict_rating(self, user_id, movies_ids):
        predicted_rating = []

        for mid in movies_ids:
            predition = self.algo.predict(user_id, mid)
            predition_tuple = (mid, float(predition.est))

            if not self.__search_in_list_of_tuples(mid, predicted_rating):
                predicted_rating.append(predition_tuple)

        return predicted_rating

    def get_top_n_recommended_movies(self, user_id, n=5):

        not_rated = self.__get_not_rated_movies(user_id)

        rating_preditions = self.__predict_rating(user_id, not_rated)

        rating_preditions = self.__filter_by_movies_saved_in_kitso(
            rating_preditions)

        sorted_preditions = sorted(rating_preditions,
                                   key=itemgetter(1),
                                   reverse=True)[:n]

        response = self.movies_set[self.movies_set.id.isin(sorted_preditions)]

        return jsonify(response.to_dict('records'))

    def __load_training_set(self):
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(
            self.ratings_set[['user_id', 'item_id', 'rating']], reader)
        self.training_set = data.build_full_trainset()

    def __load_movies_set(self):
        self.movies_set = pd.read_csv(FILE_PATH_MOVIES,
                                      delimiter=';',
                                      encoding='latin-1')

    def __load_rating_data_set(self):
        self.ratings_set = pd.read_csv(FILE_PATH_RATINGS, delimiter=';')