def KNNPred(data):  #KNN Means algorithm
    print("\nTraining KNN Means model..\n")
    global x_test, y_test, testlen, trainlen, y_train, model_params, X, Y, avg_rat, cold_itm
    options = model_params[0]
    knnModel = KNNWithMeans(sim_options=options)
    knnModel_1 = KNNWithMeans()
    train = data.build_full_trainset()
    knnModel.fit(train)
    print("\nTraining done..\nPrediction started..")
    knnModel_1.fit(train)
    #y_pred_w_m = [knnModel.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)]
    #y_pred_wo_m = [knnModel_1.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)]
    y_pred_w_m = [0 for i in range(testlen)]
    y_pred_wo_m = [0 for i in range(testlen)]
    kk = 0
    for i in x_test:
        if i[1] - 1 in cold_itm:
            y_pred_w_m[kk] = avg_rat[i[0] - 1]
            y_pred_wo_m[kk] = avg_rat[i[0] - 1]
        else:
            y_pred_w_m[kk] = knnModel.predict(i[0], i[1]).est
            y_pred_wo_m[kk] = knnModel_1.predict(i[0], i[1]).est
        kk += 1
    #y_pred_train = [knnModel_1.predict(x_train[i][0], x_train[i][1]).est for i in range(trainlen)]
    #y_pred_tot = [knnModel_1.predict(X[i][0], X[i][1]).est for i in range(trainlen+testlen)]
    print("\nPrediction done..\n")
    return [y_pred_w_m, y_pred_wo_m, knnModel,
            knnModel_1]  #, y_pred_train, y_pred_tot
def train_surprise_model():
    # import reduced dataset:
    df = import_reduced_reviews(
        'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv')
    df = df[['user_key', 'game_key', 'rating']]

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])

    ### Modelling part with Surprise:
    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Build trainset from the whole dataset:
    trainsetfull = data.build_full_trainset()
    print('Number of users: ', trainsetfull.n_users, '\n')
    print('Number of items: ', trainsetfull.n_items, '\n')

    # Parameters:
    sim_option = {'name': 'cosine', 'user_based': False}
    k = 10
    min_k = 5

    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)

    # Run fit:
    start_time = time.time()
    algo.fit(trainsetfull)
    print("--- %s seconds ---" % (time.time() - start_time))

    ### Test: is it possible to exchange the sim matrix?
    sim_matrix_imported = pd.read_csv(
        '../Data/Recommender/selfmade_item-item-similarity-matrix.csv',
        index_col=0)
    sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int)
    sim_matrix_imported = sim_matrix_imported.to_numpy()

    a = algo.predict(93681, 100007)
    algo.sim = sim_matrix_imported
    b = algo.predict(93681, 100007)

    # We now need to save the similarity matrix somewhere:
    sim_matrix = algo.sim
    pd.DataFrame(sim_matrix).to_csv(
        '../Data/Recommender/sim_matrix-myKNNWithMeans_item_based_model')

    # Save the precomputed model:
    dump.dump('../Data/Recommender/myKNNWithMeans_item_based_model', algo)
示例#3
0
    def CFM(self):
        u_id = []
        I_id = []
        r_ui_ = np.array([])
        _est = np.array([])

        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)
        algo.fit(self.trainset)

        for uid in (self.list):
            lids = self.data[self.data.uid == uid]
            a = self.data[self.data.uid == uid]

            for i in range(1, len(a)):
                lid = lids[i - 1:i].lid.values[0]
                r_ui = lids[i - 1:i].rate.values[0]
                pred = algo.predict(uid, lid, r_ui, verbose=True)
                u_id.append(int(pred.uid))
                I_id.append(int(pred.iid))
                r_ui_ = np.append(r_ui_, pred.r_ui)
                _est = np.append(_est, pred.est)

        self.df_est = pd.DataFrame({
            'uid': u_id,
            'Iid': I_id,
            'r_ui': r_ui_,
            'est': _est
        })
        self.arr = self.df_est['uid'].unique()

        self.CFWM_ndcg_ = self.Calculate_NDCG()
示例#4
0
class Rater:
    def __init__(self, ratings):
        self.classifier = KNNWithMeans(sim_options={"name": "cosine", "user_based": False})
        self.training_set = None
        self.ratings_dict = None
        self._prepare_data_(ratings)
        self._train_()

    def _prepare_data_(self, ratings):
        self.ratings_dict = {
            "user_id": [item.user_id for item in ratings],
            "movie_id": [item.movie_id for item in ratings],
            "mark": [item.mark for item in ratings]
        }
        df = pd.DataFrame(self.ratings_dict)
        data = Dataset.load_from_df(df[["user_id", "movie_id", "mark"]], Reader(rating_scale=Constants.RATING_SCALE))
        self.training_set = data.build_full_trainset()

    def _train_(self):
        self.classifier.fit(self.training_set)

    def get_ratings(self, user_id):
        predicted_ratings = {}
        for movie_id in self.ratings_dict["movie_id"]:
            prediction = self.classifier.predict(user_id, movie_id)
            predicted_ratings[movie_id] = prediction.est
        return predicted_ratings
    def run(self): #will run model
        ratings = pd.read_csv('rating_final.csv')
        ratings_dict = {"userID": list(ratings.userID), "placeID": list(ratings.placeID), "rating": list(ratings.rating)}
        df = pd.DataFrame(ratings_dict)
        reader = Reader(rating_scale=(0, 2))
        data = Dataset.load_from_df(df[["userID", "placeID", "rating"]], reader)

        # To use item-based cosine similarity
        sim_options = {
            "name": "cosine",
            "user_based": True,  # Compute  similarities between items
            "min_support":9
        }
        # define a cross-validation iterator
        kf = KFold(n_splits=5)
        algo = KNNWithMeans(sim_options=sim_options)
        places = list(df['placeID'].unique())
        ordered = ArrayList()
        for i in places:
            total=0
            for trainset, testset in kf.split(data): #finds result for each fold
                # train algorithm.
                algo.fit(trainset)
                #test algorithm
                #predictions = algo.test(testset)
                # Compute and print Root Mean Squared Error
                #accuracy.rmse(predictions, verbose=True)

                #gets predicted rating for each place
                prediction = algo.predict(self.user, i, verbose=False)
                total+=prediction.est
            ordered.append(i, total/5) #we find average of estimate for each fold

        ordered.sort()
        highest = ordered.inArray[ordered.count - 5:ordered.count]

        place = pd.read_csv('geoplaces2.csv')

        #placedf = pd.DataFrame({"placeID": list(place.placeID), "name": list(place.name)})
        count = 0
        finalRec=ArrayList()
        for i in range(len(highest) - 1, -1, -1):
            count += 1
            name = list(place[place["placeID"].unique() == highest[i].id]['name'])
            finalRec.append(count, name[0])

        #printing accuracy score
        out = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)
        mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
        print(mean_rmse)

        return finalRec.inArray
    def CFM(self):
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)
        algo.fit(self.trainset)

        for uid in (self.list):
            lids = self.data[self.data.uid == uid]
            a = self.data[self.data.uid == uid]

            for i in range(1, len(a)):
                lid = lids[i - 1:i].lid.values[0]
                r_ui = lids[i - 1:i].rate.values[0]
                pred = algo.predict(uid, lid, r_ui, verbose=True)

        return pred
示例#7
0
def computeKNNMeansMovie(data, test_np):
    """Compute the k-NN with mean item based method and return the predictions on the test
     The method is on all the data and got the following settings:
         - Similarity function : Pearson baseline, item based
         - Number of closest neighbors : 108
         
         data : data frame which represent the train set
         test_np : data frame on which the prediction will be returned
         
         return : test_np with a column of prediction named 'knnmeans_item_rating'"""
    
    trainset, test = dataTrainSurprise(data, test_np)
    
    sim_options = {'name':'pearson_baseline','user_based': False}
    knnmeans_algo = KNNWithMeans(k = 108, sim_options =sim_options).fit(trainset)

    test['knnmeans_item_rating'] = test[['user_id', 'movie_id']] \
    .apply(lambda row: knnmeans_algo.predict(row['user_id'], row['movie_id'])[3], axis=1)
    
    return test
def knn_centered_user(train, test, ids, Xtest, Xids):
    """
    kNN approach taking into account the mean ratings of each user
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('Centered kNN User')
    algo = KNNWithMeans(k=200,
                        name='pearson_baseline',
                        min_support=5,
                        user_based=True,
                        shrinkage=120)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
示例#9
0
def user_collaborative_filtering(trainset, testset):

    # Use user_based true/false to switch between user-based or item-based collaborative filtering
    algo = KNNWithMeans(k=50,
                        sim_options={
                            'name': 'pearson_baseline',
                            'user_based': True
                        })
    algo.fit(trainset)

    # we can now query for specific predicions
    uid = str(196)  # raw user id
    iid = str(302)  # raw item id

    # get a prediction for specific users and items.
    pred = algo.predict(uid, iid, r_ui=4, verbose=True)

    # run the trained model against the testset
    test_pred = algo.test(testset)

    # get RMSE
    print("User-based Model : Test Set")
    accuracy.rmse(test_pred, verbose=True)
示例#10
0
startTime = time.time()
# 数据读取
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)
trainset = data.build_full_trainset()

# ItemCF 计算得分
# 取最相似的用户计算时,只取最相似的k个
algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'})

# 定义K折交叉验证迭代器,K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)
    # 计算MAE
    accuracy.mae(predictions, verbose=True)
# algo.fit(trainset)

uid = str(196)
iid = str(302)

pred = algo.predict(uid, iid)
print(pred)
endTime = time.time()
print("程序运行的时间:{}".format(endTime - startTime))
示例#11
0
# In[ ]:


trainset, testset = train_test_split(data, test_size=.15)


# In[ ]:


algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)


# In[ ]:


test_pred = algo.test(testset)


# In[ ]:


accuracy.rmse(test_pred, verbose=True)


# In[ ]:


algo.predict(uid=2, iid='Fight Club (1999)').est

示例#12
0
r_ui1 = 4
r_ui2 = 4
r_ui3 = 1
r_ui4 = 3

verboseFlag = True

# get a prediction for specific users and items.
print("KNNBaseLine:")
predBaseLine1 = algoBaseLine.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag)
predBaseLine2 = algoBaseLine.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag)
predBaseLine3 = algoBaseLine.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag)
predBaseLine4 = algoBaseLine.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag)

print("\nKNNBasic:")
predBasic1 = algoBasic.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag)
predBasic2 = algoBasic.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag)
predBasic3 = algoBasic.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag)
predBasic4 = algoBasic.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag)

print("\nKNNWithMeans:")
predWithMeans1 = algoWithMeans.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag)
predWithMeans2 = algoWithMeans.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag)
predWithMeans3 = algoWithMeans.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag)
predWithMeans4 = algoWithMeans.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag)

print("\nKNNWithZScore:")
predWithZScore1 = algoWithZScore.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag)
predWithZScore2 = algoWithZScore.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag)
predWithZScore3 = algoWithZScore.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag)
predWithZScore4 = algoWithZScore.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag)
示例#13
0
                                     r_ui=r_ui3,
                                     verbose=verboseFlag)
predBaseLine4 = algoBaseLine.predict(uid4,
                                     iid4,
                                     r_ui=r_ui4,
                                     verbose=verboseFlag)

print("\nKNNBasic:")
predBasic1 = algoBasic.predict(uid1, iid1, r_ui=r_ui1, verbose=verboseFlag)
predBasic2 = algoBasic.predict(uid2, iid2, r_ui=r_ui2, verbose=verboseFlag)
predBasic3 = algoBasic.predict(uid3, iid3, r_ui=r_ui3, verbose=verboseFlag)
predBasic4 = algoBasic.predict(uid4, iid4, r_ui=r_ui4, verbose=verboseFlag)

print("\nKNNWithMeans:")
predWithMeans1 = algoWithMeans.predict(uid1,
                                       iid1,
                                       r_ui=r_ui1,
                                       verbose=verboseFlag)
predWithMeans2 = algoWithMeans.predict(uid2,
                                       iid2,
                                       r_ui=r_ui2,
                                       verbose=verboseFlag)
predWithMeans3 = algoWithMeans.predict(uid3,
                                       iid3,
                                       r_ui=r_ui3,
                                       verbose=verboseFlag)
predWithMeans4 = algoWithMeans.predict(uid4,
                                       iid4,
                                       r_ui=r_ui4,
                                       verbose=verboseFlag)

print("\nKNNWithZScore:")
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)

    top_n = get_top_n(predictions, n=5)

    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print('precision',
          sum(prec for prec in precisions.values()) / len(precisions))
    print('recall', sum(rec for rec in recalls.values()) / len(recalls))

# Retrieve inner id of the movie Toy Story
# trainset = data.build_full_trainset()
# toy_story_raw_id = 'eSQ3z93DlzkpXK_H6MFEMw'
# toy_story_inner_id = algo.trainset.to_inner_uid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.

r = restaurants_and_food.copy()
r['Estimate_Score'] = r['business_id'].apply(
    lambda x: algo.predict('gRdBkmXdRqUzDMkcMtt7rQ', x).est)

r = r.sort_values(by=['Estimate_Score'], ascending=False)
print(r[['business_id', 'name', 'categories', 'stars',
         'Estimate_Score']].head(10))
示例#15
0
data = Dataset.load_builtin('ml-100k')
train, test = train_test_split(data, test_size=0.25, random_state=10)

algo = SVD()
algo.n_epochs = 20
algo.random_state = 15
algo.fit(train)

predictions = algo.test(test)
accuracy.rmse(predictions)

uid = str(196)  # raw user id
iid = str(302)  # raw item id
r_ui = 4  # already know the true rating is 4, so we can make a comparison

pred = algo.predict(uid, iid, r_ui=r_ui, verbose=True)
print(pred.est)

knn = KNNWithMeans(
    sim_options={
        "name": "msd",  # cosine / msd / pearson / pearson_baseline
        "min_support": 2,
        "user_based": False
    })
knn.fit(train)

predictions = knn.test(test)
accuracy.rmse(predictions)
pred = knn.predict(uid, iid, r_ui=r_ui, verbose=True)
print(pred.est)
def collaborative_filtering_using_surprise():
    """
    https://towardsdatascience.com/how-to-build-a-memory-based-recommendation-system-using-python-surprise-55f3257b2cf4
    Predict games for user with user_key = 93681
    """
    target_user_key = 93681

    # import reduced dataset:
    df = import_reduced_reviews()

    # check for duplicates:
    duplicates = len(df) - len(
        df.drop_duplicates(subset=['game_key', 'user_key']))

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])
    print('duplicates removed: ' + str(duplicates))

    # check out our user:
    df_target_user = df[df['user_key'] == target_user_key]

    # build utility matrix:
    # data_pivot = df.pivot(index='user_key', columns='game_key', values='rating')

    # calculate sparsity
    # sparsity = data_pivot.isnull().sum().sum() / data_pivot.size
    # print('Sparcity of utility matrix: ' + str(sparsity))

    ### Modelling part with Surprise:
    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Split in trainset and testset
    trainset, testset = train_test_split(data, test_size=0.2)

    print('Number of users: ', trainset.n_users, '\n')
    print('Number of items: ', trainset.n_items, '\n')

    # When surprise creates a Trainset or Testset object, it takes the raw_id’s (the ones that you used in the file
    # you imported), and converts them to so-called inner_id’s (basically a series of integers, starting from 0). You
    # might need to trace back to the original names. Using the items as an example (you can do the same approach
    # with users, just swap iid's with uid's in the code), to get the list of inner_iids, you can use the all_items
    # method. To convert from raw to inner id you can use the to_inner_iid method, and the to_raw_iid to convert back.

    # An example on how to save a list of inner and raw item id’s:
    trainset_iids = list(trainset.all_items())
    iid_converter = lambda x: trainset.to_raw_iid(x)
    trainset_raw_iids = list(map(iid_converter, trainset_iids))

    ## Model parameters: of kNN:
    # Two hyperparameters we can tune:
    # 1. k parameter
    # 2. similarity option
    #   a) user-user vs item-item
    #   b) similarity function (cosine, pearson, msd)

    sim_option = {'name': 'pearson', 'user_based': False}

    # 3 different KNN Models: KNNBasic, KNNWithMeans, KNNWithZScore
    k = 40
    min_k = 5

    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)

    algo.fit(trainset)

    ## Testing:
    predictions = algo.test(testset)

    accuracy.rmse(predictions)

    # Own similarity matrix:
    sim_matrix_imported = pd.read_csv(
        '../Data/Recommender/selfmade_item-item-similarity-matrix.csv',
        index_col=0)
    sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int)
    sim_matrix_imported = sim_matrix_imported.to_numpy()

    algo.sim = sim_matrix_imported

    predictions = algo.test(testset)

    accuracy.rmse(predictions)

    # Cross validation:
    skip = True
    if not skip:
        results = cross_validate(algo=algo,
                                 data=data,
                                 measures=['RMSE'],
                                 cv=5,
                                 return_train_measures=True)
        results_mean = results['test_rmse'].mean()

    ## Predictions
    # Lets assume we are happy with the method and now want to apply it to the entire data set.

    # Estimate for a specific user a specific item:
    single_item_single_user_prediction = algo.predict(uid=target_user_key,
                                                      iid=100010,
                                                      verbose=True)

    # Estimate all items for a specific user:
    list_of_all_items = trainset_raw_iids
    target_predictions = []

    for item in list_of_all_items:
        single_prediction = algo.predict(uid=target_user_key, iid=item)
        target_predictions.append(
            (single_prediction.uid, single_prediction.iid,
             single_prediction.est))

    # Then sort the predictions for each user and retrieve the k highest ones:
    target_predictions.sort(key=lambda x: x[2], reverse=True)
    n = 20
    top_n = target_predictions[:n]
    top_n = [row[1] for row in top_n]

    print('end')
# df = pd.DataFrame(ratings_dict)
# df.to_csv('csv_example')

df = pd.read_csv('csv_example')
print(df)


reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)


# movielens = Dataset.load_builtin('ml-100k')

from surprise import KNNWithMeans

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)


trainingSet = data.build_full_trainset()
algo.fit(trainingSet)

prediction = algo.predict('E', fifa)


print (prediction.est)
示例#18
0
from collections import defaultdict
import pprint
# 数据读取
path = './movielens_sample.txt'
df = pd.read_csv(path, usecols=[0, 1, 2], skiprows=1)
df.columns = ['user', 'item', 'rating']
reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_df(df, reader=reader)
trainset = data.build_full_trainset()

# ItemCF 计算得分
# 取最相似的用户计算时,只取最相似的k个

kf = KFold(n_splits=5)
algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'})

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    print(rmse, rmse * rmse)

predictions = []
for row in df.itertuples():
    user, item = getattr(row, 'user'), getattr(row, 'item')
    predictions.append([user, item, algo.predict(user, item).est])

print("*" * 100)
print("user\titem\tpredict\n")
pprint.pprint(predictions)
def selfmade_approach():
    # import reduced dataset:
    df = import_reduced_reviews(
        'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv')
    df = df[['user_key', 'game_key', 'rating']]

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])

    ### Modelling part with Surprise:
    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Build trainset from the whole dataset:
    trainsetfull = data.build_full_trainset()
    print('Number of users: ', trainsetfull.n_users, '\n')
    print('Number of items: ', trainsetfull.n_items, '\n')

    # Parameters:
    sim_option = {'name': 'cosine', 'user_based': False}
    k = 10
    min_k = 5

    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)

    # Run fit:
    start_time = time.time()
    algo.fit(trainsetfull)
    print("--- %s seconds ---" % (time.time() - start_time))

    # 1st approach: Calculate for a single user contained in dataset:
    target_user_key = 286189
    target_user_info = df[df['user_key'] == target_user_key]

    # Estimate single game:
    target_game_key = 100098

    # data structures:
    # sim_matrix = ndarray(312,312)
    # xr = defaultdict: 312
    # yr = defaultdict 8787

    # later on replace these by self-written structures
    xr = algo.xr
    yr = algo.yr
    sim_matrix = algo.sim
    item_means = algo.means

    inner_target_uid = algo.trainset.to_inner_uid(target_user_key)
    inner_target_iid = algo.trainset.to_inner_iid(target_game_key)

    # switch: uid and idd:
    x = inner_target_uid
    y = inner_target_iid

    # pred2:
    inner_2_raw_item_ids = algo.trainset._raw2inner_id_items
    # swap keys and values:
    inner_2_raw_item_ids = dict(
        (v, k) for k, v in inner_2_raw_item_ids.items())

    # similarity matrix with raw ids instead of inner surprise ids:
    sim_matrix_df = pd.DataFrame(sim_matrix)
    sim_matrix_df = sim_matrix_df.rename(
        columns=lambda x: inner_2_raw_item_ids[x])
    sim_matrix_df = sim_matrix_df.rename(
        index=lambda x: inner_2_raw_item_ids[x])

    target_user_ratings = yr[x]

    # convert from inner to raw:
    target_user_ratings2 = []
    for (inner_iid, rating) in target_user_ratings:
        target_user_ratings2.append((inner_2_raw_item_ids[inner_iid], rating))

    # convert item means from inner to raw:
    item_means2 = {}
    for i, mean in enumerate(item_means):
        item_means2[inner_2_raw_item_ids[i]] = mean

    myKNN = MyKnnWithMeans(sim_matrix=sim_matrix_df,
                           target_user_ratings=target_user_ratings2,
                           item_means=item_means2,
                           k=k,
                           min_k=min_k)
    pred = myKNN.predict_single_game(user_key=target_user_key,
                                     game_key=target_game_key)
    pred_surprise = algo.predict(uid=inner_target_uid, iid=inner_target_iid)

    estimate = pred
    print("Estimate for user %s for game %s is %s" %
          (target_user_key, target_game_key, estimate))

    # Estimate for user not contained in dataset:
    target_user_key = 123456789
    target_game_key = 100098

    user_ratings = [
        (100284, 7),
        (100311, 8),
        (105154, 2),
        (100020, 4),
        (100001, 9),
        (100277, 7),
    ]

    myKNN2 = MyKnnWithMeans(sim_matrix_df, user_ratings, item_means2, k, min_k)
    prediction = myKNN2.predict_single_game(target_user_key, target_game_key)

    # export similarity matrix:
    sim_matrix_df.to_csv(
        '../Data/Recommender/item-item-sim-matrix-surprise.csv')

    # export item means:
    export_path = '../Data/Recommender/item-means.json'
    with open(export_path, 'w') as fp:
        json.dump(item_means2, fp, sort_keys=False, indent=4)

    test = sim_matrix_df.loc[100516, 100284]

    pass
示例#20
0
# We'll use the famous SVD algorithm.
for i in range(10):
bsl_options = {'method': 'sgd',
              'reg': i,
              'learning_rate': 0.005,
                }
sim_options = {'name': 'msd',
              'shrinkage': 0,
              'user_based': 'True',
              'min_support': 10,
                }
algo = KNNWithMeans(bsl_options=bsl_options, sim_options=sim_options)
# algo = SVD(n_factors=20, n_epochs=30, lr_all=0.005, reg=0.04)
algo.train(trainset)

indices = []
with open("data/sampleSubmission.csv", 'r') as sample:
  samples = sample.read().splitlines()[1:]

indices = [ re.match(r'r(\d+?)_c(\d+?),.*?', line, re.DOTALL).groups() for line in samples ]

with open("data/pred_svd.csv", 'w') as csvfile:
  fieldnames = ['Id', 'Prediction']
  writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
  writer.writeheader()
  for item, user in indices:
    pred = algo.predict(user, item)
    writer.writerow({'Id':"r" + item + "_c" + user,'Prediction':pred.est})
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print(perf)
示例#21
0
class recommender:
    def __init__(self, algorithm):

        # Always call base method before doing anything.
        self.name = algorithm.lower()  # SVD, NMF, SAE, LSTM
        self.surprise_algorithms = ['svd', 'nmf', 'knnbasic', 'knnmeans']
        self.devooght_algorithms = ['fism']
        '''
         To implement with surprise:
             - Matrix-Factorization Based:
                 SVDpp: The SVD++ algorithm, an extension of SVD taking into account implicit ratings.
             - Neighbourhood-based:
                 Coclustering
                 KNNWithZScore: A basic collaborative filtering algorithm, taking into account the z-score normalization of each user.
                 KNNBaseline: A basic collaborative filtering algorithm taking into account a baseline rating.
             - Random Predictor    
                 NormalPredictor: Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.
             - Baseline    
                 BaselineOnly: Algorithm predicting the baseline estimate for given user and item.
             - Slope One
                 SlopeOne: A simple yet accurate collaborative filtering algorithm.

        To implement using RNN:
            - LSTM 
            - GRU (Devooght, Bersini)
            - GRU with clustering (Devooght, Bersini)
            
        To extract latent factors:
            - Stacked Autoencoders
            - CNN
            - CNN with Stacked Autoencoders
        '''

        self.df_known_predictions = None
        self.df_unknown_predictions = None
        self.known_sequence_dict = None
        self.unknown_sequence_dict = None
        self.k = None
        self.k_min = None
        self.metrics = None

    def get_name(self, verbose=False):
        return self.name

    def fit(self,
            df_ratings=None,
            columns=['userId', 'itemId', 'rating'],
            verbose=False,
            **kwargs):

        self.columns = np.array(columns)
        # If Surprise lib is the base package to fit, then df_ratings must be used.
        # Algorithms that use Surprise Lib: NMF, SVD, KNN, SVDpp

        if (df_ratings is not None):
            self.df_ratings = df_ratings.copy()

        ###########################################
        # Convert Utility Matrix to df_ratings if utility matrix is passed
        #
        #
        ###########################################

        if self.name in self.surprise_algorithms:  # Surprise-based recommenders
            from surprise import Dataset
            from surprise import Reader

            # A reader is still needed but only the rating_scale param is required.
            # The Reader class is used to parse a file containing ratings.
            reader = Reader(rating_scale=(0.5, 5.0))

            # Separating timestamp column
            if ('timestamp' in columns):
                self.df_timestamp = self.df_ratings['timestamp'].copy()
                self.df_ratings.drop(labels='timestamp', inplace=True, axis=1)

            # The columns must correspond to user id, item id and ratings (in that order).
            data = Dataset.load_from_df(
                self.df_ratings[self.columns[np.where(
                    self.columns != 'timestamp')]], reader)

            # Creting trainset variable to be used in prediction functions of Surprise
            self.trainset = data.build_full_trainset()

            # Creating Model
            if self.name == 'svd':
                from surprise import SVD

                # Setting Number of Factors in Matrix Factorization
                if ('n_factors' in kwargs):
                    self.n_factors = kwargs['n_factors']
                else:
                    self.n_factors = 100
                    if (verbose):
                        print("Using default number of factors: {}".format(
                            self.n_factors))

                # Setting number of epochs in stocastic gradient descent
                if ('n_epochs' in kwargs):
                    self.n_epochs = kwargs['n_epochs']
                else:
                    self.n_epochs = 20
                    if (verbose):
                        print("Using default number of epochs: {}".format(
                            self.n_epochs))

                self.model = SVD(n_factors=self.n_factors,
                                 n_epochs=self.n_epochs,
                                 verbose=verbose)

            elif self.name == 'nmf':
                from surprise import NMF

                # Setting Number of Factors in Matrix Factorization
                if ('n_factors' in kwargs):
                    self.n_factors = kwargs['n_factors']
                else:
                    self.n_factors = 15
                    if (verbose):
                        print("Using default number of factors: {}".format(
                            self.n_factors))

                # Setting number of epochs in stocastic gradient descent
                if ('n_epochs' in kwargs):
                    self.n_epochs = kwargs['n_epochs']
                else:
                    self.n_epochs = 50
                    if (verbose):
                        print("Using default number of epochs: {}".format(
                            self.n_epochs))

                self.model = NMF(n_factors=self.n_factors,
                                 n_epochs=self.n_epochs,
                                 verbose=verbose)

            elif self.name == 'knnbasic':
                from surprise import KNNBasic

                # Setting number of neighbours
                if ('k' in kwargs):
                    self.k = kwargs['k']
                else:
                    self.k = 40
                    if (verbose):
                        print("Using default k: {}".format(self.k))

                # Setting minimum number of neighbours
                if ('k_min' in kwargs):
                    self.k_min = kwargs['k_min']
                else:
                    self.k_min = 1
                    if (verbose):
                        print("Using default k_min: {}".format(1))

                self.model = KNNBasic(k=self.k,
                                      min_k=self.k_min,
                                      verbose=verbose)

            elif self.name == 'kmeans':
                from surprise import KNNWithMeans

                # Setting number of neighbours
                if ('k' in kwargs):
                    self.k = kwargs['k']
                else:
                    self.k = 40
                    if (verbose):
                        print("Using default k: {}".format(40))

                # Setting minimum number of neighbours
                if ('k_min' in kwargs):
                    self.k_min = kwargs['k_min']
                else:
                    self.k_min = 1
                    if (verbose):
                        print("Using default k_min: {}".format(1))

                self.model = KNNWithMeans(k=self.k,
                                          min_k=self.k_min,
                                          verbose=verbose)

            else:
                if (verbose):
                    print("Algorithm not configured: {}".format(self.name))
                return -1

            # Train the algorithm on the trainset, and predict ratings for the testset
            self.model.train(self.trainset)

            return 0

        elif (self.name in self.devooght_algorithms):

            # Arguments
            directory_path = os.path.join(
                '.', 'Sequence_based_recommendation_files', self.name)
            preprocess.create_dirs(dirname=directory_path, verbose=verbose)

            data = preprocess.remove_rare_elements(data=df_ratings,
                                                   min_user_activity=1,
                                                   min_item_popularity=1,
                                                   verbose=verbose)

            data = preprocess.save_index_mapping(data=data,
                                                 dirname=directory_path,
                                                 separator=',')

            train_set, val_set, test_set = preprocess.split_data(
                data=data,
                nb_val_users=0.1,  # val_size
                nb_test_users=0.1,  # test_size
                dirname=directory_path,
                verbose=verbose)

            preprocess.make_sequence_format(train_set=train_set,
                                            val_set=val_set,
                                            test_set=test_set,
                                            dirname=directory_path,
                                            verbose=verbose)

            preprocess.save_data_stats(data=data,
                                       train_set=train_set,
                                       val_set=val_set,
                                       test_set=test_set,
                                       dirname=directory_path,
                                       verbose=verbose)

            # Training Algorithm
            parser = parse.command_parser(parse.predictor_command_parser,
                                          train.training_command_parser,
                                          parse.early_stopping_command_parser)

            if self.name == 'fism':
                args = parser.parse_args([
                    '--dir',
                    os.path.join(directory_path, 'models'),
                    '-d',
                    directory_path,  #directory_path + '/', 
                    '-b',
                    '20',  # Batch size: the number of training examples present in a single blatch
                    '--max_iter',
                    '50',  # Maximum number of iterations: the number of batches needed to complete one epoch
                    '--progress',
                    '10',  # when progress information should be printed during training
                    '-m',
                    self.name.upper(),  # Method
                    #'-i', '-1', # Number of batches - only on test parser
                    '--loss',
                    'RMSE',
                    '--save',
                    'Best'
                ])

                self.model = parse.get_predictor(args)

                dataset = handler.DataHandler(
                    dirname=args.dataset,
                    extended_training_set=args.extended_set,
                    shuffle_training=args.tshuffle)

                self.model.prepare_model(dataset)
                self.metrics = self.model.train(
                    dataset,
                    save_dir=args.dir,
                    time_based_progress=args.time_based_progress,
                    progress=float(args.progress),
                    autosave=args.save,
                    max_progress_interval=args.mpi,
                    max_iter=args.max_iter,
                    min_iterations=args.min_iter,
                    max_time=args.max_time,
                    early_stopping=parse.get_early_stopper(args),
                    load_last_model=args.load_last_model,
                    validation_metrics=args.metrics.split(','))

            else:
                if (verbose):
                    print("Algorithm not configured: {}".format(self.name))
                return -1

            return 0

        else:  # if self.name not in self.surprise_algorithms
            if (verbose):
                print("Invalid algorithm: {}".format(self.name))

    def get_model(self):
        return self.model

    def get_metrics(self):
        return self.metrics

    def calculate_known_predictions(self):
        # Calculating all predictions for known items

        if self.name in self.surprise_algorithms:
            # Calculating predictions dataframe as userId, itemId, rating, prediction
            # predictions return raw uid and iid

            known_predictions = self.model.test(self.trainset.build_testset(
            ))  # Brings all predictions of existing ratings

            for prediction in known_predictions:
                arr = np.array([
                    int(prediction.uid),
                    int(prediction.iid), prediction.r_ui, prediction.est
                ])
                if prediction == known_predictions[0]:
                    predictions = np.array([arr])
                else:
                    predictions = np.append(predictions, [arr], axis=0)

            self.df_known_predictions = pd.DataFrame({
                'userId':
                predictions[:, 0],
                'itemId':
                predictions[:, 1],
                'rating':
                predictions[:, 2],
                'prediction':
                predictions[:, 3]
            })

            if ('timestamp' in self.columns):
                self.df_known_predictions = self.df_known_predictions.set_index(
                    keys=['userId', 'itemId']).join(
                        df_ratings.drop('rating', axis=1).set_index(
                            keys=['userId', 'itemId'])).reset_index()

            self.df_known_predictions['userId'] = self.df_known_predictions[
                'userId'].astype(int)
            self.df_known_predictions['itemId'] = self.df_known_predictions[
                'itemId'].astype(int)

    def get_known_predictions(self, calculate_predictions=False):
        if self.df_known_predictions is None or calculate_predictions == True:
            self.calculate_known_predictions()

        return self.df_known_predictions

    def calculate_unknown_predictions(self):
        # Calculating all predictions for known items
        # predictions return raw uid and iid

        if self.name in self.surprise_algorithms:
            unknown_predictions = self.model.test(
                self.trainset.build_anti_testset(
                ))  # => Brings all predictions of non-existing ratings

            for prediction in unknown_predictions:
                arr = np.array([
                    int(prediction.uid),
                    int(prediction.iid), 0, prediction.est
                ])
                if prediction == unknown_predictions[0]:
                    predictions = np.array([arr])
                else:
                    predictions = np.append(predictions, [arr], axis=0)

            self.df_unknown_predictions = pd.DataFrame({
                'userId':
                predictions[:, 0],
                'itemId':
                predictions[:, 1],
                'rating':
                predictions[:, 2],
                'prediction':
                predictions[:, 3]
            })

    def get_unknown_predictions(self, calculate_predictions=False):
        if self.df_unknown_predictions is None or calculate_predictions == True:
            self.calculate_unknown_predictions()

        return self.df_unknown_predictions

    def predict(self, userId, itemId, verbose=False):

        if self.name in self.surprise_algorithms:
            prediction = self.model.predict(
                uid=int(userId),
                iid=int(itemId))  # Take as input the raw user id and item id
            #ref: http://surprise.readthedocs.io/en/stable/algobase.html#surprise.prediction_algorithms.algo_base.AlgoBase.predict

            if prediction.details['was_impossible'] == True:
                if (verbose):
                    print(
                        "Impossible to predict item {} rating for user {} (one of them may not have been in training step)"
                        .format(itemId, userId))
                return 0
            else:
                return prediction.est

    def get_top_n(self, n=10, source='unknown', calculate_sequence=False):
        '''Return the top-N recommendation for each user from a set of predictions.
        Args:        
            n(int): The number of recommendation to output for each user. Default
                is 10.
        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n. '''

        if (source.lower() == 'known'):

            # Checking if known predictions are calculated
            if (self.df_known_predictions is None):
                self.get_unknown_predictions(calculate_predictions=True)

            if (calculate_sequence == True
                    or self.known_sequence_dict is None):
                self.known_sequence_dict = dict()

                for userId in self.df_known_predictions['userId'].unique():
                    # Selecting single user
                    df_user = self.df_known_predictions[
                        self.df_known_predictions['userId'] == userId].copy()

                    # Sorting values by prediction
                    df_user.sort_values(by=['prediction'],
                                        ascending=False,
                                        inplace=True)

                    # Saving the first K in sequence dict
                    self.known_sequence_dict[userId] = np.array(
                        df_user['itemId'].head(n))

            return self.known_sequence_dict
}

df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(1, 5))

# Loads Pandas dataframe
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)

# recommender.py

from surprise import KNNWithMeans

# To use user-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between user
}
algo = KNNWithMeans(sim_options=sim_options)

trainingSet = data.build_full_trainset()

algo.fit(trainingSet)



#predicting on the basis of user that how much E would rate to the second product
prediction = algo.predict('E', 2)
prediction.est
if(prediction.est>3.5):
  print('The product is recommended.')
  print('Here, the prediction estimate is : '+str(prediction.est))
示例#23
0
playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)

# 把歌曲id转成歌曲名字
# to_raw_uid映射回去
playlist_neighbors = (algo.trainset.to_raw_uid(inner_id)
                      for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id]
                      for playlist_id in playlist_neighbors)

print "和歌单 《", current_playlist, "》 最接近的10个歌单为:\n"
for playlist in playlist_neighbors:
    print playlist, algo.trainset.to_inner_uid(name_id_dic[playlist])

# 重建歌曲id到歌曲名的映射字典
song_id_name_dic = pickle.load(open("data/song.pkl", "rb"), encoding='utf-8')
print "加载歌曲id到歌曲名的映射字典完成..."
# 重建歌曲名到歌曲id的映射字典
song_name_id_dic = {}
for song_id in song_id_name_dic:
    song_name_id_dic[song_id_name_dic[song_id]] = song_id
print "加载歌曲名到歌曲id的映射字典完成..."

#内部编码的4号用户
user_inner_id = 4
user_rating = trainset.ur[user_inner_id]
items = map(lambda x: x[0], user_rating)
for song in items:
    print algo.predict(
        user_inner_id, song,
        r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)]
示例#24
0
def make_prediction(test_data_imdb):
    train_data = pd.read_csv('../data/modeling/train/ratings_clean_std_0.csv',
                             sep=',').drop(columns={'Unnamed: 0'})
    omdb = pd.read_csv('../data/modeling/train/omdb_cleaned.csv')

    # build a reader, define the rating scale (minimum and maximum value)
    reader = Reader(rating_scale=(0.5, 5))
    # convert data to surprise format
    train_surprise = Dataset.load_from_df(train_data,
                                          reader).build_full_trainset()

    # Collaborative Filtering Models
    knn_collaborative = KNNWithMeans(k=115,
                                     min_k=5,
                                     sim_options={
                                         'name': 'msd',
                                         'user_based': False
                                     })
    knn_collaborative.fit(train_surprise)
    svd = SVD(lr_all=0.01, reg_all=0.05, n_epochs=23)
    svd.fit(train_surprise)
    preds = [[
        knn_collaborative.predict(test[1], test[3]).est
        for test in test_data_imdb.itertuples()
    ],
             [
                 svd.predict(test[1], test[3]).est
                 for test in test_data_imdb.itertuples()
             ]]

    # Content-Based Models
    # define features for content-based models
    params_features = {
        'threshold_actors': 0,
        'ts_languages': 0,
        'year': True,
        'runtime': True,
        'imdbvotes': True,
        'series': False,
        'awards': False,
        'genres': True,
        'imdb_rating': True,
        'roto_rating': True,
        'pg_rating': True,
        'threshold_newkeywords': 0,
        'threshold_plots': 0,
        'threshold_directors': 0
    }
    # load features
    features, names = preprocessing.features(**params_features)

    # add imdbID and set as index
    features = omdb[['imdbID'
                     ]].join(pd.DataFrame(features)).set_index('imdbID')

    # predict ratings
    pred_content = []
    no_of_ratings = []
    train_data = train_data[train_data['imdbID'] != 'tt0720339']
    for row in test_data_imdb.itertuples():
        # select user and movie

        imdbID = row.imdbID
        userID = row.user_id

        # compute predictions
        if imdbID == 'tt0720339':
            # exclude outlier movie without information
            pred_content.append(svd.predict(userID, imdbID).est)
        else:
            # select ratings of the user
            ratings_user = train_data.loc[train_data['user_id'] == userID]
            ratings_user.reset_index(inplace=True, drop=True)
            # select features of corresponding movies and convert to array
            features_user = np.array(features.loc[ratings_user['imdbID']])
            features_movie = np.array(features.loc[imdbID])

            pred_content.append(
                predict_movie_rating(ratings_user, features_user,
                                     features_movie))
        # store the number of predictions of a user:
        no_of_ratings.append(ratings_user.shape[0])

    # predictions of the models
    predictions = weighted_prediction(preds[0], preds[1], pred_content,
                                      no_of_ratings)
    test_data_with_rating = test_data_imdb.join(predictions)

    return test_data_with_rating[['user_id', 'movieID', 'rating']]
示例#25
0
from surprise import Dataset, print_perf, Reader
from surprise.model_selection import cross_validate
import os

# 指定文件所在路径
file_path = os.path.expanduser('mydata.csv')
# 告诉文本阅读器,文本的格式是怎么样的
reader = Reader(line_format='user item rating', sep=',')
# 加载数据
data = Dataset.load_from_file(file_path, reader=reader)
trainset = data.build_full_trainset()

# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50, sim_options={'user_based': False})#取最相似的用户进行计算时,只取最相似的k个
algo.fit(trainset)

# we can now query for specific predicions
uid = str(5)  # raw user id
iid = str(1)  # raw item id

# get a prediction for specific users and items.
pred = algo.predict(uid, iid)
print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)# rating of user-5 to item-1

#----------------------------
uid = str(5)  # raw user id
iid = str(5)  # raw item id
# get a prediction for specific users and items.
pred = algo.predict(uid, iid)
print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)
示例#26
0
trainset, testset = train_test_split(data, test_size=.15)

# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50,
                    sim_options={
                        'name': 'pearson_baseline',
                        'user_based': True
                    })
algo.fit(trainset)

# we can now query for specific predicions
uid = str(196)  # raw user id
iid = str(302)  # raw item id

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

# run the trained model against the testset
test_pred = algo.test(testset)

# get RMSE
print("User-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

# if you wanted to evaluate on the trainset
print("User-based Model : Training Set")
train_pred = algo.test(trainset.build_testset())
accuracy.rmse(train_pred)

# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50,
示例#27
0
algo3.fit(data_train.build_full_trainset())

pred1 = []
pred_f1 = []
pred2 = []
pred_f2 = []
pred3 = []
pred_f3 = []
with open("./data/testing.dat", "r", encoding='utf-8') as f:
    for line in f.readlines():
        line_data = line.strip().split(",")
        a = algo1.predict(str(line_data[0]), str(line_data[1]), None, True,
                          True)[3]
        b = algo2.predict(str(line_data[0]), str(line_data[1]), None, True,
                          True)[3]
        c = algo3.predict(str(line_data[0]), str(line_data[1]), None, True,
                          True)[3]
        pred1.append(int(round(a)))
        pred_f1.append(a)
        pred2.append(int(round(b)))
        pred_f2.append(b)
        pred3.append(int(round(c)))
        pred_f3.append(c)

with open("./雷雨轩_PB18111791_4.txt", "w") as f:
    for ratings in pred1:
        f.write(str(ratings) + "\n")

with open("./4_float.txt", "w") as f:
    for ratings in pred_f1:
        f.write(str(ratings) + "\n")
示例#28
0
list_reviews = read_datafile(data_file)

df = pd.DataFrame(list_reviews, columns=['UserId', 'ItemId', 'Playtime'])
#filter_dataset(df)
#normalize_playtime(df)

reader = Reader(rating_scale=(0, max(df.Playtime)))

sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)

if cross_validate:
    data = Dataset.load_from_df(df, reader)

    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
else:
    train_df, test_df = train_test_split(df, test_size=0.2)
    train_data = Dataset.load_from_df(train_df, reader)
    training_set = train_data.build_full_trainset()
    algo.fit(training_set)

    for index, row in test_df.iterrows():
        user = row['UserId']
        item = row['ItemId']
        playtime = row['Playtime']
        prediction = algo.predict(user, item)
        print('{}:{} - {} / {}'.format(user, item, prediction, playtime))
示例#29
0
print(movie_vector['Toy Story (1995)'])

dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

print('dataset', dataset)

ratings.rating.min()
ratings.rating.max()

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

trainset, testset = train_test_split(data, test_size=.15)

algo = KNNWithMeans(k=50,
                    sim_options={
                        'name': 'pearson_baseline',
                        'user_based': True
                    })
algo.fit(trainset)

test_pred = algo.test(testset)

print('accuracy', accuracy.rmse(test_pred, verbose=True))
print('predict', algo.predict(uid=2, iid='Fight Club (1999)').est)
示例#30
0
class KNNMean:
    def __init__(self, data, rating_scale, k=50, min_k=1, sim_options=None):
        self.data = data
        self.rating_scale = rating_scale
        self.k = k
        self.min_k = min_k
        self.reader = Reader(rating_scale=self.rating_scale)
        if not sim_options:
            sim_options = {
                "name": "cosine",
                'min_support': 3,
                "user_based": False
            }  # Compute  similarities between items
        self.model_data = Dataset.load_from_df(
            data.loc[:, ["userId", "movieId", "rating"]], self.reader)
        self.trainset = self.model_data.build_full_trainset()
        self.model = KNNWithMeans(self.k, self.min_k, sim_options=sim_options)
        print('fitting KNNWithMeans model...')
        self.model.fit(self.trainset)
        self.grid_search_ = None

    def set_model_params(self, model_params):
        print('updating model parameters...')
        self.model = KNNWithMeans(model_params)
        print('fitting KNNWithMeans model...')
        self.model.fit(self.trainset)

    def update_grid_search(self, gs):
        self.grid_search_ = gs

    def fit(self, data):
        self.data = data
        self.model_data = Dataset.load_from_df(
            data.loc[:, ["userId", "movieId", "rating"]], self.reader)
        self.trainset = self.model_data.build_full_trainset()
        self.model.fit(self.trainset)

    def grid_search(self):
        print('grid search...')
        sim_options = {
            "name": ["msd", "cosine"],
            "min_support": [3, 4],
            "user_based": [False]
        }
        param_grid = {
            "sim_options": sim_options,
            "k": [50, 100, 200],
            "min_k": [1]
        }
        gs = GridSearchCV(KNNWithMeans,
                          param_grid,
                          measures=["rmse", "mae"],
                          cv=3)
        gs.fit(self.model_data)
        best_params, best_score = gs.best_params["rmse"], gs.best_score["rmse"]
        print(f'Best score (RMSE): {best_score}')
        print(f'Best params (RMSE): {best_params}')

        print(f'Best score (MAE): {gs.best_score["mae"]}')
        print(f'Best params (RMSE): {gs.best_params["mae"]}')

        self.set_model_params(best_params)

        return best_params

    def predict(self, test_data):
        ratings = test_data.apply(
            lambda x: self.model.predict(x['userId'], x['movieId']).est,
            axis=1)
        return ratings
示例#31
0
kf = KFold(n_splits=2)
algo = KNNWithMeans(k=30, min_k=1, verbose=True)

count = 0
for trainset, testset in kf.split(data):
    count = count + 1
    print('Training round: ' + str(count))
    # 训练并测试算法
    print('Training')
    algo.fit(trainset)
    print('Train completed!')
    #if count is 1:
    #break
    print('Variating')
    predictions = algo.test(testset)
    print('Variation completed!')

    # 计算并打印 RMSE(均方根误差,Root Mean Squared Error)
    accuracy.rmse(predictions, verbose=True)

# 保存模型
dump.dump('saved_model_knnm.model', algo=algo, verbose=1)

# 生成结果
fo = open('submission.txt', 'w', encoding='utf-8')
with open('test.txt', 'r', encoding='utf-8') as f:
    for line in f:
        row = line.split(',')
        fo.write(str(algo.predict(int(row[0]), int(row[1])).est) + '\n')
fo.close()