Exemplo n.º 1
0
def test_nearest_neighbors():
    """Ensure the nearest neighbors are different when using user-user
    similarity vs item-item."""

    reader = Reader(line_format='user item rating', sep=' ', skip_lines=3)

    data_file = os.path.dirname(os.path.realpath(__file__)) + '/custom_train'
    data = Dataset.load_from_file(data_file, reader, rating_scale=(1, 5))
    trainset = data.build_full_trainset()

    algo_ub = KNNBasic(sim_options={'user_based': True})
    algo_ub.fit(trainset)
    algo_ib = KNNBasic(sim_options={'user_based': False})
    algo_ib.fit(trainset)
    assert algo_ub.get_neighbors(0, k=10) != algo_ib.get_neighbors(0, k=10)
Exemplo n.º 2
0
class ItemCF():
    def __init__(self):
        file_path = os.path.expanduser('user_item_rate.csv')
        reader = Reader(line_format='user item rating', sep=',')
        surprise_data = Dataset.load_from_file(file_path, reader=reader)
        all_trainset = surprise_data.build_full_trainset()

        # 训练模型:基于项目相似度
        self.item_algo = KNNBasic(k=10,
                                  min_k=3,
                                  sim_options={'user_based': False})
        # sim_options={'name': 'cosine','user_based': True} cosine/msd/pearson/pearson_baseline
        self.item_algo.fit(all_trainset)

    def get_similar_items(self, top_k, item_id):
        """
        相似项目
        Args:
            top_k(int): 相似项目数量
            item_id(str): 项目id

        Returns:
            list generator
        """
        item_inner_id = self.item_algo.trainset.to_inner_iid(item_id)
        item_neighbors = self.item_algo.get_neighbors(item_inner_id, k=top_k)
        item_neighbor_ids = (self.item_algo.trainset.to_raw_iid(inner_id)
                             for inner_id in item_neighbors)
        return item_neighbor_ids
Exemplo n.º 3
0
    def item_based_cf(self, co_pe, df_path):
        # INITIALIZE REQUIRED PARAMETERS
        # INITIALIZE REQUIRED PARAMETERS
        path = '/home/mister-t/Projects/PycharmProjects/RecommendationSys/ml-100k/u.item'
        prnt = "ITEM"
        sim_op = {'name': co_pe, 'user_based': False}
        algo = KNNBasic(sim_options=sim_op)

        reader = Reader(line_format="user item rating",
                        sep=',',
                        rating_scale=(1, 5))
        df = Dataset.load_from_file(df_path, reader=reader)

        # START TRAINING
        trainset = df.build_full_trainset()

        # APPLYING ALGORITHM KNN Basic
        res = algo.train(trainset)
        print "\t\t >>>TRAINED SET<<<<\n\n", res

        # Read the mappings raw id <-> movie name
        rid_to_name, name_to_rid = self.read_item_names(path)
        print "CF Type:", prnt, "BASED"

        search_key = raw_input(
            "Enter a Movie Name, \n ex. Toy Story (1995) or Seven (Se7en) (1995)\n Movie name:"
        )
        print "ALGORITHM USED : ", co_pe
        raw_id = name_to_rid[search_key]

        # --------------------------------------------- MARKERS

        f = io.open("cluster/AlgoHist_ib.txt", "wb")
        f.write(repr(co_pe))
        f.close()

        # --------------------------------------------- MARKERS END

        print "\t\t RAW ID>>>>>>>", raw_id, "<<<<<<<"
        inner_id = algo.trainset.to_inner_iid(raw_id)

        print "INNER ID >>>>>", inner_id

        # Retrieve inner ids of the nearest neighbors of Toy Story.
        k = input("Enter size of Neighborhood (Min:1, Max:40)")
        neighbors = algo.get_neighbors(inner_id, k=k)

        neighbors = (algo.trainset.to_raw_iid(inner_id)
                     for inner_id in neighbors)
        neighbors = (rid_to_name[rid] for rid in neighbors)

        print "Nearest ", k, " Matching Items are:"
        for i in neighbors:
            print "\t " * 6, i
Exemplo n.º 4
0
def user_based_cf(co_pe):
    # INITIALIZE REQUIRED PARAMETERS
    # path = 'ml-100k/u.user'
    prnt = "USER"
    sim_op = {'name': co_pe, 'user_based': True}
    algo = KNNBasic(sim_options=sim_op)

    reader = Reader(line_format="user item rating",
                    sep='\t',
                    rating_scale=(1, 5))
    df = Dataset.load_from_file('ml-100k/u.data', reader=reader)

    # START TRAINING
    trainset = df.build_full_trainset()

    # APPLYING ALGORITHM KNN Basic
    algo.train(trainset)
    print "ALGORITHM USED", co_pe

    # -------------------------------`-------------- MARKERS

    f = io.open("_AlgoHist_ub.txt", "wb")
    f.write(repr(co_pe))
    f.close()

    # --------------------------------------------- MARKERS END

    print "CF Type:", prnt, "BASED"

    # PEEKING PREDICTED VALUES
    search_key = raw_input("Enter User ID:")
    item_id = raw_input("Enter Item ID:")
    actual_rating = input("Enter actual Rating:")

    print algo.predict(str(search_key), item_id, actual_rating)

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset=testset)

    top_n = get_top_n(predictions, 5)
    result_u = True

    k = input("Enter size of Neighborhood (Min:1, Max:40)")

    inner_id = algo.trainset.to_inner_iid(search_key)
    neighbors = algo.get_neighbors(inner_id, k=k)
    print "Nearest Matching users are:"
    for i in neighbors:
        print "\t " * 6, i
    return top_n, result_u
Exemplo n.º 5
0
def FriendRecommender(user):
    df = pd.DataFrame(rating_dict)
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['users', 'colleges', 'rating']], reader)
    trainset = data.build_full_trainset()
    sim_options = {'name': 'cosine', 'user-based': True}

    algo = KNNBasic(sim_options)
    algo.fit(trainset)

    uid = trainset.to_inner_uid(user)
    pred = algo.get_neighbors(uid, 3)

    for i in pred:
        x.insert(i, (trainset.to_raw_uid(i)))
Exemplo n.º 6
0
def FriendRecommender(user):
    df = pd.DataFrame(rating_dict)
    reader = Reader(rating_scale=(1, 5))  # the ratings range from 1 to 5
    data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader)
    trainset = data.build_full_trainset()
    sim_options = {'name': 'cosine', 'user_based': True}
    # Using cosine to measure similarities, user based approach

    algo = KNNBasic(sim_options)
    algo.fit(trainset)

    uid = trainset.to_inner_uid(user)
    pred = algo.get_neighbors(
        uid, 3)  # returns 3 nearest neighbours of inputted user

    for i in pred:
        print(trainset.to_raw_uid(i))
Exemplo n.º 7
0
class Movie_KNN_recommender:
    def __init__(self, mode=0):
        self.index = pd.read_csv('../data/personal/movies.csv')
        self.reader = Reader()
        self.ratings = pd.read_csv('../data/personal/ratings.csv')
        data = Dataset.load_from_df(
            self.ratings[['userId', 'movieId', 'rating']], self.reader)
        trainset = data.build_full_trainset()
        sim_options = {'name': 'pearson_baseline', 'user_based': False}
        if mode == 0:
            self.algo = KNNBaseline(sim_options=sim_options)
        elif mode == 1:
            self.algo = KNNWithMeans(sim_options=sim_options)
        elif mode == 2:
            self.algo = KNNBasic(sim_options=sim_options)
        else:
            exit(0)

        self.algo.fit(trainset)

    def get_similar_movies(self, movieID, num=10):
        movie_inner_id = self.algo.trainset.to_inner_iid(movieID)
        movie_neighbors = self.algo.get_neighbors(movie_inner_id, k=num)
        movie_neighbors = [
            self.algo.trainset.to_raw_iid(inner_id)
            for inner_id in movie_neighbors
        ]
        print(movie_neighbors)
        return movie_neighbors

    def debug(self):
        similar_users = self.get_similar_movies(1, 1)
        print(self.ratings[self.ratings.userId == 1].head())
        for i in similar_users:
            print(list(self.ratings[self.ratings.userId == i]['movieId']))

    def recommend(self, movieID, num=10):
        movie_similar = self.get_similar_movies(movieID, num)
        recommending = []
        for i in movie_similar:
            recommending.append(self.index[self.index.movieId == i]['title'])
        return recommending
Exemplo n.º 8
0
def FriendRecommender(user):
    df = pd.DataFrame(rating_dic)
    reader = Reader(rating_scale=(0, 1))
    data = Dataset.load_from_df(df[['user', 'game', 'favorite']], reader)
    trainset = data.build_full_trainset()
    sim_options = {'name': 'cosine', 'user_based': True}

    algo = KNNBasic(sim_options)
    algo.fit(trainset)

    uid = trainset.to_inner_uid(user)
    pred = algo.get_neighbors(uid, 3)

    list_result = []

    for i in pred:
        #print(trainset.to_raw_uid(i))
        list_result.append(trainset.to_raw_uid(i))

    return list_result
Exemplo n.º 9
0
class Personal_KNN_recommender:
    def __init__(self, mode=0):
        self.index = pd.read_csv('../data/personal/movies.csv')
        self.reader = Reader()
        self.ratings = pd.read_csv('../data/personal/train.csv')
        self.testings = pd.read_csv('../data/personal/test.csv')
        data = Dataset.load_from_df(
            self.ratings[['userId', 'movieId', 'rating']], self.reader)
        trainset = data.build_full_trainset()
        sim_options = {'name': 'pearson_baseline', 'user_based': True}
        if mode == 0:
            self.algo = KNNBaseline(sim_options=sim_options)
        elif mode == 1:
            self.algo = KNNWithMeans(sim_options=sim_options)
        elif mode == 2:
            self.algo = KNNBasic(sim_options=sim_options)
        else:
            exit(0)
        self.userid = []
        for i in range(len(self.testings['userId'])):
            if not self.testings['userId'][i] in self.userid:
                self.userid.append(self.testings['userId'][i])
        self.algo.fit(trainset)

    def get_similar_users(self, usrID, num=10):
        user_inner_id = self.algo.trainset.to_inner_uid(usrID)
        user_neighbors = self.algo.get_neighbors(user_inner_id, k=num)
        user_neighbors = [
            self.algo.trainset.to_raw_uid(inner_id)
            for inner_id in user_neighbors
        ]
        # print(user_neighbors)
        return user_neighbors

    def debug(self):
        similar_users = self.get_similar_users(1, 1)
        print(self.ratings[self.ratings.userId == 1].head())
        for i in similar_users:
            print(list(self.ratings[self.ratings.userId == i]['movieId']))

    def recommend(self, usrID, num=5):
        existed_movie = list(
            self.ratings[self.ratings.userId == usrID]['movieId'])
        similar_users = self.get_similar_users(usrID, num)
        movies_dict = {}
        for i in similar_users:
            movie = list(self.ratings[self.ratings.userId == i]['movieId'])
            vote = list(self.ratings[self.ratings.userId == i]['rating'])
            for j in range(len(vote)):
                if not (movie[j] in existed_movie):
                    if movie[j] in movies_dict.keys():
                        movies_dict[movie[j]] += vote[j]
                    else:
                        movies_dict[movie[j]] = vote[
                            j]  # 从最相似的用户中挑选出没看过的电影,评分相加
        result = sorted(movies_dict.items(), key=lambda x: x[1],
                        reverse=True)  # 对评分进行排序
        result = result[:num]  # 挑选出最高评分的10部电影
        # print(result)
        recommending = []
        recommending_id = []
        for i in result:
            recommending.append(
                self.index[self.index.movieId == i[0]]['title'])
            recommending_id.append(i[0])
        return recommending, recommending_id  # 返回推荐的电影名字和id

    def test(self, num=10):
        result = []
        for user in self.userid:
            _, ids = self.recommend(user, num)
            # print(ids)
            result.append(ids)

        with open("./result.csv", "w") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['userId', 'result'])
            for i, row in enumerate(result):
                writer.writerow([self.userid[i], row])
Exemplo n.º 10
0
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]
    return rid_to_name, name_to_rid


rid_to_name, name_to_rid = read_item_names()

#采用基于物品的协同过滤
transet = data.build_full_trainset()
algo = KNNBasic(sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.train(transet)

#查看电影Now and Then (1995)最相似的10部电影
toy_story_raw_id = name_to_rid['Now and Then (1995)']
toy_story_raw_id
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_inner_id
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
toy_story_neighbors

#将10部电影转化为对应的名字
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors)
for movie in toy_story_neighbors:
    print(movie)
data = Dataset.load_from_df(final[['user_pseudo_id', 'interest', 'rating']],
                            reader)

print('Using KNNBasic')
bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
#algo = BaselineOnly(bsl_options=bsl_options)
#cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

trainset, testset = train_test_split(data, test_size=0.25)
algo = KNNBasic(bsl_options=bsl_options)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

result = pd.DataFrame(predictions,
                      columns=[
                          'visitor_id', 'item_id', 'base_event',
                          'predict_event', 'details'
                      ])
result.drop(columns={'details'}, inplace=True)
result['erro'] = abs(result['base_event'] - result['predict_event'])
print(result.head())

tuzlaId = algo.trainset.to_inner_iid('Satılik_İstanbul_Tuzla_İçmeler')
print("Satılik_İstanbul_Tuzla_İçmeler : " + tuzlaId)
tuzla_neighbors = algo.get_neighbors(tuzlaId, k=5)
tuzla_neighbors = (algo.trainset.to_raw_iid(tuzlaId)
                   for tuzlaId in tuzla_neighbors)

for n in tuzla_neighbors:
    print(n)
Exemplo n.º 12
0
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['Name', 'Title', 'Value']], reader)
train_set = data.build_full_trainset()
algo = KNNBasic(k=k,
                verbose=True,
                sim_options={
                    'name': 'pearson_baseline',
                    'shrinkage': shrinkage_parameter
                })
algo.fit(train_set)

names = df.Name.sort_values().unique()

for name in names:
    user_inner_id = train_set.to_inner_uid(name)
    neighbors = algo.get_neighbors(user_inner_id, k)

    print("\n{}:".format(name))
    print("   {:<12}Similarity (%)\tBooks in Common".format("Name"))
    for i in range(len(neighbors)):
        neighbor_name = train_set.to_raw_uid(neighbors[i])
        similarity = algo.sim[user_inner_id, neighbors[i]]
        if similarity < 0.0001:
            # stop reporting neighbors with neglible similarity
            break

        user_books = set(df[df.Name == name].Title)
        neighbor_books = set(df[df.Name == neighbor_name].Title)
        books_in_common = len(user_books.intersection(neighbor_books))
        print("{}: {:<9}\t{:0.2f}\t\t\t{}".format(i + 1, neighbor_name,
                                                  similarity * 100,
trainset.n_users
trainset.n_items

#训练协同过滤模型,这里采用用户协同过滤
algo=KNNBasic()
algo.train(trainset)

#计算第39个歌单的前10个近邻歌单
current_playlist_name=list(name_id_dic.keys())[39]
print(current_playlist_name)
current_playlist_id=name_id_dic[current_playlist_name]
print(current_playlist_id)
#映射到内部user_id
playlist_inner_id=algo.trainset.to_inner_uid(current_playlist_id)
#获取前10个近邻
playlist_neighbors=algo.get_neighbors(playlist_inner_id,k=10)
#将近邻映射回原来的id
playlist_neighbors=[algo.trainset.to_raw_uid(inner_id) for inner_id in playlist_neighbors]
#将歌单id映射回歌单名字
playlist_neighbors=[id_name_dic[id] for id in playlist_neighbors]
playlist_neighbors


#加载歌曲id=>歌曲名字映射文件
song_id_name_dic=pickle.load(open("C:\\Users\\T\\Desktop\\python视频\\song.pkl",'rb'))
#构建歌单名=>歌单id的映射
song_name_id_dic={}
for song_id in song_id_name_dic:
    song_name_id_dic[song_id_name_dic[song_id]]=song_id

#对用户进行推荐,这里选择4号用户
Exemplo n.º 14
0
    for playlist_name in playlist_neighbors_name:
        print(playlist_name, name_id_dic[playlist_name])


playlist_recommend_main()

file_path = os.path.expanduser('neteasy_playlist_recommend_data.csv')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 分成5折 改成了, cv=5
# music_data.split(n_folds=5)

# userCF - default
# itemCF - KNNBasic(sim_options={"user_based": False})
algo = KNNBasic(sim_options={"user_based": False})
perf = cross_validate(algo,
                      music_data,
                      measures=['RMSE', 'MAE'],
                      cv=5,
                      verbose=True)
print(perf)
# 回归算法的评价指标MSE(均方误差),RMSE(均方根误差),MAE(平均绝对误差)

# algo.fit(music_data.build_full_trainset())

# print(algo.get_neighbors(algo.trainset.to_inner_uid('2150055953'), 10))

print(algo.get_neighbors(algo.trainset.to_inner_iid("424262401"), 3))
Exemplo n.º 15
0
def run_rec(dataset, num_rec=20):
    r_cols = ['user_id', 'item_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv(
        'ml-100k/sample.txt',
        sep=' ',
        names=['user_id', 'item_id', 'rating', 'unix_timestamp'])
    # ratings = pd.read_csv('ml-100k/ua.base.txt', sep='\t', names=r_cols)
    # ratings = pd.DataFrame(dataset, columns=['user_id', 'item_id', 'rating'])

    train_data = ratings.to_numpy()
    n_rows, n_cols = train_data.shape

    # normalized_data = train_data.copy()
    normalized_data = np.ndarray((n_rows, n_cols), dtype=object)
    for r in range(n_rows):
        normalized_data[r, 0] = train_data[r, 0]
        normalized_data[r, 1] = train_data[r, 1]
        normalized_data[r, 2] = float(train_data[r, 2])
        normalized_data[r, 3] = train_data[r, 3]

    # User mean
    # users = train_data[:, 0]
    # n_users = int(np.max(train_data[:, 0]))
    # mean_rating_matrix = np.zeros((n_users + 1,))
    # for u in range(1, n_users + 1):
    #     indices = np.where(users == u)[0].astype(np.int32)
    #     temp_ratings = train_data[indices, 2]
    #     # temp_ratings = [float(temp) for temp in train_data[indices, 2]]
    #     mean_rating_matrix[u] = np.mean(temp_ratings) if indices.size > 0 else 0
    #     normalized_data[indices, 2] = temp_ratings - mean_rating_matrix[u]

    # Item mean
    items = train_data[:, 1]
    n_items = int(np.max(train_data[:, 1]))
    mean_rating_matrix = np.zeros((n_items + 1, ))
    for i in range(1, n_items + 1):
        indices = np.where(items == i)[0].astype(np.int32)
        temp_ratings = train_data[indices, 2]
        # temp_ratings = [float(temp) for temp in train_data[indices, 2]]
        mean_rating_matrix[i] = np.mean(
            temp_ratings) if indices.size > 0 else 0
        normalized_data[indices, 2] = temp_ratings - mean_rating_matrix[i]

    new_ratings = pd.DataFrame(
        normalized_data,
        columns=['user_id', 'item_id', 'rating', 'unix_timestamp'])

    reader = Reader()

    data = Dataset.load_from_df(new_ratings[['user_id', 'item_id', 'rating']],
                                reader)
    trainset = data.build_full_trainset()

    sim_options = {'name': 'cosine', 'user_based': False}
    algo = KNNBasic(sim_options=sim_options)

    algo.fit(trainset)
    print(algo.sim)

    item_raw_id = 1
    item_inner_id = algo.trainset.to_inner_iid(item_raw_id)
    # print(item_inner_id)

    item_neighbors_inner_ids = algo.get_neighbors(item_inner_id, k=num_rec)
    # for inner_id in item_neighbors_inner_ids:
    #     print(inner_id)

    item_neighbors_raw_ids = (algo.trainset.to_raw_iid(inner_id)
                              for inner_id in item_neighbors_inner_ids)
    print('Start')
    for raw_id in item_neighbors_raw_ids:
        print(raw_id)

    print('Done')