示例#1
0
def knn_cosine(dataset_path, target, value):
    # Read the data file
    file_path = dataset_path
    reader = Reader(line_format= 'user item rating', rating_scale=(1, 7), sep='\t')
    data = Dataset.load_from_file(file_path, reader)

    # Construct a training set using the entire dataset (without spliting the dataset into folds)
    # variable trainset is an object of Trainset
    trainset = data.build_full_trainset()
    
    # Parameters needed to create rating matrix
    user_to_item = {}
    item_to_user = {}
    file = open(file_path, "r")
        
    # Train the algorithm to compute the similarities between users
    sim_options = {'name': 'cosine', 'user_based': True}
    algo = KNNBaseline(sim_options=sim_options)
    algo.train(trainset)
    
    # Read the mappings user <-> item
    for line in file:
        line = line.split('\t')
        user_to_item[line[0]] = line[1]
        item_to_user[line[1]] = line[0]
    
    # Retrieve the user id and 
    target_neighbors = algo.get_neighbors(target, k=value)
    target_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in target_neighbors)
    target_neighbors = (item_to_user[rid] for rid in target_neighbors)
                       
    return target_neighbors
示例#2
0
def knn_item(trainset, testset, predset):
    
    modelname = 'knnitem'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return
    
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
                   }
    sim_options = {
                    'name': 'pearson_baseline',
                    'shrinkage': 100,
                    'user_based': False
                    }
    algo = KNNBaseline(k=60, sim_options=sim_options, bsl_options=bsl_options)
    print('KNN item based Model')
    algo.train(trainset)
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
示例#3
0
def getSimModle():  # 默认载入movielens数据集
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset(
    )  #使用pearson_baseline方式计算相似度  False以item为基准计算相似度 本例为电影之间的相似度
    sim_options = {
        'name': 'pearson_baseline',
        'user_based': False
    }  ##使用KNNBaseline算法
    algo = KNNBaseline(sim_options=sim_options)  #训练模型
    algo.train(trainset)
    return algo
示例#4
0
def getSimModle():
    # 载入数据集
    reader = Reader(rating_scale=(1, 5))
    new_data = Dataset.load_from_df(data['user', 'product', 'rating'], reader)
    trainset = new_data.build_full_trainset()
    # 使用pearson_baseline方式计算相似度  以product为基准计算相似度
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    ##使用KNNBaseline算法
    algo = KNNBaseline(sim_options=sim_options)
    #训练模型
    algo.train(trainset)
    return algo
def get_model(re_train=False):
    algopath = os.path.join("Practice 3", "Movie Len Code", "kNNBaseline.algo")
    if not re_train and os.path.exists(algopath):
        logging.debug("Retrieving existed Model")
        algo = dump.load(algopath)[1]
        return algo

    filepath = os.path.join("Practice 3", "ml-latest-small", "ratings.csv")
    reader = Reader(line_format="user item rating timestamp",
                    sep=",",
                    skip_lines=1)
    data = Dataset.load_from_file(filepath, reader=reader)
    trainset = data.build_full_trainset()
    # print("train")
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    algo = KNNBaseline(sim_options=sim_options)
    algo.train(trainset)

    dump.dump(algopath, algo=algo, verbose=1)
    return algo
示例#6
0
def start(goods='951'):

    file_path = os.path.expanduser('SampleData')
    # 指定文件格式
    reader = Reader(line_format='user item rating timestamp', sep=',')
    # 从文件读取数据
    data = Dataset.load_from_file(file_path, reader=reader)

    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    ##使用KNNBaseline算法
    algo = KNNBaseline(sim_options=sim_options)
    algo.train(trainset)

    iid_innerid = algo.trainset.to_inner_iid(goods)

    iid_neighbors = algo.get_neighbors(iid_innerid, k=10)

    iid_neighbors = (algo.trainset.to_raw_iid(inner_id)
                     for inner_id in iid_neighbors)

    print('The 10 nearest neighbors of %s:' % goods)
    for iid in iid_neighbors:
        print(iid)
from surprise import (KNNBaseline, Reader, Dataset, dump)

# First, train the algortihm to compute the similarities between items
data = Dataset.load_from_file('ratings.csv',
                              reader=Reader(sep=',', rating_scale=(1, 10)))
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)
'''
>>> algo.predict('425', '0338564')
Prediction(uid='425', iid='0338564', r_ui=None, est=8.8268148604314725, details={u'actual_k': 40, u'was_impossible': False})
>>> algo.predict('732', '1219827')
Prediction(uid='732', iid='1219827', r_ui=None, est=1.3944813261280586, details={u'actual_k': 8, u'was_impossible': False})
'''

dump.dump('knn.algo', algo=algo)
class MovieRecommender:
    def __init__(self):
        self._knn = None
        self._nmf = None
        self._trainset = None
        self._predictions = None

        self.initialized = False

    def initialize(self, data_filepath):
        self._data = Dataset.load_from_file(data_filepath,
                                            reader=Reader('ml-100k'))
        self._trainset = self._data.build_full_trainset()

        sim_options = {'name': 'pearson_baseline', 'user_based': False}
        self._knn = KNNBaseline(sim_options=sim_options)
        self._nmf = NMF()

        start_new_thread(self._train)

    def get_similar_movies(self, movie_id, k=10):
        if not self.initialized:
            return []

        model = self._knn

        movie_inner_id = model.trainset.to_inner_iid(movie_id)
        similar_movie_inner_ids = model.get_neighbors(movie_inner_id, k=k)

        to_raw_iid = model.trainset.to_raw_iid
        similar_movie_ids = (to_raw_iid(inner_id)
                             for inner_id in similar_movie_inner_ids)

        movie_ids = [
            similar_movie_id.encode('ascii')
            for similar_movie_id in similar_movie_ids
        ]
        return movie_dataset.get_movies(movie_ids)

    def get_similar_movies_for_user(self, user_id, num_movies=10):
        if not self.initialized:
            return []

        user_id = str(user_id)
        user_predictions = [
            prediction for prediction in self._predictions
            if prediction[0] == user_id
        ]

        sorted_predictions = sorted(user_predictions,
                                    key=lambda x: x.est,
                                    reverse=True)
        top_n_predictions = sorted_predictions[:num_movies]

        similar_movie_ids = (prediction.iid
                             for prediction in top_n_predictions)

        movie_ids = [
            similar_movie_id.encode('ascii')
            for similar_movie_id in similar_movie_ids
        ]
        return movie_dataset.get_movies(movie_ids)

    def update_user_ratings(self, user_id, movie_id, rating):
        if not self.initialized:
            return

        rating = float(rating)

        has_previous_rating = False
        if self._trainset.knows_user(user_id):
            trainset_dict = dict(self._trainset.ur[user_id])
            has_previous_rating = movie_id in trainset_dict

        user_id = str(user_id)
        movie_id = str(movie_id)
        new_rating = (user_id, movie_id, rating, time())
        if has_previous_rating:
            for i, rating in enumerate(self._data.raw_ratings):
                if rating[0] == user_id and rating[1] == movie_id:
                    self._data.raw_ratings[i] = new_rating
                    break
        else:
            self._data.raw_ratings.append(new_rating)

        self._trainset = self._data.build_full_trainset()
        self._train()

    def _train(self):
        self._nmf.train(self._trainset)
        self._knn.train(self._trainset)

        self._predictions = self._nmf.test(self._trainset.build_anti_testset())

        self.initialized = True
示例#9
0
def getModel(train_data):

    sim_option = {'name': 'pearson_baseline', 'user_based': False}
    algo = KNNBaseline(sim_options=sim_option)
    algo.train(train_data)
    return algo
示例#10
0
    with io.open(item_file, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_2_name[line[0]] = line[1]
            name_2_rid[line[1]] = line[0]
    return rid_2_name, name_2_rid


# u.data数据格式为 user item rating timestamp;
reader = Reader(line_format='user item rating timestamp', sep='\t')
file_path = 'your path + /ml-100k'
data = Dataset.load_from_file(file_path=file_path + '/u.data', reader=reader)
train_set = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(train_set)

# 获取id对应的电影名列表,由于中途涉及一个id转换,所以要双向
rid_2_name, name_2_rid = read_iter_names()
# print(rid_2_name['1'])
# print(name_2_rid['Toy Story (1995)'])

# raw-id映射到内部id
toy_story_raw_id = name_2_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# 获取toy story对应的内部id 并由此取得其对应的k个近邻 k个近邻对应的也是内部id
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# 近邻内部id转换为对应的名字
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)