def calculate_item_similarity(trainset, use_iuf_similarity=False):
    """
    Calculate item similarity matrix by building movie-users inverse table.
    The calculating will only between items which are voted by common users.

    :param use_iuf_similarity:  This is based on Item IUF similarity.
                                if a person views a lot of movies, items' similarity will be lower.
    :param trainset: trainset
    :return: similarity matrix
    """
    movie_popular, movie_count = calculate_movie_popular(trainset)

    # count co-rated items between users
    print('generate items co-rated similarity matrix...')
    # the keys of item_sim_mat are movie1's id,
    # the values of item_sim_mat are dicts which save {movie2's id: co-occurrence times}.
    # so you can seem item_sim_mat as a two-dim table.
    # TODO DO NOT USE DICT TO SAVE MATRIX, USE LIST INDEED.
    # TODO IF USE LIST, THE MATRIX WILL BE VERY SPARSE.
    movie_sim_mat = {}
    # record the calculate time has spent.
    movie2users_time = LogTime(print_step=1000)
    for user, movies in trainset.items():
        for movie1 in movies:
            # set default similarity between movie1 and other users equals zero
            movie_sim_mat.setdefault(movie1, defaultdict(int))
            for movie2 in movies:
                if movie1 == movie2:
                    continue
                # ignore the score they voted.
                # item similarity matrix only focus on co-occurrence.
                if use_iuf_similarity:
                    # if a person views a lot of movies, items' similarity will be lower.
                    movie_sim_mat[movie1][movie2] += 1 / math.log(1 +
                                                                  len(movies))
                else:
                    # origin method, users'similarity based on common items count.
                    movie_sim_mat[movie1][movie2] += 1
        # log steps and times.
        movie2users_time.count_time()
    print('generate items co-rated similarity matrix success.')
    movie2users_time.finish()

    # calculate item-item similarity matrix
    print('calculate item-item similarity matrix...')
    # record the calculate time has spent.
    movie_sim_mat_time = LogTime(print_step=1000)
    for movie1, related_items in movie_sim_mat.items():
        len_movie1 = movie_popular[movie1]
        for movie2, count in related_items.items():
            len_user2 = movie_popular[movie2]
            # The similarity of movie1 and movie2 is len(common movies)/sqrt(len(movies1)* len(movies2)
            movie_sim_mat[movie1][movie2] = count / math.sqrt(
                len_movie1 * len_user2)
            # log steps and times.
        movie_sim_mat_time.count_time()

    print('calculate item-item similarity matrix success.')
    movie_sim_mat_time.finish()
    return movie_sim_mat, movie_popular, movie_count
Пример #2
0
    def test(self, testset):
        """
        Test the recommendation system by recommending scores to all users in testset.
        :param testset: test dataset
        :return:
        """
        if not self.n_rec_movie or not self.trainset or not self.movie_popular or not self.movie_count:
            raise ValueError('UserCF has not init or fit method has not called yet.')
        self.testset = testset
        print('Test recommendation system start...')
        N = self.n_rec_movie
        #  varables for precision and recall
        hit = 0
        rec_count = 0
        test_count = 0
        # varables for coverage
        all_rec_movies = set()
        # varables for popularity
        popular_sum = 0

        # record the calculate time has spent.
        test_time = LogTime(print_step=1000)
        for i, user in enumerate(self.trainset):
            test_movies = self.testset.get(user, {})
            rec_movies = self.recommend(user)  # type:list
            for movie in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
                popular_sum += math.log(1 + self.movie_popular[movie])
                # log steps and times.
            rec_count += N
            test_count += len(test_movies)
            # print time per 500 times.
            test_time.count_time()
        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
        popularity = popular_sum / (1.0 * rec_count)

        print('Test recommendation system success.')
        test_time.finish()
        
        print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f\n' %
              (precision, recall, coverage, popularity))
        summary = []
        summary.append(precision)
        summary.append(recall)
        summary.append(coverage)
        summary.append(popularity)
        
        return summary
Пример #3
0
    def test(self, testset):
        """
        Test the recommendation system by recommending scores to all users in testset.
        :param testset: test dataset
        :return: None
        """
        self.testset = testset
        print('Test recommendation system start...')
        #  varables for precision and recall
        hit = 0
        rec_count = 0
        test_count = 0
        # varables for coverage
        all_rec_movies = set()
        # varables for popularity
        popular_sum = 0

        # record the calculate time has spent.
        test_time = LogTime(print_step=1000)
        for user in self.users_set:
            test_movies = self.testset.get(user, {})
            rec_movies = self.recommend(user)  # type:list
            for movie in rec_movies:
                if movie in test_movies.keys():
                    hit += 1
                all_rec_movies.add(movie)
                popular_sum += math.log(1 + self.item_popular[movie])
                # log steps and times.
            rec_count += self.n_rec_movie
            test_count += len(test_movies)
            # print time per 500 times.
            test_time.count_time()
        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.items_count)
        popularity = popular_sum / (1.0 * rec_count)
        print('Test recommendation system success.')
        test_time.finish()
        print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f\n' %
              (precision, recall, coverage, popularity))
        
        summary = []
        summary.append(precision)
        summary.append(recall)
        summary.append(coverage)
        summary.append(popularity)
Пример #4
0
 def predict(self, testset):
     """
     Recommend movies to all users in testset.
     :param testset: test dataset
     :return: `dict` : recommend list for each user.
     """
     movies_recommend = defaultdict(list)
     print('Predict scores start...')
     # record the calculate time has spent.
     predict_time = LogTime(print_step=500)
     for i, user in enumerate(testset):
         rec_movies = self.recommend(user)  # type:list
         movies_recommend[user].append(rec_movies)
         # log steps and times.
         predict_time.count_time()
     print('Predict scores success.')
     predict_time.finish()
     return movies_recommend
Пример #5
0
        model = LFM(10, 10, 0.1, 0.01, 10)
    else:
        raise ValueError('No model named ' + model_name)
    model.fit(trainset)
    recommend_test(model, [1, 100, 233, 666, 888])
    model.test(testset)


def recommend_test(model, user_list):
    for user in user_list:
        recommend = model.recommend(str(user))
        print("recommend for userid = %s:" % user)
        print(recommend)
        print()


if __name__ == '__main__':
    main_time = LogTime(words="Main Function")
    dataset_name = 'ml-100k'
    # dataset_name = 'ml-1m'
    # model_type = 'UserCF'
    # model_type = 'UserCF-IIF'
    # model_type = 'ItemCF'
    # model_type = 'Random'
    # model_type = 'MostPopular'
    # model_type = 'ItemCF-IUF'
    model_type = 'LFM'
    test_size = 0.1
    run_model(model_type, dataset_name, test_size, False)
    main_time.finish()
def calculate_user_similarity(trainset, use_iif_similarity=False):
    """
    Calculate user similarity matrix by building movie-users inverse table.
    The calculating will only between users which have common items votes.

    :param use_iif_similarity:  This is based on User IIF similarity.
                                if the item is very popular, users' similarity will be lower.
    :param trainset: trainset
    :return: similarity matrix
    """
    # build inverse table for item-users
    # key=movieID, value=list of userIDs who have seen this movie
    print('building movie-users inverse table...')
    movie2users = collections.defaultdict(set)
    movie_popular = defaultdict(int)

    for user, movies in trainset.items():
        for movie in movies:
            movie2users[movie].add(user)
            movie_popular[movie] += 1
    print('building movie-users inverse table success.')

    # save the total movie number, which will be used in evaluation
    movie_count = len(movie2users)
    print('total movie number = %d' % movie_count)

    # count co-rated items between users
    print('generate user co-rated movies similarity matrix...')
    # the keys of usersim_mat are user1's id,
    # the values of usersim_mat are dicts which save {user2's id: co-occurrence times}.
    # so you can seem usersim_mat as a two-dim table.
    # TODO DO NOT USE DICT TO SAVE MATRIX, USE LIST INDEED.
    # TODO IF USE LIST, THE MATRIX WILL BE VERY SPARSE.
    usersim_mat = {}
    # record the calculate time has spent.
    movie2users_time = LogTime(print_step=1000)
    for movie, users in movie2users.items():
        for user1 in users:
            # set default similarity between user1 and other users equals zero
            usersim_mat.setdefault(user1, defaultdict(int))
            for user2 in users:
                if user1 == user2:
                    continue
                # ignore the score they voted.
                # user similarity matrix only focus on co-occurrence.
                if use_iif_similarity:
                    # if the item is very popular, users' similarity will be lower.
                    usersim_mat[user1][user2] += 1 / math.log(1 + len(users))
                else:
                    # origin method, users'similarity based on common items count.
                    usersim_mat[user1][user2] += 1
        # log steps and times.
        movie2users_time.count_time()
    print('generate user co-rated movies similarity matrix success.')
    movie2users_time.finish()

    # calculate user-user similarity matrix
    print('calculate user-user similarity matrix...')
    # record the calculate time has spent.
    usersim_mat_time = LogTime(print_step=1000)
    for user1, related_users in usersim_mat.items():
        len_user1 = len(trainset[user1])
        for user2, count in related_users.items():
            len_user2 = len(trainset[user2])
            # The similarity of user1 and user2 is len(common movies)/sqrt(len(user1 movies)* len(user2 movies)
            usersim_mat[user1][user2] = count / math.sqrt(len_user1 * len_user2)
            # log steps and times.
        usersim_mat_time.count_time()

    print('calculate user-user similarity matrix success.')
    usersim_mat_time.finish()
    return usersim_mat, movie_popular, movie_count