예제 #1
0
def fillMatrix(F_list, step_list, LearnRating_list, penalty_list, N = 30):
    trainDataSet = FO.readDataSet('trainDataSet.txt')
    testDataSet = FO.readDataSet('testDataSet.txt')
    userList = FO.readUserList('data/users.txt')
    itemList = FO.readItemList('data/movies.txt')  # 获取物品列表
    dataSet = {**trainDataSet, **testDataSet}
    mu = calMu(dataSet)

    for F in F_list:
        for step in step_list:
            for LearnRating in LearnRating_list:
                for penalty in penalty_list:
                    print(F,step,LearnRating,penalty)
                    path = 'SVD++/Matrix/' + str(F) + '-' + str(step) + '-' + str(LearnRating) + '-' + str(penalty)
                    if not os.path.exists(path):
                        os.makedirs(path)
                    bu, bi, p, q = ReadParameter(F, step, LearnRating, penalty)
                    for user in userList:
                        print(user)
                        rank = dict()
                        userRated = set([user_item[1] for user_item, value in dataSet.items() if user_item[0] == user])
                        UnRatedList = itemList - userRated
                        for item in UnRatedList:
                            rating = Predict(user, item, p, q, bu, bi, mu)
                            if math.isnan(float(rating)):
                                continue
                            rank[item] = round(rating)
                        if len(rank) == 0:
                            continue
                        if N > len(UnRatedList):
                            N = len(UnRatedList)
                        chooseList = random.sample(UnRatedList, N)
                        with open(path + '/new_ratings.txt', 'a') as fileObject:
                            for choose in chooseList:
                                fileObject.write(str(user) + '::' + str(choose) + '::' + str(rank[choose]) + '\n')
예제 #2
0
def calItemSimilarity(trainDataSet):
    '''
    :param trainDataSet: 训练数据集
    :param type: 以何种方式计算相似度
    :param simMeas: 计算相似的方法
    :return: None
    最后将用户之间的相似度写到文件里面
    '''

    # 建立 用户 到 物品 的倒排表
    user_items = dict()
    for user, item in trainDataSet:
        if user not in user_items:
            user_items[user] = set()
        user_items[user].add(item)

    #for user, item in user_items.items():
    #    print(user, ':', item)

    # for item, user in item_users.items():
    #    print(item, ':', user)

    # 读取用户列表
    itemList = FO.readItemList('data/movies.txt')
    #ItemRatingMean = FO.ReadRatingMean('ItemCF/ItemMean/add_trainDataSet_mean.txt')
    UserRatingMean = FO.ReadRatingMean('ItemCF/UserMean/trainDataSet_mean.txt')
    # 计算用户之间的相似度
    for item in itemList:
        W = ItemSimilarity(trainDataSet, user_items, itemList, UserRatingMean, item)
        print(item)
        filename = 'ItemCF/Similarity/'+str(item)+'.txt'
        FO.WirteSimilarty(filename,W)
예제 #3
0
def testSystem(trainDataSet, testDataSet, K=20, N=30, type='implicit'):
    testUserList = set()  # 记录测试数据集里面的用户列表
    for user_item, value in testDataSet.items():
        testUserList.add(user_item[0])

    itemList = FO.readItemList('data/movies.txt')  # 获取物品列表

    recall = 0  # 计算总的召回率
    precision = 0  # 计算总的精确度
    coverage_item = set()  # 计算覆盖的物品

    userLove = dict()
    for user in testUserList:  # 遍历所有测试用户
        #userLove[user] = [(user_item[1],value) for user_item, value in trainDataSet.items() if user_item[0] == user]
        userLove[user] = set([
            user_item[1] for user_item, value in trainDataSet.items()
            if user_item[0] == user
        ])
        rank = Recommend(trainDataSet, itemList, userLove[user], user, K, type)
        print(rank)
        if N < len(rank):
            top = N
        else:
            top = len(rank)
        topRank = sorted(rank.items(),
                         key=operator.itemgetter(1),
                         reverse=True)[:top]  # 获取排名前N个的推荐物品
        topItemRank = [item_rating[0]
                       for item_rating in topRank]  # 获取排名前N个的推荐物品
        User_item_InTest = [
            user_item[1] for user_item, rating in testDataSet.items()
            if user_item[0] == user
        ]  # 获取用户在测试集里面喜欢的物品
        #print(User_item_InTest)
        #print(topRank)
        #print(topItemRank)
        # 计算 准确率 以及 召回率
        oneRecall = Recall(User_item_InTest, topItemRank)
        onePrecision = Precision(User_item_InTest, topItemRank)
        recall += oneRecall
        precision += onePrecision
        print('recall=%.4f,precision=%.4f' % (oneRecall, onePrecision))
        # 计算覆盖率
        Coverage(coverage_item, topItemRank)

    coverage = float(len(coverage_item)) / len(itemList)
    popularity = Popularity(trainDataSet, coverage_item)
    recall = recall / float(len(testUserList))
    precision = precision / float(len(testUserList))
    print('total recall=%.4f,precision=%.4f,coverage=%.4f,popularity=%.4f'%\
          (recall,precision,coverage,popularity))

    return recall, precision, coverage, popularity
예제 #4
0
def calItemRatingMean(dataSet):
    itemList = FO.readItemList('data/movies.txt')  # 获取物品列表
    itemMeanDict = dict()
    for item in itemList:
        print(item)
        itemRating = [value for user_item, value in dataSet.items() if user_item[1] == item]
        if len(itemRating) == 0:
            value = 0
        else:
            value = sum(itemRating) / len(itemRating)
        itemMeanDict[item] = value
    return itemMeanDict
예제 #5
0
def calItemSimilarity(trainDataSet, type='implicit', simMeas=cosSim):
    '''
    :param trainDataSet: 训练数据集
    :param type: 以何种方式计算相似度
    :param simMeas: 计算相似的方法
    :return: None
    最后将用户之间的相似度写到文件里面
    '''

    # 建立 用户 到 物品 的倒排表
    users_item = dict()
    for user, item in trainDataSet:
        if user not in users_item:
            users_item[user] = set()
        users_item[user].add(item)
    for user, item in users_item.items():
        print(user, ':', item)

    # for item, user in item_users.items():
    #    print(item, ':', user)

    # 读取用户列表
    itemList = FO.readItemList('data/movies.txt')

    # 遍历每一个用户, 计算用户之间的相似度
    for item in itemList:
        print(item)
        if type == 'implicit':  # 判读计算相似度的方式, 这里隐式计算
            W = ItemSimilarity_implicit(users_item, item)
            filename = 'ItemCF/ItemSimilarity/implicit/' + str(item) + '.txt'
        elif type == 'explicit':  # 判读计算相似度的方式, 这里显式方式
            W = ItemSimilarity_explicit(trainDataSet, users_item, itemList,
                                        item, simMeas)
            filename = 'ItemCF/ItemSimilarity/explicit/' + str(
                simMeas.__name__) + '/' + str(item) + '.txt'

        # 将结果写入文件之中
        with open(filename, 'w') as fileObject:
            for users, values in W.items():
                for u in users:
                    fileObject.write(str(str(u) + '::'))
                fileObject.write(str(values) + '\n')
예제 #6
0
            W = FO.readItemSimilarity(item, type)  # 获取当前用户与其他用户的相似度
            topItem = dict(
                sorted(W.items(), key=operator.itemgetter(1),
                       reverse=True)[:K])
            topSimItem = set([items[1] for items, value in topItem.items()])
            topLoveItem = userLove & topSimItem
            if len(topLoveItem) == 0:
                continue
            # print(topLoveItem)
            for LoveItem in topLoveItem:
                wji = W[item, LoveItem]
                rui = dataSet[user, LoveItem]
                rank[item] = rank.get(item, 0) + wji * rui
    return dict(
        sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[:N])


if __name__ == '__main__':

    trainDataSet = FO.readDataSet('ItemCF/trainDataSet.txt')
    testDataSet = FO.readDataSet('ItemCF/testDataSet.txt')
    '''
    calItemSimilarity(trainDataSet, 'implicit')  # 计算隐式相似度
    calItemSimilarity(trainDataSet, 'explicit', cosSim)  # 以余弦相似度计算显式相似度
    calItemSimilarity(trainDataSet, 'explicit', ecludSim)  # 以欧氏距离计算显式相似度
    '''
    itemList = FO.readItemList('data/movies.txt')
    for i in range(1, 11):
        rank = predictSystem(trainDataSet, testDataSet, i, itemList)
        print(rank)