Exemplo n.º 1
0
def knnm_running_time(data):
    '''
        Calculates the running times for training and predictions for KNN with Means

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_KnnMeanstrain: running time for training
            elapsedtime_KnnMeanstest: running time for predictions on testset
    '''
    elapsedtime_KnnMeanstrain = []
    elapsedtime_KnnMeanstest = []

    # tune the parameters on the entire data
    param_grid = {
        'k': [5, 10, 20],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson'],
            'min_support': [1, 5],
            'user_based': [False]
        }
    }
    grid_search = GridSearch(KNNWithMeans,
                             param_grid,
                             measures=['RMSE'],
                             verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    k = param['k']
    sim = param['sim_options']['name']
    min_support = param['sim_options']['min_support']
    user_based = param['sim_options']['user_based']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        knnm = KNNWithMeans(k=k,
                            name=sim,
                            min_support=min_support,
                            user_based=user_based)
        knnm.train(training)
        elapsedtime_KnnMeanstrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        knnm.test(testing)
        elapsedtime_KnnMeanstest.append(time.time() - test_start)
    return elapsedtime_KnnMeanstrain, elapsedtime_KnnMeanstest
Exemplo n.º 2
0
def main():
    # Charge movielens-100k dataset
    data = Dataset.load_builtin('ml-100k')

    # Créer un jeu de test et de train ( 15%, 85%)
    trainset, testset = train_test_split(data, test_size=.15)

    # Détermine l'algorithme utilisé
    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    result =[]
    for prediction in predictions:
        # Calcul le delta entre la prediction et la réalité
        result.append(prediction.r_ui - prediction.est)

    # Affiche l'histogramme du delta entre les predictions et la réalité
    plt.hist(result, 100)

    plt.show()
Exemplo n.º 3
0
def DisplayGraphDelta(data) : 
    """
        Affichage du delta entre prédiction et réalité
    """
    # Créer un jeu de test et de train ( 25%, 75%)
    trainset, testset = train_test_split(data, test_size=.25)

    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    #print(predictions)

    result =[]
    for prediction in predictions:
        print(prediction)
        # Calcul le delta entre la prediction et la réalité
        result.append(prediction.r_ui - prediction.est)

    # Affiche l'histogramme du delta entre les prediction et la réalité
    print(len(result))
    plt.hist(result, 100)
    plt.show()
Exemplo n.º 4
0
def main():

    # Charge movielens-100k dataset
    movielens_ds = Dataset.load_builtin('ml-100k')

    # Creer un jeu de test et de train ( 15%, 85%)
    trainset, testset = train_test_split(movielens_ds, test_size=.15)

    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    #print(predictions)

    result = []
    for prediction in predictions:
        # Difference prediction et realite
        result.append(prediction.r_ui - prediction.est)

    # Histogramme du resultat
    plt.hist(result, 100)

    plt.show()
    def evaluate_on_test(self, train_set, test_set):
        """
        Evaluate the algorithm on the test set after running it on the test set
        :param train_set:
        :param test_set:
        :return: RMSE value on test set
        """
        if train_set is not None and test_set is not None:
            print("Evaluate RMSE on test data")
            self.LOG_HANDLE.info("Evaluate RMSE on test data")

            similarity_options = {
                'name': 'msd',
                'user_based': False,
            }

            # Use the KNN algorithm
            algo = KNNWithMeans(sim_options=similarity_options)

            # Train the algorithm on the trainset, and predict ratings for the testset
            algo.fit(train_set)
            predictions = algo.test(test_set)

            # Then compute RMSE
            return accuracy.rmse(predictions)
Exemplo n.º 6
0
def knnBasico(df, testSize, vecinos, pr, bool):

    # df = pd.read_csv('../datasets/yelp_beautySpa_aspects.csv', header=0)
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)
    trainset, testset = train_test_split(data,
                                         test_size=testSize,
                                         shuffle=False)

    sim_options = {
        'name': 'cosine',
        'user_based': bool  # compute  similarities between items
    }
    algo = KNNWithMeans(k=vecinos, sim_options=sim_options)

    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, pr, 4)

    # Precision and recall can then be averaged over all users
    # print(sum(prec for prec in precisions.values()) / len(precisions))
    # print(sum(rec for rec in recalls.values()) / len(recalls))

    precision = round(
        sum(prec for prec in precisions.values()) / len(precisions), 3)
    recall = round(sum(rec for rec in recalls.values()) / len(recalls), 3)

    return precision, recall
Exemplo n.º 7
0
    def recommender_knn_baseline(self, train_file, test_file, output):

        train, test, train_dataset, test_dataset = prepare_datasets(
            train_file, test_file)
        # Use user_based true/false to switch between user-based or item-based collaborative filtering
        algo_knn_means = KNNWithMeans(verbose=False)

        algo_knn_means.fit(train)

        #not_seen_elems = self.merge_train_set(train_dataset, test_dataset)

        #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True)
        predictions_knn_means = algo_knn_means.test(test, verbose=False)

        #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0)
        # Precision and recall can then be averaged over all users
        #precision_avg = sum(prec for prec in precisions.values()) / len(precisions)
        #recall_avg = sum(rec for rec in recalls.values()) / len(recalls)
        #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str(
        #    rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False)))
        print('KNN_BASELINE: ' + ' RMSE ' +
              str(rmse(predictions_knn_means, verbose=False)) + ' MAE ' +
              str(mae(predictions_knn_means, verbose=False)))

        return algo_knn_means
Exemplo n.º 8
0
def test_knn_based(data):
    """
    Parameters
    ----------
    data : dataframe
        Dataframe with columns userId, movieId, and rating in that order.

    Returns
    -------
    test_mse : float
        The mean squared error for the knn based algorithm.

    """
    reader = Reader(rating_scale=(1, 5))
    knn_data = Dataset.load_from_df(data, reader)
    trainset, testset = train_test_split(knn_data,
                                         test_size=.10,
                                         random_state=24)
    algo = KNNWithMeans(k=5,
                        sim_options={
                            'name': 'pearson_baseline',
                            'user_based': True
                        })
    algo.fit(trainset)
    predictions = algo.test(testset)
    test_mse = accuracy.mse(predictions, verbose=False)
    return test_mse
Exemplo n.º 9
0
def knn_m(data, training, testing):
    '''
        Tune KNN with Means parameters then calculates RMSE, coverage and running time of KNN with Means

        Args:
            data(Dataset): the whole dataset divided into 5 folds
            training(Dataset): training dataset
            testing(Dataset): test dataset

        Returns:
            rmse: RMSE of KNN with Means with optimized parameters
            top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                                                        'min_support': [1, 5], 'user_based': [False]}}

    # optimize parameters
    knnm_grid_search = GridSearch(KNNWithMeans, knn_param_grid, measures=['RMSE'], verbose=False)
    knnm_grid_search.evaluate(data)
    param = knnm_grid_search.best_params['RMSE']
    print('KNNWithMeans:', param)

    # fit model using the optimized parameters
    knnm = KNNWithMeans(k=param['k'], name=param['sim_options']['name'],
                        min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based'])
    knnm.train(training)

    # evaluate the model using test data
    predictions = knnm.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)

    return rmse, top_n
Exemplo n.º 10
0
    def __build_model(self):
        model_path = '{}{}'.format(self.file_prefix, self.model_path)
        try:
            model = joblib.load(model_path)
            print('recommender exists, load it')
            return model
        except Exception as e:
            print('recommender does not exist, build new recommender')

            # load data

            # initialize KNN recommender
            algo = KNNWithMeans(k=50,
                                sim_options={
                                    'name': 'pearson_baseline',
                                    'user_based': False
                                })
            # train model
            algo.fit(self.trainset)
            # save model
            joblib.dump(algo, model_path)
            # validation
            test_pred = algo.test(self.testset)
            accuracy.rmse(test_pred)

            return algo
Exemplo n.º 11
0
def plot_ROC(qNum, k, thresh=[2.5,3,3.5,4]):
    range = 5.0
    trainset, testset = train_test_split(data, test_size=0.1)
    if qNum == 15:
        model = KNNWithMeans(k=k, sim_options={'name': 'pearson'})
    model.fit(trainset)
    predictions = model.test(testset)
    
    for thrs in thresh:
        y = np.array([])
        scores = np.array([])
        for u, i, t, est, d in predictions:
            if t >= thrs:
                t = 1
            else:
                t = 0
            y = np.append(y, t)
            scores = np.append(scores, est/range)
        
        fpr, tpr, thresholds = metrics.roc_curve(y, scores)
        roc_auc = metrics.auc(fpr, tpr)

        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Threshold = '+str(thrs))
        plt.show()
        print("auc = "+str(roc_auc))
Exemplo n.º 12
0
def rank_predictions(model_name):

    k_KNN = 22 
    k_NNMF = 20
    k_MF = 26

    if model_name == 'KNN':
        sim_options = {
            'name': 'pearson_baseline',
            'shrinkage': 0
        }
        model = KNNWithMeans(k_KNN, sim_options=sim_options)
    elif model_name == 'NNMF':
        model = NMF(n_factors= k_NNMF)
    else:
        model = SVD(n_factors = k_MF)

    precision_arr = []
    recall_arr = []
    for t in range (1,26):
        kf = KFold(n_splits=10)
        print(t)
        p = []
        r = []
        for trainSet, testSet in kf.split(data):
            model.fit(trainSet)
            predictions = model.test(testSet)
            precisions, recalls = precision_recall (predictions, t)
            p.append(sum(prec for prec in precisions.values()) / len(precisions))
            r.append(sum(rec for rec in recalls.values()) / len(recalls))
            
        precision_arr.append(np.mean(np.array(p)))
        recall_arr.append(np.mean(np.array(r)))

    # precision vs t
    plt.plot(list(range (1,26)), precision_arr)
    plt.xlabel("Size")
    plt.ylabel("Precision")
    plt.title("The average precision plot using " + model_name)
    plt.show()
    
    # recall vs t
    plt.plot(list(range (1,26)), recall_arr)
    plt.xlabel("Size")
    plt.ylabel("Recall")
    plt.title("The average recall plot using MF " + model_name)
    plt.show()
    
    # precision vs recall 
    plt.plot(recall_arr, precision_arr)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("The average precision and recall plot using " + model_name)
    plt.show()


    return precision_arr, recall_arr 
def knn_centered_user(train, test, ids, Xtest, Xids):
    """
    kNN approach taking into account the mean ratings of each user
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('Centered kNN User')
    algo = KNNWithMeans(k=200,
                        name='pearson_baseline',
                        min_support=5,
                        user_based=True,
                        shrinkage=120)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Exemplo n.º 14
0
def binary_value(data, threshold) :
    trainset, testset = train_test_split(data, test_size=.1)
    
    algo = KNNWithMeans(k = 30)
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    like0 = []#real
    like  = []#predict
    for row in range(len(predictions)) :
        like.append( 1 if predictions[row][3] > threshold else 0)
        like0.append(1 if predictions[row][2] > threshold else 0)
    #predictions[row][3] -> predict value
    #predictions[row][2] -> real value
    return like0, like
Exemplo n.º 15
0
def trim_performance(qNum,maxk=0): 
    pop, unpop, highVar = trimMovies()
    
    if maxk == 0:
        if 12 <= qNum <= 14:
            maxk = 100
        elif 19 <= qNum <= 21:
            maxk = 50

    trim_Model = {
        12: (pop, 'KNNWithMeans'),
        13: (unpop, 'KNNWithMeans'),
        14: (highVar, 'KNNWithMeans'),
        19: (pop, 'NMF'),
        20: (unpop, 'NMF'),
        21: (highVar, 'NMF'),
    }
    trimSet, modelName = trim_Model[qNum]
    
    kf = KFold(n_splits=10)
    RMSE = [] 
    for k in range(2, maxk + 1, 2):
        print('-' * 20 + 'k = ' + str(k) + ' ' + '-' * 20)
        
        if modelName == 'KNNWithMeans':
            model = KNNWithMeans(k=k, sim_options={'name': 'pearson'})
        elif modelName == 'NMF':
            model = NMF(n_factors=k)

        subRMSE = [] 
        temp = 1
        for trainSet, testSet in kf.split(data):
            model.fit(trainSet)
            testSet = list(filter(lambda x: int(x[1]) in trimSet, testSet))
            print("Split " + str(temp) + ": test set size after trimming: %d", len(testSet))
            temp += 1
            predictions = model.test(testSet)
            subRMSE.append(accuracy.rmse(predictions, verbose=True))
        RMSE.append(np.mean(subRMSE))

    plt.figure()
    plt.plot(list(range(2, maxk+1, 2)), RMSE)
    plt.xlabel("k")
    plt.ylabel("Average RMSE")
    plt.title("Q"+str(qNum)+": Average RMSE Along k")
    plt.show()
    print(min(RMSE))
    return min(RMSE)
Exemplo n.º 16
0
def chose_yahoo(file_path):
    # mae= []
    # rmse = []
    reader = Reader(line_format='timestamp user item  rating', sep='\t')#timestamp
    #载入数据,包括多准则评分:故事,角色,表演,画面,音乐,以及整体评分
    story = Dataset.load_from_file(file_path + 'story.txt', reader=reader)
    role = Dataset.load_from_file(file_path + 'role.txt', reader=reader)
    show = Dataset.load_from_file(file_path + 'show.txt', reader=reader)
    image = Dataset.load_from_file(file_path + 'image.txt', reader=reader)
    music = Dataset.load_from_file(file_path + 'music.txt', reader=reader)
    total = Dataset.load_from_file(file_path + 'total.txt', reader=reader)
    # print('载入数据成功!\n')
    #按2:8拆分数据
    random_states = 180
    story_train, story_test = train_test_split(story, random_state = random_states)
    role_train, role_test = train_test_split(role, random_state = random_states)
    show_train, show_test = train_test_split(show, random_state = random_states)
    image_train, image_test = train_test_split(image, random_state = random_states)
    music_train, music_test = train_test_split(music, random_state = random_states)
    total_train, total_test = train_test_split(total, random_state = random_states)
    # print('数据划分成功!\n')
    #选择的是基于项目的协同过滤算法,项目相似度计算采用cosine方法
    sim_options = {'name': 'pearson',#用皮尔森基线相似度避免出现过拟合
                   'user_based': False} # 基于用户的协同过滤算法
    algo1 = KNNWithMeans(sim_options=sim_options)
    algo2 = KNNWithMeans(sim_options=sim_options)
    algo3 = KNNWithMeans(sim_options=sim_options)
    algo4 = KNNWithMeans(sim_options=sim_options)
    algo5 = KNNWithMeans(sim_options=sim_options)
    algo6 = KNNWithMeans(sim_options=sim_options)
    algo1.fit(story_train)
    algo2.fit(role_train)
    algo3.fit(show_train)
    algo4.fit(image_train)
    algo5.fit(music_train)
    algo6.fit(total_train)
    story_p = algo1.test(story_test)
    role_p = algo2.test(role_test)
    show_p = algo3.test(show_test)
    image_p =algo4.test(image_test)
    music_p = algo5.test(music_test)
    single_p = algo6.test(total_test)
    # rmse.append(accuracy.rmse(single_p))
    #平均法
    # multi_p = avg(story_p, role_p, show_p, image_p, music_p, single_p)
    #整体回归
    P = combine(story_p, role_p, show_p, image_p, music_p, single_p)
    df = pd.read_csv(file_path + 'all.txt', sep = '\t', names = ['id', 'uid', 'mid', 'total', 'story', 'role', 'show', 'image', 'music'])
    k, b = totalRegModel(df)
    multi_p = totalReg(P, k, b, single_p)
    #基于每个用户的回归
    
    mae = (accuracy.mae(single_p),accuracy.mae(multi_p))
    # rmse.append(accuracy.rmse(multi_p))
    return mae#, rmse
def solve_item_item(pathw):
    reader = Reader(line_format='user item rating timestamp', sep=',')
    data = Dataset.load_from_file(pathw, reader=reader)
    data.split(n_folds=5)
    algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    top_n = get_top_n(predictions, n=10)
    # Print the recommended items for each user
    for uid, user_ratings in top_n.items():
        if uid == '615':
            # print(uid, [iid for (iid, _) in user_ratings])
            return [iid for (iid, _) in user_ratings]
Exemplo n.º 18
0
    def CFM(self):
        kf = KFold(n_splits=5)
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)

        for trainset, testset in kf.split(self.data):
            algo.fit(trainset)
            predictions = algo.test(testset)

            precisions, recalls = self.precision_recall_at_k(predictions)

            P = sum(prec for prec in precisions.values()) / len(precisions)
            R = sum(rec for rec in recalls.values()) / len(recalls)
            F1 = 2 * P * R / (P + R)

            print("Precision : ", P)
            print("Recall    : ", R)
            print("F1        : ", F1)
def ComputeCollaborativeFiltering_User_User(recipe_df, train_rating_df, pd, benchmark, knnmeans=False):
    print("\n###### Compute CollaborativeFiltering_User_User ######")
    df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner')
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader)
    trainSet, testSet = train_test_split(data, test_size=.2, random_state=0)

    # compute  similarities between items
    sim_options = {'name': 'cosine', 'user_based': True}

    if knnmeans:
        algo = KNNWithMeans(sim_options=sim_options, verbose=False)
    else:
        algo = KNNBasic(sim_options=sim_options, verbose=False)
    algo.fit(trainSet)
    predictions = algo.test(testSet)

    Evaluators.RunAllEvals(predictions, benchmark)
    def train_best_model_generate_ratings_test(self, ratings_set, test_set):
        """
        Train the best model (with minimum RMSE) on the complete ratings set and then compute the ratings for the test set
        :param ratings_set: The complete ratings data set
        :param test_set: The streams for the users for which ratings are not yet available
        :return: A data frame of the form user, stream, predicted rating
        """
        if ratings_set and test_set:
            print(
                "Training the best model and generating the ratings for the test data set"
            )
            self.LOG_HANDLE.info(
                "Training the best model and generating the ratings for the test data set"
            )

            algo = KNNWithMeans(**model_params.knn_means_best_params)
            algo.fit(ratings_set)

            predictions = algo.test(test_set)
            return predictions
Exemplo n.º 21
0
def apply_kNN_movie(threshold, similarity_metric, user_based):
    # Daten in ein pandas dataframe einlesen, file liegt im Projektordner
    df = pd.read_csv("../data/ml-latest-small/ratings.csv",
                     usecols=['userId', 'movieId', 'rating'])
    print(df)
    #Tabelle in user-item matrix umwandeln. rows = users, columns = items
    df = df.pivot(index='userId', columns='movieId', values='rating')
    print(df)

    # columns rauslöschen, wo Anzahl ratings < thresh
    df.dropna(thresh=threshold, axis=1, inplace=True)
    print(df)

    # Matrix wieder in Tabellenform umwandeln. Table: userId, movieId, ratings sortiert nach userId
    df = df.stack().reset_index().sort_values(by=['userId', 'movieId'], axis=0)
    df.columns = ['userId', 'movieId', 'rating']
    print(df)

    # pandas dataframe in ein Surprise data object umwandeln. nur relevante spalten auswählen
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[["userId", "movieId", "rating"]], reader)
    # test und trainset erstellen
    x_train, x_test = train_test_split(data, train_size=0.8, test_size=0.2)

    sim_options = {
        "name": similarity_metric,
        "user_based": user_based,
    }

    # definition des algo objekts
    algo1 = KNNWithMeans(k=316, sim_options=sim_options)
    algo1.fit(x_train)
    # algo testen
    predictions1 = algo1.test(x_test)

    #für jeden user die n items, wo wir predicten dass er sie hoch bewertet, holen
    top_n = get_top_n(predictions1, n=5)
    # Evaluations berechnen
    error_score = accuracy.mae(predictions1)

    return top_n, error_score
Exemplo n.º 22
0
def trainFinalModels(ratingsTrainDataset, ratingsTest, bestParamsNMF,
                     bestParamsKNN):
    ratingsTrainTrainset = ratingsTrainDataset.build_full_trainset()

    modelNMF = NMF(**bestParamsNMF)
    modelNMF.fit(ratingsTrainTrainset)
    saveModel(modelNMF, 'NMF')

    predictions = modelNMF.test(ratingsTest)
    rmseValue = rmse(predictions)
    maeValue = mae(predictions)
    saveFinalResult('NMF', rmseValue, maeValue)

    modelKNN = KNNWithMeans(**bestParamsKNN)
    modelKNN.fit(ratingsTrainTrainset)
    saveModel(modelKNN, 'KNN')

    predictions = modelKNN.test(ratingsTest)
    rmseValue = rmse(predictions)
    maeValue = mae(predictions)
    saveFinalResult('KNN', rmseValue, maeValue)
Exemplo n.º 23
0
def train(df_path, limit):
    print('Loading data...')
    df = pd.read_json(df_path)
    new_df = df[df['review_count'] >= limit]
    # Reading the dataset
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(new_df[['reviewerID', 'asin', 'overall']],
                                reader)
    # Splitting the dataset
    train_set, test_set = train_test_split(data,
                                           test_size=0.3,
                                           random_state=101)
    # Use user_based true/false to switch between user-based or item-based collaborative filtering
    print('Parameter Tuning...')
    acc_score = []
    for i in range(1, 10):
        algo = KNNWithMeans(k=i,
                            sim_options={
                                'name': 'pearson_baseline',
                                'user_based': False
                            })
        algo.fit(train_set)
        acc_score.append((i, accuracy.rmse(algo.test(test_set))))
    # run the trained model against the testset
    acc_score.sort(reverse=True, key=lambda x: x[1])
    c = acc_score[0][0]
    print(f"Final C = {c}\nTest RMSE : {acc_score[0][1]}")
    final_knn = KNNWithMeans(k=c,
                             sim_options={
                                 'name': 'pearson_baseline',
                                 'user_based': False
                             })
    trainset = data.build_full_trainset()
    final_knn.fit(trainset)

    print("Storing model in pickle file ...")
    final_knn_file = './models/pickle_files/knn/final_knn.pkl'
    pickle.dump(final_knn, open(final_knn_file, 'wb'))
    print('Done.')
    return final_knn
Exemplo n.º 24
0
def recommendUser(user):
    recommendItemForUser(user, -5)
    r_cols = ['user_id', 'item_id', 'rating']
    ratings = pd.read_csv('user-id-sentiment-category_and_score', names=r_cols)
    reader = Reader(rating_scale=(-1, 1))
    data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']],
                                reader)
    iids = ratings['item_id'].unique()
    iids50 = ratings.loc[ratings['user_id'] == -5, 'item_id']
    final_dataset = np.concatenate((iids, iids50.values))
    final_dataset = final_dataset.astype('str')
    pred = np.unique(final_dataset)
    testset = [[5, item_id, 1] for item_id in pred]
    algo = KNNWithMeans()
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    predictions = algo.test(testset)
    pred_ratings = np.array([pred.est for pred in predictions])
    i_max = np.argpartition(pred_ratings, -5)[-5:]
    iid = pred[i_max]
    print(iid)
    recItems = getProductsForRecommender(iid)
    return recItems
Exemplo n.º 25
0
def user_collaborative_filtering(trainset, testset):

    # Use user_based true/false to switch between user-based or item-based collaborative filtering
    algo = KNNWithMeans(k=50,
                        sim_options={
                            'name': 'pearson_baseline',
                            'user_based': True
                        })
    algo.fit(trainset)

    # we can now query for specific predicions
    uid = str(196)  # raw user id
    iid = str(302)  # raw item id

    # get a prediction for specific users and items.
    pred = algo.predict(uid, iid, r_ui=4, verbose=True)

    # run the trained model against the testset
    test_pred = algo.test(testset)

    # get RMSE
    print("User-based Model : Test Set")
    accuracy.rmse(test_pred, verbose=True)
Exemplo n.º 26
0
def trimming_knn_plot(data) :
    
    trainset, testset = train_test_split(data, test_size=.1)
    
    mae_knn = []
    rmse_knn = []
    #sweeping 
    for kk in np.arange(2, 102, 2) :
        algo = KNNWithMeans(k=kk)
        algo.fit(trainset)
        predictions = algo.test(testset)
        rmse_knn.append(accuracy.rmse(predictions))
        mae_knn.append(accuracy.mae(predictions))
    
    plt.figure(1)
    plt.plot(np.arange(2, 102, 2), rmse_knn)
    plt.title("Performance Evaluations Using Trimming Data for Average RMSE over k", fontsize = 10)
    plt.ylabel("Average MAE (testset)")
    plt.xlabel("k")
    plt.figure(2)
    plt.plot(np.arange(2, 102, 2), mae_knn)
    plt.title("Performance Evaluations Using 10-fold Trimming Data for Average MAE over k", fontsize = 10)
    plt.ylabel("Average MAE (testset)")
    plt.xlabel("k")
Exemplo n.º 27
0
from collections import defaultdict
import pprint
# 数据读取
path = './movielens_sample.txt'
df = pd.read_csv(path, usecols=[0, 1, 2], skiprows=1)
df.columns = ['user', 'item', 'rating']
reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_df(df, reader=reader)
trainset = data.build_full_trainset()

# ItemCF 计算得分
# 取最相似的用户计算时,只取最相似的k个

kf = KFold(n_splits=5)
algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'})

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    print(rmse, rmse * rmse)

predictions = []
for row in df.itertuples():
    user, item = getattr(row, 'user'), getattr(row, 'item')
    predictions.append([user, item, algo.predict(user, item).est])

print("*" * 100)
print("user\titem\tpredict\n")
pprint.pprint(predictions)
def collaborative_filtering_using_surprise():
    """
    https://towardsdatascience.com/how-to-build-a-memory-based-recommendation-system-using-python-surprise-55f3257b2cf4
    Predict games for user with user_key = 93681
    """
    target_user_key = 93681

    # import reduced dataset:
    df = import_reduced_reviews()

    # check for duplicates:
    duplicates = len(df) - len(
        df.drop_duplicates(subset=['game_key', 'user_key']))

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])
    print('duplicates removed: ' + str(duplicates))

    # check out our user:
    df_target_user = df[df['user_key'] == target_user_key]

    # build utility matrix:
    # data_pivot = df.pivot(index='user_key', columns='game_key', values='rating')

    # calculate sparsity
    # sparsity = data_pivot.isnull().sum().sum() / data_pivot.size
    # print('Sparcity of utility matrix: ' + str(sparsity))

    ### Modelling part with Surprise:
    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Split in trainset and testset
    trainset, testset = train_test_split(data, test_size=0.2)

    print('Number of users: ', trainset.n_users, '\n')
    print('Number of items: ', trainset.n_items, '\n')

    # When surprise creates a Trainset or Testset object, it takes the raw_id’s (the ones that you used in the file
    # you imported), and converts them to so-called inner_id’s (basically a series of integers, starting from 0). You
    # might need to trace back to the original names. Using the items as an example (you can do the same approach
    # with users, just swap iid's with uid's in the code), to get the list of inner_iids, you can use the all_items
    # method. To convert from raw to inner id you can use the to_inner_iid method, and the to_raw_iid to convert back.

    # An example on how to save a list of inner and raw item id’s:
    trainset_iids = list(trainset.all_items())
    iid_converter = lambda x: trainset.to_raw_iid(x)
    trainset_raw_iids = list(map(iid_converter, trainset_iids))

    ## Model parameters: of kNN:
    # Two hyperparameters we can tune:
    # 1. k parameter
    # 2. similarity option
    #   a) user-user vs item-item
    #   b) similarity function (cosine, pearson, msd)

    sim_option = {'name': 'pearson', 'user_based': False}

    # 3 different KNN Models: KNNBasic, KNNWithMeans, KNNWithZScore
    k = 40
    min_k = 5

    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)

    algo.fit(trainset)

    ## Testing:
    predictions = algo.test(testset)

    accuracy.rmse(predictions)

    # Own similarity matrix:
    sim_matrix_imported = pd.read_csv(
        '../Data/Recommender/selfmade_item-item-similarity-matrix.csv',
        index_col=0)
    sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int)
    sim_matrix_imported = sim_matrix_imported.to_numpy()

    algo.sim = sim_matrix_imported

    predictions = algo.test(testset)

    accuracy.rmse(predictions)

    # Cross validation:
    skip = True
    if not skip:
        results = cross_validate(algo=algo,
                                 data=data,
                                 measures=['RMSE'],
                                 cv=5,
                                 return_train_measures=True)
        results_mean = results['test_rmse'].mean()

    ## Predictions
    # Lets assume we are happy with the method and now want to apply it to the entire data set.

    # Estimate for a specific user a specific item:
    single_item_single_user_prediction = algo.predict(uid=target_user_key,
                                                      iid=100010,
                                                      verbose=True)

    # Estimate all items for a specific user:
    list_of_all_items = trainset_raw_iids
    target_predictions = []

    for item in list_of_all_items:
        single_prediction = algo.predict(uid=target_user_key, iid=item)
        target_predictions.append(
            (single_prediction.uid, single_prediction.iid,
             single_prediction.est))

    # Then sort the predictions for each user and retrieve the k highest ones:
    target_predictions.sort(key=lambda x: x[2], reverse=True)
    n = 20
    top_n = target_predictions[:n]
    top_n = [row[1] for row in top_n]

    print('end')
Exemplo n.º 29
0
def make_alg_and_test(trainset, testset):
    """
    This function for: create the algorithm and run the algorithm on test dataset.
    Args: 
        trainset, testset        
    Return:     

    Try other config in sim_options:
        name : contains the similarity metric to use. Options are cosine, msd, pearson, or pearson_baseline. The default is msd.
        user_based : a boolean that tells whether the approach will be user-based or item-based. The default is True, which means the user-based approach will be used.
        min_support: the minimum number of common items needed between users to consider them for similarity. 
                        For the item-based approach, this corresponds to the minimum number of common users for two items.
    """

    cfg = []
    sim_options0 = {'name': 'pearson_baseline', 'user_based': False}
    cfg.append(sim_options0)

    # To use item-based cosine similarity
    sim_options1 = {
        "name": "cosine",
        "user_based": False,  # Compute  similarities between items
        "min_support": 3,
    }
    cfg.append(sim_options1)

    sim_options2 = {
        "name": "msd",
        "user_based": False,
    }
    cfg.append(sim_options2)

    sim_options3 = {
        "name": "cosine",
        "user_based": False,
        "min_support": 4,
    }
    cfg.append(sim_options3)

    sim_options4 = {
        "name": "msd",
        "user_based": False,
        "min_support": 5,
    }
    cfg.append(sim_options4)

    sim_options5 = {
        "name": "cosine",
        "user_based": False,
        "min_support": 5,
    }
    cfg.append(sim_options5)

    for index in range(len(cfg)):
        algo = KNNWithMeans(k=5, sim_options=cfg[index])
        algo.fit(trainset)

        # run the trained model against the testset
        test_pred = algo.test(testset)

        logging.info(test_pred[20])
        # get RMSE
        logging.info(
            f"With index config : {index} , rmse on Test Set = {accuracy.rmse(test_pred, verbose=True)}"
        )
Exemplo n.º 30
0
# In[ ]:


trainset, testset = train_test_split(data, test_size=.15)


# In[ ]:


algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)


# In[ ]:


test_pred = algo.test(testset)


# In[ ]:


accuracy.rmse(test_pred, verbose=True)


# In[ ]:


algo.predict(uid=2, iid='Fight Club (1999)').est