def KNNPred(data):  #KNN Means algorithm
    print("\nTraining KNN Means model..\n")
    global x_test, y_test, testlen, trainlen, y_train, model_params, X, Y, avg_rat, cold_itm
    options = model_params[0]
    knnModel = KNNWithMeans(sim_options=options)
    knnModel_1 = KNNWithMeans()
    train = data.build_full_trainset()
    knnModel.fit(train)
    print("\nTraining done..\nPrediction started..")
    knnModel_1.fit(train)
    #y_pred_w_m = [knnModel.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)]
    #y_pred_wo_m = [knnModel_1.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)]
    y_pred_w_m = [0 for i in range(testlen)]
    y_pred_wo_m = [0 for i in range(testlen)]
    kk = 0
    for i in x_test:
        if i[1] - 1 in cold_itm:
            y_pred_w_m[kk] = avg_rat[i[0] - 1]
            y_pred_wo_m[kk] = avg_rat[i[0] - 1]
        else:
            y_pred_w_m[kk] = knnModel.predict(i[0], i[1]).est
            y_pred_wo_m[kk] = knnModel_1.predict(i[0], i[1]).est
        kk += 1
    #y_pred_train = [knnModel_1.predict(x_train[i][0], x_train[i][1]).est for i in range(trainlen)]
    #y_pred_tot = [knnModel_1.predict(X[i][0], X[i][1]).est for i in range(trainlen+testlen)]
    print("\nPrediction done..\n")
    return [y_pred_w_m, y_pred_wo_m, knnModel,
            knnModel_1]  #, y_pred_train, y_pred_tot
Пример #2
0
 def KNN_train(self,
               k=20,
               options={
                   'name': 'pearson',
                   'user_based': False
               }):
     '''
     seed:int-3划分训练集测试集的随机种子
     k:int-40,最大邻居数量
     options:dict-{'name': 'pearson', 'user_based': False},算法的选项,默认为Pearson相似度,基于项目的方法
     '''
     self.algos = []
     df = self.trainDatas
     names = locals()
     r = Reader(rating_scale=(1, 5))
     # 读取、划分数据;训练预测数据
     total = Dataset.load_from_df(df[['uid', 'iid', 'total']], reader=r)
     total_train = total.build_full_trainset()
     total_algo = KNNWithMeans(k, sim_options=options)
     total_algo.fit(total_train)
     self.algos.append(total_algo)
     for i in range(1, self.no_of_criteria + 1):
         names['c' + str(i)] = Dataset.load_from_df(
             df[['uid', 'iid', 'c' + str(i)]], reader=r)
         names['c' + str(i) +
               '_train'] = names.get('c' + str(i)).build_full_trainset()
         names['algo_c' + str(i)] = KNNWithMeans(k, sim_options=options)
         names.get('algo_c' + str(i)).fit(names.get('c' + str(i) +
                                                    '_train'))
         self.algos.append(names.get('algo_c' + str(i)))
Пример #3
0
def select_model(loaded_data, model_selection='user_user'):
    # default model is user-user based collaborative filtering
    if model_selection == 'user_user':
        algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
    elif model_selection == 'item_item':
        algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
    else:
        algo = mf.matrix_factorization_param(loaded_data)
        print(algo)
    return algo
Пример #4
0
def randomize():
    sim_options_cosine = {'name': 'cosine', 'user_based': False}
    sim_options_msd = {'name': 'msd', 'user_based': False}
    sim_options_pearson = {'name': 'pearson', 'user_based': False}
    sim_options_baseline = {
        'name': 'pearson_baseline',
        'user_based': False,
        'shrinkage': 0
    }

    algorithms = [
        ('kNN Basic - Cosine',
         KNNBasic(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Basic - MSD', KNNBasic(sim_options=sim_options_msd,
                                     verbose=False)),
        ('kNN Basic - Pearson',
         KNNBasic(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Basic - Pearson B',
         KNNBasic(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Means - Cosine',
         KNNWithMeans(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Means - MSD',
         KNNWithMeans(sim_options=sim_options_msd, verbose=False)),
        ('kNN Means - Pearson',
         KNNWithMeans(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Means - Pearson B',
         KNNWithMeans(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Z - Cosine',
         KNNWithZScore(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Z - MSD',
         KNNWithZScore(sim_options=sim_options_msd, verbose=False)),
        ('kNN Z - Pearson',
         KNNWithZScore(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Z - Pearson B',
         KNNWithZScore(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Baseline - Cosine',
         KNNBaseline(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Baseline - MSD',
         KNNBaseline(sim_options=sim_options_msd, verbose=False)),
        ('kNN Baseline - Pearson',
         KNNBaseline(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Baseline - Pearson B',
         KNNBaseline(sim_options=sim_options_baseline, verbose=False)),
        ('SVD', SVD(verbose=False)), ('SVDpp', SVDpp(verbose=False)),
        ('Baseline Only', BaselineOnly(verbose=False)),
        ('CoClustering', CoClustering(verbose=False)),
        ('SlopeOne', SlopeOne()), ('NMF', NMF(verbose=False))
    ]

    random_ = random.randint(0, len(algorithms))

    return algorithms[random_]
Пример #5
0
    def generate_knn(self,rating_data):
        """
            here we separate untuned and tuned algo as it might take a really long time on tuning,
            it's easier to comment out the tuning part if needed

            Args:
                param1: rating_data: the main data set
            Return:
                    a dictionary of algorithms; key: name of algo, val: algo object

        """

        algo = {}
        bcKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
        algo['bcKNN'] = bcKNN

        wmKNN = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True})
        algo['wmKNN'] = wmKNN

        wzKNN = KNNWithZScore(sim_options={'name': 'cosine', 'user_based': True})
        algo['wzKNN'] = wzKNN

        blKNN = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True})
        algo['blKNN'] = blKNN


        # tune param for knnBaseline, since it has best accuracy
        param_grid_bl = {'k': [10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100]}
        best_params_bl = self.tune_and_find_parameter('blKNN', KNNBaseline, rating_data, param_grid_bl)

        blKNN_tuned = KNNBaseline(k=best_params_bl['k'])
        algo.update({'blKNN_tuned': blKNN_tuned})

        return algo
def check_for_args():
    args = sys.argv
    for arg in args:
        if (arg == 'SVD'):
            alg_list.append(SVD())
        elif (arg == 'SVDpp'):
            alg_list.append(SVDpp())
        elif (arg == 'SlopeOne'):
            alg_list.append(SlopeOne())
        elif (arg == 'NMF'):
            alg_list.append(NMF())
        elif (arg == 'NormalPredictor'):
            alg_list.append(NormalPredictor())
        elif (arg == 'KNNBaseline'):
            alg_list.append(KNNBaseline())
        elif (arg == 'KNNBasic'):
            alg_list.append(KNNBasic())
        elif (arg == 'KNNWithMeans'):
            alg_list.append(KNNWithMeans())
        elif (arg == 'KNNWithZScore'):
            alg_list.append(KNNWithZScore())
        elif (arg == 'BaselineOnly'):
            alg_list.append(BaselineOnly())
        elif (arg == 'CoClustering'):
            alg_list.append(CoClustering())

    return alg_list
def EvaluateDifferentAlgorithms():
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(),
            KNNWithZScore(),
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_6months,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
Пример #8
0
def plot_ROC(qNum, k, thresh=[2.5,3,3.5,4]):
    range = 5.0
    trainset, testset = train_test_split(data, test_size=0.1)
    if qNum == 15:
        model = KNNWithMeans(k=k, sim_options={'name': 'pearson'})
    model.fit(trainset)
    predictions = model.test(testset)
    
    for thrs in thresh:
        y = np.array([])
        scores = np.array([])
        for u, i, t, est, d in predictions:
            if t >= thrs:
                t = 1
            else:
                t = 0
            y = np.append(y, t)
            scores = np.append(scores, est/range)
        
        fpr, tpr, thresholds = metrics.roc_curve(y, scores)
        roc_auc = metrics.auc(fpr, tpr)

        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Threshold = '+str(thrs))
        plt.show()
        print("auc = "+str(roc_auc))
Пример #9
0
    def generate_knn(self, rating_data):

        algo = {}
        bcKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
        algo['bcKNN'] = bcKNN

        wmKNN = KNNWithMeans(sim_options={
            'name': 'cosine',
            'user_based': True
        })
        algo['wmKNN'] = wmKNN

        wzKNN = KNNWithZScore(sim_options={
            'name': 'cosine',
            'user_based': True
        })
        algo['wzKNN'] = wzKNN

        blKNN = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True})
        algo['blKNN'] = blKNN

        # tune param for knnBaseline, since it has best accuracy
        param_grid_bl = {'k': [10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100]}
        best_params_bl = self.tune_and_find_parameter('blKNN', KNNBaseline,
                                                      rating_data,
                                                      param_grid_bl)

        blKNN_tuned = KNNBaseline(k=best_params_bl['k'])
        algo.update({'blKNN_tuned': blKNN_tuned})

        return algo
Пример #10
0
def knnBasico(df, testSize, vecinos, pr, bool):

    # df = pd.read_csv('../datasets/yelp_beautySpa_aspects.csv', header=0)
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)
    trainset, testset = train_test_split(data,
                                         test_size=testSize,
                                         shuffle=False)

    sim_options = {
        'name': 'cosine',
        'user_based': bool  # compute  similarities between items
    }
    algo = KNNWithMeans(k=vecinos, sim_options=sim_options)

    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, pr, 4)

    # Precision and recall can then be averaged over all users
    # print(sum(prec for prec in precisions.values()) / len(precisions))
    # print(sum(rec for rec in recalls.values()) / len(recalls))

    precision = round(
        sum(prec for prec in precisions.values()) / len(precisions), 3)
    recall = round(sum(rec for rec in recalls.values()) / len(recalls), 3)

    return precision, recall
def select_model(user_review):
    user_review = data_prep()
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(
        user_review[['user_id', 'business_id', 'stars']], reader)
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            KNNBasic(),
            KNNBaseline(),
            KNNWithMeans(),
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF()
    ]:
        # Perform cross validation
        print(algorithm)
        print('start ......')
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)
        print(benchmark)
Пример #12
0
def test_knn_based(data):
    """
    Parameters
    ----------
    data : dataframe
        Dataframe with columns userId, movieId, and rating in that order.

    Returns
    -------
    test_mse : float
        The mean squared error for the knn based algorithm.

    """
    reader = Reader(rating_scale=(1, 5))
    knn_data = Dataset.load_from_df(data, reader)
    trainset, testset = train_test_split(knn_data,
                                         test_size=.10,
                                         random_state=24)
    algo = KNNWithMeans(k=5,
                        sim_options={
                            'name': 'pearson_baseline',
                            'user_based': True
                        })
    algo.fit(trainset)
    predictions = algo.test(testset)
    test_mse = accuracy.mse(predictions, verbose=False)
    return test_mse
Пример #13
0
def work(data, k):

    history = {}

    p_history=[]
    r_history=[]
    f1_history=[]
    ndcg_history=[]

    sim_options = {'name':'cosine', 'user_based': True}
    algo = KNNWithMeans(k=k, min_k=1, sim_options=sim_options, verbose=False)
    KNNWithMeans_history = train_with_Kfold(algo, data, 5, False)
    
    p_history.append(KNNWithMeans_history.mean()[0])
    r_history.append(KNNWithMeans_history.mean()[1])
    f1_history.append(KNNWithMeans_history.mean()[2])
    ndcg_history.append(KNNWithMeans_history.mean()[3])

    history[str(k)] = {
        "precision" : p_history,
        "recall"    : r_history,
        "f1"        : f1_history,
        "ndcg"      : ndcg_history
    }

    return history
Пример #14
0
def knn_m(data, training, testing):
    '''
        Tune KNN with Means parameters then calculates RMSE, coverage and running time of KNN with Means

        Args:
            data(Dataset): the whole dataset divided into 5 folds
            training(Dataset): training dataset
            testing(Dataset): test dataset

        Returns:
            rmse: RMSE of KNN with Means with optimized parameters
            top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                                                        'min_support': [1, 5], 'user_based': [False]}}

    # optimize parameters
    knnm_grid_search = GridSearch(KNNWithMeans, knn_param_grid, measures=['RMSE'], verbose=False)
    knnm_grid_search.evaluate(data)
    param = knnm_grid_search.best_params['RMSE']
    print('KNNWithMeans:', param)

    # fit model using the optimized parameters
    knnm = KNNWithMeans(k=param['k'], name=param['sim_options']['name'],
                        min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based'])
    knnm.train(training)

    # evaluate the model using test data
    predictions = knnm.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)

    return rmse, top_n
Пример #15
0
    def __build_model(self):
        model_path = '{}{}'.format(self.file_prefix, self.model_path)
        try:
            model = joblib.load(model_path)
            print('recommender exists, load it')
            return model
        except Exception as e:
            print('recommender does not exist, build new recommender')

            # load data

            # initialize KNN recommender
            algo = KNNWithMeans(k=50,
                                sim_options={
                                    'name': 'pearson_baseline',
                                    'user_based': False
                                })
            # train model
            algo.fit(self.trainset)
            # save model
            joblib.dump(algo, model_path)
            # validation
            test_pred = algo.test(self.testset)
            accuracy.rmse(test_pred)

            return algo
Пример #16
0
    def recommender_knn_baseline(self, train_file, test_file, output):

        train, test, train_dataset, test_dataset = prepare_datasets(
            train_file, test_file)
        # Use user_based true/false to switch between user-based or item-based collaborative filtering
        algo_knn_means = KNNWithMeans(verbose=False)

        algo_knn_means.fit(train)

        #not_seen_elems = self.merge_train_set(train_dataset, test_dataset)

        #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True)
        predictions_knn_means = algo_knn_means.test(test, verbose=False)

        #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0)
        # Precision and recall can then be averaged over all users
        #precision_avg = sum(prec for prec in precisions.values()) / len(precisions)
        #recall_avg = sum(rec for rec in recalls.values()) / len(recalls)
        #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str(
        #    rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False)))
        print('KNN_BASELINE: ' + ' RMSE ' +
              str(rmse(predictions_knn_means, verbose=False)) + ' MAE ' +
              str(mae(predictions_knn_means, verbose=False)))

        return algo_knn_means
Пример #17
0
def main():
    # Charge movielens-100k dataset
    data = Dataset.load_builtin('ml-100k')

    # Créer un jeu de test et de train ( 15%, 85%)
    trainset, testset = train_test_split(data, test_size=.15)

    # Détermine l'algorithme utilisé
    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    result =[]
    for prediction in predictions:
        # Calcul le delta entre la prediction et la réalité
        result.append(prediction.r_ui - prediction.est)

    # Affiche l'histogramme du delta entre les predictions et la réalité
    plt.hist(result, 100)

    plt.show()
Пример #18
0
def get_model(model_name):
    algo = None
    if 'KNN' in model_name:
        model_name = model_name.split('_')
        knn_model_name = model_name[0]
        user_based = False if len(
            model_name) > 1 and model_name[1] == 'I' else True
        dis_method = 'msd' if len(model_name) < 3 else model_name[2]
        k = 20 if len(model_name) < 4 else int(model_name[3])
        sim_options = {'user_based': user_based, 'name': dis_method}
        if knn_model_name == 'KNNBasic':
            algo = KNNBasic(sim_options=sim_options, k=k)
        elif knn_model_name == 'KNNWithMeans':
            algo = KNNWithMeans(sim_options=sim_options, k=k)
        elif knn_model_name == 'KNNWithZScore':
            algo = KNNWithZScore(sim_options=sim_options, k=k)
    elif 'SVDpp' in model_name or 'SVD' in model_name or 'NMF' in model_name:
        model_name = model_name.split('_')
        n_factors = 25 if len(model_name) == 1 else int(model_name[1])
        if model_name[0] == 'SVDpp':
            algo = SVDpp(n_factors=n_factors)
        elif model_name[0] == 'SVD':
            algo = SVD(n_factors=n_factors)
        elif model_name[0] == 'NMF':
            algo = NMF(n_factors=n_factors)
    return algo
        def cal_KNNWithMeans(trainset, df):
            # KNNWithMeans

            sim_options = {'name': 'cosine', 'user-based': True}
            algo_knnm = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)
            algo_knnm.fit(trainset)
            users = []
            items = []
            real = []
            estimate = []
            for i in range(len(df)):
                uid = df[i:i + 1].user.values[0]
                users.append(uid)
                iid = df[i:i + 1].store.values[0]
                items.append(iid)
                r_ui = df[i:i + 1].stars.values[0]
                real.append(r_ui)
                pred = algo.predict(uid, iid, r_ui, verbose=True)
                estimate.append(pred)
            print("end")
            # knn With Means
            df4 = pd.DataFrame(columns=['user', 'item', 'r_ui', 'est'])
            df4['user'] = users
            df4['item'] = items
            df4['r_ui'] = real
            df4['est'] = estimate
            #df3.head()
            df4['est'] = df4['est'].apply(lambda x: x[-2])
            df4['err'] = abs(df4.est - df4.r_ui)
            df4.to_csv(save_file2)
Пример #20
0
def main():

    # Charge movielens-100k dataset
    movielens_ds = Dataset.load_builtin('ml-100k')

    # Creer un jeu de test et de train ( 15%, 85%)
    trainset, testset = train_test_split(movielens_ds, test_size=.15)

    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    #print(predictions)

    result = []
    for prediction in predictions:
        # Difference prediction et realite
        result.append(prediction.r_ui - prediction.est)

    # Histogramme du resultat
    plt.hist(result, 100)

    plt.show()
Пример #21
0
 def __init__(self,
              train_data,
              model_to_use=["baselineonly", "svd", "coClustering", "knn"]):
     """initialize class with full dataset and a set of base models to use"""
     AlgoBase.__init__(self)
     self.available_models = {
         "baselineonly":
         BaselineOnly(
             bsl_options={
                 "method": "sgd",
                 "n_epochs": 30,
                 "reg": 0.1,
                 "learning_rate": 0.005
             }),
         "svd":
         SVD(lr_all=0.005, n_factors=50, reg_all=0.1),
         "coClustering":
         CoClustering(n_epochs=3, n_cltr_u=3, n_cltr_i=3),
         "knn":
         KNNWithMeans(k=40,
                      sim_options={
                          "name": "cosine",
                          "user_based": False
                      }),
     }
     self.model_selection = []
     for model in model_to_use:
         self.model_selection.append([model, self.available_models[model]])
     self.model_rmse = {}
     self.model_mae = {}
     self.model_list = {}
     self.trainset = train_data.build_full_trainset()
Пример #22
0
def benchmark(data):
    performance = []
    algorithms = [
        SVD(),
        SVDpp(),
        SlopeOne(),
        NMF(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering(),
        SVD_SGD_momentum(),
        SVDpp_SGD_momentum()
    ]
    for algorithm in algorithms:
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE', 'FCP'],
                                 cv=3,
                                 verbose=False)
        output = pd.DataFrame.from_dict(results).mean(axis=0)
        output = output.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        performance.append(output)
    output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values(
        'test_rmse')
    store_dataframe(output_df, 'Algorithm_Benchmark.csv')
def compAlgos(data):  #Compare MAE, RMSE values for different algorithms
    print("\nLet us compare performance of KNN and SVD algorithms\n")
    #KNN Algos
    knn_Basic = cross_validate(KNNBasic(), data, cv=5, n_jobs=5, verbose=False)
    knn_means = cross_validate(KNNWithMeans(),
                               data,
                               cv=5,
                               n_jobs=5,
                               verbose=False)
    knn_z = cross_validate(KNNWithZScore(),
                           data,
                           cv=5,
                           n_jobs=5,
                           verbose=False)

    #SVD Algos
    svd = cross_validate(SVD(), data, cv=5, n_jobs=5, verbose=False)
    svdpp = cross_validate(SVDpp(), data, cv=5, n_jobs=5, verbose=False)

    print('\nKNN Basic: RMSE: {}, MAE: {}'.format(
        knn_Basic['test_rmse'].mean(), knn_Basic['test_mae'].mean()))
    print('\nKNN Means: RMSE: {}, MAE: {}'.format(
        knn_means['test_rmse'].mean(), knn_means['test_mae'].mean()))
    print('\nKNN Z Score: RMSE: {}, MAE: {}'.format(knn_z['test_rmse'].mean(),
                                                    knn_z['test_mae'].mean()))

    print('\nSVD: RMSE: {}, MAE: {}'.format(svd['test_rmse'].mean(),
                                            svd['test_mae'].mean()))
    print('\nSVD ++: RMSE: {}, MAE: {}'.format(svdpp['test_rmse'].mean(),
                                               svdpp['test_mae'].mean()))

    print('\nBoth SVDs perform better on the dataset\n')
    print(
        '\nWe will test with KNN means from KNN family and SVDPP from svd family\n'
    )
    def evaluate_on_test(self, train_set, test_set):
        """
        Evaluate the algorithm on the test set after running it on the test set
        :param train_set:
        :param test_set:
        :return: RMSE value on test set
        """
        if train_set is not None and test_set is not None:
            print("Evaluate RMSE on test data")
            self.LOG_HANDLE.info("Evaluate RMSE on test data")

            similarity_options = {
                'name': 'msd',
                'user_based': False,
            }

            # Use the KNN algorithm
            algo = KNNWithMeans(sim_options=similarity_options)

            # Train the algorithm on the trainset, and predict ratings for the testset
            algo.fit(train_set)
            predictions = algo.test(test_set)

            # Then compute RMSE
            return accuracy.rmse(predictions)
Пример #25
0
def DisplayGraphDelta(data) : 
    """
        Affichage du delta entre prédiction et réalité
    """
    # Créer un jeu de test et de train ( 25%, 75%)
    trainset, testset = train_test_split(data, test_size=.25)

    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    #print(predictions)

    result =[]
    for prediction in predictions:
        print(prediction)
        # Calcul le delta entre la prediction et la réalité
        result.append(prediction.r_ui - prediction.est)

    # Affiche l'histogramme du delta entre les prediction et la réalité
    print(len(result))
    plt.hist(result, 100)
    plt.show()
Пример #26
0
    def CFM(self):
        u_id = []
        I_id = []
        r_ui_ = np.array([])
        _est = np.array([])

        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options)
        algo.fit(self.trainset)

        for uid in (self.list):
            lids = self.data[self.data.uid == uid]
            a = self.data[self.data.uid == uid]

            for i in range(1, len(a)):
                lid = lids[i - 1:i].lid.values[0]
                r_ui = lids[i - 1:i].rate.values[0]
                pred = algo.predict(uid, lid, r_ui, verbose=True)
                u_id.append(int(pred.uid))
                I_id.append(int(pred.iid))
                r_ui_ = np.append(r_ui_, pred.r_ui)
                _est = np.append(_est, pred.est)

        self.df_est = pd.DataFrame({
            'uid': u_id,
            'Iid': I_id,
            'r_ui': r_ui_,
            'est': _est
        })
        self.arr = self.df_est['uid'].unique()

        self.CFWM_ndcg_ = self.Calculate_NDCG()
Пример #27
0
def train():

    # TODO put in real data here when we have collected enough
    ratings_dict = {
        "item": [1, 2, 1, 2, 1, 2, 1, 2, 1],
        "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'],
        "rating": [1, 0, 0, 0, 1, 0, 1, 1, 1],
    }

    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(0, 1))

    # Loads Pandas dataframe
    data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)

    trainingSet = data.build_full_trainset()

    # To use item-based cosine similarity
    sim_options = {
        "name": "cosine",
        "user_based": False,  # Compute  similarities between items
    }
    algo = KNNWithMeans(sim_options=sim_options)

    algo.fit(trainingSet)

    return algo
Пример #28
0
def to_test(k, option, model):
  
  df = pd.read_csv('training_set.dat')
  test_df = pd.read_csv('test_set.dat')
  reader = Reader(rating_scale=(1, 5))
  trainingSet = Dataset.load_from_df(df, reader).build_full_trainset()
  testSet = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

  opt = {'name': option, 'user_based': False}

  
  if model == 'Basic':
    algo = KNNBasic(k = k,sim_options = opt)
    algo.fit(trainingSet)
    # dump.dump("KNNBS.model", algo=algo, verbose=1)
  elif model == 'WithMeans':
    algo = KNNWithMeans(k = k,sim_options = opt)
    algo.fit(trainingSet)
    # dump.dump("KNNWM.model", algo=algo, verbose=1)
  elif model == 'WithZScore':
    algo = KNNWithZScore(k = k,sim_options = opt)
    algo.fit(trainingSet)
    # dump.dump("KNNWZS.model", algo=algo, verbose=1)
  elif model == 'Baseline':
    algo = KNNBaseline(k = k,sim_options = opt)
    algo.fit(trainingSet)
Пример #29
0
def load_data():
    data = Dataset.load_builtin('ml-100k')
    # similarity options
    sim_options = {"name": "msd", "user_based": False}

    param_grid = {
        "n_epochs": [5, 10],
        "lr_all": [0.002, 0.005],
        "reg_all": [0.4, 0.6]
    }

    # algorithm
    algo = KNNWithMeans(sim_options=sim_options)

    # computation
    training_set = data.build_full_trainset()

    algo.fit(training_set)

    # GRID SEACH, MATRIX FACTORIZATION
    print("Divide matrix in grids")
    gs = GridSearchCV(SVD, param_grid=param_grid, measures=["rmse"], cv=3)
    gs.fit(data)

    print(gs.best_score['rmse'])
Пример #30
0
def crossvalidate(data):
    results = []
    for algorithm in [
            NormalPredictor(),
            KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)),
            BaselineOnly(),
            SVD(),
            SVDpp(),
            NMF(),
            SlopeOne(),
            CoClustering()
    ]:
        result = cross_validate(algorithm,
                                data,
                                measures=['RMSE'],
                                cv=5,
                                verbose=False)
        temp = pd.DataFrame.from_dict(result).mean(axis=0)
        temp = temp.append(
            pd.Series([str(algorithm).split(' ')[0].split(".")[-1]],
                      index=['Algorithm']))
        results.append(temp)
    rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values(
        'test_rmse')
    return rmse_values