Пример #1
0
    def run_with_diff_k(self, algo, args, range_, folds=2, test_filter=None, threshold=2, msg=None, modal_name=None):
        arg_name = {
            'KNN': 'k',
            'NMF': 'n_factors',
            'SVD': 'n_factors'
        }[modal_name]

        rmse_by_k = []
        mae_by_k = []
        k_values = []
        for k in range(*range_):
            k_values.append(k)
            args.update({arg_name: k})
            modal = algo(**args)
            kf = KFold(n_splits=folds)
            rmse_by_fold = []
            mae_by_fold = []
            for trainset, testset in kf.split(self.data):
                modal.fit(trainset)
                if test_filter:
                    testset = test_filter(testset, threshold)
                predictions = modal.test(testset)
                rmse_by_fold.append(accuracy.rmse(predictions, verbose=True))
                mae_by_fold.append(accuracy.mae(predictions, verbose=True))
            rmse_by_k.append(np.mean(rmse_by_fold))
            mae_by_k.append(np.mean(mae_by_fold))
   
        plt.plot(k_values, rmse_by_k)
        plt.plot(k_values, mae_by_k)
        plt.legend(['RMSE', 'MAE'])
        plt.title(msg)
        plt.show()
Пример #2
0
def kfold_crossvalidation(data, model, folds=5, k=5, threshold=4):
    """Preforms K fold crossvalidation on a KNN surprise model and returns average precision and recall.
    
    Arguments:
        data {surprise.dataset.DatasetAutoFolds} -- Surprise Dataset
        model {surprise.prediction_algorithms.knns.KNNBasic} -- Surprise KNNBasic model
        folds {int} -- number of folds in cross validation (default: {5})
        k {int} -- number of metrics -- (default: {10})
        threshold {int} -- ratings threshold -- (default {3})
    
    Returns:
        Tuple consisting of:
        average_precision {float} -- Average precision of the model
        average_recall {float} -- Average recall of the model
    """
    kf = KFold(n_splits=folds)
    preclist = []
    reclist = []
    for trainset, testset in kf.split(data): #cross validation splits
        model.fit(trainset) #fit model on trainset
        predictions = model.test(testset)
        precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=threshold)
        total_precision = (sum(prec for prec in precisions.values()) / len(precisions))
        total_recall = (sum(rec for rec in recalls.values()) / len(recalls))
        preclist.append(total_precision)
        reclist.append(total_recall)
    average_precision = np.mean(preclist)
    average_recall = np.mean(reclist)
    return average_precision, average_recall
def surpriseSVD(movieLensDataPath='data_clean.txt'):
    ''' Basic use of the surprise SVD algorithm. '''
    ''' Params: movieLensDataPath is the path to the movielens data we're looking at. '''
    ''' Note: replace with cleaned data. '''
    ''' We want to return U and V where for a Y of a matrix of movie ratings, Y ~/= U^TV.'''

    # Load the data as a pandas data frame, as reading from text didn't quite work at first.
    df = pd.read_csv(movieLensDataPath, sep="\t", header=None)
    df.columns = ["User Id", "Movie Id", "Rating"]

    # We need the rating scale.
    reader = Reader(rating_scale=(1, 5))

    # The columns are User Id, Movie Id, and Rating.
    data = Dataset.load_from_df(df[["User Id", "Movie Id", "Rating"]], reader)
    # To fit to the SVD algorithm, we have to convert it to a trainset.
    algo = SVD()
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    # U and V!
    algop = algo.pu
    algoq = algo.qi

    # Simple crossvalidation
    kf = KFold(n_splits=3)
    algo = SVD()
    for trainset, testset in kf.split(data):
        # train and test algorithm.
        algo.fit(trainset)
        predictions = algo.test(testset)
        # Compute and print Root Mean Squared Error
        accuracy.rmse(predictions, verbose=True)
    # Return U (pu) and V (qi)
    return algop, algoq
def train_trim_knn(data, R):
    kfold = KFold(n_splits=10)
    sim_options = {'name': 'pearson'}
    rmse_list = [[], [], []]
    for k in range(2, 102, 2):
        print("using k = %d" % k)
        p_rmse = []
        u_rmse = []
        hv_rmse = []
        knn = KNNWithMeans(k=k, sim_options=sim_options)
        for trainset, testset in kfold.split(data):
            knn.fit(trainset)
            (p_testset, u_testset, hv_testset) = trim(testset, R)

            p_pred = knn.test(p_testset)
            u_pred = knn.test(u_testset)
            hv_pred = knn.test(hv_testset)

            p_rmse.append(accuracy.rmse(p_pred))
            u_rmse.append(accuracy.rmse(u_pred))
            hv_rmse.append(accuracy.rmse(hv_pred))
        rmse_list[0].append(np.mean(p_rmse))
        rmse_list[1].append(np.mean(u_rmse))
        rmse_list[2].append(np.mean(hv_rmse))
    print("KNN with trim is finished!!")
    return rmse_list
def train_trim_nmf(data, R):
    kfold = KFold(n_splits=10)
    rmse_list = [[], [], []]
    for k in range(2, 52, 2):
        print("using k = %d" % k)
        p_rmse = []
        u_rmse = []
        hv_rmse = []
        nmf = NMF(n_factors=k)
        for trainset, testset in kfold.split(data):
            nmf.fit(trainset)
            (p_testset, u_testset, hv_testset) = trim(testset, R)

            p_pred = nmf.test(p_testset)
            u_pred = nmf.test(u_testset)
            hv_pred = nmf.test(hv_testset)

            p_rmse.append(accuracy.rmse(p_pred))
            u_rmse.append(accuracy.rmse(u_pred))
            hv_rmse.append(accuracy.rmse(hv_pred))
        rmse_list[0].append(np.mean(p_rmse))
        rmse_list[1].append(np.mean(u_rmse))
        rmse_list[2].append(np.mean(hv_rmse))
    print("NMF with trim is finished!!")
    return rmse_list
Пример #6
0
def Q30to33(qNum):
    data = load_data()
    data_full = data.build_full_trainset()
    pop, unpop, highVar = classifyMovies()
    kf = KFold(n_splits=10)
    ncf = NaiveCF()
    ncf.fit(data_full)
    subRMSE = np.array([])
    iter = 1
    for trainSet, testSet in kf.split(data):
        if qNum == 31:
            testSet = list(filter(lambda x: x[1] in pop, testSet))
        if qNum == 32:
            testSet = list(filter(lambda x: x[1] in unpop, testSet))
        if qNum == 33:
            testSet = list(filter(lambda x: x[1] in highVar, testSet))
        nTest = len(testSet)
        print("Split " + str(iter) + ": test set size after trimming: %d",
              nTest)
        iter += 1
        uid, iid, tr, est = ncf.test(testSet)
        subsubRMSE = pow(est - tr, 2)
        subsubRMSE = np.sum(subsubRMSE)
        subRMSE = np.append(subRMSE, np.sqrt(subsubRMSE / nTest))
    RMSE = np.mean(subRMSE)
    print("Q" + str(qNum) + " has RMSE " + str(RMSE))
Пример #7
0
def context_RMSE(file_path, context, algo_id, k=10):

    #Define o algoritimo
    algo = get_algo(algo_id)

    #Define o padrao de leitura dos arquivos
    reader = Reader(line_format='user item rating', sep=',', skip_lines=1)

    #Cria o dataset baseado nos dados lidos
    data = Dataset.load_from_file(file_path, reader)

    # define a cross-validation iterator
    kf = KFold(k)

    if not os.path.exists('resultados'):
        os.makedirs('resultados')

    with open("resultados/RMSE_" + context + '_' + str(algo_id) + ".csv",
              "w") as result_file:

        result_file.write('RMSEs:\n')
        # Printa os itens recomendados para cada usuario
        for trainset, testset in kf.split(data):

            # treina e testa o algoritimo
            algo.fit(trainset)
            predictions = algo.test(testset)
            result_file.write(str(accuracy.rmse(predictions)) + '\n')
def test() -> [Dict[str, object]]:
    alg_list = [SVD, KNNBaseline, BaselineOnly, CoClustering, NMF, KNNBasic, KNNWithMeans]

    seed = 0
    random.seed(seed)
    np.random.seed(seed)

    interactions: List[Interaction] = load_sorted_test_interactions()
    parsed_data: ParsedData = Parser.parse(interactions)
    kf = KFold(n_splits=5)
    entries = []

    for trainset, testset in kf.split(parsed_data.whole_data_set):
        for alg_to_test in alg_list:
            print("TESTING ALGORITHM: " + alg_to_test.__name__ + ", TIME: ")
            try:
                before = datetime.now()
                predictions: List[Prediction] = AlgoTrainer.calc_predictions(trainset,
                                                                             testset,
                                                                             alg_to_test())
                time_elapsed = (datetime.now() - before).total_seconds()

                recommender = Recommender(parsed_data.ids_offers_map, predictions)

                entry: Dict[str, object] = {'rmse': recommender.calc_rmse()}
                entry['algorithm'] = alg_to_test.__name__
                entry['time_elapsed'] = time_elapsed
                entries.append(entry)

            except Exception as e:
                print(e)
            print("")

    return entries
Пример #9
0
def train_with_Kfold(algo, data, k=5, verbose=True):
    
    kf = KFold(n_splits=k,)
    
    history = pd.DataFrame(columns=['precision','recall', 'f1', 'NDCG'])
    
    i = 0
    for trainset, testset in kf.split(data):
        # algo 는 fit의 인자로 trainset 객체를 받고,
        algo.fit(trainset)
        predictions = algo.test(testset) # test의 인자로 튜플의 list 를 받는다.
        precisions, recalls = precision_recall_at_k(predictions, k=15, threshold=4)

        P = sum(rec for rec in precisions.values()) / len(precisions)
        R = sum(rec for rec in recalls.values()) / len(recalls)
        F1 = (2 * P * R) / (P + R)
        # NDCG 의 top k rank 는 k=5 사용
        NDCG = ndcg_at_k_all(predictions, k=5)
        
        history.loc[i]=[P, R, F1, NDCG]
        
        if verbose:
            print(f"FOLD: {i}")
            print("precision: ", P)
            print("recall: ",R)
            print("f1: ",F1)
            print("NDCG: ",NDCG)
            print("------")
        
        i +=1
    
    return history
Пример #10
0
def my_cross_validation(algo, data, k=5, threshold=7, n_splits=5, verbose=False):
    kf = KFold(n_splits=n_splits)
    cv_map = {'map@{}'.format(k): [], 'mar@{}'.format(k): []}
    time_map = {'Fit time': [], 'Test time': []}
    for trainset, testset in kf.split(data):
        step_one = datetime.now()
        algo.fit(trainset)
        step_two = datetime.now()
        predictions = algo.test(testset)
        step_three = datetime.now()
        precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=threshold)
        cv_map['map@{}'.format(k)].append(sum(precisions.values()) / len(precisions))
        cv_map['mar@{}'.format(k)].append(sum(recalls.values()) / len(recalls))
        time_map['Fit time'].append((step_two - step_one).total_seconds())
        time_map['Test time'].append((step_three - step_two).total_seconds())
    if verbose:
        print_summary(
            algo,
            ['map@{}'.format(k), 'mar@{}'.format(k)],
            cv_map,
            time_map['Fit time'],
            time_map['Test time'],
            n_splits
        )
    cv_map.update(time_map)
    return cv_map
Пример #11
0
def collaborative_filter(id, new_words):
    ratings_dict = calc_collaborative_param(new_words, id)

    df = pd.DataFrame(ratings_dict)

    # A reader is still needed but only the rating_scale param is required.
    reader = Reader(rating_scale=(0.0, 5.0))
    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
    # define a cross-validation iterator
    kf = KFold(n_splits=3)

    algo = KNNBasic()

    for trainset, testset in kf.split(data):
        # train and test algorithm.
        algo.fit(trainset)
        kf_predictions = algo.test(testset)
        # Compute and print Root Mean Squared Error
        accuracy.rmse(kf_predictions, verbose=True)

    trainset = data.build_full_trainset()

    new_data = trainset.build_anti_testset()
    predictions = algo.test(new_data)

    top_n = get_top_n(predictions, n=3)

    with open('top_n.json', 'w') as fp:
        dump(top_n, fp, indent=4)

    return top_n
Пример #12
0
def check_k_and_thresh(algo):
    global predictions
    prec_to_ave = []
    rec_to_ave = []
    kf = KFold(n_splits=30)

    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        precisions, recalls = precision_recall_at_k(predictions, k=30, threshold=2.5)

        # Precision and recall can then be averaged over all users

        prec_to_ave.append(sum(prec for prec in precisions.values()) / len(precisions))
        rec_to_ave.append(sum(rec for rec in recalls.values()) / len(recalls))

    results = []
    for i in range(2, 30):
        precisions, recalls = precision_recall_at_k(predictions, k=i, threshold=2.5)

        # Precision and recall can then be averaged over all users
        prec = sum(prec for prec in precisions.values()) / len(precisions)
        rec = sum(rec for rec in recalls.values()) / len(recalls)
        results.append({'K': i, 'Precision': prec, 'Recall': rec})

    K = np.arange(2,30)
    precs = []
    recs = []
    for i in range(len(K)):
        precs.append(results[i].get("Precision"))
        recs.append(results[i].get("Recall"))

    plt.plot(K, precs)
    plt.plot(K, recs)
def draw_t_prec_recall(algo, kf, t_low, t_high, thre):
    kf = KFold(n_splits=kf)
    ts = [i for i in range(t_low, t_high + 1)]
    precision = []
    recall = []
    
    for t in ts:
        temp_prec = []
        temp_recall = []
        for trainset, testset in kf.split(data):

            # train and test algorithm.
            algo.fit(trainset)

            trimmed_testset = testset_trim(testset, t, threshold=thre)
            predictions = algo.test(trimmed_testset)
            precisions, recalls = precision_recall_at_t(predictions, t, threshold=thre)

            fold_mean_prec = sum(prec for prec in precisions.values()) / len(precisions)
            fold_mean_recall = sum(rec for rec in recalls.values()) / len(recalls)
            
            temp_prec.append(fold_mean_prec)
            temp_recall.append(fold_mean_recall)

        t_mean_prec = sum(prec for prec in temp_prec) / len(temp_prec)
        t_mean_recall = sum(rec for rec in temp_recall) / len(temp_recall)
        precision.append(t_mean_prec)
        recall.append(t_mean_recall)
    return ts, precision, recall
Пример #14
0
def Kfold_validation(k, algo, data):
    # determining number of folds of splitting
    kf = KFold(n_splits=k)
    # dictionary to hold folds with their MAE values
    fold_dict = {}
    # list of folds numbers
    folds = []
    # list of errors
    error = []

    for j, (trainset, testset) in enumerate(kf.split(data)):
        start_time = time.time()
        #append fold number in folds list
        folds.append('FOLD ' + str(j))
        #fitting algorithm on training set
        algo.fit(trainset)
        #predicting on test set
        predictions = algo.test(testset)
        #appending error in errors list
        error.append(surprise.accuracy.mae(predictions, verbose=False))
        end_time = time.time()
        print('Fold {}, MAE: {:.3f}, Time Elapsed: {:.3f} seconds'.format(
            j, error[j], end_time - start_time))
    #making key value pairs in dictionary
    #FOLD as key and folds list as value
    fold_dict['FOLD'] = folds
    #MAE as key and error list as value
    fold_dict['MAE'] = error

    return fold_dict
Пример #15
0
def get_accuracy(df,
                 genre,
                 neighbors=30,
                 min_neighbors=5,
                 seed=12345,
                 kfolds=5,
                 k=5,
                 threshold=4):
    """ Gets the precision and accuracy of the model for each genre using cross validation
        
        Args:
            df (pandas.DataFrame): the dataset of actual ratings
            genre (str): the genre for the model
            neighbors (int): the number of neighbors to take into account when training the model
                             Default is 30.
            min_neighbors (int): the number of neighbors a user must have in order to get a prediction.
                                Default is 5.
            seed (int): setting the random state. Default is 12345.
            kfolds (int): the number of folds for cross validation. Default is 5.
            k (int): number of recommendations for each user. default is 5.
            threshold (int): the cutoff rating at which an item will be considered 'enjoyed.'
        Returns:
            prec (int): The average of precision across the kfolds cross validation
            rec (int): The average of recall across the kfolds cross validation
 	"""

    data = df[df['genre'] == genre]
    data = data[['user_id', 'book_id', 'rating']]
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(data[['user_id', 'book_id', 'rating']], reader)
    algo_KNNbasic = KNNBasic(k=neighbors,
                             min_k=min_neighbors,
                             random_state=seed)

    kf = KFold(n_splits=kfolds, random_state=seed)
    prec_list = []
    recalls_list = []
    for trainset, testset in kf.split(data):
        algo_KNNbasic.fit(trainset)
        predictions = algo_KNNbasic.test(testset)
        precisions, recalls = precision_recall_at_k(predictions,
                                                    k=k,
                                                    threshold=threshold)

        # Precision and recall can then be averaged over all users
        logger.info("Precision:")
        logger.info(
            sum(prec for prec in precisions.values()) / len(precisions))
        precision = (sum(prec
                         for prec in precisions.values()) / len(precisions))
        logger.info("Recall")
        logger.info(sum(rec for rec in recalls.values()) / len(recalls))
        recall = (sum(rec for rec in recalls.values()) / len(recalls))
        prec_list.append(precision)
        recalls_list.append(recall)

    prec = (sum(prec_list) / len(prec_list))
    rec = (sum(recalls_list) / len(recalls_list))
    return prec, rec
Пример #16
0
def test_old_style_algo(small_ml):
    '''Test that old algorithms (i.e. algoritms that only define train()) can
    support both calls to fit() and to train()
    - supporting algo.fit() is needed so that custom algorithms that only
    define train() can still use up to date tools (such as evalute, which has
    been updated to use fit()).
    - algo.train() is the old way, and must still be supported for custom
    algorithms and tools.
    '''

    class CustomAlgoTrain(AlgoBase):

        def __init__(self):
            AlgoBase.__init__(self)
            self.cnt = -1

        def train(self, trainset):

            AlgoBase.train(self, trainset)
            self.est = 3
            self.bu, self.bi = 1, 1
            self.cnt += 1

        def estimate(self, u, i):
            return self.est

    with pytest.warns(UserWarning):
        algo = CustomAlgoTrain()

    kf = KFold(n_splits=2)
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        with pytest.warns(UserWarning):
            algo.fit(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.train has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoTrain.train
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of train() is only called once
        assert algo.cnt == i

    with pytest.warns(UserWarning):
        algo = CustomAlgoTrain()
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        with pytest.warns(UserWarning):
            algo.train(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.train has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoTrain.train
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of train() is only called once
        assert algo.cnt == i
def eval_model(model):
    kf = KFold(n_splits=3)
    for trainset, testset in kf.split(data):
        #训练并预测
        model.fit(trainset)
        predictions = model.test(testset)
        #计算RMSE
        accuracy.rmse(predictions, verbose=True)
Пример #18
0
def rank_predictions(model_name):

    k_KNN = 22 
    k_NNMF = 20
    k_MF = 26

    if model_name == 'KNN':
        sim_options = {
            'name': 'pearson_baseline',
            'shrinkage': 0
        }
        model = KNNWithMeans(k_KNN, sim_options=sim_options)
    elif model_name == 'NNMF':
        model = NMF(n_factors= k_NNMF)
    else:
        model = SVD(n_factors = k_MF)

    precision_arr = []
    recall_arr = []
    for t in range (1,26):
        kf = KFold(n_splits=10)
        print(t)
        p = []
        r = []
        for trainSet, testSet in kf.split(data):
            model.fit(trainSet)
            predictions = model.test(testSet)
            precisions, recalls = precision_recall (predictions, t)
            p.append(sum(prec for prec in precisions.values()) / len(precisions))
            r.append(sum(rec for rec in recalls.values()) / len(recalls))
            
        precision_arr.append(np.mean(np.array(p)))
        recall_arr.append(np.mean(np.array(r)))

    # precision vs t
    plt.plot(list(range (1,26)), precision_arr)
    plt.xlabel("Size")
    plt.ylabel("Precision")
    plt.title("The average precision plot using " + model_name)
    plt.show()
    
    # recall vs t
    plt.plot(list(range (1,26)), recall_arr)
    plt.xlabel("Size")
    plt.ylabel("Recall")
    plt.title("The average recall plot using MF " + model_name)
    plt.show()
    
    # precision vs recall 
    plt.plot(recall_arr, precision_arr)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("The average precision and recall plot using " + model_name)
    plt.show()


    return precision_arr, recall_arr 
Пример #19
0
def svdpp(dataset):
    start = time.time()
    algo = SVDpp()
    kf = KFold(n_splits=5)
    for trainset, testset in kf.split(dataset):
        algo.fit(trainset)
        predictions = algo.test(testset)
        acc = accuracy.rmse(predictions, verbose=True)
    end = time.time()
    print('svdpp花分钟数为:', (end - start) / 60)
    return acc
Пример #20
0
def test_new_style_algo(small_ml):
    '''Test that new algorithms (i.e. algoritms that only define fit()) can
    support both calls to fit() and to train()
    - algo.fit() is the new way of doing things
    - supporting algo.train() is needed for the (unlikely?) case where a user
    has defined custom tools that use algo.train().
    '''

    class CustomAlgoFit(AlgoBase):

        def __init__(self):
            AlgoBase.__init__(self)
            self.cnt = -1

        def fit(self, trainset):

            AlgoBase.fit(self, trainset)
            self.est = 3
            self.bu, self.bi = 1, 1
            self.cnt += 1

        def estimate(self, u, i):
            return self.est

    algo = CustomAlgoFit()
    kf = KFold(n_splits=2)
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        algo.fit(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.fit has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoFit.fit
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of fit() is only called once
        assert algo.cnt == i

    algo = CustomAlgoFit()
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        with pytest.warns(UserWarning):
            algo.train(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.fit has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoFit.fit
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of fit() is only called once
        assert algo.cnt == i
    def cross_validation(self, data, algo):
        # define a cross-validation iterator
        kf = KFold(n_splits=7, random_state=2)

        for trainset, testset in kf.split(data):
            # train and test algorithm.
            algo.fit(trainset)

            predictions = algo.test(testset)

            # Compute and print Root Mean Squared Error
            accuracy.rmse(predictions, verbose=True)
def train_naive(data, R):
    kfold = KFold(n_splits=10)
    ur_mean = np.mean(R, axis=1)
    rmse = []
    for _, testset in kfold.split(data):
        r_pred = []
        r = []
        for item in testset:
            r_pred.append(ur_mean[int(item[0]) - 1])
            r.append(item[2])
        rmse.append((np.mean((np.array(r_pred) - np.array(r))**2))**0.5)
    return np.mean(rmse)
Пример #23
0
def train():
    data = load_dataset()
    algo_svd = SVD()
    algo_nmf = NMF()

    print("Cross Validation procedure")
    kf = KFold(n_splits=KFOLD_NUM)
    for i, (trainset_cv, testset_cv) in enumerate(kf.split(data), start=1):
        print(f"===> Fold number {i}")
        # Save the first fold
        train_helper(algo_svd, "SVD", trainset_cv, testset_cv, i == 1)
        train_helper(algo_nmf, "NMF", trainset_cv, testset_cv, i == 1)
    def run(self): #will run model
        ratings = pd.read_csv('rating_final.csv')
        ratings_dict = {"userID": list(ratings.userID), "placeID": list(ratings.placeID), "rating": list(ratings.rating)}
        df = pd.DataFrame(ratings_dict)
        reader = Reader(rating_scale=(0, 2))
        data = Dataset.load_from_df(df[["userID", "placeID", "rating"]], reader)

        # To use item-based cosine similarity
        sim_options = {
            "name": "cosine",
            "user_based": True,  # Compute  similarities between items
            "min_support":9
        }
        # define a cross-validation iterator
        kf = KFold(n_splits=5)
        algo = KNNWithMeans(sim_options=sim_options)
        places = list(df['placeID'].unique())
        ordered = ArrayList()
        for i in places:
            total=0
            for trainset, testset in kf.split(data): #finds result for each fold
                # train algorithm.
                algo.fit(trainset)
                #test algorithm
                #predictions = algo.test(testset)
                # Compute and print Root Mean Squared Error
                #accuracy.rmse(predictions, verbose=True)

                #gets predicted rating for each place
                prediction = algo.predict(self.user, i, verbose=False)
                total+=prediction.est
            ordered.append(i, total/5) #we find average of estimate for each fold

        ordered.sort()
        highest = ordered.inArray[ordered.count - 5:ordered.count]

        place = pd.read_csv('geoplaces2.csv')

        #placedf = pd.DataFrame({"placeID": list(place.placeID), "name": list(place.name)})
        count = 0
        finalRec=ArrayList()
        for i in range(len(highest) - 1, -1, -1):
            count += 1
            name = list(place[place["placeID"].unique() == highest[i].id]['name'])
            finalRec.append(count, name[0])

        #printing accuracy score
        out = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)
        mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
        print(mean_rmse)

        return finalRec.inArray
def printingModelPrecisionAndRecall(algo, dataSet):
    kf = KFold(n_splits=5)

    for trainset, testset in kf.split(dataSet):
        algo.fit(trainset)
        predictions = algo.test(testset)
        precisions, recalls = precision_recall_at_k(predictions, k=5)

        # Precision and recall can then be averaged over all users
        print("Precision value: " +
              str(sum(prec for prec in precisions.values()) / len(precisions)))
        print("Recall value: " +
              str(sum(rec for rec in recalls.values()) / len(recalls)))
Пример #26
0
def func6():
    from surprise import SVD
    from surprise import Dataset
    from surprise import accuracy
    from surprise.model_selection import KFold

    data = Dataset.load_builtin('ml-100k')
    kf = KFold(n_splits=3)
    algo = SVD()
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        accuracy.rmse(predictions, verbose=True)
Пример #27
0
def surprise_cv_algo(data, algo, k_fold=5, verbose=True):
    # Split into folds
    kf = KFold(n_splits=k_fold)
    rmse_ = 0

    for trainset, testset in kf.split(data):
        # train and test algorithm.
        model = algo.fit(trainset)
        predictions = algo.test(testset)

        # Compute and print RMSE
        rmse_ += accuracy.rmse(predictions, verbose=verbose)

    rmse_mean = rmse_ / k_fold
    return rmse_mean
def calculate_precision_recall(classifiers, threshold, data):
    kf = KFold(n_splits=10)

    precisions = [[], [], []]
    recalls = [[], [], []]
    for t in range(1, 26):
        precision_list = []
        recall_list = []
        for i in range(3):
            classifier = classifiers[i]
            if i == 1:
                print("doing nmf")
            elif i == 2:
                print("doing svd")
            for trainset, testset in kf.split(data):
                pass
            classifier.fit(trainset)
            prediction = classifier.test(testset)

            S = dict()
            # user: 88         item: 337        r_ui = 3.50   est = 3.74   {'actual_k': 24, 'was_impossible': False}
            for (uid, mid, r, r_pred, _) in prediction:
                if uid in S:
                    S[uid].append((mid, r, r_pred))
                else:
                    S[uid] = [(mid, r, r_pred)]

            count, p_sum, r_sum = (0, 0, 0)
            for uid in S:
                if len(S[uid]) >= t:
                    pred_r = S[uid]
                    G = set([
                        x[0]
                        for x in filter(lambda x: x[1] >= threshold, pred_r)
                    ])
                    if len(G) != 0:
                        pred_r = sorted(pred_r, key=lambda x: -int(x[2]))
                        S2 = set([x[0] for x in pred_r[:t]])
                        inter = G & S2
                        precision = float(len(inter)) / len(S2)
                        recall = float(len(inter)) / len(G)
                        count += 1
                        p_sum += precision
                        r_sum += recall
            precisions[i].append(p_sum / count)
            recalls[i].append(r_sum / count)

    return precisions, recalls
Пример #29
0
def trim_performance(qNum,maxk=0): 
    pop, unpop, highVar = trimMovies()
    
    if maxk == 0:
        if 12 <= qNum <= 14:
            maxk = 100
        elif 19 <= qNum <= 21:
            maxk = 50

    trim_Model = {
        12: (pop, 'KNNWithMeans'),
        13: (unpop, 'KNNWithMeans'),
        14: (highVar, 'KNNWithMeans'),
        19: (pop, 'NMF'),
        20: (unpop, 'NMF'),
        21: (highVar, 'NMF'),
    }
    trimSet, modelName = trim_Model[qNum]
    
    kf = KFold(n_splits=10)
    RMSE = [] 
    for k in range(2, maxk + 1, 2):
        print('-' * 20 + 'k = ' + str(k) + ' ' + '-' * 20)
        
        if modelName == 'KNNWithMeans':
            model = KNNWithMeans(k=k, sim_options={'name': 'pearson'})
        elif modelName == 'NMF':
            model = NMF(n_factors=k)

        subRMSE = [] 
        temp = 1
        for trainSet, testSet in kf.split(data):
            model.fit(trainSet)
            testSet = list(filter(lambda x: int(x[1]) in trimSet, testSet))
            print("Split " + str(temp) + ": test set size after trimming: %d", len(testSet))
            temp += 1
            predictions = model.test(testSet)
            subRMSE.append(accuracy.rmse(predictions, verbose=True))
        RMSE.append(np.mean(subRMSE))

    plt.figure()
    plt.plot(list(range(2, maxk+1, 2)), RMSE)
    plt.xlabel("k")
    plt.ylabel("Average RMSE")
    plt.title("Q"+str(qNum)+": Average RMSE Along k")
    plt.show()
    print(min(RMSE))
    return min(RMSE)
Пример #30
0
def NMF_trim_filter(ratings, dims, func, mv_dict):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    RMSE = np.empty([len(dims)])
    MAE = np.empty([len(dims)])
    min_RMSE = False
    min_MAE = False
    fac_num_RMSE = 0
    fac_num_MAE = 0
    kf = KFold(n_splits=10, random_state=42)

    for k in range(len(dims)):
        nmf = NMF(n_factors=dims[k], random_state=42)
        test_rmse = np.array([])
        test_mae = np.array([])
        for trainset, testset in kf.split(data):
            nmf.fit(trainset)
            full_data = trainset.build_testset() + testset
            func(mv_dict, testset)
            pred = nmf.test(testset)
            test_rmse = np.append(test_rmse, accuracy.rmse(pred,
                                                           verbose=False))
            test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False))
        RMSE[k] = np.mean(test_rmse)
        if ((not min_RMSE) or RMSE[k] < min_RMSE):
            min_RMSE = RMSE[k]
            fac_num_RMSE = dims[k]

        MAE[k] = np.mean(test_mae)
        if ((not min_MAE) or MAE[k] < min_MAE):
            min_MAE = MAE[k]
            fac_num_MAE = dims[k]
        print('For k = %i :' % dims[k])
        print('RMSE: ', RMSE[k])
        print('MAE: ', MAE[k])

    plt.plot(dims, RMSE)
    plt.plot(dims, MAE)
    plt.legend(['RMSE', 'MAE'])
    plt.show()
    print('Finishing Plotting...')
    print('For RMSE:')
    print('\t---Optimal number of latent factors is ', fac_num_RMSE)
    print('\t---Minumun Average RMSE is ', min_RMSE)
    print('\nFor MAE:')
    print('\t---Optimal number of latent factors is ', fac_num_MAE)
    print('\t---Minumun Average MAE is ', min_MAE)
Пример #31
0
def test_KFold(toy_data):

    # Test n_folds parameter
    kf = KFold(n_splits=5)
    assert len(list(kf.split(toy_data))) == 5

    with pytest.raises(ValueError):
        kf = KFold(n_splits=10)
        next(kf.split(toy_data))  # Too big (greater than number of ratings)

    with pytest.raises(ValueError):
        kf = KFold(n_splits=1)
        next(kf.split(toy_data))  # Too low (must be >= 2)

    # Make sure data has not been shuffled. If not shuffled, the users in the
    # testsets are 0, 1, 2... 4 (in that order).
    kf = KFold(n_splits=5, shuffle=False)
    users = [int(testset[0][0][-1]) for (_, testset) in kf.split(toy_data)]
    assert users == list(range(5))

    # Make sure that when called two times without shuffling, folds are the
    # same.
    kf = KFold(n_splits=5, shuffle=False)
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a == testsets_b
    # test once again with another KFold instance
    kf = KFold(n_splits=5, shuffle=False)
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a == testsets_b

    # We'll now shuffle b and check that folds are different.
    # (this is conditioned by seed setting at the beginning of file)
    kf = KFold(n_splits=5, random_state=None, shuffle=True)
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a != testsets_b
    # test once again: two calls to kf.split make different splits when
    # random_state=None
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a != testsets_b

    # Make sure that folds are the same when same KFold instance is used with
    # suffle is True but random_state is set to some value
    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a == testsets_b

    # Make sure raw ratings are not shuffled by KFold
    old_raw_ratings = copy(toy_data.raw_ratings)
    kf = KFold(n_splits=5, shuffle=True)
    next(kf.split(toy_data))
    assert old_raw_ratings == toy_data.raw_ratings

    # Make sure kf.split() and the old toy_data.split() have the same folds.
    np.random.seed(3)
    with pytest.warns(UserWarning):
        toy_data.split(2, shuffle=True)
        testsets_a = [testset for (_, testset) in toy_data.folds()]
    kf = KFold(n_splits=2, random_state=3, shuffle=True)
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
Пример #32
0
from surprise.model_selection import KFold


data = Dataset.load_builtin('ml-100k')

algo = SVD()

trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

# We can also do this during a cross-validation procedure!
print('CV procedure:')

kf = KFold(n_splits=3)
for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)):
    print('fold number', i + 1)
    algo.fit(trainset_cv)

    print('On testset,', end='  ')
    predictions = algo.test(testset_cv)
    accuracy.rmse(predictions, verbose=True)

    print('On trainset,', end=' ')
    predictions = algo.test(trainset_cv.build_testset())
    accuracy.rmse(predictions, verbose=True)
"""
This module descibes how to use cross-validation iterators.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import KFold

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)