def knn_compute_prec_rec(t):
    precision, recall = [], []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        knn = KNNWithMeans(k=knn_best_k,
                           sim_options={'name': 'pearson'},
                           verbose=False)
        knn.fit(trainset)
        trimmed_testset = trim_unpopular_user(testset, t, threshold)
        pred = knn.test(trimmed_testset)

        precision_dict, recall_dict = calculate_precision_recall(
            pred, t, threshold)
        precision.append(np.mean([prec for prec in precision_dict.values()]))
        recall.append(np.mean([rec for rec in recall_dict.values()]))
    return np.mean(precision), np.mean(recall)
Exemplo n.º 2
0
def Q10():
    data = load_data()

    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }

    meanRMSE, meanMAE = [], []
    start = time.time()
    for k in range(2, 102, 2):
        knnWithMeans = KNNWithMeans(k, sim_options=sim_options)
        out = cross_validate(knnWithMeans,
                             data,
                             measures=['RMSE', 'MAE'],
                             cv=10)
        meanRMSE.append(np.mean(out['test_rmse']))
        meanMAE.append(np.mean(out['test_mae']))
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    print("Total time used for cross validation: " + cv_time)

    k = list(range(2, 102, 2))
    ys = [[meanRMSE, 'mean RMSE'], [meanMAE, 'mean MAE']]
    make_plot(k, ys, 'Number of Neighbors', 'Error')
    return meanRMSE, meanMAE
Exemplo n.º 3
0
 def slot_select_algo_combobox(self):
     self.algo_change_flag=True
     self.algo_trained_flag=False
     algo_name=self.select_algo_comboBox.currentText()
     if algo_name=='SVD':
         self.algo=SVD()
         self.display_process_label.append('加载SVD模型...')
     elif algo_name=='SVD++':
         self.algo = SVDpp()
         self.display_process_label.append('加载SVD++模型...')
     elif algo_name == 'NMF':
         self.algo = NMF()
         self.display_process_label.append('加载NMF模型...')
     elif algo_name == 'Slope One':
         self.algo = SlopeOne()
         self.display_process_label.append('加载Slope One模型...')
     elif algo_name == 'k-NN':
         self.algo = KNNBasic()
         self.display_process_label.append('加载k-NN模型...')
     elif algo_name == 'Centered k-NN':
         self.algo = KNNWithMeans()
         self.display_process_label.append('加载Centered k-NN模型...')
     elif algo_name == 'k-NN Baseline':
         self.algo = KNNBaseline()
         self.display_process_label.append('加载k-NN Baseline模型...')
     elif algo_name == 'Co-Clustering':
         self.algo = CoClustering()
         self.display_process_label.append('加载Co-Clustering模型...')
     elif algo_name == 'Baseline':
         self.algo = BaselineOnly()
         self.display_process_label.append('加载Baseline模型...')
     elif algo_name == 'Random':
         self.algo = NormalPredictor()
         self.display_process_label.append('加载Random模型...')
Exemplo n.º 4
0
def main(args=None):
    location = process_args(args)

    out_path = os.path.expanduser(location)
    print('Checking output directory...')
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    else:
        ans = input("Overwrite output directory?: ").upper()
        if ans == 'N' or ans == 'NO':
            print('Exiting...')
            exit()
    print("Loading dataset...")
    data = Dataset.load_builtin('ml-1m')
    algo = SVD()
    print("Running SVD...")
    result = cross_validate(algo,
                            data,
                            measures=['RMSE', 'MAE'],
                            cv=10,
                            verbose=True)
    write_results_to_file(result['test_rmse'], result['test_mae'],
                          'svd_out.json')
    print("Running KNN...")
    algo = KNNWithMeans()
    result = cross_validate(algo,
                            data,
                            measures=['RMSE', 'MAE'],
                            cv=10,
                            verbose=True)
    write_results_to_file(result['test_rmse'], result['test_mae'],
                          'knn_out.json')
    print("Done.")
def knn_compute_cross_validation_error(k, random_state):
    knn = KNNWithMeans(k=k, sim_options={'name': 'pearson'}, verbose=False)
    cv = cross_validate(knn,
                        R,
                        cv=KFold(n_splits=10, random_state=random_state))
    print('k: %s | RMSE: %f | MAE: %f' %
          (k, np.mean(cv['test_rmse']), np.mean(cv['test_mae'])))
    return np.mean(cv['test_rmse']), np.mean(cv['test_mae'])
Exemplo n.º 6
0
def Q15and22and29(qNum, bestK, thres=[2.5, 3, 3.5, 4]):
    range = 5.0
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    data = load_data()
    trainset, testset = train_test_split(data, test_size=0.1)
    if qNum == 15:
        model = KNNWithMeans(bestK, sim_options=sim_options)
    elif qNum == 22:
        model = NMF(n_factors=bestK)
    else:
        model = SVD(n_factors=bestK)

    model.fit(trainset)
    pred = model.test(testset)
    for thrs in thres:
        np_true = np.array([])
        np_score = np.array([])
        for u, i, t, p, d in pred:
            if t >= thrs:
                t = 1
            else:
                t = 0
            np_true = np.append(np_true, t)
            np_score = np.append(np_score, p / range)
        title = 'Threshold ' + str(thrs)
        plot_ROC(np_true, np_score, title=title)
Exemplo n.º 7
0
def user_based(data, db):
    # user-based collaborative filtering: recommend the
    # top n items based on similar users
    param_grid = {
        'k': [20, 25, 30, 35, 40],
        'min_k': [1],
        'sim_options': {
            'name': ['msd'],
            'user_based': [True],
            'min_support': [1]
        }
    }
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=4)
    gs.fit(data)
    best_rmse = gs.best_score['rmse']
    best_params = gs.best_params['rmse']
    print(best_rmse)
    print(best_params)

    k = best_params['k']
    m = best_params['min_k']
    n = best_params['sim_options']['name']
    u = best_params['sim_options']['user_based']
    s = best_params['sim_options']['min_support']
    so = {'name': n, 'user_based': u, 'min_support': s}

    trainset = data.build_full_trainset()
    algo = KNNWithMeans(k=k, min_k=m, sim_options=so)
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    # get top n predictions, in order
    top_n = get_top_n(predictions, n=10)

    # insert into database
    db.userRecs.drop()
    for uid, user_ratings in top_n.items():
        recs = [iid for (iid, _) in user_ratings]
        rec = {'user_id': uid, 'recs': recs, 'timestamp': datetime.utcnow()}
        result = db.userRecs.insert_one(rec)

    print('done')
def knn_evaluate_trim_performance(trimming, k, random_state):
    knn = KNNWithMeans(k=k,
                       min_k=1,
                       sim_options={'name': 'pearson'},
                       verbose=False)
    rmse = []
    for trainset, testset in KFold(n_splits=10,
                                   random_state=random_state).split(R):
        knn.fit(trainset)
        if trimming == 'popular':
            trimmed_testset = popular_trimming(testset, frequency)
        elif trimming == 'unpopular':
            trimmed_testset = unpopular_trimming(testset, frequency)
        elif trimming == 'high variance':
            trimmed_testset = high_variance_trimming(testset, frequency,
                                                     variance)
        pred = knn.test(trimmed_testset)
        rmse.append(accuracy.rmse(pred, verbose=False))
    print('k: %s | RMSE: %f' % (k, np.mean(rmse)))
    return np.mean(rmse)
def train_knn(data):
    rmse = []
    mae = []
    sim_options = {'name': 'pearson'}
    for k in range(2, 102, 2):
        print("using k = %d" % k)
        knn = KNNWithMeans(k=k, sim_options=sim_options)
        temp = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=10)
        rmse.append(np.mean(temp['test_rmse']))
        mae.append(np.mean(temp['test_mae']))
    print("k-fold validation finished!")
    return (rmse, mae)
def train_trim_knn(data, R):
    kfold = KFold(n_splits=10)
    sim_options = {'name': 'pearson'}
    rmse_list = [[], [], []]
    for k in range(2, 102, 2):
        print("using k = %d" % k)
        p_rmse = []
        u_rmse = []
        hv_rmse = []
        knn = KNNWithMeans(k=k, sim_options=sim_options)
        for trainset, testset in kfold.split(data):
            knn.fit(trainset)
            (p_testset, u_testset, hv_testset) = trim(testset, R)

            p_pred = knn.test(p_testset)
            u_pred = knn.test(u_testset)
            hv_pred = knn.test(hv_testset)

            p_rmse.append(accuracy.rmse(p_pred))
            u_rmse.append(accuracy.rmse(u_pred))
            hv_rmse.append(accuracy.rmse(hv_pred))
        rmse_list[0].append(np.mean(p_rmse))
        rmse_list[1].append(np.mean(u_rmse))
        rmse_list[2].append(np.mean(hv_rmse))
    print("KNN with trim is finished!!")
    return rmse_list
Exemplo n.º 11
0
def Q34():
    rang = 5.0
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    data = load_data()
    trainset, testset = train_test_split(data, test_size=0.1)
    knn = KNNWithMeans(22, sim_options=sim_options)
    nmf = NMF(n_factors=18)
    svd = SVD(n_factors=8)
    fp = {}
    tp = {}
    area = np.array([])
    for model, key in zip([knn, nmf, svd], ['KNN', 'NNMF', 'SVD']):
        model.fit(trainset)
        pred = model.test(testset)
        np_true = np.array([])
        np_score = np.array([])
        for _, _, t, p, _ in pred:
            if t >= 3:
                t = 1
            else:
                t = 0
            np_true = np.append(np_true, t)
            np_score = np.append(np_score, p / rang)
        fpr, tpr, thresholds = roc_curve(np_true, np_score)
        print(fpr.shape, tpr.shape)
        roc_auc = auc(fpr, tpr)
        fp[key] = fpr
        tp[key] = tpr
        area = np.append(area, roc_auc)
    plt.figure()
    lw = 2
    for mod, f, t, roc_auc in zip(['KNN', 'NNMF', 'SVD'], fp, tp, area):
        fpr = fp[f]
        tpr = tp[t]
        #     label = mod+'ROC curve (area = '+str(roc_auc)+'0.2f)'
        plt.plot(fpr,
                 tpr,
                 lw=lw,
                 label='%s ROC curve (area = %0.2f)' % (mod, roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    plt.show()
    plt.close()
Exemplo n.º 12
0
def item_based(data, db):
    # content-based recommendations: recommend the
    # top n items similar to the current item
    param_grid = {
        'k': [20, 30, 40, 50],
        'min_k': [1, 5, 10],
        'sim_options': {
            'name': ['msd'],
            'user_based': [False],
            'min_support': [1]
        }
    }
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=4)
    gs.fit(data)
    best_rmse = gs.best_score['rmse']
    best_params = gs.best_params['rmse']
    print(best_rmse)
    print(best_params)

    k = best_params['k']
    m = best_params['min_k']
    n = best_params['sim_options']['name']
    u = best_params['sim_options']['user_based']
    s = best_params['sim_options']['min_support']
    so = {'name': n, 'user_based': u, 'min_support': s}

    trainset = data.build_full_trainset()
    algo = KNNWithMeans(k=k, min_k=m, sim_options=so)
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    # get top n predictions, in order
    top_n = get_top_n(predictions, n=10)

    # insert into database
    for uid, user_ratings in top_n.items():
        print(uid, [iid for (iid, _) in user_ratings])
Exemplo n.º 13
0
def knn_cv(data):
    '''
    Calculate root mean square error using k nearest neighbor method
    with k starting from 2 to 50 in step sizes of 2
    10-folds cross-validation
    '''
    rmse = []
    k_list = range(2, 51, 2)
    print('Performing knn...')
    for k in k_list:
        print('k =', k)
        sim_options = {'name': 'cosine'}
        algo = KNNWithMeans(k=k, sim_options=sim_options, verbose=False)
        cv_result = cross_validate(algo, data, measures=['RMSE'], cv=10,
                                   verbose=False)
        rmse.append(np.mean(cv_result['test_rmse']))
    print('Completed!')
    return rmse, k_list
Exemplo n.º 14
0
def Q12To14And19To21And26To28(qNum, maxk=None):
    data = load_data()
    kf = KFold(n_splits=10)
    if maxk is None:
        if 12 <= qNum <= 14:
            maxk = 100
        elif 19 <= qNum <= 21:
            maxk = 50
        elif 26 <= qNum <= 28:
            maxk = 50

    pop, unpop, highVar = classifyMovies()

    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    trimAndModel = {
        12: (pop, 'KNNWithMeans'),
        13: (unpop, 'KNNWithMeans'),
        14: (highVar, 'KNNWithMeans'),
        19: (pop, 'NMF'),
        20: (unpop, 'NMF'),
        21: (highVar, 'NMF'),
        26: (pop, 'SVD'),
        27: (unpop, 'SVD'),
        28: (highVar, 'SVD')
    }

    RMSE = []  #  RMSE for each k
    for k in range(2, maxk + 1, 2):  # inclusive
        print('-' * 20 + ' k = ' + str(k) + ' ' + '-' * 20)
        trimSet, modelName = trimAndModel[qNum]
        if modelName == 'KNNWithMeans':
            model = KNNWithMeans(k, sim_options=sim_options)
        elif modelName == 'NMF':
            model = NMF(n_factors=k)
        else:
            model = SVD(n_factors=k)
        subRMSE = []  # RMSE for each k for each train-test split
        iter = 1
        for trainSet, testSet in kf.split(data):
            subsubRMSE = 0
            model.fit(trainSet)
            testSet = list(filter(lambda x: x[1] in trimSet, testSet))
            nTest = len(testSet)
            print("Split " + str(iter) + ": test set size after trimming: %d",
                  nTest)
            iter += 1
            predictions = model.test(testSet)
            for p in predictions:
                subsubRMSE += pow(p.est - p.r_ui, 2)
            # calculate RMSE of this train-test split
            subRMSE.append(np.sqrt(subsubRMSE / nTest))
        # average of all train-test splits of k-NN for this k
        RMSE.append(np.mean(subRMSE))

    # plotting
    k = list(range(2, maxk + 1, 2))
    ys = [[RMSE, 'RMSE']]
    xTitle = 'Number of Neighbors' if qNum <= 14 else 'Number of latent factors'
    make_plot(k, ys, xTitle, 'Error')
    return RMSE
Exemplo n.º 15
0
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.knns import KNNWithMeans

reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('ratings.csv', reader=reader)

# Calculate root mean square error using k nearest neighbor method
# with k starting from 2 to 50 in step sizes of 2
rmse_train = []
rmse_test = []
for k in range(2, 51, 2):
    print('k =', k)
    sim_options = {'name': 'cosine'}
    algo = KNNWithMeans(k=k, sim_options=sim_options, verbose=False)
    result = cross_validate(algo,
                            data,
                            measures=['RMSE'],
                            cv=10,
                            return_train_measures=True,
                            verbose=False)
    rmse_train.append(np.mean(result['train_rmse']))
    rmse_test.append(np.mean(result['test_rmse']))

plt.figure(1)
plt.plot(range(2, 51, 2), rmse_train)
plt.plot(range(2, 51, 2), rmse_test)
plt.xlabel('k')
plt.ylabel('Root Mean Square Error')
plt.title('kNN: The Result of Average RMSE versus k')
Exemplo n.º 16
0
from surprise import AlgoBase
from surprise.model_selection import cross_validate
from surprise.model_selection.split import train_test_split
import matplotlib.pyplot as plt
from surprise.prediction_algorithms.knns import KNNWithMeans
from surprise.prediction_algorithms.matrix_factorization import NMF
from surprise.prediction_algorithms.matrix_factorization import SVD
plt.close('all')

reader = Reader(sep=',')
data = Dataset.load_from_file('./ml-latest-small/ratings_new.csv',
                              reader=reader)

data.split(n_folds=10)
sim_options = {'name': 'pearson', 'user_based': True}
algo1 = KNNWithMeans(k=48, sim_options=sim_options)
algo2 = NMF(n_factors=16)
algo3 = SVD(n_factors=14)


def RankSweep(algo, tit, num):
    t_all = range(1, 26)
    pre_all = np.zeros(25)
    rec_all = np.zeros(25)
    for trainset, testset in data.folds():
        algo.fit(trainset)
        pred = algo.test(testset)
        G_all = dict()
        S_all = dict()
        for elem in pred:
            if elem.r_ui >= 3:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

neighbors = np.linspace(1,101,num=51,dtype=int)
basic_pearson, basic_cosine = [], []
for i in neighbors:
	print(i)
	cv_pearson = cross_validate(KNNBasic(k=i,sim_options={'name':'pearson'},verbose=False), data, cv=5)
	basic_pearson.append(np.mean(cv_pearson['test_rmse']))
	cv_cosine = cross_validate(KNNBasic(k=i,sim_options={'name':'cosine'},verbose=False), data, cv=5)
	basic_cosine.append(np.mean(cv_cosine['test_rmse']))

means_pearson, means_cosine = [], []
for i in neighbors:
	print(i)
	cv_pearson = cross_validate(KNNWithMeans(k=i,sim_options={'name':'pearson'},verbose=False), data, cv=5)
	means_pearson.append(np.mean(cv_pearson['test_rmse']))
	cv_cosine = cross_validate(KNNWithMeans(k=i,sim_options={'name':'cosine'},verbose=False), data, cv=5)
	means_cosine.append(np.mean(cv_cosine['test_rmse']))

fig, ax = plt.subplots()
ax.plot(neighbors,basic_cosine, 'r', label='Cosine')
ax.plot(neighbors, basic_pearson, 'b', label='Pearson')
ax.legend(loc='best')
plt.xlabel("k"); plt.ylabel("5-fold average RMSE"); plt.title("k-NN with 5-fold CV")

fig, ax = plt.subplots()
ax.plot(neighbors,means_cosine, 'r', label='Cosine')
ax.plot(neighbors, means_pearson, 'b', label='Pearson')
ax.legend(loc='best')
plt.xlabel("k"); plt.ylabel("5-fold average RMSE"); plt.title("Mean-centered k-NN with 5-fold CV")
Exemplo n.º 18
0
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('../dataset/ratings.csv', reader=reader)

# 10-fold cross validation
rmse, k_list = knn_cv(data)

# get optimal k
min_idx = rmse.index(min(rmse))
k_hat = k_list[min_idx]

# Training
trainset, testset = train_test_split(data, test_size=0.1)
sim_options = {'name': 'cosine'}
algo = KNNWithMeans(k=k_hat, sim_options=sim_options, verbose=False)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

# Plot Testing rmse
plt.figure(1)
plt.plot(k_list, rmse)
plt.xlabel('k')
plt.ylabel('Testing Root Mean Square Error')
plt.title('kNN: The Result of Average RMSE versus k')
plt.show()

# Plot ROC curve
test_target = []
test_score = []
import pandas as pd
from scipy import stats
from surprise.prediction_algorithms.knns import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import KFold
from surprise import accuracy
import matplotlib.pyplot as plt

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

sim_itembase = {
    'name': 'cosine',
    'user_based': False
}  # compute  similarities between items
algo_itembase = KNNWithMeans(sim_options=sim_itembase)

sim_userbase = {
    'name': 'pearson_baseline'
}  # compute  similarities between users
algo_userbase = KNNWithMeans(sim_options=sim_userbase)

# Run 5-fold cross-validation and save results.
kf = KFold(n_splits=5)
rmse_df = pd.DataFrame(columns=['Item-based', 'User-based'])
for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo_itembase.fit(trainset)
    pred_itembase = algo_itembase.test(testset)
Exemplo n.º 20
0
    scaled_data = convert_df_to_data(scaled_df, scaled_reader)
    scaled_data.split(n_folds=5)

    data = convert_df_to_data(df, reader)
    data.split(n_folds=5)

    # plot some EDA figures:
    plot_average_rating_hist(df)

    # Cross Valdiation Tests for different Classification Models:
    models = []
    models.append(('GM', GlobalMean()))
    models.append(('MoM', MeanofMeans()))
    models.append(('BLO', BaselineOnly()))
    models.append(('KNNb', KNNBasic()))
    models.append(('KNNwm', KNNWithMeans()))
    models.append(('KNNbl', KNNBaseline()))
    models.append(('SVD', SVD()))
    models.append(('NMF', NMF()))
    models.append(('SO', SlopeOne()))
    models.append(('CoC', CoClustering()))

    # plotting box plot of cross validation scores for array of recommendation models on scaled ratings data:
    model_names, rmses, maes = crossval_scores(scaled_data, models[:-1])

    # Now to find out which recommendation model has the lowest amount of false positives (recommending a movie that a user wounldn't like) and false negatives (failing to recommend a movie that a user would like). We'll choose a model based on the f1 score.
    model_names, fps, fns, tps, tns, precisions, recalls, f1s = get_fpfns(scaled_data, models, thresh=0.5)

    # Highest F1 score was the SVD model. We'll go with this model build a recommender system.

    '''To make a business case we'll have to make some assuptions about the costs and benefits that Movies-Legit service experiences when giving users recommendations they like (True Positive) and giving users recommendations they don't like (False Positives).
Exemplo n.º 21
0
    return pre, rec


#read data
path = '/users/ht/desktop/EE219/proj_3/'
reader = Reader(line_format='user item rating timestamp', sep=',')
data_raw = Dataset.load_from_file(path + 'data/ratings_1.csv', reader=reader)

#define K-fold
num_fold = 10
kf = split.KFold(n_splits=num_fold)

#define model for training
k_min = 24
sim_options = {'name': 'pearson', 'user_based': True}
knn = KNNWithMeans(k=k_min, sim_options=sim_options)

#train, test and rank
top_t_list = range(1, 26)
pre_list_knn = []
rec_list_knn = []
for top_t in top_t_list:
    pre = 0
    rec = 0
    for trainset, testset in kf.split(data_raw):
        knn.fit(trainset)
        prediction = knn.test(testset)
        G = create_dict(testset)
        G_s = create_dict(prediction, if_pred=1)
        R, R_s = threshold_rank_filter(G, G_s, thre=3, top_t=top_t)
        #precision and recall for each fold
            temp_prec.append(fold_mean_prec)
            temp_recall.append(fold_mean_recall)

        t_mean_prec = sum(prec for prec in temp_prec) / len(temp_prec)
        t_mean_recall = sum(rec for rec in temp_recall) / len(temp_recall)
        precision.append(t_mean_prec)
        recall.append(t_mean_recall)
    return ts, precision, recall

# read in data
file_path = os.path.expanduser('ratings.csv')
reader = Reader(line_format='user item rating', sep=',',skip_lines=1, rating_scale=(0.5, 5))
data = Dataset.load_from_file(file_path, reader=reader)

sim_options = {'name': 'pearson'}
knn = KNNWithMeans(k=24, sim_options=sim_options)
nmf = NMF(n_factors=4)
nmfBiased = NMF(n_factors=2, biased=True)

algs = []
algs.append(knn)
algs.append(nmf)
algs.append(nmfBiased)

names = {}
names[knn] = "KNN"
names[nmf] = "NNMF"
names[nmfBiased] = "NMF(biased)"

res_t_p_r = {}
for alg in algs:
Exemplo n.º 23
0
plt.title('Distribution of ratings among users')
plt.ylabel('Number of ratings')
plt.xlabel('Users')

#Question 6
var = ratings.groupby('movieId')['rating'].var().fillna(0).tolist()
plt.hist(var, bins=np.arange(0, 11, 0.5))
plt.xlabel('Variance of ratings')
plt.ylabel('Number of movies')
plt.title('Distribution of variance of ratings')

#Question 10
k_range = range(2, 100, 2)
avg_rmse, avg_mae = [], []
for k in k_range:
    algo = KNNWithMeans(k=k, sim_options={'name': 'pearson'})
    cv_results = cross_validate(algo,
                                data,
                                measures=['RMSE', 'MAE'],
                                cv=10,
                                verbose=False)
    avg_rmse.append(np.mean(cv_results['test_rmse']))
    avg_mae.append(np.mean(cv_results['test_mae']))

plt.plot(k_range, avg_rmse, label="Average RMSE")
plt.plot(k_range, avg_mae, label="Average MAE")
plt.xlabel('Number of neighbors')
plt.ylabel('Error')
plt.legend()
plt.show()
Exemplo n.º 24
0
def Q36To38(qNum):

    print("problem ", qNum)

    data = load_data()
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    filter = {
        36: 'KNNWithMeans',
        37: 'NMF',
        38: 'SVD',
    }
    k_KNNWithMeans = 30  # from Q11
    k_NMF = 18  # from Q18
    k_SVD = 8  # from Q25

    modelName = filter[qNum]

    if modelName == 'KNNWithMeans':
        model = KNNWithMeans(k_KNNWithMeans, sim_options=sim_options)
    elif modelName == 'NMF':
        model = NMF(n_factors=k_NMF)
    else:
        model = SVD(n_factors=k_SVD)

    # sweep t from 1 to 25
    precision_arr = []
    recall_arr = []
    for t in range(1, 26):
        kf = KFold(n_splits=10)
        for trainSet, testSet in kf.split(data):
            sub_precisions = 0.0
            sub_recalls = 0.0
            model.fit(trainSet)
            predictions = model.test(testSet)
            precisions, recalls = precision_recall(predictions, t)
            print(sum(prec for prec in precisions.values()) / len(precisions))
            sub_precisions += (sum(prec for prec in precisions.values()) /
                               len(precisions))
            print(sum(rec for rec in recalls.values()) / len(recalls))
            sub_recalls += (sum(rec
                                for rec in recalls.values()) / len(recalls))
        precision_arr.append(np.mean(sub_precisions))
        recall_arr.append(np.mean(sub_recalls))

    t_list = list(range(1, 26))
    ys = [[precision_arr, 'mean precisions'], [recall_arr, 'mean recalls']]

    print("model name: ", modelName)

    # make_plot(t_list, ys, 'recommended item size t','Precision')
    # precision vs t
    title_ = "precision vs t for: " + modelName
    make_plot(t_list, [[precision_arr, 'mean precisions']],
              'recommended item size t',
              'Precision',
              title=title_)
    # recall vs t
    title_ = "recall vs t for: " + modelName
    make_plot(t_list, [[recall_arr, 'mean recalls']],
              'recommended item size t',
              'Recall',
              title=title_)
    # precision vs recall
    title_ = "precision vs recall for: " + modelName
    #make_plot([recall_arr, 'mean recalls'], [[precision_arr, 'mean precisions']], 'Recall','Precision', title = title_)

    plt.plot(recall_arr, precision_arr, label=modelName)
    xlabel = "recall"
    ylabel = "precision"
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid()
    plt.title(title_)
    plt.show()

    return precision_arr, recall_arr

# Q10
# In order to fit surprise
file_path = os.path.expanduser('ratings.csv')
reader = Reader(line_format='user item rating',
                sep=',',
                skip_lines=1,
                rating_scale=(0.5, 5))
data = Dataset.load_from_file(file_path, reader=reader)

acc_cv = np.zeros((2, 50))
sim_options = {'name': 'pearson'}
i = 0
for k in range(2, 101, 2):
    algo = KNNWithMeans(k=k, sim_options=sim_options)
    cv1 = cross_validate(algo,
                         data,
                         measures=['RMSE', 'MAE'],
                         cv=10,
                         verbose=False)
    acc_cv[0, i] = np.mean(cv1['test_rmse'])
    acc_cv[1, i] = np.mean(cv1['test_mae'])
    print('test_rmse = %f, test_mae = %f' % (acc_cv[0, i], acc_cv[1, i]))
    i = i + 1
pass
ks = np.arange(2, 101, 2)

plt.xlabel('k')
plt.ylabel('Error value')
plt.title('Test RMSE and MAE vs k in KNN with 10 Validation')
                 label='Threshold: %.1f, AUC: %.4f' % (threshold, auc_score),
                 linewidth=2)

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.title('ROC Curves for {}-based Collaborative Filter'.format(method),
              fontweight="bold")
    plt.show()


# In[29]:

trainset, testset = train_test_split(R, test_size=0.1, random_state=42)
knn_best = KNNWithMeans(k=knn_best_k,
                        sim_options={'name': 'pearson'},
                        verbose=False)
knn_best.fit(trainset)
knn_best_pred = knn_best.test(testset)

plot_roc_curves(testset, knn_best_pred, 'KNN')

# # PART 2 - Model-based Collaborative Filtering

# ## Non-Negative Matrix Factorization

# <font size=4>**Question 17:** Design a NNMF-based collaborative filter to predict the ratings of the movies in the MovieLens dataset and evaluate it’s performance using 10-fold cross-validation. Sweep k (number of latent factors) from 2 to 50 in step sizes of 2, and for each k compute the average RMSE and average MAE obtained by averaging the RMSE and MAE across all 10 folds. Plot the average RMSE (Y-axis) against k (X-axis) and the average MAE (Y-axis) against k (X-axis). For solving this question, use the default value for the regularization parameter.</font>

# In[30]:

import numpy as np
Exemplo n.º 27
0
file_path = os.path.expanduser('ml-latest-small/ratings_unpopular.csv')
reader = Reader(sep=',')
data = Dataset.load_from_file(file_path, reader=reader)
# data = Dataset.load_builtin('ml-100k')

sim_options = {'name': 'pearson', 'user_based': True}

avg_rmse = []
avg_mae = []
all_k = []

for i in range(2, 102, 2):
    print('k = ', i)
    all_k.append(i)

    algo = KNNWithMeans(k=i, sim_options=sim_options)
    output = cross_validate(algo,
                            data,
                            measures=['RMSE', 'MAE'],
                            cv=10,
                            verbose=True,
                            n_jobs=1)
    avg_rmse.append(np.mean(output['test_rmse']))
    avg_mae.append(np.mean(output['test_mae']))

print("min rmse k:", avg_rmse.index(min(avg_rmse)))
print("min rmse:", min(avg_rmse))
print("min mae k:", avg_mae.index(min(avg_mae)))
print("min mae:", min(avg_mae))

plt.plot(all_k, avg_rmse)
RS_ratings = ratings.drop(columns='timestamp')
RS_reader = Reader(name=None,
                   line_format='user item rating',
                   sep=',',
                   rating_scale=(1, 5),
                   skip_lines=0)
RS_data = Dataset.load_from_df(RS_ratings, RS_reader)

# Benchmark_Algorithm_Metric
benchmark = []
for algorithm in [
        BaselineOnly(),
        CoClustering(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        NMF(),
        NormalPredictor(),
        SlopeOne(),
        SVD(),
        SVDpp()
]:
    # Perform cross validation
    results = cross_validate(algorithm,
                             RS_data,
                             measures=['rmse', 'mae', 'mse', 'fcp'],
                             cv=5,
                             verbose=True)
    # Results To Serie List
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
Exemplo n.º 29
0
    plt.savefig('plot/q15_knn_roc_' + str(threshold) + '.png')
    plt.clf()


if __name__ == "__main__":
    threshold = [2.5, 3, 3.5, 4]
    file_path = os.path.expanduser("ml-latest-small/ratings_new.csv")
    reader = Reader(sep=',')
    data = Dataset.load_from_file(file_path, reader=reader)

    sim_options = {'name': 'pearson', 'user_based': True}

    trainset, testset = train_test_split(data, test_size=0.1)

    for th in threshold:
        algo = KNNWithMeans(k=34, sim_options=sim_options)
        algo.fit(trainset)
        predictions = algo.test(testset)

        y_true = []
        y_estimate = []

        for row in predictions:
            if row[2] >= th:
                y_true.append(1)
            else:
                y_true.append(0)
            y_estimate.append(row[3])

        plot_roc(y_true, y_estimate, th)
def get_top_t(predictions, t=10):
    # First map the predictions to each user.
    top_t = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_t[uid].append((iid, est, true_r))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_t.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_t[uid] = user_ratings[:t]

    return top_t


train_set, test_set = train_test_split(data, test_size=0.1, random_state=0)
algo = KNNWithMeans(k=20, sim_options={'name': 'pearson'})
algo.fit(train_set)
predictions = algo.test(test_set)
top_recos = get_top_t(predictions)


def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])