def train_trim_nmf(data, R):
    kfold = KFold(n_splits=10)
    rmse_list = [[], [], []]
    for k in range(2, 52, 2):
        print("using k = %d" % k)
        p_rmse = []
        u_rmse = []
        hv_rmse = []
        nmf = NMF(n_factors=k)
        for trainset, testset in kfold.split(data):
            nmf.fit(trainset)
            (p_testset, u_testset, hv_testset) = trim(testset, R)

            p_pred = nmf.test(p_testset)
            u_pred = nmf.test(u_testset)
            hv_pred = nmf.test(hv_testset)

            p_rmse.append(accuracy.rmse(p_pred))
            u_rmse.append(accuracy.rmse(u_pred))
            hv_rmse.append(accuracy.rmse(hv_pred))
        rmse_list[0].append(np.mean(p_rmse))
        rmse_list[1].append(np.mean(u_rmse))
        rmse_list[2].append(np.mean(hv_rmse))
    print("NMF with trim is finished!!")
    return rmse_list
def nmf_compute_high_var_trim_rmse(k):
    nmf = NMF(n_factors=k, random_state=42)
    rmse = []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        nmf.fit(trainset)
        testset_trimmed = high_variance_trimming(testset, frequency, variance)
        pred = nmf.test(testset_trimmed)
        rmse.append(accuracy.rmse(pred, verbose=False))
    print('k: %s | RMSE: %f' % (k, np.mean(rmse)))
    return np.mean(rmse)
def nmf_compute_prec_rec(t):
    precision, recall = [], []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        nmf = NMF(n_factors=nmf_best_k, random_state=42)
        nmf.fit(trainset)
        trimmed_testset = trim_unpopular_user(testset, t, threshold)
        pred = nmf.test(trimmed_testset)

        precision_dict, recall_dict = calculate_precision_recall(
            pred, t, threshold)
        precision.append(np.mean([prec for prec in precision_dict.values()]))
        recall.append(np.mean([rec for rec in recall_dict.values()]))
    return np.mean(precision), np.mean(recall)
예제 #4
0
파일: P5.py 프로젝트: EVANMON/BigDataMining
def NMF_trim_filter(ratings, dims, func, mv_dict):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    RMSE = np.empty([len(dims)])
    MAE = np.empty([len(dims)])
    min_RMSE = False
    min_MAE = False
    fac_num_RMSE = 0
    fac_num_MAE = 0
    kf = KFold(n_splits=10, random_state=42)

    for k in range(len(dims)):
        nmf = NMF(n_factors=dims[k], random_state=42)
        test_rmse = np.array([])
        test_mae = np.array([])
        for trainset, testset in kf.split(data):
            nmf.fit(trainset)
            full_data = trainset.build_testset() + testset
            func(mv_dict, testset)
            pred = nmf.test(testset)
            test_rmse = np.append(test_rmse, accuracy.rmse(pred,
                                                           verbose=False))
            test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False))
        RMSE[k] = np.mean(test_rmse)
        if ((not min_RMSE) or RMSE[k] < min_RMSE):
            min_RMSE = RMSE[k]
            fac_num_RMSE = dims[k]

        MAE[k] = np.mean(test_mae)
        if ((not min_MAE) or MAE[k] < min_MAE):
            min_MAE = MAE[k]
            fac_num_MAE = dims[k]
        print('For k = %i :' % dims[k])
        print('RMSE: ', RMSE[k])
        print('MAE: ', MAE[k])

    plt.plot(dims, RMSE)
    plt.plot(dims, MAE)
    plt.legend(['RMSE', 'MAE'])
    plt.show()
    print('Finishing Plotting...')
    print('For RMSE:')
    print('\t---Optimal number of latent factors is ', fac_num_RMSE)
    print('\t---Minumun Average RMSE is ', min_RMSE)
    print('\nFor MAE:')
    print('\t---Optimal number of latent factors is ', fac_num_MAE)
    print('\t---Minumun Average MAE is ', min_MAE)
예제 #5
0
파일: Project3.py 프로젝트: shacocn/EE219
def Q19to21(qNum):
    data = load_data()
    kf = KFold(n_splits=10)

    trimFun = {12: popularTrim, 13: unpopularTrim, 14: highVarTrim}
    RMSE = []
    for k in range(2, 20, 2):
        nmf = NMF()
        subRMSE = []
        for trainSet, testSet in kf.split(data):
            subsubRMSE = 0
            nmf.fit(trainSet)
            testSet = trimFun[qNum](testSet)
            nTest = len(testSet)
            print("test set size after trimming: %d", nTest)
            predictions = nmf.test(testSet)
            for p in predictions:
                subsubRMSE += pow(p.est - p.r_ui, 2)
        # average of all train-test splits of k-NN
        RMSE.append(np.mean(subRMSE))
    return RMSE
def use_nmf():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using NMF')
    algo_NMF = NMF()
    algo_NMF.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_NMF = algo_NMF.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_NMF)
    accuracy_mae = accuracy.mae(predictions_NMF)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
예제 #7
0
파일: P5.py 프로젝트: EVANMON/BigDataMining
def NMF_bin_pre(ratings, ts, nmf_fac, thrd):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    trainset, testset = train_test_split(data, test_size=ts)
    algo = NMF(n_factors=nmf_fac, random_state=42)
    algo.fit(trainset)
    pre = algo.test(testset)

    true_rating = np.empty(len(pre))
    pred_rating = np.empty(len(pre))

    for i in range(len(pre)):
        true_rating[i] = pre[i][2]
        pred_rating[i] = pre[i][3]

    bi_rating = np.empty(len(pre))
    one_idx = true_rating >= thrd
    zero_idx = true_rating < thrd
    bi_rating[one_idx] = 1.0
    bi_rating[zero_idx] = 0.0

    return bi_rating, pred_rating
예제 #8
0
    plt.savefig('plot/q22_nmf_roc_' + str(threshold) + '.png')
    plt.clf()


if __name__ == "__main__":
    threshold = [2.5, 3, 3.5, 4]
    file_path = os.path.expanduser("ml-latest-small/ratings_new.csv")
    reader = Reader(sep=',')
    data = Dataset.load_from_file(file_path, reader=reader)

    sim_options = {'name': 'pearson', 'user_based': True}

    trainset, testset = train_test_split(data, test_size=0.1)

    for th in threshold:
        algo = NMF(n_factors=16)
        algo.fit(trainset)
        predictions = algo.test(testset)

        y_true = []
        y_estimate = []

        for row in predictions:
            if row[2] >= th:
                y_true.append(1)
            else:
                y_true.append(0)
            y_estimate.append(row[3])

        plot_roc(y_true, y_estimate, th)
print("=================================Optimal Number of Latent Factors=============================================================")
all_genres = set('|'.join(movies.genres).split('|'))
print('#of Genres - ', len(all_genres))

print("===================================NNMF collaborative filtering on popular movie trimmed set=================================================")
avg_rmse = []
k_range = range(2, 51, 2)
kf = KFold(n_splits=10)

for k in k_range:
    algo = NMF(n_factors=k)

    k_rmse = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(popular_trim(testset))
        k_rmse.append(accuracy.rmse(predictions, verbose=False))

    avg_rmse.append(np.mean(k_rmse))

print('Minimum average RMSE is ', min(avg_rmse), ' for k = ', k_range[np.argmin(avg_rmse)])
plt.plot(k_range, avg_rmse)
plt.xlabel('Number of latent factors', fontsize=15)
plt.ylabel('Average RMSE', fontsize=15)
plt.title('#latent factors vs Average RMSE for NMF Popular trimming')
plt.show()

print("========================NNMF collaborative filtering on unpopular movie trimmed set==================================")
avg_rmse = []
k_range = range(2, 51, 2)
kf = KFold(n_splits=10)
예제 #10
0
fig3.savefig(path + 'fig/Part_8_knn_preVSrec.png')

#define model for training
k_min_rmse = 18
nnmf = NMF(n_factors=k_min_rmse, random_state=1)

#train, test and rank
top_t_list = range(1, 26)
pre_list_nnmf = []
rec_list_nnmf = []
for top_t in top_t_list:
    pre = 0
    rec = 0
    for trainset, testset in kf.split(data_raw):
        nnmf.fit(trainset)
        prediction = nnmf.test(testset)
        G = create_dict(testset)
        G_s = create_dict(prediction, if_pred=1)
        R, R_s = threshold_rank_filter(G, G_s, thre=3, top_t=top_t)
        #precision and recall for each fold
        pre_fold = 0
        rec_fold = 0
        for key in R.keys():
            pre_temp, rec_temp = precision_recall(R[key], R_s[key])
            pre_fold += pre_temp
            rec_fold += rec_temp
        pre += pre_fold / len(R)
        rec += rec_fold / len(R)

    pre_list_nnmf.append(pre / num_fold)
    rec_list_nnmf.append(rec / num_fold)
예제 #11
0
plt.figure()
plt.title("kNN: Avg Precision vs Avg Recall with 10 fold CV")
plt.xlabel("Avg Recall")
plt.ylabel("Avg Precision")
plt.plot(knn_recall, knn_prec)
plt.show(0)
"""
Question 37
"""
nmf_prec, nmf_recall = [], []
for t in ts:
    precision_sum, recall_sum = 0.0, 0.0
    nmf = NMF(n_factors=20, biased=False)
    for trainset, testset in kf.split(data):
        nmf.fit(trainset)
        pred = nmf.test(testset)
        precision, recall = calc_precision_recall(pred, int(t), threshold)
        precision_sum += np.mean(list(precision.values()))
        recall_sum += np.mean(list(recall.values()))
    precision_avg = precision_sum / kf.n_splits
    recall_avg = recall_sum / kf.n_splits
    print(
        f"NMF t: {t}, precision_avg: {precision_avg}, recall_avg: {recall_avg}"
    )
    nmf_prec.append(precision_avg)
    nmf_recall.append(recall_avg)

plt.figure()
plt.subplot(2, 1, 1)
plt.title("NMF: Avg Precision vs t with 10 fold CV")
plt.ylabel("Avg Precision")
# In[42]:

print("Minimum average RMSE after high variance movie trimming: %.4f" %
      np.min(nmf_rmse_high_var_trim))

# <font size=4>**Question 22:** Plot the ROC curves for the NNMF-based collaborative filter designed in Question 17 for threshold values [2.5,3,3.5,4]. For the ROC plotting use the optimal number of latent factors found in Question 18. For each of the plots, also report the area under the curve (AUC) value.</font>

# In[43]:

nmf_best_k = ks[np.argmin(nmf_rmse)]

trainset, testset = train_test_split(R, test_size=0.1, random_state=42)
nmf_best = NMF(n_factors=nmf_best_k, random_state=42)
nmf_best.fit(trainset)
nmf_best_pred = nmf_best.test(testset)

plot_roc_curves(testset, nmf_best_pred, 'NNMF')

# <font size=4>**Question 23:** Perform Non-negative matrix factorization on the ratings matrix R to obtain the factor matrices U and V , where U represents the user-latent factors interaction and V represents the movie-latent factors interaction (use k = 20). For each column of V , sort the movies in descending order and report the genres of the top 10 movies. Do the top 10 movies belong to a particular or a small collection of genre? Is there a connection between the latent factors and the movie genres?</font>

# In[44]:

nmf_k20 = NMF(n_factors=20, random_state=42)
nmf_k20.fit(R.build_full_trainset())

# In[45]:

item_factors = nmf_k20.qi

# In[46]:
def problems_19_20_21_rmse_pop_unpop_hv():
    x_axis = range(2, 52, 2)
    ratings = {}
    for r in data.raw_ratings:

        if r[1] not in ratings:
            ratings[r[1]] = []
        ratings[r[1]].append(r[2])

    ###############################################################################################

    popular_movies = [x for x in ratings if len(ratings[x]) > 2]
    unpopular_movies = [x for x in ratings if len(ratings[x]) <= 2]

    ###################################################################################
    kf = KFold(n_splits=10)
    rmse_popular_store = []
    for i in x_axis:

        algo = NMF(i, verbose=False)
        accu = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            test_trim = [x for x in testset if x[1] in popular_movies]
            predictions = algo.test(test_trim)
            accu.append(accuracy.rmse(predictions, verbose=True))
        s = np.mean(accu)
        rmse_popular_store.append(s)

    plotgraphs(x_axis, rmse_popular_store, 'K', 'Mean RMSE scores',
               'Plot of popular movies', 'q19_rmse_popular_movies.png')
    plotgraphs(x_axis, rmse_popular_store, 'K', 'Mean RMSE scores',
               'Plot of popular movies')

    ##########################################################################################

    kf = KFold(n_splits=10)
    rmse_unpopular_store = []
    for i in x_axis:

        algo = NMF(i, verbose=False)
        accu = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            test_trim = [x for x in testset if x[1] in unpopular_movies]
            predictions = algo.test(test_trim)
            accu.append(accuracy.rmse(predictions, verbose=True))
        s = np.mean(accu)
        rmse_unpopular_store.append(s)

    plotgraphs(x_axis, rmse_unpopular_store, 'K', 'Mean RMSE scores',
               'Plot of unpopular movies', 'q20_rmse_unpopular_movies.png')
    plotgraphs(x_axis, rmse_unpopular_store, 'K', 'Mean RMSE scores',
               'Plot of unpopular movies')

    ############ rates  "key" id, values are ratings #######################################
    movie_var = {}
    for k in ratings:
        # print(k)
        movie_var[k] = np.var(ratings[k])

    ####################################################################################
    highvar_movies = [
        x for x in ratings if len(ratings[x]) >= 5 and movie_var[x] >= 2
    ]
    ##################################################################################

    kf = KFold(n_splits=10)
    rmse_highvar_store = []
    for i in x_axis:

        algo = NMF(i, verbose=False)
        accu = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            test_trim = [x for x in testset if x[1] in highvar_movies]
            predictions = algo.test(test_trim)
            accu.append(accuracy.rmse(predictions, verbose=True))
        s = np.mean(accu)
        rmse_highvar_store.append(s)

    pd.DataFrame(rmse_highvar_store).to_csv("rmse_highvar_store_21.csv")
    plotgraphs(x_axis, rmse_highvar_store, 'K', 'Mean RMSE scores',
               'Plot of high variance movies', 'q21_rmse_high_var_movies.png')