Пример #1
0
def get_results(setNum, reg_term):
    reader = Reader(rating_scale = (0,10))
    train = pd.read_csv('../data/train_'+str(setNum)+'.csv', sep = ';')
    # test = pd.read_csv('../data/test_update.csv', sep = ';')
    train_set = Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']], reader=reader)
    # test_set = Dataset.load_from_df(test[['User-ID', 'ISBN', 'Book-Rating']], reader=reader)
    data = train_set.build_full_trainset()

    num_factors = 50
    if setNum==10:
        num_factors_b=200
    if setNum==15:
        num_factors_b=400
    if setNum==30:
        num_factors_b=600

    svd = SVD(n_factors = num_factors, reg_all=reg_term)
    svd_bias = SVD(n_factors = num_factors_b, biased=True, reg_all=reg_term)
    baseline = DumbBaseline()

    cv_svd = cross_validate(svd, train_set, n_jobs = -2, return_train_measures=True)
    cv_svd_bias = cross_validate(svd_bias, train_set, n_jobs = -2, return_train_measures=True)
    cv_baseline = cross_validate(baseline, train_set, n_jobs = -2, return_train_measures=True)

    # getting the results ready to plot
    val_res = [np.mean(cv_svd['test_rmse']), np.mean(cv_svd_bias['test_rmse']),np.mean(cv_baseline['test_rmse'])]
    train_res = [np.mean(cv_svd['train_rmse']), np.mean(cv_svd_bias['train_rmse']),np.mean(cv_baseline['train_rmse'])]
    val_err = [np.std(cv_svd['test_rmse']), np.std(cv_svd_bias['test_rmse']),np.std(cv_baseline['test_rmse'])]
    train_err = [np.std(cv_svd['train_rmse']), np.std(cv_svd_bias['train_rmse']),np.std(cv_baseline['train_rmse'])]
    algs = ['MF (k='+str(num_factors)+')', 'MF With Bias (k='+str(num_factors_b)+')', 'Baseline']

    return val_res,train_res,val_err,train_err,algs
Пример #2
0
def svd_ratings_predicate(observed_ratings_df,
                          truth_ratings_df,
                          fold='0',
                          phase='eval'):
    """
    pmf_ratings Predicates
    """
    print("SVD predicates")
    svd_model = SVD()
    reader = Reader(rating_scale=(0.2, 1))
    train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index(
    ).loc[:, ['userId', 'movieId', 'rating']],
                                         reader=reader)
    svd_model.fit(train_dataset.build_full_trainset())

    # make predictions
    predictions = pd.DataFrame(index=truth_ratings_df.index,
                               columns=['rating'])

    for row in truth_ratings_df.loc[:, ['rating']].iterrows():
        uid = row[0][0]
        iid = row[0][1]
        predictions.loc[(uid, iid), 'rating'] = svd_model.predict(uid, iid).est

    write(predictions, 'svd_rating_obs', fold, phase)
def mfb_compute_high_var_trim_rmse(k):
    mfb = SVD(n_factors=k, random_state=42)
    rmse = []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        mfb.fit(trainset)
        testset_trimmed = high_variance_trimming(testset, frequency, variance)
        pred = mfb.test(testset_trimmed)
        rmse.append(accuracy.rmse(pred, verbose=False))
    print('k: %s | RMSE: %f' % (k, np.mean(rmse)))
    return np.mean(rmse)
Пример #4
0
def svd_algorithm() -> SVD:
    user_input = input(
        'Do you want to continue with the default parameters? Y/N')
    if user_input.lower() == 'y':
        return SVD()
    else:
        n_factors = int(input('Enter total number of factors: '))
        n_epochs = int(input('Enter number of epochs: '))
        lr_all = float(
            input('Enter the learning rate for all the paramaters: '))
        return SVD(n_factors, n_epochs, lr_all)
def mfb_compute_prec_rec(t):
    precision, recall = [], []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        mfb = SVD(n_factors=mfb_best_k, random_state=42)
        mfb.fit(trainset)
        trimmed_testset = trim_unpopular_user(testset, t, threshold)
        pred = mfb.test(trimmed_testset)

        precision_dict, recall_dict = calculate_precision_recall(
            pred, t, threshold)
        precision.append(np.mean([prec for prec in precision_dict.values()]))
        recall.append(np.mean([rec for rec in recall_dict.values()]))
    return np.mean(precision), np.mean(recall)
Пример #6
0
def MF_trim_filter(ratings, dims, func, mv_dict):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    RMSE = np.empty([len(dims)])
    MAE = np.empty([len(dims)])
    min_RMSE = False
    min_MAE = False
    fac_num_RMSE = 0
    fac_num_MAE = 0
    kf = KFold(n_splits=10, random_state=42)

    for k in range(len(dims)):
        svd = SVD(n_factors=dims[k], random_state=42)
        test_rmse = np.array([])
        test_mae = np.array([])
        for trainset, testset in kf.split(data):
            svd.fit(trainset)
            full_data = trainset.build_testset() + testset
            func(mv_dict, testset)
            pred = svd.test(testset)
            test_rmse = np.append(test_rmse, accuracy.rmse(pred,
                                                           verbose=False))
            test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False))
        RMSE[k] = np.mean(test_rmse)
        if ((not min_RMSE) or RMSE[k] < min_RMSE):
            min_RMSE = RMSE[k]
            fac_num_RMSE = dims[k]

        MAE[k] = np.mean(test_mae)
        if ((not min_MAE) or MAE[k] < min_MAE):
            min_MAE = MAE[k]
            fac_num_MAE = dims[k]
        print('For k = %i :' % dims[k])
        print('RMSE: ', RMSE[k])
        print('MAE: ', MAE[k])

    plt.plot(dims, RMSE)
    plt.plot(dims, MAE)
    plt.legend(['RMSE', 'MAE'])
    plt.show()
    print('Finishing Plotting...')
    print('For RMSE:')
    print('\t---Optimal number of latent factors is ', fac_num_RMSE)
    print('\t---Minumun Average RMSE is ', min_RMSE)
    print('\nFor MAE:')
    print('\t---Optimal number of latent factors is ', fac_num_MAE)
    print('\t---Minumun Average MAE is ', min_MAE)
Пример #7
0
def Q15and22and29(qNum, bestK, thres=[2.5, 3, 3.5, 4]):
    range = 5.0
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    data = load_data()
    trainset, testset = train_test_split(data, test_size=0.1)
    if qNum == 15:
        model = KNNWithMeans(bestK, sim_options=sim_options)
    elif qNum == 22:
        model = NMF(n_factors=bestK)
    else:
        model = SVD(n_factors=bestK)

    model.fit(trainset)
    pred = model.test(testset)
    for thrs in thres:
        np_true = np.array([])
        np_score = np.array([])
        for u, i, t, p, d in pred:
            if t >= thrs:
                t = 1
            else:
                t = 0
            np_true = np.append(np_true, t)
            np_score = np.append(np_score, p / range)
        title = 'Threshold ' + str(thrs)
        plot_ROC(np_true, np_score, title=title)
Пример #8
0
 def slot_select_algo_combobox(self):
     self.algo_change_flag=True
     self.algo_trained_flag=False
     algo_name=self.select_algo_comboBox.currentText()
     if algo_name=='SVD':
         self.algo=SVD()
         self.display_process_label.append('加载SVD模型...')
     elif algo_name=='SVD++':
         self.algo = SVDpp()
         self.display_process_label.append('加载SVD++模型...')
     elif algo_name == 'NMF':
         self.algo = NMF()
         self.display_process_label.append('加载NMF模型...')
     elif algo_name == 'Slope One':
         self.algo = SlopeOne()
         self.display_process_label.append('加载Slope One模型...')
     elif algo_name == 'k-NN':
         self.algo = KNNBasic()
         self.display_process_label.append('加载k-NN模型...')
     elif algo_name == 'Centered k-NN':
         self.algo = KNNWithMeans()
         self.display_process_label.append('加载Centered k-NN模型...')
     elif algo_name == 'k-NN Baseline':
         self.algo = KNNBaseline()
         self.display_process_label.append('加载k-NN Baseline模型...')
     elif algo_name == 'Co-Clustering':
         self.algo = CoClustering()
         self.display_process_label.append('加载Co-Clustering模型...')
     elif algo_name == 'Baseline':
         self.algo = BaselineOnly()
         self.display_process_label.append('加载Baseline模型...')
     elif algo_name == 'Random':
         self.algo = NormalPredictor()
         self.display_process_label.append('加载Random模型...')
    def recommand(self):
        sim_options = {
            'name': 'pearson_baseline',
            'shrinkage': 0  # no shrinkage
        }
        best_model = knns.KNNWithMeans(k=20, sim_options=sim_options)
        t_values, precisions_knn, recall_knn = self.test_with_t_and_k(
            best_model, msg='KNN')

        best_model = matrix_factorization.NMF(n_factors=20, biased=False)
        t_values, precisions_nmf, recall_nmf = self.test_with_t_and_k(
            best_model, msg='NMF')

        best_model = SVD(20)
        t_values, precisions_svd, recall_svd = self.test_with_t_and_k(
            best_model, msg='SVD')

        plt.plot(t_values, precisions_knn, label='precisions_knn')
        plt.plot(t_values, precisions_nmf, label='precisions_nmf')
        plt.plot(t_values, precisions_svd, label='precisions_svd')
        plt.plot(t_values, recall_knn, label='recall_knn')
        plt.plot(t_values, recall_nmf, label='recall_nmf')
        plt.plot(t_values, recall_svd, label='recall_svd')
        plt.xlabel('t_value')
        plt.ylabel('percent')
        plt.legend(loc="best")
        plt.show()
def Question24(data):
    ks = range(2, 51, 2)
    RMSE = []
    MAE = []
    for k in ks:
        model = SVD(n_factors=k)
        pred = cross_validate(model, data, cv=10)
        RMSE.append(np.mean(pred['test_rmse']))
        MAE.append(np.mean(pred['test_mae']))

    # Plot
    plt.plot(ks, RMSE)
    plt.xlabel('k')
    plt.ylabel('Average RMSE')
    plt.savefig('Q24_RMSE.png')
    plt.figure()
    plt.plot(ks, MAE)
    plt.xlabel('k')
    plt.ylabel('Average MAE')
    plt.savefig('Q24_MAE.png')

    index = np.argmin(RMSE)
    print("Best k: %i" % ks[index])
    print("Lowest RMSE: %f" % RMSE[index])
    print("Lowest MAE: %f" % np.min(MAE))
def build_model(train,method ='svd'):
    
    """Builds model and makes predictions for user-book rating.
    
    Args:
    train(surprise trainset): training set for the model to train on
    
    method (string): Method to use. Either 'knn' or 'svd'. Deafault is 'svd'. 
    
    Returns: list of Prediction objects.
    
    """
    
    if method == 'knn':
        surprise_sim_opt = {'name':'cosine','user_based':False}
        model = KNNBasic(k=100, min_k=20,sim_options = surprise_sim_opt)
    else:
        model = SVD(n_epochs=50)
        
    
    model.fit(train)
    
    
    test = train.build_anti_testset()
    pred = model.test(testset)
    
    return pred
    def run_and_test_all_models(self):
        step_size = 2

        # KNN
        sim_options = {
            'name': 'pearson_baseline',
            'shrinkage': 0  # no shrinkage
        }
        algo = knns.KNNWithMeans
        args = {'sim_options': sim_options}
        best_model = knns.KNNWithMeans(k=20, sim_options=sim_options)
        roc_auc_KNN = self.run_and_test_model(algo, args, best_model,
                                              (2, 101, step_size), 'KNN')

        # # NMF
        algo = matrix_factorization.NMF
        args = {'biased': False}
        best_model = matrix_factorization.NMF(n_factors=20, biased=False)
        roc_auc_NMF = self.run_and_test_model(algo, args, best_model,
                                              (2, 51, step_size), 'NMF')

        # SVD
        algo = matrix_factorization.SVD
        args = {}
        best_model = SVD(20)
        roc_auc_SVD = self.run_and_test_model(algo, args, best_model,
                                              (2, 51, step_size), 'SVD')

        # all
        for i in range(len(roc_auc_KNN)):
            plt.plot(roc_auc_KNN[i][0],
                     roc_auc_KNN[i][1],
                     color='blue',
                     linewidth=2.0,
                     label='KNN')
            plt.plot(roc_auc_NMF[i][0],
                     roc_auc_NMF[i][1],
                     color='blue',
                     linewidth=2.0,
                     label='NMF')
            plt.plot(roc_auc_SVD[i][0],
                     roc_auc_SVD[i][1],
                     color='blue',
                     linewidth=2.0,
                     label='SVD')
            plt.plot([0, 1], [0, 1], color='yellow', linewidth=2.0)
            plt.xlabel('FPR')
            plt.ylabel('TPR')
            plt.legend(loc="lower right")
            plt.show()

        # NaiveFilter
        self.run_naive_filter(msg='normal')
        self.run_naive_filter(test_filter=trimPopular, msg='trimPopular')
        self.run_naive_filter(test_filter=trimUnpopular, msg='trimUnpopular')
        self.run_naive_filter(test_filter=trimHighVariance,
                              msg='trimHighVariance')
Пример #13
0
def rank_predictions(model_name):

    k_KNN = 22 
    k_NNMF = 20
    k_MF = 26

    if model_name == 'KNN':
        sim_options = {
            'name': 'pearson_baseline',
            'shrinkage': 0
        }
        model = KNNWithMeans(k_KNN, sim_options=sim_options)
    elif model_name == 'NNMF':
        model = NMF(n_factors= k_NNMF)
    else:
        model = SVD(n_factors = k_MF)

    precision_arr = []
    recall_arr = []
    for t in range (1,26):
        kf = KFold(n_splits=10)
        print(t)
        p = []
        r = []
        for trainSet, testSet in kf.split(data):
            model.fit(trainSet)
            predictions = model.test(testSet)
            precisions, recalls = precision_recall (predictions, t)
            p.append(sum(prec for prec in precisions.values()) / len(precisions))
            r.append(sum(rec for rec in recalls.values()) / len(recalls))
            
        precision_arr.append(np.mean(np.array(p)))
        recall_arr.append(np.mean(np.array(r)))

    # precision vs t
    plt.plot(list(range (1,26)), precision_arr)
    plt.xlabel("Size")
    plt.ylabel("Precision")
    plt.title("The average precision plot using " + model_name)
    plt.show()
    
    # recall vs t
    plt.plot(list(range (1,26)), recall_arr)
    plt.xlabel("Size")
    plt.ylabel("Recall")
    plt.title("The average recall plot using MF " + model_name)
    plt.show()
    
    # precision vs recall 
    plt.plot(recall_arr, precision_arr)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("The average precision and recall plot using " + model_name)
    plt.show()


    return precision_arr, recall_arr 
def train_svd(data):
    rmse = []
    mae = []
    sim_options = {'name': 'pearson'}
    for k in range(2, 52, 2):
        print("using k = %d" % k)
        nmf = SVD(n_factors=k)
        temp = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=10)
        rmse.append(np.mean(temp['test_rmse']))
        mae.append(np.mean(temp['test_mae']))
    print("k-fold validation finished!")
    return (rmse, mae)
def train_trim_svd(data, R):
    kfold = KFold(n_splits=10)
    rmse_list = [[], [], []]
    for k in range(2, 52, 2):
        print("using k = %d" % k)
        p_rmse = []
        u_rmse = []
        hv_rmse = []
        svd = SVD(n_factors=k)
        for trainset, testset in kfold.split(data):
            svd.fit(trainset)
            (p_testset, u_testset, hv_testset) = trim(testset, R)

            p_pred = svd.test(p_testset)
            u_pred = svd.test(u_testset)
            hv_pred = svd.test(hv_testset)

            p_rmse.append(accuracy.rmse(p_pred))
            u_rmse.append(accuracy.rmse(u_pred))
            hv_rmse.append(accuracy.rmse(hv_pred))
        rmse_list[0].append(np.mean(p_rmse))
        rmse_list[1].append(np.mean(u_rmse))
        rmse_list[2].append(np.mean(hv_rmse))
    print("SVD with trim is finished!!")
    return rmse_list
Пример #16
0
def Q34():
    rang = 5.0
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    data = load_data()
    trainset, testset = train_test_split(data, test_size=0.1)
    knn = KNNWithMeans(22, sim_options=sim_options)
    nmf = NMF(n_factors=18)
    svd = SVD(n_factors=8)
    fp = {}
    tp = {}
    area = np.array([])
    for model, key in zip([knn, nmf, svd], ['KNN', 'NNMF', 'SVD']):
        model.fit(trainset)
        pred = model.test(testset)
        np_true = np.array([])
        np_score = np.array([])
        for _, _, t, p, _ in pred:
            if t >= 3:
                t = 1
            else:
                t = 0
            np_true = np.append(np_true, t)
            np_score = np.append(np_score, p / rang)
        fpr, tpr, thresholds = roc_curve(np_true, np_score)
        print(fpr.shape, tpr.shape)
        roc_auc = auc(fpr, tpr)
        fp[key] = fpr
        tp[key] = tpr
        area = np.append(area, roc_auc)
    plt.figure()
    lw = 2
    for mod, f, t, roc_auc in zip(['KNN', 'NNMF', 'SVD'], fp, tp, area):
        fpr = fp[f]
        tpr = tp[t]
        #     label = mod+'ROC curve (area = '+str(roc_auc)+'0.2f)'
        plt.plot(fpr,
                 tpr,
                 lw=lw,
                 label='%s ROC curve (area = %0.2f)' % (mod, roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    plt.show()
    plt.close()
Пример #17
0
def MF_bin_pre(ratings, ts, nmf_fac, thrd):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    trainset, testset = train_test_split(data, test_size=ts)
    algo = SVD(n_factors=nmf_fac, random_state=42)
    algo.fit(trainset)
    pre = algo.test(testset)

    true_rating = np.empty(len(pre))
    pred_rating = np.empty(len(pre))

    for i in range(len(pre)):
        true_rating[i] = pre[i][2]
        pred_rating[i] = pre[i][3]

    bi_rating = np.empty(len(pre))
    one_idx = true_rating >= thrd
    zero_idx = true_rating < thrd
    bi_rating[one_idx] = 1.0
    bi_rating[zero_idx] = 0.0

    return bi_rating, pred_rating
def trimmed_test_MF(data, choice=0):
    ks = range(2, 51, 2)
    avg_RMSEs = []
    for k in ks:
        kf = KFold(n_splits=10)
        rmse_total = 0
        for trainset, testset in kf.split(data):
            trimmed_testset = trim(data, testset, choice)
            model = SVD(n_factors=k).fit(trainset)
            pred = model.test(trimmed_testset)
            rmse_total += rmse(pred, verbose=False)
        rmse_total = rmse_total / 10.0
        avg_RMSEs.append(rmse_total)

    # Plot
    plt.plot(ks, avg_RMSEs)
    plt.xlabel('k')
    plt.ylabel('Average RMSE')
    plt.savefig('RMSE_' + str(choice) + '.png')

    index = np.argmin(avg_RMSEs)
    print("Best k: %i" % ks[index])
    print("Lowest RMSE: %f" % avg_RMSEs[index])
Пример #19
0
def vary_factors(setNum, n_factors):
    reader = Reader(rating_scale = (0,10))
    train = pd.read_csv('../data/train_'+str(setNum)+'.csv', sep = ';')
    train_set = Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']], reader=reader)
    data = train_set.build_full_trainset()

    train_errors = []
    val_errors = []
    for f in n_factors:
        svd = SVD(n_factors = f)
        cv = cross_validate(svd, train_set, return_train_measures=True, n_jobs = -2, verbose=True)
        train_errors += [np.mean(cv['train_rmse'])]
        val_errors += [np.mean(cv['test_rmse'])]
    return train_errors, val_errors
Пример #20
0
 def __init__(self):
     super(Window,self).__init__()
     self.setupUi(self)
     self.connect_slot_function()
     self.current_path = os.getcwd()
     self.dataset_path='./dataset/data.csv'
     self.result_path = './result/pre_result.txt'
     self.help_file_path='./help/help.txt'
     self.max_totalnum=10000
     self.cut_num=0
     self.algo = SVD()
     self.display_process_label.append('初始化加载SVD模型.')
     self.algo_change_flag = False
     self.algo_trained_flag=False
     self.init_dir()
Пример #21
0
def Q26To28(qNum, n_splits=10):
    data = load_data()
    kf = KFold(n_splits=10)

    trimFun = {26: popularTrim, 27: unpopularTrim, 28: highVarTrim}
    RMSE = []
    for k in range(2, 52, 2):
        MF_svd = SVD(n_factors=k)
        subRMSE = []
        for trainSet, testSet in kf.split(data):
            subsubRMSE = 0
            MF_svd.fit(trainSet)
            testSet = trimFun[qNum](testSet)
            nTest = len(testSet)
            print("test set size after trimming: %d", nTest)
            for (r, c, rating) in testSet:
                predictedRating = MF_svd.predict(str(r), str(c))
                subsubRMSE += (pow(rating - predictedRating.est, 2))
            # calculate RMSE of this train-test split
            subRMSE.append(np.sqrt(subsubRMSE / nTest))
        # average of all train-test splits of k-NN
        RMSE.append(np.mean(subRMSE))

    return RMSE
Пример #22
0
def run_svd(data, params, svdpp=False):
    '''Returns trained SVD model based on matrix factorization'''
    if svdpp:
        alg = SVDpp(n_factors=utils.get_param(params, 'n_factors'),
                    n_epochs=utils.get_param(params, 'n_epochs'),
                    lr_all=utils.get_param(params, 'learning_rate'),
                    reg_all=utils.get_param(params, 'reg'),
                    verbose=True)
    else:
        alg = SVD(biased=utils.get_param(params, 'biased'),
                  n_factors=utils.get_param(params, 'n_factors'),
                  n_epochs=utils.get_param(params, 'n_epochs'),
                  lr_all=utils.get_param(params, 'learning_rate'),
                  reg_all=utils.get_param(params, 'reg'),
                  verbose=True)
    alg.fit(data)
    return alg
Пример #23
0
def plot_all_ROC():
    rang = 5.0
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    trainset, testset = train_test_split(data, test_size=0.1)
    knn = KNNWithMeans(22, sim_options=sim_options)
    nmf = NMF(n_factors=18)
    svd = SVD(n_factors=8)
    fp = {}
    tp = {}
    area = np.array([])
    for model, key in zip([knn, nmf, svd], ['KNN','NNMF','SVD']):
        model.fit(trainset)
        pred = model.test(testset)
        np_true = np.array([])
        np_score = np.array([])
        for _, _, t, p, _ in pred:
            if t >= 3:
                t = 1
            else:
                t = 0
            np_true = np.append(np_true, t)
            np_score = np.append(np_score, p/rang)
        fpr, tpr, thresholds = metrics.roc_curve(np_true, np_score)
        print(fpr.shape, tpr.shape)
        roc_auc = metrics.auc(fpr, tpr)
        fp[key] = fpr
        tp[key] = tpr
        area = np.append(area, roc_auc)
    plt.figure()
    lw = 2
    for mod, f, t, roc_auc in zip(['k-NN','NNMF','MF'], fp, tp, area):
        fpr = fp[f]
        tpr = tp[t]
        plt.plot(fpr, tpr, lw=lw, label='%s'%mod)
    plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()
    plt.show()
def grid_search(surprise_model):

    if type(surprise_model()) == type(SVDpp()):

        param_grid = {'n_factors':[20] , 'n_epochs':[20], 'lr_all':[0.005, 0.007, 0.05, 0.07, 0.5, 0.7, 1.0], 'reg_all':[0.02, 0.05, 0.2, 0.5]}
        gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True)

    elif type(surprise_model()) == type(SVD()):

        param_grid = {'n_epochs':[20], 'lr_all':[0.005, 0.007, 0.05, 0.07, 0.5, 0.7, 1.0], 'reg_all':[0.02, 0.05, 0.2, 0.5]}
        gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True)

    elif type(surprise_model()) == type(NMF()):

        param_grid = {'n_epochs':[20], 'reg_pu':[0.02, 0.04, 0.06, 0.08, 0.2], 'reg_qi':[0.02, 0.04, 0.06, 0.08, 0.2]}
        gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True)

    elif type(surprise_model()) == type(BaselineOnly()):
        param_grid = {'bsl_options': {'method': ['als', 'sgd'], 'reg': [1, 2], 'learning_rate': [0.005, 0.05, 0.5, 1.0]}}
        gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True)

    return gs
Пример #25
0
def MF_bias_filter(ratings, dims):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    RMSE = np.empty([len(dims)])
    MAE = np.empty([len(dims)])
    min_RMSE = False
    min_MAE = False
    fac_num_RMSE = 0
    fac_num_MAE = 0

    for k in range(len(dims)):
        svd = SVD(n_factors=dims[k], random_state=42)
        cv = cross_validate(algo=svd,
                            data=data,
                            measures=['RMSE', 'MAE'],
                            cv=10,
                            verbose=True)
        RMSE[k] = np.mean(cv['test_rmse'])
        if ((not min_RMSE) or RMSE[k] < min_RMSE):
            min_RMSE = RMSE[k]
            fac_num_RMSE = dims[k]

        MAE[k] = np.mean(cv['test_mae'])
        if ((not min_MAE) or MAE[k] < min_MAE):
            min_MAE = MAE[k]
            fac_num_MAE = dims[k]

    plt.plot(dims, RMSE)
    plt.plot(dims, MAE)
    plt.legend(['RMSE', 'MAE'])
    plt.show()
    print('Finishing Plotting...')
    print('For RMSE:')
    print('\t---Optimal number of latent factors is ', fac_num_RMSE)
    print('\t---Minumun Average RMSE is ', min_RMSE)
    print('\nFor MAE:')
    print('\t---Optimal number of latent factors is ', fac_num_MAE)
    print('\t---Minumun Average MAE is ', min_MAE)
Пример #26
0
def Q24():

    # so far using same code as Q10, Q12-14 for Q24, Q26-28, can combine code later
    # only using SVD for Q24 for now, but the RMSE and MAE don't change much with latent factor
    data = load_data()

    meanRMSE, meanMAE = [], []
    start = time.time()
    for k in range(2, 52, 2):
        MF_svd = SVD(n_factors=k)
        out = cross_validate(MF_svd, data, measures=['RMSE', 'MAE'], cv=10)
        meanRMSE.append(np.mean(out['test_rmse']))
        meanMAE.append(np.mean(out['test_mae']))
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    print("Total time used for cross validation: " + cv_time)

    k = list(range(2, 52, 2))
    ys = [[meanRMSE, 'mean RMSE'], [meanMAE, 'mean MAE']]
    #currently plot meanRMSE and meanMAE separately because it's hard to see the trend when they are plotted in same graph
    make_plot(k, [[meanRMSE, 'mean RMSE']], 'Number of Neighbors', 'Error')
    make_plot(k, [[meanMAE, 'mean MAE']], 'Number of Neighbors', 'Error')
    return meanRMSE, meanMAE
Пример #27
0
import matplotlib.pyplot as plt
import numpy as np

file_path = os.path.expanduser('ml-latest-small/ratings_new.csv')
reader = Reader(sep=',')
data = Dataset.load_from_file(file_path, reader=reader)


avg_rmse = []
avg_mae = []
all_k = []

for i in range(2,52,2):
  print('k = ',i)
  all_k.append(i)
  mf = SVD(n_factors=i)
  output = cross_validate(mf, data, measures=['RMSE', 'MAE'], cv=10,  verbose=True)
  
  avg_rmse.append(np.mean(output['test_rmse']))
  avg_mae.append(np.mean(output['test_mae']))

print("min rmse k:", avg_rmse.index(min(avg_rmse)))
print("min mae k:", avg_mae.index(min(avg_mae)))

plt.plot(all_k,avg_rmse)
plt.savefig('plot/mf_rmse_k.png')
plt.clf()

plt.plot(all_k,avg_mae)
plt.savefig('plot/mf_mae_k.png')
plt.clf()
from surprise import Dataset, evaluate, Reader, KNNBasic
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.matrix_factorization import SVD

from DataProcessing.dataprocessing import ratings

print("Training SVD Algorithm")
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

svd = SVD()
# print(cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True))
trainset = data.build_full_trainset()
svd.fit(trainset)


def recColl(userid, movieid, gt=None):
    return svd.predict(userid, movieid, gt)


# print(recColl(1,862,3))
Пример #29
0
	sim_options = {'name': 'pearson',
	              'user_based': True
	              }

	trainset, testset = train_test_split(data, test_size=0.1)

	algo = KNNWithMeans(k=34, sim_options=sim_options)
	algo.fit(trainset)
	predictions1 = algo.test(testset)

	algo = NMF(n_factors=16)
	algo.fit(trainset)
	predictions2 = algo.test(testset)

	algo = SVD(n_factors=14)
	algo.fit(trainset)
	predictions3 = algo.test(testset)

	y_true = []
	y_estimate1 = []
	y_estimate2 = []
	y_estimate3 = []

	for row in predictions1:
		if row[2] >= threshold:
			y_true.append(1)
		else:
			y_true.append(0)
	
	for row in predictions1:
Пример #30
0
        algo.fit(trainset)
        # print testset
        predictions = algo.test(testset)
        Prec, Reca = metrics(predictions, t)
        pr = pr + Prec
        re = re + Reca

    return pr / 10.0, re / 10.0


if __name__ == '__main__':
    data = retrieve_data()
    G_max = ret_mod_user_dict(data)

    algo_NMF = NMF(NMF_no_of_LF, verbose=False)
    algo_SVD = SVD(n_factors=MF_no_of_LF)
    algo_KNN = KNNWithMeans(k=KNN_no_of_LF,
                            sim_options=sim_options,
                            verbose=False)

    # Q36
    Pr1 = []
    Re1 = []
    t = list(range(1, 26))
    for l in t:
        Precision, Recall = cross_val_(data, G_max, l, algo_KNN)
        Pr1.append(Precision)
        Re1.append(Recall)

    plotgraphs(t, Pr1, "Number of Suggestions", "Precision",
               "Precision Curve for KNN")