예제 #1
0
def nmf_ratings_predicate(observed_ratings_df,
                          truth_ratings_df,
                          fold='0',
                          phase='eval'):
    """
    nmf_ratings Predicates
    """
    print("NMF predicates")
    nmf_model = NMF()
    reader = Reader(rating_scale=(0.2, 1))
    train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index(
    ).loc[:, ['userId', 'movieId', 'rating']],
                                         reader=reader)
    nmf_model.fit(train_dataset.build_full_trainset())

    # make predictions
    predictions = pd.DataFrame(index=truth_ratings_df.index,
                               columns=['rating'])

    for row in truth_ratings_df.loc[:, ['rating']].iterrows():
        uid = row[0][0]
        iid = row[0][1]
        predictions.loc[(uid, iid), 'rating'] = nmf_model.predict(uid, iid).est

    write(predictions, 'nmf_rating_obs', fold, phase)
예제 #2
0
def nmf_algorithm() -> NMF:
    user_input = input(
        'Do you want to continue with the default parameters? Y/N')
    if user_input.lower() == 'y':
        return NMF()
    else:
        n_factors = int(input('Enter total number of factors: '))
        n_epochs = int(input('Enter number of epochs: '))
        return NMF(n_factors, n_epochs)
def nmf_compute_high_var_trim_rmse(k):
    nmf = NMF(n_factors=k, random_state=42)
    rmse = []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        nmf.fit(trainset)
        testset_trimmed = high_variance_trimming(testset, frequency, variance)
        pred = nmf.test(testset_trimmed)
        rmse.append(accuracy.rmse(pred, verbose=False))
    print('k: %s | RMSE: %f' % (k, np.mean(rmse)))
    return np.mean(rmse)
def nmf_compute_prec_rec(t):
    precision, recall = [], []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        nmf = NMF(n_factors=nmf_best_k, random_state=42)
        nmf.fit(trainset)
        trimmed_testset = trim_unpopular_user(testset, t, threshold)
        pred = nmf.test(trimmed_testset)

        precision_dict, recall_dict = calculate_precision_recall(
            pred, t, threshold)
        precision.append(np.mean([prec for prec in precision_dict.values()]))
        recall.append(np.mean([rec for rec in recall_dict.values()]))
    return np.mean(precision), np.mean(recall)
예제 #5
0
파일: Project3.py 프로젝트: shacocn/EE219
def Q23(col=0):
    print('Chosen column is ' + str(col))
    data = np.loadtxt('ml-latest-small/ratings.csv',
                      delimiter=',',
                      skiprows=1,
                      usecols=(0, 1, 2))

    row_userId = data[:, :1].astype(int)
    row_movieId = data[:, 1:2].astype(int)
    row_rating = data[:, 2:3]

    sortedId = np.sort(row_movieId.transpose()[0])
    m = {}
    idx = 0
    last = None
    for i in sortedId.tolist():
        if i != last:
            m[i] = idx
            idx += 1
        last = i

    data = load_data()
    model = NMF(n_factors=20)
    trainset, testset = train_test_split(data, test_size=0.0001)
    model.fit(trainset)
    U = model.pu
    V = model.qi

    import csv
    dict_ID_to_genre = {}
    with open('ml-latest-small/movies.csv', 'rt') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        cnt = 0
        for row in reader:
            if cnt != 0:
                dict_ID_to_genre[row[0]] = row[1:]
            cnt += 1

    dict_col_to_ID = {}
    for key in m:
        dict_col_to_ID[m[key]] = key

    V_col = V[:, col]
    V_col_sort_top10 = np.sort(V_col)[::-1][:10]
    V_col_list = V_col.tolist()
    for val in V_col_sort_top10:
        ind = V_col_list.index(val)
        m_id = dict_col_to_ID[ind]
        genre = dict_ID_to_genre[str(m_id)]
        print(genre[-1])
예제 #6
0
파일: P5.py 프로젝트: EVANMON/BigDataMining
def NMF_trim_filter(ratings, dims, func, mv_dict):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    RMSE = np.empty([len(dims)])
    MAE = np.empty([len(dims)])
    min_RMSE = False
    min_MAE = False
    fac_num_RMSE = 0
    fac_num_MAE = 0
    kf = KFold(n_splits=10, random_state=42)

    for k in range(len(dims)):
        nmf = NMF(n_factors=dims[k], random_state=42)
        test_rmse = np.array([])
        test_mae = np.array([])
        for trainset, testset in kf.split(data):
            nmf.fit(trainset)
            full_data = trainset.build_testset() + testset
            func(mv_dict, testset)
            pred = nmf.test(testset)
            test_rmse = np.append(test_rmse, accuracy.rmse(pred,
                                                           verbose=False))
            test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False))
        RMSE[k] = np.mean(test_rmse)
        if ((not min_RMSE) or RMSE[k] < min_RMSE):
            min_RMSE = RMSE[k]
            fac_num_RMSE = dims[k]

        MAE[k] = np.mean(test_mae)
        if ((not min_MAE) or MAE[k] < min_MAE):
            min_MAE = MAE[k]
            fac_num_MAE = dims[k]
        print('For k = %i :' % dims[k])
        print('RMSE: ', RMSE[k])
        print('MAE: ', MAE[k])

    plt.plot(dims, RMSE)
    plt.plot(dims, MAE)
    plt.legend(['RMSE', 'MAE'])
    plt.show()
    print('Finishing Plotting...')
    print('For RMSE:')
    print('\t---Optimal number of latent factors is ', fac_num_RMSE)
    print('\t---Minumun Average RMSE is ', min_RMSE)
    print('\nFor MAE:')
    print('\t---Optimal number of latent factors is ', fac_num_MAE)
    print('\t---Minumun Average MAE is ', min_MAE)
예제 #7
0
 def slot_select_algo_combobox(self):
     self.algo_change_flag=True
     self.algo_trained_flag=False
     algo_name=self.select_algo_comboBox.currentText()
     if algo_name=='SVD':
         self.algo=SVD()
         self.display_process_label.append('加载SVD模型...')
     elif algo_name=='SVD++':
         self.algo = SVDpp()
         self.display_process_label.append('加载SVD++模型...')
     elif algo_name == 'NMF':
         self.algo = NMF()
         self.display_process_label.append('加载NMF模型...')
     elif algo_name == 'Slope One':
         self.algo = SlopeOne()
         self.display_process_label.append('加载Slope One模型...')
     elif algo_name == 'k-NN':
         self.algo = KNNBasic()
         self.display_process_label.append('加载k-NN模型...')
     elif algo_name == 'Centered k-NN':
         self.algo = KNNWithMeans()
         self.display_process_label.append('加载Centered k-NN模型...')
     elif algo_name == 'k-NN Baseline':
         self.algo = KNNBaseline()
         self.display_process_label.append('加载k-NN Baseline模型...')
     elif algo_name == 'Co-Clustering':
         self.algo = CoClustering()
         self.display_process_label.append('加载Co-Clustering模型...')
     elif algo_name == 'Baseline':
         self.algo = BaselineOnly()
         self.display_process_label.append('加载Baseline模型...')
     elif algo_name == 'Random':
         self.algo = NormalPredictor()
         self.display_process_label.append('加载Random模型...')
예제 #8
0
파일: Project3.py 프로젝트: shacocn/EE219
def Q15and22and29(qNum, bestK, thres=[2.5, 3, 3.5, 4]):
    range = 5.0
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    data = load_data()
    trainset, testset = train_test_split(data, test_size=0.1)
    if qNum == 15:
        model = KNNWithMeans(bestK, sim_options=sim_options)
    elif qNum == 22:
        model = NMF(n_factors=bestK)
    else:
        model = SVD(n_factors=bestK)

    model.fit(trainset)
    pred = model.test(testset)
    for thrs in thres:
        np_true = np.array([])
        np_score = np.array([])
        for u, i, t, p, d in pred:
            if t >= thrs:
                t = 1
            else:
                t = 0
            np_true = np.append(np_true, t)
            np_score = np.append(np_score, p / range)
        title = 'Threshold ' + str(thrs)
        plot_ROC(np_true, np_score, title=title)
def problem_17_rmse_mae_full_dataset():
    x_axis = range(2, 52, 2)
    dim = len(x_axis)
    rmse_test_store = np.zeros(dim)
    mae_test_store = np.zeros(dim)

    for i in x_axis:
        algo = NMF(i, verbose=False)  # i = number of latent factors
        result = cross_validate(algo,
                                data,
                                measures=['rmse', 'mae'],
                                cv=10,
                                verbose=True)

        rmse_score = np.mean(result['test_rmse'])
        mae_score = np.mean(result['test_mae'])

        ##################### Index to store values in rmse and mae ###################
        ind = int(i / 2 - 1)

        rmse_test_store[ind] = rmse_score
        mae_test_store[ind] = mae_score

    pd.DataFrame(rmse_test_store).to_csv("rmse_test_store_10.csv")
    pd.DataFrame(mae_test_store).to_csv("mae_test_store_10.csv")

    plotgraphs(x_axis, rmse_test_store, 'K', 'Mean rmse scores', 'Plot',
               'q17_rmse.png')
    plotgraphs(x_axis, mae_test_store, 'K', 'Mean Mae scores', 'Plot',
               'q17_Mae.png')
예제 #10
0
def rank_predictions(model_name):

    k_KNN = 22 
    k_NNMF = 20
    k_MF = 26

    if model_name == 'KNN':
        sim_options = {
            'name': 'pearson_baseline',
            'shrinkage': 0
        }
        model = KNNWithMeans(k_KNN, sim_options=sim_options)
    elif model_name == 'NNMF':
        model = NMF(n_factors= k_NNMF)
    else:
        model = SVD(n_factors = k_MF)

    precision_arr = []
    recall_arr = []
    for t in range (1,26):
        kf = KFold(n_splits=10)
        print(t)
        p = []
        r = []
        for trainSet, testSet in kf.split(data):
            model.fit(trainSet)
            predictions = model.test(testSet)
            precisions, recalls = precision_recall (predictions, t)
            p.append(sum(prec for prec in precisions.values()) / len(precisions))
            r.append(sum(rec for rec in recalls.values()) / len(recalls))
            
        precision_arr.append(np.mean(np.array(p)))
        recall_arr.append(np.mean(np.array(r)))

    # precision vs t
    plt.plot(list(range (1,26)), precision_arr)
    plt.xlabel("Size")
    plt.ylabel("Precision")
    plt.title("The average precision plot using " + model_name)
    plt.show()
    
    # recall vs t
    plt.plot(list(range (1,26)), recall_arr)
    plt.xlabel("Size")
    plt.ylabel("Recall")
    plt.title("The average recall plot using MF " + model_name)
    plt.show()
    
    # precision vs recall 
    plt.plot(recall_arr, precision_arr)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("The average precision and recall plot using " + model_name)
    plt.show()


    return precision_arr, recall_arr 
 def __init__(self,category,save_name):
     self.category=category
     self.max_user=10000 #maximum number of user
     self.price_dict={}
     self.price_dict_temp={}
     self.cate_dict={}
     self.cate_dict_temp={}
     self.top_value=15 # top x features in SVD
     self.model=NMF()
     self.topk=500 #maximum items in each category, finding the top k popular
     self.max_price={}
     self.save_path= os.path.join("..", "feature", save_name)
     if not os.path.isfile(self.save_path):
         self.load_data() #load raw data
         #self.create_user_item_matrix()
         self.create_ratings()
         self.gen_new_price_dict()
         self.save_data(self.save_path) #save the feature
     else:
         self.load(self.save_path) #load the feature
def train_nmf(data):
    rmse = []
    mae = []
    sim_options = {'name': 'pearson'}
    for k in range(2, 52, 2):
        print("using k = %d" % k)
        nmf = NMF(n_factors=k)
        temp = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=10)
        rmse.append(np.mean(temp['test_rmse']))
        mae.append(np.mean(temp['test_mae']))
    print("k-fold validation finished!")
    return (rmse, mae)
예제 #13
0
파일: Project3.py 프로젝트: shacocn/EE219
def Q19to21(qNum):
    data = load_data()
    kf = KFold(n_splits=10)

    trimFun = {12: popularTrim, 13: unpopularTrim, 14: highVarTrim}
    RMSE = []
    for k in range(2, 20, 2):
        nmf = NMF()
        subRMSE = []
        for trainSet, testSet in kf.split(data):
            subsubRMSE = 0
            nmf.fit(trainSet)
            testSet = trimFun[qNum](testSet)
            nTest = len(testSet)
            print("test set size after trimming: %d", nTest)
            predictions = nmf.test(testSet)
            for p in predictions:
                subsubRMSE += pow(p.est - p.r_ui, 2)
        # average of all train-test splits of k-NN
        RMSE.append(np.mean(subRMSE))
    return RMSE
def train_trim_nmf(data, R):
    kfold = KFold(n_splits=10)
    rmse_list = [[], [], []]
    for k in range(2, 52, 2):
        print("using k = %d" % k)
        p_rmse = []
        u_rmse = []
        hv_rmse = []
        nmf = NMF(n_factors=k)
        for trainset, testset in kfold.split(data):
            nmf.fit(trainset)
            (p_testset, u_testset, hv_testset) = trim(testset, R)

            p_pred = nmf.test(p_testset)
            u_pred = nmf.test(u_testset)
            hv_pred = nmf.test(hv_testset)

            p_rmse.append(accuracy.rmse(p_pred))
            u_rmse.append(accuracy.rmse(u_pred))
            hv_rmse.append(accuracy.rmse(hv_pred))
        rmse_list[0].append(np.mean(p_rmse))
        rmse_list[1].append(np.mean(u_rmse))
        rmse_list[2].append(np.mean(hv_rmse))
    print("NMF with trim is finished!!")
    return rmse_list
예제 #15
0
파일: Project3.py 프로젝트: shacocn/EE219
def Q34():
    rang = 5.0
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    data = load_data()
    trainset, testset = train_test_split(data, test_size=0.1)
    knn = KNNWithMeans(22, sim_options=sim_options)
    nmf = NMF(n_factors=18)
    svd = SVD(n_factors=8)
    fp = {}
    tp = {}
    area = np.array([])
    for model, key in zip([knn, nmf, svd], ['KNN', 'NNMF', 'SVD']):
        model.fit(trainset)
        pred = model.test(testset)
        np_true = np.array([])
        np_score = np.array([])
        for _, _, t, p, _ in pred:
            if t >= 3:
                t = 1
            else:
                t = 0
            np_true = np.append(np_true, t)
            np_score = np.append(np_score, p / rang)
        fpr, tpr, thresholds = roc_curve(np_true, np_score)
        print(fpr.shape, tpr.shape)
        roc_auc = auc(fpr, tpr)
        fp[key] = fpr
        tp[key] = tpr
        area = np.append(area, roc_auc)
    plt.figure()
    lw = 2
    for mod, f, t, roc_auc in zip(['KNN', 'NNMF', 'SVD'], fp, tp, area):
        fpr = fp[f]
        tpr = tp[t]
        #     label = mod+'ROC curve (area = '+str(roc_auc)+'0.2f)'
        plt.plot(fpr,
                 tpr,
                 lw=lw,
                 label='%s ROC curve (area = %0.2f)' % (mod, roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    plt.show()
    plt.close()
def use_nmf():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using NMF')
    algo_NMF = NMF()
    algo_NMF.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_NMF = algo_NMF.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_NMF)
    accuracy_mae = accuracy.mae(predictions_NMF)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
예제 #17
0
파일: P5.py 프로젝트: EVANMON/BigDataMining
def NMF_bin_pre(ratings, ts, nmf_fac, thrd):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    trainset, testset = train_test_split(data, test_size=ts)
    algo = NMF(n_factors=nmf_fac, random_state=42)
    algo.fit(trainset)
    pre = algo.test(testset)

    true_rating = np.empty(len(pre))
    pred_rating = np.empty(len(pre))

    for i in range(len(pre)):
        true_rating[i] = pre[i][2]
        pred_rating[i] = pre[i][3]

    bi_rating = np.empty(len(pre))
    one_idx = true_rating >= thrd
    zero_idx = true_rating < thrd
    bi_rating[one_idx] = 1.0
    bi_rating[zero_idx] = 0.0

    return bi_rating, pred_rating
예제 #18
0
def trim_performance(qNum,maxk=0): 
    pop, unpop, highVar = trimMovies()
    
    if maxk == 0:
        if 12 <= qNum <= 14:
            maxk = 100
        elif 19 <= qNum <= 21:
            maxk = 50

    trim_Model = {
        12: (pop, 'KNNWithMeans'),
        13: (unpop, 'KNNWithMeans'),
        14: (highVar, 'KNNWithMeans'),
        19: (pop, 'NMF'),
        20: (unpop, 'NMF'),
        21: (highVar, 'NMF'),
    }
    trimSet, modelName = trim_Model[qNum]
    
    kf = KFold(n_splits=10)
    RMSE = [] 
    for k in range(2, maxk + 1, 2):
        print('-' * 20 + 'k = ' + str(k) + ' ' + '-' * 20)
        
        if modelName == 'KNNWithMeans':
            model = KNNWithMeans(k=k, sim_options={'name': 'pearson'})
        elif modelName == 'NMF':
            model = NMF(n_factors=k)

        subRMSE = [] 
        temp = 1
        for trainSet, testSet in kf.split(data):
            model.fit(trainSet)
            testSet = list(filter(lambda x: int(x[1]) in trimSet, testSet))
            print("Split " + str(temp) + ": test set size after trimming: %d", len(testSet))
            temp += 1
            predictions = model.test(testSet)
            subRMSE.append(accuracy.rmse(predictions, verbose=True))
        RMSE.append(np.mean(subRMSE))

    plt.figure()
    plt.plot(list(range(2, maxk+1, 2)), RMSE)
    plt.xlabel("k")
    plt.ylabel("Average RMSE")
    plt.title("Q"+str(qNum)+": Average RMSE Along k")
    plt.show()
    print(min(RMSE))
    return min(RMSE)
예제 #19
0
파일: Project3.py 프로젝트: shacocn/EE219
def Q17():
    data = load_data()

    meanRMSE, meanMAE = [], []
    start = time.time()
    for k in range(16, 24, 2):
        nmf = NMF(n_factors=k)
        out = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=10)
        meanRMSE.append(np.mean(out['test_rmse']))
        meanMAE.append(np.mean(out['test_mae']))
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    print("Total time used for cross validation: " + cv_time)

    k = list(range(16, 24, 2))
    ys = [[meanRMSE, 'mean RMSE'], [meanMAE, 'mean MAE']]
    make_plot(k, ys, 'Number of Latent Factors', 'ratings')
    return meanRMSE, meanMAE
예제 #20
0
def plot_all_ROC():
    rang = 5.0
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    trainset, testset = train_test_split(data, test_size=0.1)
    knn = KNNWithMeans(22, sim_options=sim_options)
    nmf = NMF(n_factors=18)
    svd = SVD(n_factors=8)
    fp = {}
    tp = {}
    area = np.array([])
    for model, key in zip([knn, nmf, svd], ['KNN','NNMF','SVD']):
        model.fit(trainset)
        pred = model.test(testset)
        np_true = np.array([])
        np_score = np.array([])
        for _, _, t, p, _ in pred:
            if t >= 3:
                t = 1
            else:
                t = 0
            np_true = np.append(np_true, t)
            np_score = np.append(np_score, p/rang)
        fpr, tpr, thresholds = metrics.roc_curve(np_true, np_score)
        print(fpr.shape, tpr.shape)
        roc_auc = metrics.auc(fpr, tpr)
        fp[key] = fpr
        tp[key] = tpr
        area = np.append(area, roc_auc)
    plt.figure()
    lw = 2
    for mod, f, t, roc_auc in zip(['k-NN','NNMF','MF'], fp, tp, area):
        fpr = fp[f]
        tpr = tp[t]
        plt.plot(fpr, tpr, lw=lw, label='%s'%mod)
    plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()
    plt.show()
def grid_search(surprise_model):

    if type(surprise_model()) == type(SVDpp()):

        param_grid = {'n_factors':[20] , 'n_epochs':[20], 'lr_all':[0.005, 0.007, 0.05, 0.07, 0.5, 0.7, 1.0], 'reg_all':[0.02, 0.05, 0.2, 0.5]}
        gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True)

    elif type(surprise_model()) == type(SVD()):

        param_grid = {'n_epochs':[20], 'lr_all':[0.005, 0.007, 0.05, 0.07, 0.5, 0.7, 1.0], 'reg_all':[0.02, 0.05, 0.2, 0.5]}
        gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True)

    elif type(surprise_model()) == type(NMF()):

        param_grid = {'n_epochs':[20], 'reg_pu':[0.02, 0.04, 0.06, 0.08, 0.2], 'reg_qi':[0.02, 0.04, 0.06, 0.08, 0.2]}
        gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True)

    elif type(surprise_model()) == type(BaselineOnly()):
        param_grid = {'bsl_options': {'method': ['als', 'sgd'], 'reg': [1, 2], 'learning_rate': [0.005, 0.05, 0.5, 1.0]}}
        gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True)

    return gs
예제 #22
0
파일: P5.py 프로젝트: EVANMON/BigDataMining
def NMF_filter(ratings, dims):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    RMSE = np.empty([len(dims)])
    MAE = np.empty([len(dims)])
    min_RMSE = False
    min_MAE = False
    fac_num_RMSE = 0
    fac_num_MAE = 0

    for k in range(len(dims)):
        nmf = NMF(n_factors=dims[k], biased=False)
        cv = cross_validate(algo=nmf,
                            data=data,
                            measures=['RMSE', 'MAE'],
                            cv=10,
                            verbose=True)
        RMSE[k] = np.mean(cv['test_rmse'])
        if ((not min_RMSE) or RMSE[k] < min_RMSE):
            min_RMSE = RMSE[k]
            fac_num_RMSE = dims[k]

        MAE[k] = np.mean(cv['test_mae'])
        if ((not min_MAE) or MAE[k] < min_MAE):
            min_MAE = MAE[k]
            fac_num_MAE = dims[k]

    plt.plot(dims, RMSE)
    plt.plot(dims, MAE)
    plt.legend(['RMSE', 'MAE'])
    plt.show()
    print('Finishing Plotting...')
    print('For RMSE:')
    print('\t---Optimal number of latent factors is ', fac_num_RMSE)
    print('\t---Minumun Average RMSE is ', min_RMSE)
    print('\nFor MAE:')
    print('\t---Optimal number of latent factors is ', fac_num_MAE)
    print('\t---Minumun Average MAE is ', min_MAE)
 def nmf(dataName, data, biased=True):
     print('Start building NMF with ' + dataName + '!')
     for i, k in enumerate(ks):
         nmf = NMF(n_factors=k, biased=biased)
         scores = cross_validate(nmf, data, cv=10)
         mae[i] = scores['test_mae'].mean()
         rmse[i] = scores['test_rmse'].mean()
         print('k = ' + str(k) + ' finished!')
     plt.figure()
     plt.subplot(211)
     plt.plot(ks, mae)
     plt.xlabel('k')
     plt.ylabel('mean absolute error')
     plt.title('Mean absolute error vs. k of ' + dataName)
     plt.subplot(212)
     plt.plot(ks, rmse)
     plt.xlabel('k')
     plt.ylabel('root mean squared error')
     plt.title('Root mean squared error vs. k of ' + dataName)
     print('mae:')
     print(mae)
     print('rmse:')
     print(rmse)
     print('Finish building NMF with ' + dataName + '!')
예제 #24
0
파일: Project3.py 프로젝트: shacocn/EE219
def Q12To14And19To21And26To28(qNum, maxk=None):
    data = load_data()
    kf = KFold(n_splits=10)
    if maxk is None:
        if 12 <= qNum <= 14:
            maxk = 100
        elif 19 <= qNum <= 21:
            maxk = 50
        elif 26 <= qNum <= 28:
            maxk = 50

    pop, unpop, highVar = classifyMovies()

    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    trimAndModel = {
        12: (pop, 'KNNWithMeans'),
        13: (unpop, 'KNNWithMeans'),
        14: (highVar, 'KNNWithMeans'),
        19: (pop, 'NMF'),
        20: (unpop, 'NMF'),
        21: (highVar, 'NMF'),
        26: (pop, 'SVD'),
        27: (unpop, 'SVD'),
        28: (highVar, 'SVD')
    }

    RMSE = []  #  RMSE for each k
    for k in range(2, maxk + 1, 2):  # inclusive
        print('-' * 20 + ' k = ' + str(k) + ' ' + '-' * 20)
        trimSet, modelName = trimAndModel[qNum]
        if modelName == 'KNNWithMeans':
            model = KNNWithMeans(k, sim_options=sim_options)
        elif modelName == 'NMF':
            model = NMF(n_factors=k)
        else:
            model = SVD(n_factors=k)
        subRMSE = []  # RMSE for each k for each train-test split
        iter = 1
        for trainSet, testSet in kf.split(data):
            subsubRMSE = 0
            model.fit(trainSet)
            testSet = list(filter(lambda x: x[1] in trimSet, testSet))
            nTest = len(testSet)
            print("Split " + str(iter) + ": test set size after trimming: %d",
                  nTest)
            iter += 1
            predictions = model.test(testSet)
            for p in predictions:
                subsubRMSE += pow(p.est - p.r_ui, 2)
            # calculate RMSE of this train-test split
            subRMSE.append(np.sqrt(subsubRMSE / nTest))
        # average of all train-test splits of k-NN for this k
        RMSE.append(np.mean(subRMSE))

    # plotting
    k = list(range(2, maxk + 1, 2))
    ys = [[RMSE, 'RMSE']]
    xTitle = 'Number of Neighbors' if qNum <= 14 else 'Number of latent factors'
    make_plot(k, ys, xTitle, 'Error')
    return RMSE
예제 #25
0
    plt.savefig('plot/q22_nmf_roc_' + str(threshold) + '.png')
    plt.clf()


if __name__ == "__main__":
    threshold = [2.5, 3, 3.5, 4]
    file_path = os.path.expanduser("ml-latest-small/ratings_new.csv")
    reader = Reader(sep=',')
    data = Dataset.load_from_file(file_path, reader=reader)

    sim_options = {'name': 'pearson', 'user_based': True}

    trainset, testset = train_test_split(data, test_size=0.1)

    for th in threshold:
        algo = NMF(n_factors=16)
        algo.fit(trainset)
        predictions = algo.test(testset)

        y_true = []
        y_estimate = []

        for row in predictions:
            if row[2] >= th:
                y_true.append(1)
            else:
                y_true.append(0)
            y_estimate.append(row[3])

        plot_roc(y_true, y_estimate, th)
예제 #26
0
	threshold = 3
	file_path = os.path.expanduser("ml-latest-small/ratings_new.csv")
	reader = Reader(sep=',')
	data = Dataset.load_from_file(file_path, reader=reader)

	sim_options = {'name': 'pearson',
	              'user_based': True
	              }

	trainset, testset = train_test_split(data, test_size=0.1)

	algo = KNNWithMeans(k=34, sim_options=sim_options)
	algo.fit(trainset)
	predictions1 = algo.test(testset)

	algo = NMF(n_factors=16)
	algo.fit(trainset)
	predictions2 = algo.test(testset)

	algo = SVD(n_factors=14)
	algo.fit(trainset)
	predictions3 = algo.test(testset)

	y_true = []
	y_estimate1 = []
	y_estimate2 = []
	y_estimate3 = []

	for row in predictions1:
		if row[2] >= threshold:
			y_true.append(1)
예제 #27
0
        print "Fold for" + str(t)
        algo.fit(trainset)
        # print testset
        predictions = algo.test(testset)
        Prec, Reca = metrics(predictions, t)
        pr = pr + Prec
        re = re + Reca

    return pr / 10.0, re / 10.0


if __name__ == '__main__':
    data = retrieve_data()
    G_max = ret_mod_user_dict(data)

    algo_NMF = NMF(NMF_no_of_LF, verbose=False)
    algo_SVD = SVD(n_factors=MF_no_of_LF)
    algo_KNN = KNNWithMeans(k=KNN_no_of_LF,
                            sim_options=sim_options,
                            verbose=False)

    # Q36
    Pr1 = []
    Re1 = []
    t = list(range(1, 26))
    for l in t:
        Precision, Recall = cross_val_(data, G_max, l, algo_KNN)
        Pr1.append(Precision)
        Re1.append(Recall)

    plotgraphs(t, Pr1, "Number of Suggestions", "Precision",
예제 #28
0
파일: Project3.py 프로젝트: shacocn/EE219
def Q36To38(qNum):

    print("problem ", qNum)

    data = load_data()
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    filter = {
        36: 'KNNWithMeans',
        37: 'NMF',
        38: 'SVD',
    }
    k_KNNWithMeans = 30  # from Q11
    k_NMF = 18  # from Q18
    k_SVD = 8  # from Q25

    modelName = filter[qNum]

    if modelName == 'KNNWithMeans':
        model = KNNWithMeans(k_KNNWithMeans, sim_options=sim_options)
    elif modelName == 'NMF':
        model = NMF(n_factors=k_NMF)
    else:
        model = SVD(n_factors=k_SVD)

    # sweep t from 1 to 25
    precision_arr = []
    recall_arr = []
    for t in range(1, 26):
        kf = KFold(n_splits=10)
        for trainSet, testSet in kf.split(data):
            sub_precisions = 0.0
            sub_recalls = 0.0
            model.fit(trainSet)
            predictions = model.test(testSet)
            precisions, recalls = precision_recall(predictions, t)
            print(sum(prec for prec in precisions.values()) / len(precisions))
            sub_precisions += (sum(prec for prec in precisions.values()) /
                               len(precisions))
            print(sum(rec for rec in recalls.values()) / len(recalls))
            sub_recalls += (sum(rec
                                for rec in recalls.values()) / len(recalls))
        precision_arr.append(np.mean(sub_precisions))
        recall_arr.append(np.mean(sub_recalls))

    t_list = list(range(1, 26))
    ys = [[precision_arr, 'mean precisions'], [recall_arr, 'mean recalls']]

    print("model name: ", modelName)

    # make_plot(t_list, ys, 'recommended item size t','Precision')
    # precision vs t
    title_ = "precision vs t for: " + modelName
    make_plot(t_list, [[precision_arr, 'mean precisions']],
              'recommended item size t',
              'Precision',
              title=title_)
    # recall vs t
    title_ = "recall vs t for: " + modelName
    make_plot(t_list, [[recall_arr, 'mean recalls']],
              'recommended item size t',
              'Recall',
              title=title_)
    # precision vs recall
    title_ = "precision vs recall for: " + modelName
    #make_plot([recall_arr, 'mean recalls'], [[precision_arr, 'mean precisions']], 'Recall','Precision', title = title_)

    plt.plot(recall_arr, precision_arr, label=modelName)
    xlabel = "recall"
    ylabel = "precision"
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid()
    plt.title(title_)
    plt.show()

    return precision_arr, recall_arr
    movie_rating_map = defaultdict(list)
    for val in data:
        movie_rating_map[val[1]].append(val[2])

    high_var_data = [val for val in data if
                     len(movie_rating_map[val[1]]) >= 5 and np.var(movie_rating_map[val[1]]) >= 2.0]
    return high_var_data

print("=====================Non-negative Matrix Factorization based filtering=============================================================")
print("Evaluating NNMF collaborative filtering based on #of latent factors vs RMSE and MAE errors on 10folds cross-validation")

k_range = range(2, 51, 2)
avg_rmse, avg_mae = [], []

for k in k_range:
    algo = NMF(n_factors=k)
    cv_result = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=False)
    avg_rmse.append(np.mean(cv_result['test_rmse']))
    avg_mae.append(np.mean(cv_result['test_mae']))

plt.plot(k_range, avg_rmse, label="Average RMSE")
plt.plot(k_range, avg_mae, label="Average MAE")
plt.xlabel('Number of latent factors', fontsize=15)
plt.ylabel('Error', fontsize=15)
plt.legend()
plt.show()

print("=================================Optimal Number of Latent Factors=============================================================")
all_genres = set('|'.join(movies.genres).split('|'))
print('#of Genres - ', len(all_genres))
                   line_format='user item rating',
                   sep=',',
                   rating_scale=(1, 5),
                   skip_lines=0)
RS_data = Dataset.load_from_df(RS_ratings, RS_reader)

# Benchmark_Algorithm_Metric
benchmark = []
for algorithm in [
        BaselineOnly(),
        CoClustering(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        NMF(),
        NormalPredictor(),
        SlopeOne(),
        SVD(),
        SVDpp()
]:
    # Perform cross validation
    results = cross_validate(algorithm,
                             RS_data,
                             measures=['rmse', 'mae', 'mse', 'fcp'],
                             cv=5,
                             verbose=True)
    # Results To Serie List
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(
        pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],