예제 #1
def knn_item(trainset, testset, predset):
    modelname = 'knnitem'
    # Check if predictions already exist
    if is_already_predicted(modelname):
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
    sim_options = {
                    'name': 'pearson_baseline',
                    'shrinkage': 100,
                    'user_based': False
    algo = KNNBaseline(k=60, sim_options=sim_options, bsl_options=bsl_options)
    print('KNN item based Model')
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
예제 #2
def svdpp(trainset, testset, predset):
    modelname = 'svdpp'
    # Check if predictions already exist
    if is_already_predicted(modelname):
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
    algo = SVDpp(n_epochs=40, n_factors=100, bsl_options=bsl_options, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, lr_yj=0.01, reg_bu = 0.05, reg_bi = 0.05, reg_pu = 0.09, reg_qi = 0.1, reg_yj=0.01)
    print('SVDpp Model')
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
예제 #3
def svd(trainset, testset, predset):

    modelname = 'svd'
    # Check if predictions already exist
    if is_already_predicted(modelname):

    algo = SVD(n_factors=100, n_epochs=40, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, reg_bu=0.05, reg_bi=0.05, reg_pu=0.09, reg_qi=0.1)
    print('SVD Model')
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
예제 #4
def baseline(trainset, testset, predset):
    modelname = 'baseline'
    # Check if predictions already exist
    if is_already_predicted(modelname):
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
    algo = BaselineOnly(bsl_options=bsl_options)
    print('Baseline Model')
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
예제 #5
def slope_one(trainset, testset, predset):
    modelname = 'slopeone'
    # Check if predictions already exist
    if is_already_predicted(modelname):
    algo = SlopeOne()
    print('SlopeOne Model')
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('  Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
예제 #6
def global_mean(trainset, testset, predset):
    """Save predictions based on the global mean"""
    modelname = 'globalmean'
    # Check if predictions already exist
    if is_already_predicted(modelname):

    print('Global Mean model')
    global_mean = trainset.global_mean
    # Find predictions
    train_pred = np.tile(global_mean, trainset.n_ratings)
    test_pred = np.tile(global_mean, len(testset))
    final_pred = np.tile(global_mean, len(predset))
    # Extract true labels
    train_labels = [rat for (_,_,rat) in trainset.all_ratings()]
    test_labels = [rat for (_,_,rat) in testset]

    # Evaluate performances
    print('   RMSE on Train: ', calculate_rmse(train_labels, train_pred) )
    rmse = calculate_rmse(test_labels, test_pred)
    print('   RMSE on Test: ', rmse )

    # Save predictions
    save_predictions(modelname, rmse, test_pred, 'test')
    save_predictions(modelname, rmse, final_pred)
예제 #7
def user_mean(trainset, testset, predset):
    """Save predictions based on the user means"""
    modelname = 'usermean'
    # Check if predictions already exist
    if is_already_predicted(modelname):
    print('User Mean model')

    # Find the mean rating of each user
    user_mean = np.zeros(trainset.n_users + 1) # raw_indices start from 1
    for user in trainset.all_users():
        ratings = [rat for (_,rat) in trainset.ur[user]]
        if ratings:
            user_mean[int(trainset.to_raw_uid(user))] = np.mean(ratings)
            user_mean[int(trainset.to_raw_uid(user))] = trainset.global_mean
    # Extract info from datasets
    train_users, _, train_labels = list(zip(*trainset.all_ratings()))
    test_users, _, test_labels = list(zip(*testset))
    pred_users, _, pred_labels = list(zip(*predset))
    # Raw ids are strings, so convert them into integer
    train_users = np.array(train_users, dtype='int')
    test_users = np.array(test_users, dtype='int')
    pred_users = np.array(pred_users, dtype='int')
    # Calculate predictions
    train_pred = np.empty(trainset.n_ratings)
    test_pred = np.empty(len(testset))
    final_pred = np.empty(len(predset))
    for n, user in enumerate(train_users):
        train_pred[n] = user_mean[int(trainset.to_raw_uid(user))]
    for n, user in enumerate(test_users):
        test_pred[n] = user_mean[user]
    for n, user in enumerate(pred_users):
        final_pred[n] = user_mean[user]
    # Evaluate performances
    print('   RMSE on Train: ', calculate_rmse(train_labels, train_pred) )
    rmse = calculate_rmse(test_labels, test_pred)
    print('   RMSE on Test: ', rmse)
    # Save predictions
    save_predictions(modelname, rmse, test_pred, 'test')
    save_predictions(modelname, rmse, final_pred)
예제 #8
def item_median(trainset, testset, predset):
    """Save predictions based on the items medians"""
    modelname = 'itemmedian'
    # Check if predictions already exist
    if is_already_predicted(modelname):
    print('Item Median model')

    # Find the mean rating of each item
    item_median = np.zeros(trainset.n_items + 1) # raw_indices start from 1
    for item in trainset.all_items():
        ratings = [rat for (_,rat) in trainset.ir[item]]
        if ratings:
            item_median[int(trainset.to_raw_iid(item))] = np.median(ratings)
            item_median[int(trainset.to_raw_iid(item))] = trainset.global_mean
    # Extract info from datasets
    _, train_items, train_labels = list(zip(*trainset.all_ratings()))
    _, test_items, test_labels = list(zip(*testset))
    _, pred_items, pred_labels = list(zip(*predset))
    # Raw ids are strings, so convert them into integer
    train_items = np.array(train_items, dtype='int')
    test_items = np.array(test_items, dtype='int')
    pred_items = np.array(pred_items, dtype='int')
    # Calculate predictions
    train_pred = np.empty(trainset.n_ratings)
    test_pred = np.empty(len(testset))
    final_pred = np.empty(len(predset))
    for n, item in enumerate(train_items):
        train_pred[n] = item_median[int(trainset.to_raw_iid(item))]
    for n, item in enumerate(test_items):
        test_pred[n] = item_median[item]
    for n, item in enumerate(pred_items):
        final_pred[n] = item_median[item]
    # Evaluate performances
    print('   RMSE on Train: ', calculate_rmse(train_labels, train_pred) )
    rmse = calculate_rmse(test_labels, test_pred)
    print('   RMSE on Test: ', rmse)
    # Save predictions
    save_predictions(modelname, rmse, test_pred, 'test')
    save_predictions(modelname, rmse, final_pred)
예제 #9
def matrix_factorization_ALS(trainset, testset, predset, verbose=False):
    """ Save predictions based on matrix factorization with ALS"""
    modelname = 'mfals'
    # Check if predictions already exist
    if is_already_predicted(modelname):
    num_features = 20   # K in the lecture notes
    lambda_user = 0.08
    lambda_item = 0.1
    stop_criterion = 1e-4
    change = 1
    error_list = [0, 0]
    # Build matrix of train and test for compatibility with this algorithm
    train = sp.lil_matrix((10000, 1000))
    test = sp.lil_matrix((10000, 1000))
    # Extract raw_users from trainset
    train_users = [trainset.to_raw_uid(u) for (u,_,_) in trainset.all_ratings()]
    train_items = [trainset.to_raw_iid(i) for (_,i,_) in trainset.all_ratings()]
    train_labels = [r for (_,_,r) in trainset.all_ratings()]
    # Extract info from testset and predset
    test_users, test_items, test_labels = list(zip(*testset))
    pred_users, pred_items, pred_labels = list(zip(*predset))
    # Raw ids are strings, so convert them into integer.
    # Decrease by 1 because raw ids start from 1
    train_users = np.array(train_users, dtype='int') - 1
    test_users = np.array(test_users, dtype='int') - 1
    pred_users = np.array(pred_users, dtype='int') - 1
    train_items = np.array(train_items, dtype='int') - 1
    test_items = np.array(test_items, dtype='int') - 1
    pred_items = np.array(pred_items, dtype='int') - 1
    # Fill train and test matrices
    train[train_users, train_items] = train_labels
    test[test_users, test_items] = test_labels
    # set seed

    # init ALS
    user_features, item_features = init_MF(train, num_features)
    # get the number of non-zero ratings for each user and item
    nnz_items_per_user, nnz_users_per_item = train.getnnz(axis=1), train.getnnz(axis=0)
    # group the indices by row or column index
    nz_train, nz_user_itemindices, nz_item_userindices = build_index_groups(train)

    # run ALS
    print("Matrix Factorization ALS Model")
    while change > stop_criterion:
        # update user feature & item feature
        user_features = update_user_feature(
            train, item_features, lambda_user,
            nnz_items_per_user, nz_user_itemindices)
        item_features = update_item_feature(
            train, user_features, lambda_item,
            nnz_users_per_item, nz_item_userindices)

        error = compute_error_MF(train, user_features, item_features, nz_train)
        if verbose:
            print("RMSE on Train: {}.".format(error))
        change = np.fabs(error_list[-1] - error_list[-2])

    # Evaluate train error
    print("RMSE on Train: ", compute_error_MF(train, user_features, item_features, nz_train))

    # evaluate the test error
    nnz_row, nnz_col = test.nonzero()
    nnz_test = list(zip(nnz_row, nnz_col))
    rmse = compute_error_MF(test, user_features, item_features, nnz_test)
    print("RMSE on Test: {v}.".format(v=rmse))
    # Save predictions
    predictions = user_features.T @ item_features
    save_predictions(modelname, np.asscalar(rmse), predictions[pred_users, pred_items])
    save_predictions(modelname, np.asscalar(rmse), predictions[test.nonzero()], 'test')
예제 #10
def matrix_factorization_SGD(trainset, testset, predset):
    """Save predictions based on the matrix factorization with SGD"""
    modelname = 'mfsgd'
    # Check if predictions already exist
    if is_already_predicted(modelname):
    gamma = 0.06
    num_features = 20   # K in the lecture notes
    lambda_user = 0.08
    lambda_item = 0.1
    num_epochs = 30     # number of full passes through the train set
    # Build matrix of train and test for compatibility with this algorithm
    train = sp.lil_matrix((10000, 1000))
    test = sp.lil_matrix((10000, 1000))
    # Extract raw_users from trainset
    train_users = [trainset.to_raw_uid(u) for (u,_,_) in trainset.all_ratings()]
    train_items = [trainset.to_raw_iid(i) for (_,i,_) in trainset.all_ratings()]
    train_labels = [r for (_,_,r) in trainset.all_ratings()]
    # Extract info from testset and predset
    test_users, test_items, test_labels = list(zip(*testset))
    pred_users, pred_items, pred_labels = list(zip(*predset))
    # Raw ids are strings, so convert them into integer.
    # Decrease by 1 because raw ids start from 1 and matrix indices from 0
    train_users = np.array(train_users, dtype='int') - 1
    test_users = np.array(test_users, dtype='int') - 1
    pred_users = np.array(pred_users, dtype='int') - 1
    train_items = np.array(train_items, dtype='int') - 1
    test_items = np.array(test_items, dtype='int') - 1
    pred_items = np.array(pred_items, dtype='int') - 1
    # Fill train and test matrices
    train[train_users, train_items] = train_labels
    test[test_users, test_items] = test_labels
    # set seed
    # init matrix
    user_features, item_features = init_MF(train, num_features)
    # find the non-zero ratings indices 
    nz_row, nz_col = train.nonzero()
    nz_train = list(zip(nz_row, nz_col))
    nz_row, nz_col = test.nonzero()
    nz_test = list(zip(nz_row, nz_col))
    print("Matrix Factorization SGD Model")
    for it in range(num_epochs):        
        # shuffle the training rating indices
        # decrease step size
        gamma /= 1.2
        for d, n in nz_train:
            # update W_d (item_features[:, d]) and Z_n (user_features[:, n])
            item_info = item_features[:, n]
            user_info = user_features[:, d]
            err = train[d, n] - user_info.T.dot(item_info)
            # calculate the gradient and update
            item_features[:, n] += gamma * (err * user_info - lambda_item * item_info)
            user_features[:, d] += gamma * (err * item_info - lambda_user * user_info)
    # evaluate the test error
    print('   RMSE on train: ', compute_error_MF(train, user_features, item_features, nz_train))
    rmse = compute_error_MF(test, user_features, item_features, nz_test)
    print("   RMSE on Test: ",rmse)
    predictions = user_features.T @ item_features
    save_predictions(modelname, np.asscalar(rmse), predictions[pred_users, pred_items])
    save_predictions(modelname, np.asscalar(rmse), predictions[test.nonzero()], 'test')