예제 #1
0
def svd(data, svd_data, lr=0.01, reg=0.1, k=10, iters=500):
    train_data, val_data = data['train_data'], data['val_data']
    train_svd, val_svd = svd_data['train_svd'], svd_data['val_svd']

    svd = SVD(learning_rate=lr,
              regularization=reg,
              n_epochs=iters,
              n_factors=k,
              min_rating=0,
              max_rating=1)
    svd.fit(X=pd.DataFrame(train_svd),
            X_val=pd.DataFrame(val_svd),
            early_stopping=False,
            shuffle=False)

    # Train Accuracy
    pred = svd.predict(train_svd)
    train_acc = evaluate(train_data, pred)

    # Validate Accuracy
    pred = svd.predict(val_svd)
    val_acc = evaluate(val_data, pred)

    return train_acc, val_acc
예제 #2
0
def main():
    # Hyperparameters
    m = 5  # Number of bootstrap resamples

    irt_iters = 30
    irt_lr = 0.009

    svd_lr = 0.01
    svd_reg = 0.10
    svd_k = 50
    svd_iters = 500

    train_data = load_train_csv("../data")
    val_data = load_valid_csv("../data")
    test_data = load_private_test_csv('../data')

    val_svd = {'u_id': val_data['user_id'], 'i_id': val_data['question_id'], 'rating': val_data['is_correct']}
    test_svd = {'u_id': test_data['user_id'], 'i_id': test_data['question_id'], 'rating': test_data['is_correct']}

    svd_train_resamples = generate_resamples(train_data, m)
    irt_train_resamples = generate_resamples(train_data, m)

    svd_test_pred, irt_test_pred = [], []
    for i in range(m):
        curr_irt, curr_svd = irt_train_resamples[i], svd_train_resamples[i]

        # Train 2-PL IRT
        theta, a, beta, train_acc, val_acc, train_log_likes, val_log_likes, final = \
            irt(curr_irt, val_data, irt_lr, irt_iters)

        irt_test_pred.append(irt_predict(test_data, theta, a, beta)[0])

        # Train Funk SVD
        curr_svd = {'u_id': curr_svd['user_id'], 'i_id': curr_svd['question_id'],
                     'rating': curr_svd['is_correct']}

        svd = SVD(learning_rate=svd_lr, regularization=svd_reg, n_epochs=svd_iters, n_factors=svd_k, min_rating=0, max_rating=1)
        svd.fit(X=pd.DataFrame(curr_svd), X_val=pd.DataFrame(val_svd), early_stopping=False, shuffle=False)

        svd_test_pred.append(svd.predict(test_svd))

    test_avg = np.sum(irt_test_pred + svd_test_pred, axis=0) / (2 * m)

    binary_pred = [0 if x < 0.5 else 1 for x in test_avg]
    test_data['is_correct'] = binary_pred
    save_private_test_csv(test_data)
예제 #3
0
def main():
    train_data = load_train_csv("../data")
    val_data = load_valid_csv("../data")
    test_data = load_public_test_csv("../data")

    train_svd = {
        'u_id': train_data['user_id'],
        'i_id': train_data['question_id'],
        'rating': train_data['is_correct']
    }
    val_svd = {
        'u_id': val_data['user_id'],
        'i_id': val_data['question_id'],
        'rating': val_data['is_correct']
    }
    test_svd = {
        'u_id': test_data['user_id'],
        'i_id': test_data['question_id'],
        'rating': test_data['is_correct']
    }

    data = {"train_data": train_data, "val_data": val_data}
    svd_data = {"train_svd": train_svd, "val_svd": val_svd}

    lrs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
    regs = [0.001, 0.01, 0.05, 0.1, 0.5, 1]
    ks = [1, 5, 10, 20, 50, 100]
    iters = [10, 50, 100, 500, 1000, 2000]

    lr_train_results = []
    lr_val_results = []
    reg_train_results = []
    reg_val_results = []
    ks_train_results = []
    ks_val_results = []
    iters_train_results = []
    iters_val_results = []

    for lr in lrs:
        train_result, val_result = svd(data, svd_data, lr=lr)
        lr_train_results.append(train_result)
        lr_val_results.append(val_result)

    for reg in regs:
        train_result, val_result = svd(data, svd_data, reg=reg)
        reg_train_results.append(train_result)
        reg_val_results.append(val_result)

    for k in ks:
        train_result, val_result = svd(data, svd_data, k=k)
        ks_train_results.append(train_result)
        ks_val_results.append(val_result)

    for iter in iters:
        train_result, val_result = svd(data, svd_data, iters=iter)
        iters_train_results.append(train_result)
        iters_val_results.append(val_result)

    best_lr = lrs[lr_val_results.index(max(lr_val_results))]
    print("Best learning rate: ", best_lr)
    best_reg = regs[reg_val_results.index(max(reg_val_results))]
    print("Best regularization value: ", best_reg)
    best_k = ks[ks_val_results.index(max(ks_val_results))]
    print("Best k: ", best_k)
    best_iter = iters[iters_val_results.index(max(iters_val_results))]
    print("Best iterations: ", best_iter)

    plot(lrs, lr_train_results, lr_val_results, "Learning Rates")
    plot(regs, reg_train_results, reg_val_results, "Regularized Rates")
    plot(ks, ks_train_results, ks_val_results, "K-Values")
    plot(iters, iters_train_results, iters_val_results, "Iterations")

    final_svd = SVD(learning_rate=best_lr,
                    regularization=best_reg,
                    n_epochs=best_iter,
                    n_factors=best_k,
                    min_rating=0,
                    max_rating=1)
    final_svd.fit(X=pd.DataFrame(train_svd),
                  X_val=pd.DataFrame(val_svd),
                  early_stopping=False,
                  shuffle=False)

    # Train Accuracy
    pred = final_svd.predict(train_svd)
    train_acc = evaluate(train_data, pred)
    print("Final Train Accuracy: ", train_acc)

    # Validate Accuracy
    pred = final_svd.predict(val_svd)
    val_acc = evaluate(val_data, pred)
    print("Final Validation Accuracy: ", val_acc)

    # Test Accuracy
    pred = final_svd.predict(test_svd)
    test_acc = evaluate(test_data, pred)
    print("Final Test Accuracy: ", test_acc)
예제 #4
0
import pandas as pd
import numpy as np

from funk_svd.dataset import fetch_ml20m_ratings
from funk_svd import SVD

from sklearn.metrics import mean_absolute_error

df = fetch_ml20m_ratings()

train = df.sample(frac=0.8, random_state=7)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

svd = SVD(learning_rate=0.001,
          regularization=0.005,
          n_epochs=100,
          n_factors=15,
          min_rating=1,
          max_rating=5)

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

pred = svd.predict(test)
mae = mean_absolute_error(test["rating"], pred)

print("Test MAE: {:.2f}".format(mae))
예제 #5
0
train = df.sample(frac=0.8, random_state=7)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

svd = SVD(learning_rate=0.001,
          regularization=0.005,
          n_epochs=100,
          n_factors=15,
          min_rating=1,
          max_rating=5)

df_matrix_original = svd.get_utility_matrix(df)
print("Original Utility Matrix: \n", df_matrix_original.values)

# Getting all u_id and i_id combinations
df_user_item = pd.melt(df_matrix_original.reset_index(drop=False),
                       id_vars='u_id')

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

pred_test = svd.predict(test)
df_user_item["rating"] = svd.predict(df_user_item)

print("Predicted Utility Matrix: \n",
      svd.get_utility_matrix(df_user_item).values)

mae = mean_absolute_error(test["rating"], pred_test)

print(f'Test MAE: {mae:.2f}')