def svd(data, svd_data, lr=0.01, reg=0.1, k=10, iters=500): train_data, val_data = data['train_data'], data['val_data'] train_svd, val_svd = svd_data['train_svd'], svd_data['val_svd'] svd = SVD(learning_rate=lr, regularization=reg, n_epochs=iters, n_factors=k, min_rating=0, max_rating=1) svd.fit(X=pd.DataFrame(train_svd), X_val=pd.DataFrame(val_svd), early_stopping=False, shuffle=False) # Train Accuracy pred = svd.predict(train_svd) train_acc = evaluate(train_data, pred) # Validate Accuracy pred = svd.predict(val_svd) val_acc = evaluate(val_data, pred) return train_acc, val_acc
def main(): # Hyperparameters m = 5 # Number of bootstrap resamples irt_iters = 30 irt_lr = 0.009 svd_lr = 0.01 svd_reg = 0.10 svd_k = 50 svd_iters = 500 train_data = load_train_csv("../data") val_data = load_valid_csv("../data") test_data = load_private_test_csv('../data') val_svd = {'u_id': val_data['user_id'], 'i_id': val_data['question_id'], 'rating': val_data['is_correct']} test_svd = {'u_id': test_data['user_id'], 'i_id': test_data['question_id'], 'rating': test_data['is_correct']} svd_train_resamples = generate_resamples(train_data, m) irt_train_resamples = generate_resamples(train_data, m) svd_test_pred, irt_test_pred = [], [] for i in range(m): curr_irt, curr_svd = irt_train_resamples[i], svd_train_resamples[i] # Train 2-PL IRT theta, a, beta, train_acc, val_acc, train_log_likes, val_log_likes, final = \ irt(curr_irt, val_data, irt_lr, irt_iters) irt_test_pred.append(irt_predict(test_data, theta, a, beta)[0]) # Train Funk SVD curr_svd = {'u_id': curr_svd['user_id'], 'i_id': curr_svd['question_id'], 'rating': curr_svd['is_correct']} svd = SVD(learning_rate=svd_lr, regularization=svd_reg, n_epochs=svd_iters, n_factors=svd_k, min_rating=0, max_rating=1) svd.fit(X=pd.DataFrame(curr_svd), X_val=pd.DataFrame(val_svd), early_stopping=False, shuffle=False) svd_test_pred.append(svd.predict(test_svd)) test_avg = np.sum(irt_test_pred + svd_test_pred, axis=0) / (2 * m) binary_pred = [0 if x < 0.5 else 1 for x in test_avg] test_data['is_correct'] = binary_pred save_private_test_csv(test_data)
def main(): train_data = load_train_csv("../data") val_data = load_valid_csv("../data") test_data = load_public_test_csv("../data") train_svd = { 'u_id': train_data['user_id'], 'i_id': train_data['question_id'], 'rating': train_data['is_correct'] } val_svd = { 'u_id': val_data['user_id'], 'i_id': val_data['question_id'], 'rating': val_data['is_correct'] } test_svd = { 'u_id': test_data['user_id'], 'i_id': test_data['question_id'], 'rating': test_data['is_correct'] } data = {"train_data": train_data, "val_data": val_data} svd_data = {"train_svd": train_svd, "val_svd": val_svd} lrs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5] regs = [0.001, 0.01, 0.05, 0.1, 0.5, 1] ks = [1, 5, 10, 20, 50, 100] iters = [10, 50, 100, 500, 1000, 2000] lr_train_results = [] lr_val_results = [] reg_train_results = [] reg_val_results = [] ks_train_results = [] ks_val_results = [] iters_train_results = [] iters_val_results = [] for lr in lrs: train_result, val_result = svd(data, svd_data, lr=lr) lr_train_results.append(train_result) lr_val_results.append(val_result) for reg in regs: train_result, val_result = svd(data, svd_data, reg=reg) reg_train_results.append(train_result) reg_val_results.append(val_result) for k in ks: train_result, val_result = svd(data, svd_data, k=k) ks_train_results.append(train_result) ks_val_results.append(val_result) for iter in iters: train_result, val_result = svd(data, svd_data, iters=iter) iters_train_results.append(train_result) iters_val_results.append(val_result) best_lr = lrs[lr_val_results.index(max(lr_val_results))] print("Best learning rate: ", best_lr) best_reg = regs[reg_val_results.index(max(reg_val_results))] print("Best regularization value: ", best_reg) best_k = ks[ks_val_results.index(max(ks_val_results))] print("Best k: ", best_k) best_iter = iters[iters_val_results.index(max(iters_val_results))] print("Best iterations: ", best_iter) plot(lrs, lr_train_results, lr_val_results, "Learning Rates") plot(regs, reg_train_results, reg_val_results, "Regularized Rates") plot(ks, ks_train_results, ks_val_results, "K-Values") plot(iters, iters_train_results, iters_val_results, "Iterations") final_svd = SVD(learning_rate=best_lr, regularization=best_reg, n_epochs=best_iter, n_factors=best_k, min_rating=0, max_rating=1) final_svd.fit(X=pd.DataFrame(train_svd), X_val=pd.DataFrame(val_svd), early_stopping=False, shuffle=False) # Train Accuracy pred = final_svd.predict(train_svd) train_acc = evaluate(train_data, pred) print("Final Train Accuracy: ", train_acc) # Validate Accuracy pred = final_svd.predict(val_svd) val_acc = evaluate(val_data, pred) print("Final Validation Accuracy: ", val_acc) # Test Accuracy pred = final_svd.predict(test_svd) test_acc = evaluate(test_data, pred) print("Final Test Accuracy: ", test_acc)
import pandas as pd import numpy as np from funk_svd.dataset import fetch_ml20m_ratings from funk_svd import SVD from sklearn.metrics import mean_absolute_error df = fetch_ml20m_ratings() train = df.sample(frac=0.8, random_state=7) val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8) test = df.drop(train.index.tolist()).drop(val.index.tolist()) svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=100, n_factors=15, min_rating=1, max_rating=5) svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False) pred = svd.predict(test) mae = mean_absolute_error(test["rating"], pred) print("Test MAE: {:.2f}".format(mae))
train = df.sample(frac=0.8, random_state=7) val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8) test = df.drop(train.index.tolist()).drop(val.index.tolist()) svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=100, n_factors=15, min_rating=1, max_rating=5) df_matrix_original = svd.get_utility_matrix(df) print("Original Utility Matrix: \n", df_matrix_original.values) # Getting all u_id and i_id combinations df_user_item = pd.melt(df_matrix_original.reset_index(drop=False), id_vars='u_id') svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False) pred_test = svd.predict(test) df_user_item["rating"] = svd.predict(df_user_item) print("Predicted Utility Matrix: \n", svd.get_utility_matrix(df_user_item).values) mae = mean_absolute_error(test["rating"], pred_test) print(f'Test MAE: {mae:.2f}')