def construct_full_features(predicted_user_features, predicted_item_features, valid_users_idx, valid_items_idx, min_num_ratings, train_full, lambda_user, lambda_item): """ Construct the full user and item features matrix to match the size in the prediction :param predicted_user_features: predicted user features matrix that has to be filled :param predicted_item_features: predicted item features matrix that has to be filled :param valid_users_idx: indices of valid users :param valid_items_idx: indices of valid items :param min_num_ratings: minimum number of ratings :param train_full: full training data set :param lambda_user: weight of the regularizer for user_features :param lambda_item: weight of the regularizer for item_features :return: the full user and item features matrix of shapes (num_features, num_users) and (num_features, num_items) """ # Check for the base case if min_num_ratings == 0: full_user_features = predicted_user_features full_item_features = predicted_item_features else: total_num_items, total_num_users = train_full.shape # Add columns for the deleted user and items full_user_features = add_removed_elements(predicted_user_features, valid_users_idx, total_num_users) full_item_features = add_removed_elements(predicted_item_features, valid_items_idx, total_num_items) # Select the unvalid indexes added_users = unvalid_indexes(total_num_users, valid_users_idx) added_items = unvalid_indexes(total_num_items, valid_items_idx) # Find the number of non zero for items and users nnz_items_per_user = train_full.getnnz(axis=0) nnz_users_per_item = train_full.getnnz(axis=1) # Create the non zero item indices for each user and the non zero user indices for each item nz_user_itemindices = [] nz_item_userindices = [] nz_ratings, nz_row_colindices, nz_col_rowindices = build_index_groups(train_full) for row, colindices in nz_row_colindices: nz_item_userindices.append(colindices) for col, rowindices in nz_col_rowindices: nz_user_itemindices.append(rowindices) # Update the features matrices in order to converge to a better prediction than the everage full_item_features = fill_added_item_features(full_item_features, full_user_features, added_items, train_full, lambda_item, nnz_users_per_item, nz_item_userindices) full_user_features = fill_added_user_features(full_item_features, full_user_features, added_users, train_full, lambda_user, nnz_items_per_user, nz_user_itemindices) return full_user_features, full_item_features
def ALS(train, test, n_f, l_u, l_i): print("Running ALS with {} features, lambda user = {}, lambda item = {}".format(n_f, l_u, l_i)) # define parameters num_features = n_f # K in the lecture notes lambda_user = l_u lambda_item = l_i stop_criterion = 1e-4 # set seed np.random.seed(988) # init ALS user_features, item_features = init_MF(train, num_features) # find the non-zero ratings indices nz_train, nz_item_userindices, nz_user_itemindices = build_index_groups(train) if test is not None: nz_row, nz_col = test.nonzero() nz_test = list(zip(nz_row, nz_col)) rmse = compute_error(train, user_features, item_features, nz_train) delta_rmse = np.inf it = 0 while np.abs(delta_rmse - rmse) > stop_criterion: user_features = update_user_feature(train, item_features, lambda_user, train.nnz, nz_user_itemindices) item_features = update_item_feature(train, user_features, lambda_item, train.nnz, nz_item_userindices) delta_rmse = rmse rmse = compute_error(train, user_features, item_features, nz_train) it += 1 if test is not None: print("iter: {}, RMSE on training set: {}.".format(it, rmse)) else: print("iter: {}, RMSE: {}.".format(it, rmse)) rmse_test = 0 if test is not None: rmse_test = compute_error(test, user_features, item_features, nz_test) # Uncomment if logging needed for multiple runs during a long period of time # with open('logs/overnight_logging', 'a') as f: # f.write("RMSE on testing set: {}, with k: {}, l_u: {}, l_i {}\n".format(rmse_test, num_features, lambda_user, lambda_item)) print("RMSE on testing set: {}, with k: {}, l_u: {}, l_i {}".format(rmse_test, num_features, lambda_user, lambda_item)) return item_features.dot(user_features.T), rmse_test
def ALS(train, test, n_features, lambda_user, lambda_item, verbose=1): """Alternating Least Squares (ALS) algorithm.""" print( '\nStarting ALS with n_features = %d, lambda_user = %f, lambda_item = %f' % (n_features, lambda_user, lambda_item)) n_epochs = 20 user_features_file_path = 'ALSdump/user_features_%s_%s_%s_%s.npy' \ % (n_epochs, n_features, lambda_user, lambda_item) item_features_file_path = 'ALSdump/item_features_%s_%s_%s_%s.npy' \ % (n_epochs, n_features, lambda_user, lambda_item) if (os.path.exists(user_features_file_path) and os.path.exists(item_features_file_path)): user_features = np.load(user_features_file_path) item_features = np.load(item_features_file_path) train_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[train.nonzero()], train[train.nonzero()].toarray()[0]) test_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[test.nonzero()], test[test.nonzero()].toarray()[0]) print("Train error: %f, test error: %f" % (train_rmse, test_rmse)) return user_features, item_features user_features, item_features = init_MF(train, n_features) nz_row, nz_col = test.nonzero() nz_test = list(zip(nz_row, nz_col)) nz_train, nz_row_colindices, nz_col_rowindices = helpers.build_index_groups( train) _, nz_user_itemindices = map(list, zip(*nz_col_rowindices)) nnz_items_per_user = [len(i) for i in nz_user_itemindices] _, nz_item_userindices = map(list, zip(*nz_row_colindices)) nnz_users_per_item = [len(i) for i in nz_item_userindices] prev_train_rmse = 100 #prev_test_rmse =100 for it in range(n_epochs): user_features = update_user_feature(train, item_features, lambda_user, nnz_items_per_user, nz_user_itemindices) item_features = update_item_feature(train, user_features, lambda_item, nnz_users_per_item, nz_item_userindices) train_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[train.nonzero()], train[train.nonzero()].toarray()[0]) test_rmse = helpers.calculate_rmse( np.dot(item_features, user_features)[test.nonzero()], test[test.nonzero()].toarray()[0]) if verbose == 1: print("[Epoch %d / %d] train error: %f, test error: %f" % (it + 1, n_epochs, train_rmse, test_rmse)) if (train_rmse > prev_train_rmse or abs(train_rmse - prev_train_rmse) < 1e-5): if verbose == 1: print('Algorithm has converged!') break #prev_test_rmse = test_rmse prev_train_rmse = train_rmse if verbose == 0: print("[Epoch %d / %d] train error: %f, test error: %f" % (it + 1, n_epochs, train_rmse, test_rmse)) np.save(user_features_file_path, user_features) np.save(item_features_file_path, item_features) return user_features, item_features
def ALS(train, test, lambda_user, lambda_item, num_features): """ Matrix factorization using Alternating Least Squares (ALS). :param train: train data matrix of size (num_items, num_users) :param test: test data matrix of size (num_items, num_users) :param lambda_user: weight of the regularizer for user_features :param lambda_item: weight of the regularizer for item_features :param num_features: number of features for the factorization, also called k :return: user_features, item_features of size (num_features, num_users) and (num_features, num_items) respectively. error_table containing the RMSEs after every iteration until it converges to the stopping criterion. rmse_test that is -1 if there is no test set. """ # Define initial parameters stop_criterion = 1e-4 change = 1 error_list = [0, 0] error_table = [] # Set seed np.random.seed(988) # Initialize the factorization matrices user_features, item_features = init_MF(train, num_features) # Create an array of size 2 in order to keep in memory the previous and the present RMSE to know if we have reached # the stop_criterion error_list[0] = 1000 # Calculate arguments for the update of Z and W nnz_items_per_user = train.getnnz(axis=0) nnz_users_per_item = train.getnnz(axis=1) nz_train_indices, nz_row_colindices, nz_col_rowindices = build_index_groups( train) while abs(error_list[0] - error_list[1]) > stop_criterion: # Fix W (item), estimate Z (user) for i, nz_user_itemindices in nz_col_rowindices: user_features[:, i] = update_user_feature(train[:, i], item_features, lambda_user, nnz_items_per_user[i], nz_user_itemindices) # Fix Z, estimate W for j, nz_item_userindices in nz_row_colindices: item_features[:, j] = update_item_feature(train[j], user_features, lambda_item, nnz_users_per_item[j], nz_item_userindices) # Create a list of non zero indices of the training set nz_row, nz_col = train.nonzero() nz_train_indices = list(zip(nz_row, nz_col)) # Store the RMSE error_list[change] = compute_error(train, user_features, item_features, nz_train_indices) error_table.append(error_list[change]) print("RMSE on train data: {}".format(error_list[change])) # Update the index of the array to not overwrite the previous RMSE if (change == 1): change = 0 else: change = 1 print("Converged\n") # Create a list of non zero indices of the test set nz_row_te, nz_col_te = test.nonzero() nz_test = list(zip(nz_row_te, nz_col_te)) # Check if the test is non null, otherwise we set its RMSE to -1 if len(nz_test) == 0: rmse_test = -1 else: rmse_test = compute_error(test, user_features, item_features, nz_test) print("RMSE on test data: {}.".format(rmse_test)) return user_features, item_features, error_table, rmse_test