def main(): # IMPORTANT: Set generate to True if you wish to generate the Matrix Factorizations, otherwise set to false to skip training generate = False from attribute_obfuscation import test_NN, load_NN_and_movie_lists from attribute_inference_NN import get_NN_model_location if generate: get_rating_predictor_using_training_data(load_dataset(0, 0).MF_training, training_enabled=True, skip_already_trained=True) # For every k value (obfuscation percentage), generate the rmse for k in options.k_values: # Each NN will recommend different movies, which will affect how the Matrix Factorization is trained for test_percentage in options.TEST_PERCENTAGES: dataset = load_dataset(test_percentage, k) (train_ratings, train_labels), (test_ratings, test_labels), (train_user_ids, test_user_ids) = dataset.get_training_testing_for_NN() model, categorical_movies = load_NN_and_movie_lists(get_NN_model_location(test_percentage, k)) print("Retrieving Obfuscated User Item Matrix to train Matrix Factorization Recommender...") _, _, _, _, _, _, _, _, modified_user_item_matrix = test_NN(model, test_ratings, test_labels, test_user_ids, categorical_movies, test_percentage, k) print("\nTraining MF for k: {} and test_percentage: {:.2f}".format(k, test_percentage)) get_rating_predictor_using_obscured_data(dataset.MF_training, modified_user_item_matrix=modified_user_item_matrix, test_percentage=test_percentage, k_obfuscation=k, training_enabled=True, skip_already_trained=True) view_change_in_results()
def view_change_in_results(): from attribute_obfuscation import test_NN, load_NN_and_movie_lists from attribute_inference_NN import get_NN_model_location # Compute results with differing values of k for precision_recall_k in options.precision_at_k_values: dataset = load_dataset(0, 0) mf1 = get_rating_predictor_using_training_data(training_data=dataset.MF_training, training_enabled=False) mf1_mse = mf1.mse(dataset.MF_testing) mf1_mae = mf1.mae(dataset.MF_testing) mf1_precisions, mf1_recalls, mf1_F1_list = mf1.precision_and_recall_at_k(dataset.MF_testing, k=precision_recall_k) mf1_avg_precision = sum(mf1_precisions)/len(mf1_precisions) mf1_avg_recall = sum(mf1_recalls)/len(mf1_recalls) mf1_avg_F1 = sum(mf1_F1_list)/len(mf1_F1_list) results = [] results.append([0, 0, mf1_mse, mf1_mae, mf1_avg_precision, mf1_avg_recall, mf1_avg_F1]) # For every k value (obfuscation percentage), generate the rmse for k in options.k_values: # Each NN will recommend different movies, which will affect how the Matrix Factorization is trained for test_percentage in options.TEST_PERCENTAGES: dataset = load_dataset(test_percentage, k) (train_ratings, train_labels), (test_ratings, test_labels), (train_user_ids, test_user_ids) = dataset.get_training_testing_for_NN() model, categorical_movies = load_NN_and_movie_lists(get_NN_model_location(test_percentage, k)) print("Retrieving MF for k: {} and test_percentage: {:.2f}".format(k, test_percentage)) _, _, _, _, _, _, _, _, modified_user_item_matrix = test_NN(model, test_ratings, test_labels, test_user_ids, categorical_movies, test_percentage, k) mf2 = get_rating_predictor_using_obscured_data(dataset.MF_training, modified_user_item_matrix=modified_user_item_matrix, test_percentage=test_percentage, k_obfuscation=k, training_enabled=False) mf2_mse = mf2.mse(dataset.MF_testing) mf2_mae = mf2.mae(dataset.MF_testing) mf2_precisions, mf2_recalls, mf2_F1_list = mf2.precision_and_recall_at_k(dataset.MF_testing, k=precision_recall_k) mf2_avg_precision = sum(mf2_precisions)/len(mf2_precisions) mf2_avg_recall = sum(mf2_recalls)/len(mf2_recalls) mf2_avg_F1 = sum(mf2_F1_list)/len(mf2_F1_list) print("MF2 RMSE - MF1 RMSE = {}".format(mf2_mse - mf1_mse)) print("MF2 MAE - MF1 MAE = {}".format(mf2_mae - mf1_mae)) results.append([test_percentage, k, mf2_mse, mf2_mae, mf2_avg_precision, mf2_avg_recall, mf2_avg_F1]) np_results = np.array(results) # Delete the model after you're done with it del model print(np_results) save_location = options.MF_results_folder + "/matrix_factorization_recommender_results_precision{}k.out".format(precision_recall_k) print("Saved at Matrix Factorization Factorization Results at " + save_location) print("Data is to be read as:") print(['test_percentage', 'k (obfuscation percent)', 'mf2_mse', 'mf2_mae', 'mf2_avg_precision', 'mf2_avg_recall', 'mf2_avg_F1']) np.savetxt(save_location, np_results)
def test_NN_with_user(model, user_vector, user_attribute, categorical_movies, chosen_k): dataset = load_dataset(0, chosen_k) mf = get_rating_predictor_using_training_data(dataset.MF_training) ''' k: The percentage of movies to add (obfuscation) ''' ##################################################################### # Test the select_movies implemented function above on a single user ##################################################################### choices = set(range(3)) # One for each job choices.remove(user_attribute) movies_list = categorical_movies[random.choice( list(choices))[0]] # Use anything besides what the user is defined as # Retrieve set of movies rated by user user_movies = set([ movie_index for movie_index, rating in enumerate(user_vector) if rating > 0 ]) new_user_movies = select_movies(user_movies=user_movies, k=chosen_k, movies_list=movies_list, strategy=options.chosen_strategy) print("Known User Attribute: ", user_attribute) print("Original User Movie Length: ", len(user_movies)) print("New User Movie Length: ", len(new_user_movies)) print("[Added {} movies]".format(len(new_user_movies) - len(user_movies))) print("Original Predicted User Attribute: ", model.predict([[user_vector]])) user_vector_new = [0] * options.NUM_ITEMS for movie_index in new_user_movies: #new_user_vector[movie_index] = 1 user_vector_new[movie_index] = mf.get_rating(1, movie_index) #print("User ID:", user_id, "Movie Index:", movie_index, "Rating:", new_user_vector[movie_index]) print("New Predicted User Attribute: ", model.predict([[user_vector_new]])) print("\n", "#" * 100, "\n")
import options from data_train_test import load_dataset import tensorflow as tf import numpy as np ############################################### # Compile many models, each using their own test percentage ############################################### if __name__ == "__main__": non_obfuscated_baseline_accuracies = [] obfuscated_baseline_accuracies = [] for test_percentage in options.TEST_PERCENTAGES: dataset = load_dataset(test_percentage, 0) (train_ratings, train_labels), (test_ratings, test_labels), (train_user_ids, test_user_ids) = dataset.get_training_testing_for_NN() # Step 1: Find the majority class/attribute print("Train Labels:", train_labels) counts = np.bincount(train_labels) majority_attribute = np.argmax(counts) print("Majority Attribute:", majority_attribute) count2 = np.bincount(dataset.user_genders) print("Distribution in Entire Dataset: ", count2) # Step 2: Initialize variables to keep track of accuracy correct_prediction_count = 0 total_users = len(test_labels) print("Test Label Size: ", total_users)
def main(): # Results across all NNs for k in options.k_values: dataset = load_dataset(0, k) non_obfuscated_losses = [] non_obfuscated_accuracies = [] obfuscated_losses = [] obfuscated_accuracies = [] non_obfuscated_auc_micros = [] non_obfuscated_auc_macros = [] obfuscated_auc_micros = [] obfuscated_auc_macros = [] for test_percentage in options.TEST_PERCENTAGES: # Load model for every dataset dataset = load_dataset(test_percentage, k) (train_ratings, train_labels), (test_ratings, test_labels), ( train_user_ids, test_user_ids) = dataset.get_training_testing_for_NN() saved_model_location = get_NN_model_location(test_percentage, k) model, categorical_movies = load_NN_and_movie_lists( saved_model_location) non_obfuscated_loss, non_obfuscated_acc, obfuscated_loss, obfuscated_acc, non_obfuscated_auc_micro, non_obfuscated_auc_macro, obfuscated_auc_micro, obfuscated_auc_macro, modified_user_item_matrix = test_NN( model, test_ratings, test_labels, test_user_ids, categorical_movies, test_percentage, k) del model, categorical_movies tf.keras.backend.clear_session() non_obfuscated_losses.append(non_obfuscated_loss) non_obfuscated_accuracies.append(non_obfuscated_acc) obfuscated_losses.append(obfuscated_loss) obfuscated_accuracies.append(obfuscated_acc) non_obfuscated_auc_micros.append(non_obfuscated_auc_micro) non_obfuscated_auc_macros.append(non_obfuscated_auc_macro) obfuscated_auc_micros.append(obfuscated_auc_micro) obfuscated_auc_macros.append(obfuscated_auc_macro) # Evaluated on test set without obfuscation and with k% obfuscation results = np.array([ options.TEST_PERCENTAGES, non_obfuscated_losses, obfuscated_losses, non_obfuscated_accuracies, obfuscated_accuracies, non_obfuscated_auc_micros, non_obfuscated_auc_macros, obfuscated_auc_micros, obfuscated_auc_macros ]) print('-' * 100) print("[First row is test percentages]") print("[Second row is non_obfuscated_losses]") print("[Third row is obfuscated_losses]") print("[Fourth row is non_obfuscated_accuracies]") print("[Fifth row is obfuscated_accuracies]") print("[Sixth row is non_obfuscated_auc_micros]") print("[Seventh row is non_obfuscated_auc_macros]") print("[Eigth row is obfuscated_auc_micros]") print("[Ninth row is obfuscated_auc_macros]") print(results) if k < 1: save_location = options.results_folder + "/{}_inference_NN_{:.2f}%k_obfuscation.out".format( options.inference_target, k) else: save_location = options.results_folder + "/{}_inference_NN_{:.2f}k_obfuscation.out".format( options.inference_target, k) print("Saved at " + save_location) np.savetxt(save_location, results)
def test_NN(model, test_ratings, test_labels, test_user_ids, categorical_movies, test_percentage, chosen_k): ''' Evaluate on test set without obfuscation and with obfuscation k: The percentage of movies to add (obfuscation) ''' ################################################################ # Use the select_movies implemented function above on all users ################################################################ success_original = 0 success_obfuscated = 0 total_num_users = 0 modified_user_item_matrix = [] # Includes user ids dataset = load_dataset(0, chosen_k) # if options.inference_target == 'gender': # test_labels = [dataset.user_info[user_id]['gender'] for user_id in test_user_ids] # elif options.inference_target == 'age': # test_labels = [dataset.user_ages[user_id] for user_id in test_user_ids] # elif options.inference_target == 'job': # test_labels = [dataset.user_jobs[user_id] for user_id in test_user_ids] # else: # print("Please use 'gender', 'age', or 'job' and the inference target.") # return test_ratings_obfuscated = [] # Does not include user ids mf = get_rating_predictor_using_training_data(dataset.MF_training) for user_index, user_vector in enumerate(test_ratings): user_id = test_user_ids[user_index] total_num_users += 1 user_attribute = test_labels[ user_index] # 0 is male, 1 is female in the case of 'gender' choices = set(range(options.NUM_CLASSES)) # One for each job choices.remove(user_attribute) for i, v in enumerate(categorical_movies): if len(v) == 0 and i != user_attribute: choices.remove(i) movies_list = categorical_movies[random.choice( list(choices))] # Use anything besides what the user is defined as # Retrieve set of movies rated by user user_movies = set([ movie_index for movie_index, rating in enumerate(user_vector) if rating > 0 ]) new_user_movies = select_movies(user_movies=user_movies, k=chosen_k, movies_list=movies_list, strategy=options.chosen_strategy) # print("Known User job: ", user_attribute) # print("Original User Movie Length: ", len(user_movies)) # print("New User Movie Length: ", len(new_user_movies)) # print("[Added {} movies]".format(len(new_user_movies) - len(user_movies))) # print("\n[[Male Probability | Female Probability]]") old_prediction = model.predict([[user_vector]]) # print("Original Predicted User job: ", old_prediction) #print("Old:", np.argmax(old_prediction[0])) #print("{} == {}? -> {}".format(user_attribute, np.argmax(old_prediction[0]), user_attribute == np.argmax(old_prediction[0]))) if user_attribute == np.argmax(old_prediction[0]): success_original += 1 # Assign movie rating to added movie (average or predicted) new_user_vector = [0] * options.NUM_ITEMS for movie_index in new_user_movies: if options.average_or_predicted_ratings == 'average': new_user_vector[movie_index] = dataset.user_item_averages[ movie_index] elif options.average_or_predicted_ratings == 'predicted': new_user_vector[movie_index] = mf.get_rating( user_id, movie_index) #print("User ID:", user_id, "Movie Index:", movie_index, "Rating:", new_user_vector[movie_index]) test_ratings_obfuscated.append(new_user_vector) modified_user_item_matrix.append((user_id, new_user_vector)) new_prediction = model.predict([[new_user_vector]]) #print("New:", np.argmax(new_prediction[0])) #print("New Predicted User job: ", new_prediction) if user_attribute == np.argmax(new_prediction[0]): success_obfuscated += 1 print("\n", "#" * 100, "\n") non_obfuscated_result = success_original / total_num_users print("NN job Inference Accuracy of test set before obfuscation:", non_obfuscated_result) # The below two lines are a faster way of performing the some of the above lines of code in the for loop #print("test_labels", test_labels) non_obfuscated_loss, non_obfuscated_acc = model.evaluate( test_ratings, test_labels) print("Non-Obfuscated Accuracy (using evaluate function):", non_obfuscated_acc) print("Non-Obfuscated Loss (using evaluate function):", non_obfuscated_loss) obfuscated_result = success_obfuscated / total_num_users print("NN job Inference Accuracy of test set after obfuscation:", obfuscated_result) print("\n", "#" * 100, "\n") # The below two lines are a faster way of performing the some of the above lines of code in the for loop obfuscated_loss, obfuscated_acc = model.evaluate( np.array(test_ratings_obfuscated), test_labels) print("Obfuscated Accuracy (using evaluate function):", obfuscated_acc) print("Obfuscated Loss (using evaluate function):", obfuscated_loss) non_obfuscated_predicted_attributes = model.predict(test_ratings) non_obfuscated_auc_micro, non_obfuscated_auc_macro = auc( test_labels, non_obfuscated_predicted_attributes, test_percentage, chosen_k) obfuscated_predicted_attributes = model.predict( np.array(test_ratings_obfuscated)) obfuscated_auc_micro, obfuscated_auc_macro = auc( test_labels, obfuscated_predicted_attributes, test_percentage, chosen_k) return non_obfuscated_loss, non_obfuscated_acc, obfuscated_loss, obfuscated_acc, non_obfuscated_auc_micro, non_obfuscated_auc_macro, obfuscated_auc_micro, obfuscated_auc_macro, modified_user_item_matrix #(non_obfuscated_result, obfuscated_result)
def main(): for k in options.k_values: for test_percentage in options.TEST_PERCENTAGES: (train_ratings, train_labels), (test_ratings, test_labels), (train_user_ids, test_user_ids) = load_dataset(test_percentage, k).get_training_testing_for_NN() model = build_model() ############################################### # Compile the model (optimizer, loss, and metrics) ############################################### model.compile(optimizer=tf.train.AdamOptimizer(), loss='sparse_categorical_crossentropy',#keras.losses.categorical_crossentropy, metrics=['accuracy']) ############################################### # Train the model ############################################### # Feed the model the training data, with the associated training labels print("Training Shape:", train_ratings.shape) print("Testing Shape:", test_ratings.shape) print("Number of train labels:", len(train_labels)) print("Number of test labels:", len(test_labels)) print("First Test Rating: ", test_ratings[0]) print("First test label:", test_labels[0]) print("Second Test Rating: ", test_ratings[1]) print("Second test label:", test_labels[1]) # Observce the BASELINE results = model.evaluate(test_ratings, test_labels) print('-'*100) print("{} Attribute inference results prior to training (Test Data)\nLoss: {}\nAccuracy: {}".format( options.inference_target, results[0], results[1])) print('-'*100) callback_list = [] early_stopping_monitor = 'val_loss' early_stopping_min_delta = 0 early_stopping_patience = 10 # Number of epochs with no improvement early_stopping_verbose = 1 early_stopping_callback = keras.callbacks.EarlyStopping(monitor=early_stopping_monitor, min_delta=early_stopping_min_delta, patience=early_stopping_patience, verbose=early_stopping_verbose) callback_list.append(early_stopping_callback) # Then TRAIN training_history = model.fit(train_ratings, train_labels, epochs=options.EPOCHS, batch_size=options.TRAINING_BATCH_SIZE, validation_data=(test_ratings, test_labels), verbose=1, callbacks=callback_list) # Then Observe if there was an improvement results = model.evaluate(test_ratings, test_labels) print('-'*100) print("{} Attribute inference results after training (Test Data)\nLoss: {}\nAccuracy: {}".format( options.inference_target, results[0], results[1])) print('-'*100) ############################################### # Save the model ############################################### model_save_path = get_NN_model_location(test_percentage, k) # Save entire model to a HDF5 file if not os.path.exists(options.model_folder): os.makedirs(options.model_folder) model.save(model_save_path) ############################################### # Load the model (Just to make sure it saved correctly) ############################################### new_model = keras.models.load_model(model_save_path) # new_model.summary() new_model.compile(optimizer=tf.train.AdamOptimizer(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) loss, acc = new_model.evaluate(test_ratings, test_labels) print("Restored model accuracy: {:5.2f}%".format(100*acc))