def Similarity(df_train, df_test, df_test_rec_items, grid_area_dict, pp): df_train = funcs.update_grid_group(df_train, grid_area_dict) # random forest model_rf = funcs.train_rf_model_check_in(df_train) # xgboost model_xgb = funcs.train_xgb_model_check_in(df_train) print("model training over...") df_test_rec_items = funcs.update_grid_group(df_test_rec_items, grid_area_dict) print("start obfuscating...") X_obf_dict = {} # get similarity matrix itemCols = df_test.columns[:-4] df_items = df_test[itemCols] sim_mat = cosine_similarity(df_items.values) for i in range(50): X_obf_dict[i], _ = funcs.get_similarity_obf_X(sim_mat, df_test, pp) _, X_ori = funcs.get_similarity_obf_X(sim_mat, df_test, pp) print("obfuscating done.") for i in X_ori.keys(): user_grid = X_ori[i][-2] X_ori[i][-3] = grid_area_dict[user_grid] for j in range(50): X_obf_dict[j][i][-1] = grid_area_dict[user_grid] return X_obf_dict, X_ori, model_rf, model_xgb
def Random(df_train, df_test, grid_area_dict, df_test_rec_items, p_rand, pp): df_train = funcs.update_grid_group(df_train, grid_area_dict) # random forest model_rf = funcs.train_rf_model_check_in(df_train) # xgboost model_xgb = funcs.train_xgb_model_check_in(df_train) print("model training over...") df_test_rec_items = funcs.update_grid_group(df_test_rec_items, grid_area_dict) print("start obfuscating...") X_obf_dict = {} for i in range(50): X_obf_dict[i], _ = funcs.get_random_obf_X(df_test, p_rand, pp) _, X_ori = funcs.get_random_obf_X(df_test, p_rand, pp) print("obfuscating done.") for i in X_ori.keys(): user_grid = X_ori[i][-2] X_ori[i][-3] = grid_area_dict[user_grid] for j in range(50): X_obf_dict[j][i][-1] = grid_area_dict[user_grid] return X_obf_dict, X_ori, model_rf, model_xgb
def differential_privacy(df_train, df_test, grid_area_dict, df_test_rec_items, beta): df_train = funcs.update_grid_group(df_train, grid_area_dict) model_rf = funcs.train_rf_model_check_in(df_train) # xgboost model_xgb = funcs.train_xgb_model_check_in(df_train) print("model training over...") dist_mat = dist.squareform(dist.pdist(df_test.values[:, :-4], 'euclidean')) dist_mat = normalize(dist_mat, axis=1, norm='max') df_test_rec_items = funcs.update_grid_group(df_test_rec_items, grid_area_dict) print("start obfuscating...") X_obf_dict = {} for i in range(50): X_obf_dict[i], _ = funcs.get_DP_obf_X(df_test, dist_mat, beta) _, X_ori = funcs.get_DP_obf_X(df_test, dist_mat, beta) print("obfuscating done.") for i in X_ori.keys(): user_grid = X_ori[i][-2] X_ori[i][-3] = grid_area_dict[user_grid] for j in range(50): X_obf_dict[j][i][-1] = grid_area_dict[user_grid] return X_obf_dict, X_ori, model_rf, model_xgb
def PrivCheck(df_train, df_test, df_test_rec_items, grid_area_dict, area_grid_dict, grid_list, cluster_num, grid_area_number, deltaX, pp): df_train = funcs.update_grid_group(df_train, grid_area_dict) model_rf = funcs.train_rf_model_check_in(df_train) # xgboost model_xgb = funcs.train_xgb_model_check_in(df_train) pd.DataFrame( funcs.cal_pgy_withoutGridGroup(df_test, cluster_num, grid_list)).to_csv( 'tmp/pgy_check_in_privcheck.csv', index=False, header=None) df_test = funcs.update_grid_group(df_test, grid_area_dict) JSD_Mat_dict = np.zeros((cluster_num, cluster_num, grid_area_number)) group_user_size_dict = {} for gg in range(grid_area_number): df_test_gg = df_test.loc[df_test['grid_group'] == gg] grid_list_gg = area_grid_dict[gg] group_user_size_dict[gg] = df_test_gg.shape[0] JSD_Mat_dict[:, :, gg] = funcs.cal_JSD_Matrix_withoutGridGroup( df_test_gg, cluster_num, 4) scipy.io.savemat('tmp/JSDM_girdGroup_privcheck.mat', {"JSD_Mat_input_Yang_trueTrain": JSD_Mat_dict}) eng = matlab.engine.start_matlab() eng.edit("../../matlab/checkin_tradeoff_scenario_I/PrivCheck", nargout=0) eng.cd('../../matlab/checkin_tradeoff_scenario_I', nargout=0) xpgg, distortion_budget = np.array(eng.PrivCheck(deltaX, nargout=2)) xpgg = np.array(xpgg) df_test['grid_group'] = pd.Series(np.zeros(df_test.shape[0]), index=df_test.index, dtype='int32') X_obf_dict = {} for i in range(50): X_obf_dict[i], _ = funcs.get_obf_X(df_test, xpgg, pp) _, X_ori = funcs.get_obf_X(df_test, xpgg, pp) for i in X_ori.keys(): user_grid = X_ori[i][-2] X_ori[i][-3] = grid_area_dict[user_grid] for j in range(50): X_obf_dict[j][i][-1] = grid_area_dict[user_grid] df_test_rec_items = funcs.update_grid_group(df_test_rec_items, grid_area_dict) return X_obf_dict, X_ori, model_rf, model_xgb
def Similarity(df_train, grid_area_dict, pp): print("start obfuscating...") df_train = funcs.update_grid_group(df_train, grid_area_dict) itemCols = df_train.columns[:-4] df_items = df_train[itemCols] sim_mat = cosine_similarity(df_items.values) X_obf_dict = {} for i in range(25): print("obfuscation {}".format(i)) X_obf_dict[i], _ = funcs.get_similarity_obf_X_withAgeGroup( sim_mat, df_train, pp) _, X_ori = funcs.get_similarity_obf_X_withAgeGroup(sim_mat, df_train, pp) print("obfuscating done.") for i in X_ori.keys(): user_grid = X_ori[i][-2] X_ori[i][-3] = grid_area_dict[user_grid] for j in range(25): X_obf_dict[j][i][-1] = grid_area_dict[user_grid] return X_obf_dict, X_ori
def differential_privacy(df_train, grid_area_dict, grid_area_number, beta): print("start obfuscating...") df_train = funcs.update_grid_group(df_train, grid_area_dict) dist_mat_dict = {} for i in range(grid_area_number): dist_mat_dict[i] = normalize(dist.squareform( dist.pdist(df_train[df_train['grid_group'] == i].values[:, :-4], 'euclidean')), axis=1, norm='max') X_obf_dict = {} for i in range(25): X_obf_dict[i], _ = funcs.get_DP_obf_X_withAgeGroup( df_train, dist_mat_dict, beta) # X_obf_dict[i], _ = get_DP_obf_X(df_train, dist_mat, beta) _, X_ori = funcs.get_DP_obf_X_withAgeGroup(df_train, dist_mat_dict, beta) # _, X_ori = get_DP_obf_X(df_train, dist_mat, beta) print("obfuscating done.") for i in X_ori.keys(): user_grid = X_ori[i][-2] X_ori[i][-3] = grid_area_dict[user_grid] for j in range(25): X_obf_dict[j][i][-1] = grid_area_dict[user_grid] return X_obf_dict, X_ori
def PrivCheck(df_train, cluster_num, grid_list, grid_area_dict, grid_area_number, area_grid_dict, deltaX, pp): pd.DataFrame( funcs.cal_pgy_withoutGridGroup(df_train, cluster_num, grid_list)).to_csv( 'tmp/pgy_check_in_privcheck.csv', index=False, header=None) df_train = funcs.update_grid_group(df_train, grid_area_dict) JSD_Mat_dict = np.zeros((cluster_num, cluster_num, grid_area_number)) group_user_size_dict = {} for gg in range(grid_area_number): df_train_gg = df_train.loc[df_train['grid_group'] == gg] grid_list_gg = area_grid_dict[gg] group_user_size_dict[gg] = df_train_gg.shape[0] JSD_Mat_dict[:, :, gg] = funcs.cal_JSD_Matrix_withoutGridGroup( df_train_gg, cluster_num, 4) scipy.io.savemat('tmp/JSDM_girdGroup_privcheck.mat', {"JSD_Mat_input_Yang_allObf": JSD_Mat_dict}) eng = matlab.engine.start_matlab() eng.edit("../../matlab/checkin_clusternum_scenario_II/PrivCheck", nargout=0) eng.cd('../../matlab/checkin_clusternum_scenario_II', nargout=0) xpgg, distortion_budget = np.array(eng.PrivCheck(deltaX, nargout=2)) xpgg = np.array(xpgg) df_train = funcs.update_grid_group(df_train, grid_area_dict) X_obf_dict = {} for i in range(25): X_obf_dict[i], _ = funcs.get_obf_X_withAgeGroup(df_train, xpgg, pp) _, X_ori = funcs.get_obf_X_withAgeGroup(df_train, xpgg, pp) for i in X_ori.keys(): user_grid = X_ori[i][-2] X_ori[i][-3] = grid_area_dict[user_grid] for j in range(25): X_obf_dict[j][i][-1] = grid_area_dict[user_grid] return X_obf_dict, X_ori
def Random(df_train, grid_area_dict, p_rand): print("start obfuscating...") df_train = funcs.update_grid_group(df_train, grid_area_dict) X_obf_dict = {} for i in range(25): X_obf_dict[i], _ = funcs.get_random_obf_X_withAgeGroup( df_train, p_rand) _, X_ori = funcs.get_random_obf_X_withAgeGroup(df_train, p_rand) print("obfuscating done.") for i in X_ori.keys(): user_grid = X_ori[i][-2] X_ori[i][-3] = grid_area_dict[user_grid] for j in range(25): X_obf_dict[j][i][-1] = grid_area_dict[user_grid] return X_obf_dict, X_ori
def Frapp(df_train, grid_area_dict, gamma): print("start obfuscating...") df_train = funcs.update_grid_group(df_train, grid_area_dict) X_obf_dict = {} for i in range(25): X_obf_dict[i], _ = funcs.get_frapp_obf_X_withAgeGroup(df_train, gamma) # X_obf_dict[i], _ = get_frapp_obf_X(df_train, gamma) _, X_ori = funcs.get_frapp_obf_X_withAgeGroup(df_train, gamma) # _, X_ori = get_frapp_obf_X(df_train, gamma) print("obfuscating done.") for i in X_ori.keys(): user_grid = X_ori[i][-2] X_ori[i][-3] = grid_area_dict[user_grid] for j in range(25): X_obf_dict[j][i][-1] = grid_area_dict[user_grid] return X_obf_dict, X_ori
pass else: os.makedirs('tmp') df = read_data() grid_list = list(set(df['grid'].values)) grid_list.sort() grid_colrow, grid_rowcol = get_grid_loc(grid_list) print("initiate area...") area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility = area_initiate( grid_rowcol, grid_area_number) df['grid_group'] = pd.Series(np.zeros(df.shape[0]), index=df.index, dtype='int32') if method in ['HyObscure', 'YGen', 'XObf']: df_grid_group = funcs.update_grid_group(df, grid_area_dict) else: df_grid_group = df cols = list(df.columns.values) cols_change = cols[:-3] cols_change.extend(['grid_group', 'grid', 'uid']) df_item_gridGroup_uid = df_grid_group[cols_change] df_cluster = funcs.hierarchical_clustering(df_item_gridGroup_uid, cluster_num, -3, 'cosine', 'complete') drop_idx = df_cluster.loc[df_cluster['cluster']==5].index df_cluster.drop(drop_idx, inplace=True) drop_idx = [] for i in df_cluster.grid.value_counts().index: if df_cluster.grid.value_counts()[i] < 3: drop_idx.extend(df_cluster.loc[df_cluster['grid']==i].index) df_cluster.drop(drop_idx, inplace=True)
df = read_data() grid_list = list(set(df['grid'].values)) grid_list.sort() grid_colrow, grid_rowcol = get_grid_loc(grid_list) print("initiate area...") area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility = area_initiate( grid_rowcol, grid_area_number) df['grid_group'] = pd.Series(np.zeros(df.shape[0]), index=df.index, dtype='int32') if method in ['HyObscure', 'YGen', 'XObf']: df_grid_group = funcs.update_grid_group(df, grid_area_dict) else: df_grid_group = df cols = list(df.columns.values) cols_change = cols[:-3] cols_change.extend(['grid_group', 'grid', 'uid']) df_item_gridGroup_uid = df_grid_group[cols_change] for cluster_num in cluster_num_list: df_cluster = funcs.hierarchical_clustering(df_item_gridGroup_uid, cluster_num, -3, 'cosine', 'complete') results = {} results['ori_acc_rf'] = []
def HyObscure(df_train, grid_area_dict, area_grid_dict, cluster_num, grid_area_number, grid_list, area_reducibility, area_grid_rowcol_dict, area_grid_colrow_dict, method, grid_rowcol, grid_colrow, l_threshold, k_threshold, deltaX, pp): df_train_copy = copy.deepcopy(df_train) df_train_copy['grid_group'] = pd.Series(np.zeros(df_train_copy.shape[0]), index=df_train_copy.index, dtype='int32') user_num = df_train_copy.shape[0] X_ori = {} for k in range(user_num): user_id = df_train_copy['uid'][k] X_ori[user_id] = df_train_copy[df_train_copy['uid'] == user_id].values[ 0, :-1] for k in X_ori.keys(): user_grid = X_ori[k][-2] X_ori[k][-3] = grid_area_dict[user_grid] for i in area_grid_dict: print("user number in area ", i, " is ", funcs.k_anonymity(df_train, area_grid_dict[i])) print("start solving xpgg...") xpgg = np.ones((cluster_num * grid_area_number, cluster_num * grid_area_number)) * 0.00000001 JSD_Mat = np.ones( (cluster_num * grid_area_number, cluster_num * grid_area_number)) pgy = np.ones( (len(grid_list), cluster_num * grid_area_number)) * 0.00000001 JSD_Mat_dict = {} pgy_dict = {} for op in range(0, 6): ## compute JSD and pgy JSD_Mat, pgy, JSD_Mat_dict, pgy_dict = funcs.get_JSD_PGY( df_train, area_grid_dict, JSD_Mat_dict, cluster_num, pgy_dict, JSD_Mat, pgy, method) print('op:', op) grid_xpgg_dict = {} ## compute xpgg for gg in range(0, grid_area_number): eng = matlab.engine.start_matlab() eng.edit('../../matlab/checkin_clusternum_scenario_II/HyObscure', nargout=0) eng.cd('../../matlab/checkin_clusternum_scenario_II', nargout=0) grid_xpgg_dict[gg] = np.array(eng.HyObscure(deltaX, gg)) for row in range(cluster_num): for col in range(cluster_num): xpgg[gg + row * grid_area_number, gg + col * grid_area_number] = grid_xpgg_dict[gg][row, col] mean_Utility = funcs.Mean_JSD(JSD_Mat, xpgg) mean_Privacy = funcs.Mean_KL_div(pgy, xpgg) min_mean_Utility = mean_Utility min_mean_Privacy = mean_Privacy ## area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility areas = list(area_grid_dict.keys()) random.shuffle(areas) ### change grid group (area) by stochastic privacy-utility boosting for area_code in areas: ##select one area to adjust area_grids = area_grid_dict[ area_code] ## get all the grids in the area l_cur = funcs.l_diversity(df_train, area_grids) ## check l diversity l_range = int(np.exp(l_cur) - np.exp(np.log(l_threshold))) print('start adjusting area: ', area_code) if l_range > 0: ### select one direction to adjust: left (0); right (1); up (2); down(3) d = np.random.choice([0, 1, 2, 3], p=area_reducibility[area_code] / np.sum(area_reducibility[area_code])) # the selected area can be reduced through the selected direction if d < 2: ## change left or right area_grid_line_list_dict = area_grid_rowcol_dict line_list_to_grid = funcs.rowcol_to_grid grid_linelist = grid_rowcol else: ## change up or down area_grid_line_list_dict = area_grid_colrow_dict line_list_to_grid = funcs.colrow_to_grid grid_linelist = grid_colrow area_lines = list(area_grid_line_list_dict[area_code].keys()) area_lines.sort() for line in area_lines: # recheck area l diversity area_grids = area_grid_dict[ area_code] ## get all the grids in the area l_cur = funcs.l_diversity(df_train, area_grids) ## check l diversity l_range = int(np.exp(l_cur) - np.exp(np.log(l_threshold))) change_range = l_range line_lists = area_grid_line_list_dict[area_code][line] line_lists.sort() line_lists_len = len(line_lists) if change_range > line_lists_len: change_range = line_lists_len for i in range(1, change_range + 1): if d == 0 or d == 3: moveout_grid_lists = line_lists[-i:] elif d == 1 or d == 2: moveout_grid_lists = line_lists[:i] moveout_grids = [] for mgc in moveout_grid_lists: moveout_grids.append(line_list_to_grid(line, mgc)) adjusted_area_grids = list( set(area_grids) - set(moveout_grids)) ## check k anonymity k_adjust = funcs.k_anonymity(df_train, adjusted_area_grids) ## the adjusted schema meets both k-anonymity and l-diversity if k_adjust >= k_threshold: if d == 0: to_area = area_code + 1 elif d == 1: to_area = area_code - 1 elif d == 2: to_area = area_code - int( grid_area_number / int(np.sqrt(grid_area_number))) elif d == 3: to_area = area_code + int( grid_area_number / int(np.sqrt(grid_area_number))) ## adjust grid groups (areas): update area_grid_dict and grid_area_dict area_grid_dict_cur = copy.deepcopy(area_grid_dict) adjusted_area_grids.sort() area_grid_dict_cur[area_code] = adjusted_area_grids area_grid_dict_cur[to_area] = list( set(area_grid_dict_cur[to_area]) | set(moveout_grids)) area_grid_dict_cur[to_area].sort() grid_area_dict_cur = copy.deepcopy(grid_area_dict) for grid in moveout_grids: grid_area_dict_cur[grid] = to_area for i in area_grid_dict_cur: print("area:", i, "grid number:", len(area_grid_dict_cur[i])) print('from area: ', area_code, 'to area: ', to_area, 'change line: ', line, 'moveout_grids: ', moveout_grids) df_train_new = funcs.update_grid_group( df_train, grid_area_dict_cur) # try: new_JSD_Mat, new_pgy, new_JSD_Mat_dict, new_pgy_dict = funcs.get_JSD_PGY( df_train_new, area_grid_dict_cur, JSD_Mat_dict, cluster_num, pgy_dict, JSD_Mat, pgy, method) new_mean_Utility = funcs.Mean_JSD( new_JSD_Mat, xpgg) new_mean_Privacy = funcs.Mean_KL_div(new_pgy, xpgg) if new_mean_Privacy < min_mean_Privacy and new_mean_Utility < min_mean_Utility: min_mean_Utility = new_mean_Utility min_mean_Privacy = new_mean_Privacy min_grid_area_dict = grid_area_dict_cur min_area_grid_dict = area_grid_dict_cur min_df_train = df_train_new grid_area_dict = min_grid_area_dict area_grid_dict = min_area_grid_dict df_train = min_df_train min_distortion_budget = min_mean_Utility area_grid_rowcol_dict, area_grid_colrow_dict = funcs.update_rowcol_colrow_dict( area_grid_dict) print("! Find a better area group") break print(op, area_code, to_area, line, mgc, mean_Privacy, mean_Utility, min_mean_Privacy, min_mean_Utility, new_mean_Privacy, new_mean_Utility) else: print("*** area not meet k_anonymity requirement") else: print("*** area not meet l_diversity requirement") df_train = funcs.update_grid_group(df_train, grid_area_dict) X_obf_dict = {} for i in range(25): X_obf_dict[i], _ = funcs.get_obf_X(df_train, xpgg, pp) return X_obf_dict, X_ori
cluster_col.append(6) elif i == 8: cluster_col.append(7) else: cluster_col.append(i) df_cluster.cluster = cluster_col cluster_num = len(set(df_cluster.cluster)) for grid_area_number in grid_area_number_list: print("initiate area...") area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility = area_initiate( grid_rowcol, grid_area_number) if method in ['HyObscure', 'YGen', 'XObf']: df_cluster = funcs.update_grid_group(df_cluster, grid_area_dict) results = {} results['ori_acc_rf'] = [] results['obf_acc_rf'] = [] results['ori_acc_xgb'] = [] results['obf_acc_xgb'] = [] results['rec_ori'] = [] results['rec_obf'] = [] for r in range(20): if method in ['HyObscure', 'YGen']: area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility = area_initiate( grid_rowcol, grid_area_number) df_cluster = funcs.update_grid_group(df_cluster, grid_area_dict)