def _analyze_privacy(paras): epsilon2 = analysis.epsilon(N=paras['n_row'], batch_size=paras['minibatch_size'], noise_multiplier=paras['noise_multiplier'], delta=paras['delta'], iterations=paras['iterations'] * (paras['n_col'] - 1)) return epsilon2
def syn_br2000(): """ Group binary attributes in the preprocessing step. Restore back from HoloMake process. """ start = time.time() path_data = f'./testdata/br2000/br2000.csv' path_ic = f'./testdata/br2000/br2000.ic' # Group bianry attributes as one attribute. return path includes _concat path_data_preproc = preproc_br2000_separate(path_data) n_row, n_col = pd.read_csv(path_data_preproc).shape n_len = len(str(n_row)) + 1 paras = { 'reuse_embedding': True, # set True to reuse the embedding 'dp': True, # set True to enable privacy 'n_row': n_row, # number of rows in the true dataset 'n_col': n_col, # number of columns in the true data 'epsilon1': .4, # epsilon1, 'l2_norm_clip': 1.0, 'noise_multiplier': 1.1, 'minibatch_size': 29, # batch size to sample for each iteration. 'microbatch_size': 1, # micro batch size 'delta': float(f'1e-{n_len}'), # depends on data size. Do not change for now 'learning_rate': 1e-4, 'iterations': 1500 # =1500 for comparision, =1000 for testing learned weights } if paras['dp']: epsilon2 = _analyze_privacy(paras) gaussian_std = [] sensitivity_hist = 2 std1 = np.sqrt(sensitivity_hist * 2. * np.log(1.25 / paras['delta'])) / paras['epsilon1'] for i in range(1): gaussian_std.append(std1) sensitivity = 100 * 3 # sample size * number of dcs std_learnW = np.sqrt(sensitivity * 2. * np.log(1.25 / 1e-3)) / 1. gaussian_std.append(std_learnW) epsilon = analysis.epsilon(N=n_row, batch_size=paras['minibatch_size'], noise_multiplier=paras['noise_multiplier'], iterations=paras['iterations'] * (paras['n_col'] - 1), delta=paras['delta'], gaussian_std=gaussian_std) paras['epsilon2'] = epsilon2 msg = f"epsilon1= {paras['epsilon1']}\t epsilon2= {'{:.2f}'.format(epsilon2)}\t delta= {paras['delta']}\n" \ f"epsilon= {epsilon}" print(msg) syn_data(path_data_preproc, path_ic, paras) path_syn = paras['path_syn'] path_data_postproc = postproc_br2000(path_syn) end = time.time() logging.info(f'TIME_WALL= {end - start}') evaluate_data(path_data, path_data_postproc, path_ic) copy_log(paras)
def syn_tpch(): """ iid first three attributes, and then use tuple embedding For the last two numerical attributes, iid """ start = time.time() path_data = f'./testdata/tpch/tpch_order.csv' path_data_cat = f'./testdata/tpch/tpch_order_cat.csv' path_data_num = f'./testdata/tpch/tpch_order_num.csv' path_ic = f'./testdata/tpch/tpch_order.ic' # c_custkey, c_mktsegment, c_nationkey, # n_name, n_regionkey, o_orderstatus, o_orderpriority, # c_acctbal, o_total_price df = pd.read_csv(path_data_cat) n_row, n_col = df.shape n_len = len(str(n_row)) + 1 paras = { 'reuse_embedding': True, # set True to reuse the embedding 'dp': True, # set True to enable privacy 'n_row': n_row, # number of rows in the true dataset 'n_col': n_col - 3, # number of columns in the true dataset 'epsilon1': .2, # 'l2_norm_clip': 1.0, 'noise_multiplier': 1.1, # 'minibatch_size': 19, # 'microbatch_size': 1, # micro batch size 'delta': float(f'1e-{n_len}'), # depends on data size. Do not change for now 'learning_rate': 1e-4, 'iterations': 2000 # number of iteration. Should be large enough: iterations * minibatch_size > n_row } std1 = None if paras['dp']: epsilon2 = _analyze_privacy(paras) paras['epsilon2'] = epsilon2 gaussian_std = [] sensitivity = 2 std1 = np.sqrt(sensitivity * 2. * np.log(1.25 / paras['delta'])) / paras['epsilon1'] for i in range(5): gaussian_std.append(std1) epsilon = analysis.epsilon(N=n_row, batch_size=paras['minibatch_size'], noise_multiplier=paras['noise_multiplier'], iterations=paras['iterations'] * (paras['n_col'] - 1), delta=paras['delta'], gaussian_std=gaussian_std) msg = f"epsilon1= {5 * paras['epsilon1']}\t epsilon2= {'{:.2f}'.format(epsilon2)}\n" \ f"epsilon= {epsilon}\t delta= {paras['delta']}" print(msg) # c_custkey, c_mktsegment, c_nationkey, custkeys = list(df['c_custkey'].unique()) mkesgements = list(df['c_mktsegment'].unique()) nationkeys = list(df['c_nationkey'].unique()) probas_custkeys = [] probas_mkesgements = [] probas_nationkeys = [] temp_df_custkeys = df['c_custkey'].value_counts() for custkey in custkeys: count = temp_df_custkeys[custkey] noise = np.random.normal(0, std1) noisy_count = max(0, count + noise) probas_custkeys.append(noisy_count) probas_custkeys = np.array(probas_custkeys) / np.sum(probas_custkeys) temp_df_mkesgements = df['c_mktsegment'].value_counts() for mkesgement in mkesgements: count = temp_df_mkesgements[mkesgement] noise = np.random.normal(0, std1) noisy_count = max(0, count + noise) probas_mkesgements.append(noisy_count) probas_mkesgements = np.array(probas_mkesgements) / np.sum( probas_mkesgements) temp_df_nationkeys = df['c_nationkey'].value_counts() for nationkey in nationkeys: count = temp_df_nationkeys[nationkey] noise = np.random.normal(0, std1) noisy_count = max(0, count + noise) probas_nationkeys.append(noisy_count) probas_nationkeys = np.array(probas_nationkeys) / np.sum(probas_nationkeys) syn_custkeys = [] syn_mkesgements = [] syn_nationkeys = [] while len(syn_custkeys) < df.shape[0]: custkey = np.random.choice(custkeys, p=probas_custkeys) if custkey in syn_custkeys: id = syn_custkeys.index(custkey) nationkey = syn_nationkeys[id] mkesgement = syn_mkesgements[id] else: mkesgement = np.random.choice(mkesgements, p=probas_mkesgements) nationkey = np.random.choice(nationkeys, p=probas_nationkeys) syn_custkeys.append(custkey) syn_mkesgements.append(mkesgement) syn_nationkeys.append(nationkey) dicti = { 'c_custkey': syn_custkeys, 'c_mktsegment': syn_mkesgements, 'c_nationkey': syn_nationkeys } syn = pd.DataFrame(dicti) path_tmp = '/tmp/tpch_3.csv' syn.to_csv(path_tmp, index=False) synthesize_continue(path_data=path_data_cat, path_syn_old=path_tmp, path_constraint=path_ic, paras=paras, rand_sequence=False, ic_sampling=True) path_syn = paras['path_syn'] ## for numerical attribute df_num = _syn_tpch_num(path_data_num, std1) df_cat = pd.read_csv(path_syn) df_syn = df_cat.copy() for num_attr in list(df_num.columns): df_syn[num_attr] = df_num[num_attr].copy() df_syn.to_csv(path_syn, index=False) logging.info( f'path_data_cat= {path_data_cat}\npath_data_num= {path_data_num}\n' f'path_ic= {path_ic}\npath_syn= {path_syn}') logging.info(f'{paras}') end = time.time() time_wall = end - start time_train = paras['TIME_TRAINING'] time_seq = paras['TIME_SEQ'] time_sampling = time_wall - time_seq * 2 - time_train logging.info(f'TIME_WALL= {time_wall}') logging.info(f'TIME_SEQ= {time_seq}') logging.info(f'TIME_TRAINING= {time_train}') logging.info(f'TIME_SAMPLING= {time_sampling}') evaluate_data(path_data, path_syn, path_ic) copy_log(paras)
def syn_adult100(): """ Quick run on the adult data with 100 rows. """ start = time.time() path_data = f'./testdata/a100/adult100.csv' path_ic = f'./testdata/a100/adult100.ic' path_data_preproc = preproc_adult(path_data) n_row, n_col = pd.read_csv(path_data_preproc).shape n_len = len(str(n_row)) + 1 paras = { 'reuse_embedding': True, # set True to reuse the embedding 'dp': True, # set True to enable privacy 'n_row': n_row, # number of rows in the true dataset 'n_col': n_col, # number of columns in the true dataset 'epsilon1': .1, # 'l2_norm_clip': 1.0, 'noise_multiplier': 1.1, 'minibatch_size': 23, 'microbatch_size': 1, 'delta': float(f'1e-{n_len}'), 'learning_rate': 1e-4, 'iterations': 10 } if paras['dp']: epsilon2 = _analyze_privacy(paras) paras['epsilon2'] = epsilon2 gaussian_std = [] sensitivity = 2 std1 = np.sqrt(sensitivity * 2. * np.log(1.25 / paras['delta'])) / paras['epsilon1'] for i in range(1): gaussian_std.append(std1) epsilon = analysis.epsilon(N=n_row, batch_size=paras['minibatch_size'], noise_multiplier=paras['noise_multiplier'], iterations=paras['iterations'] * (paras['n_col'] - 1), delta=paras['delta'], gaussian_std=gaussian_std) msg = f"epsilon1= {paras['epsilon1']}\t epsilon2= {'{:.2f}'.format(epsilon2)}\t delta= {paras['delta']}\n" \ f"epsilon= {epsilon}" print(msg) syn_data(path_data_preproc, path_ic, paras) path_syn = paras['path_syn'] path_data_postproc = postproc_adult(path_syn) end = time.time() logging.info(f'TIME_WALL= {end - start}') evaluate_data(path_data, path_data_postproc, path_ic) copy_log(paras)
def syn_tax(): """ The full tax with city and zip - private setup use iid for zip and city Revision note: this is actually the chosen setup """ start = time.time() path_data = f'./testdata/tax/tax30k.csv' path_ic = f'./testdata/tax/tax30k.ic' path_data_part = f'./testdata/tax/tax30k_part.csv' path_ic_part = f'./testdata/tax/tax30k_part.ic' df = pd.read_csv(path_data) n_row, n_col = df.shape n_len = len(str(n_row)) + 1 paras = { 'reuse_embedding': True, # set True to reuse the embedding 'dp': True, # set True to enable privacy 'n_row': n_row, # number of rows in the true dataset 'n_col': n_col - 2, # number of columns in the true dataset 'epsilon1': .4, # epsilon1, for iid 'l2_norm_clip': 1.0, 'noise_multiplier': 1.1, 'minibatch_size': 18, # batch size to sample for each iteration. 'microbatch_size': 1, # micro batch size 'delta': float(f'1e-{n_len}'), # depends on data size. Do not change for now 'learning_rate': 1e-4, 'iterations': 2000 # number of iteration. Should be large enough: iterations * minibatch_size > N } if paras['dp']: epsilon2 = _analyze_privacy(paras) paras['epsilon2'] = epsilon2 gaussian_std = [] sensitivity = 2 std1 = np.sqrt(sensitivity * 2. * np.log(1.25 / paras['delta'])) / paras['epsilon1'] for i in range(3): gaussian_std.append(std1) epsilon = analysis.epsilon(N=n_row, batch_size=paras['minibatch_size'], noise_multiplier=paras['noise_multiplier'], iterations=paras['iterations'] * (paras['n_col']-1), delta=paras['delta'], gaussian_std=gaussian_std) msg = f"epsilon1= {3 * paras['epsilon1']}\tepsilon2= {'{:.2f}'.format(epsilon2)}\t" \ f"delta= {paras['delta']}\nepsilon={epsilon}" print(msg) syn_data(path_data_part, path_ic_part, paras) path_syn_part = paras['path_syn'] # continue on the 'Zip', 'City' zips = list(df['Zip'].unique()) cities = list(df['City'].unique()) probas_zips = [] probas_cities = [] temp_df_zips = df['Zip'].value_counts() for crnt_zip in zips: count = temp_df_zips[crnt_zip] # noise = np.random.laplace(0, 1.0 / paras['epsilon1']) noise = np.random.normal(0, std1) noisy_count = max(0, count + noise) probas_zips.append(noisy_count) probas_zips = np.array(probas_zips) / np.sum(probas_zips) temp_df_cities = df['City'].value_counts() for city in cities: count = temp_df_cities[city] # noise = np.random.laplace(0, 1.0 / paras['epsilon1']) noise = np.random.normal(0, std1) noisy_count = max(0, count + noise) probas_cities.append(noisy_count) probas_cities = np.array(probas_cities) / np.sum(probas_cities) df_syn = pd.read_csv(path_syn_part) syn_zips = [] syn_cities = [] syn_states = df_syn['State'].tolist() while len(syn_zips) < len(syn_states): idx = len(syn_zips) crnt_state = syn_states[idx] crnt_zip = np.random.choice(zips, p=probas_zips) previous_city = None # check if crnt_zip violates zip->state if crnt_zip in syn_zips: previous_idx = syn_zips.index(crnt_zip) previous_state = syn_states[previous_idx] previous_city = syn_cities[previous_idx] if previous_state != crnt_state: continue # so far, zip->states if previous_city is None: crnt_city = np.random.choice(cities, p=probas_cities) else: crnt_city = previous_city syn_zips.append(crnt_zip) syn_cities.append(crnt_city) df_syn['Zip'] = syn_zips df_syn['City'] = syn_cities tmp_syn_path = '/tmp/tax_iid.csv' df_syn.to_csv(tmp_syn_path, index=False) synthesize_continue(path_data=path_data, path_syn_old=tmp_syn_path, path_constraint=path_ic, paras=paras, rand_sequence=False, ic_sampling=True) path_syn = paras['path_syn'] end = time.time() time_wall = end - start paras['TIME_WALL'] = time_wall time_sampling = time_wall - paras['TIME_TRAINING'] logging.info(f"TIME_TRAINING= {paras['TIME_TRAINING']}") logging.info(f'TIME_SAMPLING= {time_sampling}') logging.info(f'TIME_ALL= {time_wall}') evaluate_data(path_data, path_syn, path_ic) copy_log(paras)