def dp_query_test(self, d1_query, d2_query, debug=False, plot=True, bound=True, exact=False, repeat_count=10000, confidence=0.95, get_exact=True): ag = agg.Aggregation(t=1, repeat_count=repeat_count) d1, d2, d1_metadata, d2_metadata = self.generate_neighbors( load_csv=True) fD1, fD1_actual, fD1_low, fD1_high = ag.run_agg_query( d1, d1_metadata, d1_query, confidence, get_exact) fD2, fD2_actual, fD2_low, fD2_high = ag.run_agg_query( d2, d2_metadata, d2_query, confidence, get_exact) d1hist, d2hist, bin_edges = self.generate_histogram_neighbors( fD1, fD2, binsize="auto") d1size, d2size = fD1.size, fD2.size dp_res, d1histupperbound, d2histupperbound, d1lower, d2lower = self.dp_test( d1hist, d2hist, bin_edges, d1size, d2size, debug) acc_res, utility_res, within_bounds = self.accuracy_test( fD1_actual, fD1_low, fD1_high, confidence) bias_res, msd = self.bias_test(fD1_actual, fD1) if (plot): self.plot_histogram_neighbors(fD1, fD2, d1histupperbound, d2histupperbound, d1hist, d2hist, d1lower, d2lower, bin_edges, bound, exact) return dp_res, acc_res, utility_res, bias_res
def dp_powerset_test(self, query_str, debug=False, plot=True, bound=True, exact=False, repeat_count=10000, confidence=0.95, test_cases=5): ag = agg.Aggregation(t=1, repeat_count=repeat_count) ex = exp.Exploration() res_list = {} halton_samples = ex.generate_halton_samples(bounds=ex.corners, dims=ex.N, n_sample=test_cases) # Iterate through each sample generated by halton sequence for sample in halton_samples: df, metadata = ex.create_small_dataset(sample) ex.generate_powerset(df) print("Test case: ", list(sample)) for filename in ex.visited: print("Testing: ", filename) d1_query = query_str + "d1_" + filename + "." + "d1_" + filename d2_query = query_str + "d2_" + filename + "." + "d2_" + filename [d1, d2, d1_metadata, d2_metadata] = ex.neighbor_pair[filename] fD1, fD1_actual, fD1_low, fD1_high = ag.run_agg_query( d1, d1_metadata, d1_query, confidence) fD2, fD2_actual, fD2_low, fD2_high = ag.run_agg_query( d2, d2_metadata, d2_query, confidence) #acc_res, utility_res, within_bounds = self.accuracy_test(fD1_actual, fD1_low, fD1_high, confidence) acc_res, utility_res, within_bounds = None, None, None bias_res, msd = self.bias_test(fD1_actual, fD1) d1hist, d2hist, bin_edges = self.generate_histogram_neighbors( fD1, fD2, binsize="auto") d1size, d2size = fD1.size, fD2.size dp_res, d1histupperbound, d2histupperbound, d1lower, d2lower = self.dp_test( d1hist, d2hist, bin_edges, d1size, d2size, debug) print("DP Predicate Test Result: ", dp_res) if (plot): self.plot_histogram_neighbors(fD1, fD2, d1histupperbound, d2histupperbound, d1hist, d2hist, d1lower, d2lower, bin_edges, bound, exact) key = "[" + ','.join(str(e) for e in list(sample)) + "] - " + filename res_list[key] = [ dp_res, acc_res, utility_res, within_bounds, bias_res, msd ] print("Halton sequence based Powerset Test Result") for data, res in res_list.items(): print(data, "-", res) dp_res = np.all(np.array([res[0] for data, res in res_list.items()])) #acc_res = np.all(np.array([res[1] for res in res_list])) #utility_res = np.all(np.array([res[2] for res in res_list])) acc_res, utility_res = None, None bias_res = np.all(np.array([res[4] for data, res in res_list.items()])) return dp_res, acc_res, utility_res, bias_res
def whitenoise_core_test(self, dataset_path, col_names, f, *args, numbins=0, binsize="auto", debug=False, plot=True, bound=True, exact=False, repeat_count=100, epsilon=1.0, actual=1.0, **kwargs): ag = agg.Aggregation(t=1, repeat_count=repeat_count) self.dataset_path = dataset_path d1, d2, d1_metadata, d2_metadata = self.generate_neighbors( load_csv=True) d1_file_path = os.path.join(self.file_dir, self.csv_path, "d1.csv") d2_file_path = os.path.join(self.file_dir, self.csv_path, "d2.csv") if (len(args) == 3): fD1 = ag.whitenoise_core_dp_multi_agg(f, d1_file_path, col_names, args, epsilon, kwargs) fD2 = ag.whitenoise_core_dp_multi_agg(f, d2_file_path, col_names, args, epsilon, kwargs) else: fD1 = ag.whitenoise_core_dp_agg(f, d1_file_path, col_names, args, epsilon, kwargs) fD2 = ag.whitenoise_core_dp_agg(f, d2_file_path, col_names, args, epsilon, kwargs) d1size, d2size = fD1.size, fD2.size d1hist, d2hist, bin_edges = \ self.generate_histogram_neighbors(fD1, fD2, numbins, binsize, exact=exact) dp_res, d1histupperbound, d2histupperbound, d1lower, d2lower = self.dp_test( d1hist, d2hist, bin_edges, d1size, d2size, debug) print("DP Predicate Test:", dp_res, "\n") bias_res, msd = self.bias_test(actual, fD1) print("Bias Test:", bias_res, "\n") if (plot): self.plot_histogram_neighbors(fD1, fD2, d1histupperbound, d2histupperbound, d1hist, d2hist, d1lower, d2lower, bin_edges, bound) return dp_res, bias_res
def dp_groupby_query_test(self, d1_query, d2_query, debug=False, plot=True, bound=True, exact=False, repeat_count=10000, confidence=0.95): ag = agg.Aggregation(t=1, repeat_count=repeat_count) d1, d2, d1_metadata, d2_metadata = self.generate_neighbors( load_csv=True) d1_res, d1_exact, dim_cols, num_cols = ag.run_agg_query_df( d1, d1_metadata, d1_query, confidence, file_name="d1") d2_res, d2_exact, dim_cols, num_cols = ag.run_agg_query_df( d2, d2_metadata, d2_query, confidence, file_name="d2") res_list = [] for col in num_cols: d1_gp = d1_res.groupby(dim_cols)[col].apply(list).reset_index( name=col) d2_gp = d2_res.groupby(dim_cols)[col].apply(list).reset_index( name=col) exact_gp = d1_exact.groupby(dim_cols)[col].apply(list).reset_index( name=col) # Full outer join after flattening the results above to one row per dimension key # We cannot be sure if every dimension key has a response in every repeated query run because of tau thresholding # That's why we do a full outer join and flatten whatever vector of results we get for the numerical column across repeat runs # This is what we use for generating the histogram of results for that dimension key d1_d2 = d1_gp.merge(d2_gp, on=dim_cols, how='outer') d1_d2 = d1_d2.merge(exact_gp, on=dim_cols, how='left') n_cols = len(d1_d2.columns) for index, row in d1_d2.iterrows(): print(d1_d2.iloc[index, :n_cols - 3]) print("Column: ", col) # fD1 and fD2 will have the results of the K repeated query results that can be passed through histogram test # These results are for that particular numerical column and the specific dimension key of d1_d2 fD1 = np.array( [val[0] for val in d1_d2.iloc[index, n_cols - 3]]) fD2 = np.array( [val[0] for val in d1_d2.iloc[index, n_cols - 2]]) exact_val = d1_d2.iloc[index, n_cols - 1][0] d1hist, d2hist, bin_edges = self.generate_histogram_neighbors( fD1, fD2, binsize="auto") d1size, d2size = fD1.size, fD2.size dp_res, d1histupperbound, d2histupperbound, d1lower, d2lower = self.dp_test( d1hist, d2hist, bin_edges, d1size, d2size, debug) print("DP Predicate Test Result: ", dp_res) # Accuracy Test low = np.array( [val[1] for val in d1_d2.iloc[index, n_cols - 2]]) high = np.array( [val[2] for val in d1_d2.iloc[index, n_cols - 2]]) acc_res, utility_res, within_bounds = self.accuracy_test( exact_val, low, high, confidence) bias_res, msd = self.bias_test(exact_val, fD1) res_list.append([ dp_res, acc_res, utility_res, within_bounds, bias_res, msd ]) if (plot): self.plot_histogram_neighbors(fD1, fD2, d1histupperbound, d2histupperbound, d1hist, d2hist, d1lower, d2lower, bin_edges, bound, exact) for res in res_list: print(res) res_list = res_list.values() if hasattr( res_list, "values") else res_list # TODO why is this needed? dp_res = np.all(np.array([res[0] for res in res_list])) acc_res = np.all(np.array([res[1] for res in res_list])) utility_res = np.all(np.array([res[2] for res in res_list])) bias_res = np.all(np.array([res[4] for res in res_list])) return dp_res, acc_res, utility_res, bias_res