def _generate_histogram_neighbors(self, fD1, fD2, ep: EvaluatorParams): """ Generate histograms given the vectors of repeated aggregation results applied on neighboring datasets """ fD1 = np.asarray(fD1, dtype="float64") fD2 = np.asarray(fD2, dtype="float64") d = np.concatenate((fD1, fD2), axis=None) n = len(fD1) binlist = [] minval = min(min(fD1), min(fD2)) maxval = max(max(fD1), max(fD2)) # Deciding bin width and bin list if ep.exact: binlist = np.linspace(minval, maxval, 2) elif ep.numbins > 0: binlist = np.linspace(minval, maxval, ep.numbins) elif ep.binsize == "auto": iqr = np.subtract(*np.percentile(d, [75, 25])) numerator = 2 * iqr if iqr > 0 else maxval - minval denominator = n**(1.0 / 3) binwidth = numerator / denominator # Freedman–Diaconis' choice ep.numbins = int(math.ceil( (maxval - minval) / binwidth)) if maxval > minval else 20 binlist = np.linspace(minval, maxval, ep.numbins) else: # Choose bin size of unity binlist = np.arange(np.floor(minval), np.ceil(maxval)) # Calculating histograms of fD1 and fD2 fD1hist, bin_edges = np.histogram(fD1, bins=binlist, density=False) fD2hist, bin_edges = np.histogram(fD2, bins=binlist, density=False) return fD1hist, fD2hist, bin_edges
def test_interface_benchmark(self): logging.getLogger().setLevel(logging.DEBUG) lib = DPSampleLibrary() pa = DPSample() epsilon_list = [0.001, 0.5, 1.0, 2.0, 4.0] pp = PrivacyParams(epsilon=1.0) ev = EvaluatorParams(repeat_count=500) # Creating neighboring datasets d1 = pd.DataFrame(random.sample(range(1, 1000), 100), columns=['Usage']) drop_idx = np.random.choice(d1.index, 1, replace=False) d2 = d1.drop(drop_idx) benchmarking = DPBenchmarking() # Preparing benchmarking params pa_algorithms = {pa: [lib.dp_count]} privacy_params_list = [] for epsilon in epsilon_list: pp = PrivacyParams() pp.epsilon = epsilon privacy_params_list.append(pp) d1_d2_list = [[d1, d2]] benchmark_params = BenchmarkParams(pa_algorithms, privacy_params_list, d1_d2_list, ev) benchmark_metrics_list = benchmarking.benchmark(benchmark_params) for bm in benchmark_metrics_list: for key, metrics in bm.key_metrics.items(): test_logger.debug("Epsilon: " + str(bm.privacy_params.epsilon) + \ " MSE:" + str(metrics.mse) + \ " Privacy Test: " + str(metrics.dp_res)) assert (metrics.dp_res == True) assert (len(benchmark_metrics_list) == 5)
def test_interface_algorithm(self): logging.getLogger().setLevel(logging.DEBUG) lib = DPSampleLibrary() dv = DPSample() pp = PrivacyParams(epsilon=1.0) ev = EvaluatorParams(repeat_count=500) df = pd.DataFrame(random.sample(range(1, 1000), 100), columns=['Usage']) # Preparing and releasing from Sample DP algorithm to send noisy results to evaluator dv.prepare(lib.dp_count, pp, ev) report = dv.release(df) # Test DP respose from interface assert (isinstance(report.res, dict)) assert (len(report.res) > 0) firstkey = list(report.res.keys())[0] test_logger.debug("First key name is:" + str(firstkey)) test_logger.debug("Repeated noisy count responses: " + str(report.res[firstkey])) assert (isinstance(firstkey, str)) assert (len(report.res[firstkey]) == ev.repeat_count) # Test non-DP i.e. actual response from interface should be a single numeric return report = dv.actual_release(df) test_logger.debug("Actual count response: " + str(report.res[firstkey])) assert (isinstance(report.res[firstkey], (int, float)))
def test_interface_multikey(self): logging.getLogger().setLevel(logging.DEBUG) lib = DPSampleLibrary() pa = DPMultiKey() metrics = Metrics() # Before running the DP test, it should be default to False # and Wasserstein distance should be 0 assert (metrics.dp_res == False) assert (metrics.wasserstein_distance == 0.0) assert (metrics.jensen_shannon_divergence == 0.0) assert (metrics.kl_divergence == 0.0) assert (metrics.mse == 0.0) assert (metrics.std == 0.0) assert (metrics.msd == 0.0) pp = PrivacyParams(epsilon=1.0) ev = EvaluatorParams(repeat_count=500) # Creating neighboring datasets col1 = list(range(0, 1000)) col2 = list(range(-1000, 0)) d1 = pd.DataFrame(list(zip(col1, col2)), columns=['Col1', 'Col2']) drop_idx = np.random.choice(d1.index, 1, replace=False) d2 = d1.drop(drop_idx) # Call evaluate eval = DPEvaluator() key_metrics = eval.evaluate(d1, d2, pa, lib.dp_sum, pp, ev) # After evaluation, it should return True and distance metrics should be non-zero for key, metrics in key_metrics.items(): assert (metrics.dp_res == True) test_logger.debug("Wasserstein Distance:" + str(metrics.wasserstein_distance)) test_logger.debug("Jensen Shannon Divergence:" + str(metrics.jensen_shannon_divergence)) test_logger.debug("KL Divergence:" + str(metrics.kl_divergence)) test_logger.debug("MSE:" + str(metrics.mse)) test_logger.debug("Standard Deviation:" + str(metrics.std)) test_logger.debug("Mean Signed Deviation:" + str(metrics.msd)) assert (metrics.wasserstein_distance > 0.0) assert (metrics.jensen_shannon_divergence > 0.0) assert (metrics.kl_divergence != 0.0) assert (metrics.mse > 0.0) assert (metrics.std != 0.0) assert (metrics.msd != 0.0)
def test_interface_count(self): logging.getLogger().setLevel(logging.DEBUG) # Initialize params and algorithm to benchmark pa = DPSingletonQuery() pp = PrivacyParams(epsilon=1.0) ev = EvaluatorParams(repeat_count=100) dd = DatasetParams(dataset_size=500) query = "SELECT COUNT(UserId) AS UserCount FROM dataset.dataset" # Preparing neighboring datasets df, metadata = self.create_simulated_dataset(dd.dataset_size, "dataset") d1_dataset, d2_dataset, d1_metadata, d2_metadata = self.generate_neighbors( df, metadata) d1 = PandasReader(d1_dataset, d1_metadata) d2 = PandasReader(d2_dataset, d2_metadata) # Call evaluate eval = DPEvaluator() key_metrics = eval.evaluate([d1_metadata, d1], [d2_metadata, d2], pa, query, pp, ev) # After evaluation, it should return True and distance metrics should be non-zero for key, metrics in key_metrics.items(): assert (metrics.dp_res == True) test_logger.debug("Wasserstein Distance:" + str(metrics.wasserstein_distance)) test_logger.debug("Jensen Shannon Divergence:" + str(metrics.jensen_shannon_divergence)) test_logger.debug("KL Divergence:" + str(metrics.kl_divergence)) test_logger.debug("MSE:" + str(metrics.mse)) test_logger.debug("Standard Deviation:" + str(metrics.std)) test_logger.debug("Mean Signed Deviation:" + str(metrics.msd)) assert (metrics.wasserstein_distance > 0.0) assert (metrics.jensen_shannon_divergence > 0.0) assert (metrics.kl_divergence != 0.0) assert (metrics.mse > 0.0) assert (metrics.std != 0.0) assert (metrics.msd != 0.0)
def __init__(self, learner_params): self.lp = learner_params self.pp = PrivacyParams(epsilon=1.0) self.ev = EvaluatorParams(repeat_count=100) self.dd = DatasetParams(dataset_size=500)
def __init__(self): self.pp = PrivacyParams(epsilon=1.0) self.ev = EvaluatorParams(repeat_count=100) self.dd = DatasetParams(dataset_size=500) self.pa = DPSingletonQuery()