def test_interface_count(self): logging.getLogger().setLevel(logging.DEBUG) # Initialize params and algorithm to benchmark pa = DPSingletonQuery() pp = PrivacyParams(epsilon=1.0) ev = EvaluatorParams(repeat_count=100) dd = DatasetParams(dataset_size=500) query = "SELECT COUNT(UserId) AS UserCount FROM dataset.dataset" # Preparing neighboring datasets df, metadata = self.create_simulated_dataset(dd.dataset_size, "dataset") d1_dataset, d2_dataset, d1_metadata, d2_metadata = self.generate_neighbors(df, metadata) d1 = PandasReader(d1_dataset, d1_metadata) d2 = PandasReader(d2_dataset, d2_metadata) # Call evaluate eval = DPEvaluator() key_metrics = eval.evaluate([d1_metadata, d1], [d2_metadata, d2], pa, query, pp, ev) # After evaluation, it should return True and distance metrics should be non-zero for key, metrics in key_metrics.items(): assert(metrics.dp_res == True) test_logger.debug("Wasserstein Distance:" + str(metrics.wasserstein_distance)) test_logger.debug("Jensen Shannon Divergence:" + str(metrics.jensen_shannon_divergence)) test_logger.debug("KL Divergence:" + str(metrics.kl_divergence)) test_logger.debug("MSE:" + str(metrics.mse)) test_logger.debug("Standard Deviation:" + str(metrics.std)) test_logger.debug("Mean Signed Deviation:" + str(metrics.msd)) assert(metrics.wasserstein_distance > 0.0) assert(metrics.jensen_shannon_divergence > 0.0) assert(metrics.kl_divergence != 0.0) assert(metrics.mse > 0.0) assert(metrics.std != 0.0) assert(metrics.msd != 0.0)
def test_group_by_exact_order_expr_desc(self): reader = PandasReader(df, schema) rs = reader.execute( "SELECT COUNT(*) * 5 AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC" ) assert (rs[1][0] == 549 * 5) assert (rs[2][0] == 451 * 5)
def learn(self, querypool, export_as_csv=False): output = [] for i in range(len(querypool)): df, metadata = create_simulated_dataset(self.dd.dataset_size, "dataset") d1_dataset, d2_dataset, d1_metadata, d2_metadata = generate_neighbors(df, metadata) d1 = PandasReader(d1_metadata, d1_dataset) d2 = PandasReader(d2_metadata, d2_dataset) eval = DPEvaluator() pa = DPSingletonQuery() key_metrics = eval.evaluate([d1_metadata, d1], [d2_metadata, d2], pa, querypool[i], self.pp, self.ev) if key_metrics['__key__'].dp_res is None: dp_res = key_metrics['__key__'].dp_res error = key_metrics['__key__'].error output.append({"query":querypool[i], "dpresult": dp_res, "jensen_shannon_divergence":None, "error": error}) else: res_list = [] for key, metrics in key_metrics.items(): dp_res = metrics.dp_res js_res = metrics.jensen_shannon_divergence res_list.append([dp_res, js_res]) dp_res = np.all(np.array([res[0] for res in res_list])) js_res = (np.array([res[1] for res in res_list])).max() output.append({"query":querypool[i], "dpresult": dp_res,"jensen_shannon_divergence": js_res, "error":None}) if export_as_csv: write_to_csv('Bandit.csv', output, flag='bandit') else: return output
def run_agg_query(self, df, metadata, query, confidence, get_exact=True): """ Run the query using the private reader and input query Get query response back """ reader = PandasReader(df, metadata) actual = 0.0 # VAR not supported in Pandas Reader. So not needed to fetch actual on every aggregation if (get_exact): actual = reader.execute_typed(query).rows()[1:][0][0] private_reader = PrivateReader(reader, metadata, self.epsilon) query_ast = private_reader.parse_query_string(query) srs_orig = private_reader.reader.execute_ast_typed(query_ast) noisy_values = [] low_bounds = [] high_bounds = [] for idx in range(self.repeat_count): srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) res = private_reader._execute_ast(query_ast, True) # Disabled because confidence interval not available in report #interval = res.report[res.colnames[0]].intervals[confidence] #low_bounds.append(interval[0].low) #high_bounds.append(interval[0].high) noisy_values.append(res.rows()[1:][0][0]) return np.array(noisy_values), actual, low_bounds, high_bounds
def test_legacy_params_pandas_reader(self): # params swapped with pytest.warns(Warning): reader = PandasReader(schema, df) assert("metadata.collection.CollectionMetadata" in str(type(reader.metadata))) # doubled up params of wrong type should fail with pytest.raises(Exception): reader = PandasReader(schema, schema) with pytest.raises(Exception): reader = PandasReader(df, df)
def test_group_by_noisy_typed_order_desc(self): reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, 4.0) rs = private_reader.execute_typed( "SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC" ) assert (rs['c'][0] > rs['c'][1])
def test_check_thresholds_gauss(self): # check tau for various privacy parameters epsilons = [0.1, 2.0] max_contribs = [1, 3] deltas = [10E-5, 10E-15] query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married" reader = PandasReader(df, schema) qp = QueryParser(schema) q = qp.query(query) for eps in epsilons: for d in max_contribs: for delta in deltas: # using slightly different formulations of same formula from different papers # make sure private_reader round-trips gaus_scale = math.sqrt(d) * math.sqrt( 2 * math.log(1.25 / delta)) / eps gaus_rho = 1 + gaus_scale * math.sqrt( 2 * math.log(d / math.sqrt(2 * math.pi * delta))) schema_c = copy.copy(schema) schema_c["PUMS.PUMS"].max_ids = d private_reader = PrivateReader(reader, schema_c, eps, delta) assert (private_reader._options.max_contrib == d) r = private_reader.execute_ast(q) assert (math.isclose(private_reader.tau, gaus_rho, rel_tol=0.03, abs_tol=2))
def test_no_tau_noisy(self): # should never drop rows reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, 0.01) for i in range(10): rs = private_reader.execute_df("SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 90 AND educ = '8'") assert(len(rs['c']) == 1)
def test_ok1(self): s = copy.copy(schema) s["PUMS.PUMS"]["income"].maxval = None reader = PandasReader(df, s) private_reader = PrivateReader(reader, s, 4.0) rs = private_reader.execute_df( "SELECT income FROM PUMS.PUMS GROUP BY income")
def test_err1(self): s = copy.copy(schema) s["PUMS.PUMS"]["income"].maxval = None reader = PandasReader(df, s) private_reader = PrivateReader(reader, s, 4.0) with pytest.raises(ValueError): rs = private_reader.execute_df("SELECT SUM(income) FROM PUMS.PUMS")
def setup_class(cls): meta = CollectionMetadata.from_file(meta_path) meta["PUMS.PUMS"].censor_dims = False df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, 10.0, 10E-3) cls.reader = private_reader
def test_empty_result_count_typed_notau_prepost(self): reader = PandasReader(df, schema) query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] private_reader = PrivateReader(reader, schema, 1.0) private_reader._execute_ast(query, True) for i in range(3): trs = private_reader._execute_ast(query, True) assert(len(trs) == 2)
def test_execute_without_dpsu(self): schema_no_dpsu = copy.copy(schema) schema_no_dpsu["PUMS.PUMS"].use_dpsu = False reader = PandasReader(df, schema_no_dpsu) private_reader = PrivateReader(reader, schema_no_dpsu, 1.0) assert(private_reader._options.use_dpsu == False) query = QueryParser(schema_no_dpsu).queries("SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0] assert(private_reader._get_reader(query) is private_reader.reader)
def test_viz_child_nodes(self): query = "SELECT AVG(age) AS my_sum FROM PUMS.PUMS GROUP BY age" reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, 1.0) inner, outer = private_reader.rewrite(query) aggfuncs = outer.find_nodes(AggFunction) for aggfunc in aggfuncs: graph = aggfunc.visualize(n_trunc=30) assert (isinstance(graph, Digraph))
def QuerytoAST(self, query, meta, data): reader = PandasReader(meta, data) private_reader = PrivateReader(meta, reader, self.pp.epsilon) # query = 'SELECT Usage AS Usage, SUM(Usage) + 3 AS u FROM dataset.dataset GROUP BY Role' try: ast = private_reader.parse_query_string(query) except: return return ast
def test_yes_tau(self): # should usually drop some rows reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, 1.0, 1/10000) lengths = [] for i in range(10): rs = private_reader.execute_df("SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 80 GROUP BY educ") lengths.append(len(rs['c'])) l = lengths[0] assert(any([l != ll for ll in lengths]))
def setup_class(self): meta = CollectionMetadata.from_file(meta_path) meta["PUMS.PUMS"].censor_dims = False meta["PUMS.PUMS"]["sex"].type = "int" meta["PUMS.PUMS"]["educ"].type = "int" meta["PUMS.PUMS"]["married"].type = "bool" df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, 10.0, 10E-3) self.reader = private_reader
def _compute_reward(self, query): ast_transform = self.observe(query) d1_query = query d2_query = query.replace("d1.d1", "d2.d2") d1_dataset, d2_dataset, d1_metadata, d2_metadata = generate_neighbors( self.df, self.metadata) d1 = PandasReader(d1_metadata, d1_dataset) d2 = PandasReader(d2_metadata, d2_dataset) eval = DPEvaluator() pa = DPSingletonQuery() key_metrics = eval.evaluate([d1_metadata, d1], [d2_metadata, d2], pa, query, self.pp, self.ev) message = None if key_metrics["__key__"].dp_res is None: dpresult = "DP_BUG" self.reward = 1 message = key_metrics["__key__"].error elif key_metrics["__key__"].dp_res == False: self._game_ended = True dpresult = "DP_FAIL" self.reward = 20 message = "dp_res_False" elif (key_metrics["__key__"].dp_res == True and key_metrics["__key__"].jensen_shannon_divergence == math.inf): self._game_ended = True dpresult = "DP_BUG" self.reward = 20 message = "jsdistance_is_inf" else: res_list = [] for key, metrics in key_metrics.items(): dp_res = metrics.dp_res js_res = metrics.jensen_shannon_divergence # ws_res = metrics.wasserstein_distance res_list.append([dp_res, js_res]) dp_res = np.all(np.array([res[0] for res in res_list])) js_res = (np.array([res[1] for res in res_list])).max() # ws_res = (np.array([res[2] for res in res_list])).max() if dp_res == True: dpresult = "DP_PASS" self.reward = js_res return dpresult, self.reward, message, d1, d2
def test_legacy_params_private_reader(self): reader = PandasReader(df, schema) # params swapped with pytest.warns(Warning): private_reader = PrivateReader(schema, reader, 1.0) assert(isinstance(private_reader.reader, PandasReader)) # doubled up params of wrong type should fail with pytest.raises(Exception): private_reader = PrivateReader(schema, schema, 1.0) with pytest.raises(Exception): private_reader = PrivateReader(reader, reader, 1.0)
def test_viz_query_rewritten(self): query = "SELECT SUM(age) AS my_sum FROM PUMS.PUMS GROUP BY age" parsed_query = QueryParser(schema).query(query) reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, 1.0) inner, outer = private_reader.rewrite_ast(parsed_query) graph = outer.visualize(n_trunc=30) assert (isinstance(graph, Digraph)) #graph.render('ast_digraph', view=True, cleanup=True) graph = inner.visualize(n_trunc=30) assert (isinstance(graph, Digraph))
def test_dpsu_vs_korolova(self): query = "SELECT ngram, COUNT(*) as n FROM reddit.reddit GROUP BY ngram ORDER BY n desc" reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, 3.0) private_reader.options.max_contrib = 10 result = private_reader.execute_typed(query) private_reader_korolova = PrivateReader(reader, schema, 3.0) private_reader_korolova.options.dpsu = False private_reader_korolova.options.max_contrib = 10 korolova_result = private_reader_korolova.execute_typed(query) assert len(result['n']) > len(korolova_result['n']) assert len(final_df) < len(df)
def test_calculate_multiplier(self): pums_meta_path = os.path.join( git_root_dir, os.path.join("service", "datasets", "PUMS.yaml")) pums_csv_path = os.path.join( git_root_dir, os.path.join("service", "datasets", "PUMS.csv")) pums_schema = CollectionMetadata.from_file(pums_meta_path) pums_df = pd.read_csv(pums_csv_path) pums_reader = PandasReader(pums_df, pums_schema) query = "SELECT COUNT(*) FROM PUMS.PUMS" cost = PrivateReader.get_budget_multiplier(pums_schema, pums_reader, query) query = "SELECT AVG(age) FROM PUMS.PUMS" cost_avg = PrivateReader.get_budget_multiplier(pums_schema, pums_reader, query) assert 1 + cost == cost_avg
def test_sklearn_query(): sklearn_dataset = sklearn.datasets.load_iris() sklearn_df = pd.DataFrame(data=sklearn_dataset.data, columns=sklearn_dataset.feature_names) iris = Table("dbo", "iris", [ Float("sepal length (cm)", 4, 8), Float("sepal width (cm)", 2, 5), Float("petal length (cm)", 1, 7), Float("petal width (cm)", 0, 3) ], 150) schema = CollectionMetadata([iris], "csv") reader = PandasReader(sklearn_df, schema) rowset = execute_private_query( schema, reader, 0.3, 'SELECT AVG("petal width (cm)") FROM dbo.iris') df = pd.DataFrame(rowset[1:], columns=rowset[0]) assert df is not None assert len(df) == 1
def test_sklearn_query(): sklearn_dataset = sklearn.datasets.load_iris() sklearn_df = pd.DataFrame(data=sklearn_dataset.data, columns=sklearn_dataset.feature_names) iris = Table("dbo", "iris", [ Float("sepal length (cm)", 4, 8), Float("sepal width (cm)", 2, 5), Float("petal length (cm)", 1, 7), Float("petal width (cm)", 0, 3) ], 150) schema = CollectionMetadata([iris], "csv") reader = PandasReader(sklearn_df, schema) # Calling both times for back compat check for params in ([reader, schema], [schema, reader]): df = execute_private_query( *params, 0.3, 'SELECT AVG("petal width (cm)") FROM dbo.iris') assert df is not None assert len(df) == 1
def _load_reader(dataset_document): return PandasReader( LocalCSVAdapter.load_df(dataset_document), LocalCSVAdapter.load_metadata(dataset_document), )
def test_empty_result_typed(self): reader = PandasReader(df, schema) rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100") trs = reader._to_df(rs) assert(len(trs) == 0)
def test_count_no_rows_exact_typed(self): reader = PandasReader(df, schema) query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] trs = reader._execute_ast_df(query) assert(trs['c'][0] == 0)
import pandas as pd from opendp.smartnoise.sql import PandasReader, PrivateReader from opendp.smartnoise.metadata import CollectionMetadata pums = pd.read_csv('PUMS.csv') meta = CollectionMetadata.from_file('PUMS.yaml') query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS GROUP BY married' query = 'SELECT COUNT(*) AS n, COUNT(pid) AS foo FROM PUMS.PUMS WHERE age > 80 GROUP BY educ' reader = PandasReader(meta, pums) private_reader = PrivateReader(meta, reader, 4.0) private_reader.options.censor_dims = True private_reader.options.clamp_counts = True exact = reader.execute_typed(query) print(exact) private = private_reader.execute_typed(query) print(private)
def run_agg_query_df(self, df, metadata, query, confidence, file_name="d1"): """ Run the query using the private reader and input query Get query response back for multiple dimensions and aggregations """ # Getting exact result reader = PandasReader(df, metadata) exact = reader.execute_typed(query).rows()[1:] exact_res = [] for row in exact: exact_res.append(row) private_reader = PrivateReader(reader, metadata, self.epsilon) query_ast = private_reader.parse_query_string(query) # Distinguishing dimension and measure columns srs_orig = private_reader.reader.execute_ast_typed(query_ast) srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) sample_res = private_reader._execute_ast(query_ast, True) headers = sample_res.colnames dim_cols = [] num_cols = [] for col in headers: if (sample_res.types[col] == "string"): dim_cols.append(col) else: num_cols.append(col) # Repeated query and store results along with intervals res = [] for idx in range(self.repeat_count): dim_rows = [] num_rows = [] srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) singleres = private_reader._execute_ast(query_ast, True) values = singleres[col] for col in dim_cols: dim_rows.append(singleres[col]) for col in num_cols: values = singleres[col] #low = singleres.report[col].intervals[confidence].low #high = singleres.report[col].intervals[confidence].high #num_rows.append(list(zip(values, low, high))) num_rows.append(list(zip(values))) res.extend(list(zip(*dim_rows, *num_rows))) exact_df = pd.DataFrame(exact_res, columns=headers) noisy_df = pd.DataFrame(res, columns=headers) # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2 if (len(dim_cols) == 0): dim_cols.append("__dim__") if (dim_cols[0] == "__dim__"): exact_df[dim_cols[0]] = ["key"] * len(exact_df) noisy_df[dim_cols[0]] = ["key"] * len(noisy_df) return noisy_df, exact_df, dim_cols, num_cols
def create(self): return PandasReader(self.df, self.meta)