def test_empty_result_count_typed_notau_prepost(self): reader = PandasReader(df, schema) query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] private_reader = PrivateReader(reader, schema, 1.0) private_reader._execute_ast(query, True) for i in range(3): trs = private_reader._execute_ast(query, True) assert(len(trs) == 2)
def run_agg_query(self, df, metadata, query, confidence, get_exact=True): """ Run the query using the private reader and input query Get query response back """ reader = PandasReader(df, metadata) actual = 0.0 # VAR not supported in Pandas Reader. So not needed to fetch actual on every aggregation if (get_exact): actual = reader.execute_typed(query).rows()[1:][0][0] private_reader = PrivateReader(reader, metadata, self.epsilon) query_ast = private_reader.parse_query_string(query) srs_orig = private_reader.reader.execute_ast_typed(query_ast) noisy_values = [] low_bounds = [] high_bounds = [] for idx in range(self.repeat_count): srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) res = private_reader._execute_ast(query_ast, True) # Disabled because confidence interval not available in report #interval = res.report[res.colnames[0]].intervals[confidence] #low_bounds.append(interval[0].low) #high_bounds.append(interval[0].high) noisy_values.append(res.rows()[1:][0][0]) return np.array(noisy_values), actual, low_bounds, high_bounds
def release(self, dataset: object) -> Report: """ Dataset is a collection of [Dataset Metadata, PandasReader] Releases response to SQL query based on the number of repetitions requested by eval_params if actual is set of False. """ private_reader = PrivateReader(dataset[1], dataset[0], self.privacy_params.epsilon) query_ast = private_reader.parse_query_string(self.algorithm) srs_orig = private_reader.reader._execute_ast_df(query_ast) noisy_values = [] for idx in range(self.eval_params.repeat_count): res = private_reader._execute_ast(query_ast, True) noisy_values.append(res[1:][0][0]) return Report({"__key__": noisy_values})
def test_check_thresholds_gauss(self): # check tau for various privacy parameters epsilons = [0.1, 2.0] max_contribs = [1, 3] deltas = [10E-5, 10E-15] query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married" reader = PandasReader(df, schema) qp = QueryParser(schema) q = qp.query(query) for eps in epsilons: for d in max_contribs: for delta in deltas: # using slightly different formulations of same formula from different papers # make sure private_reader round-trips gaus_scale = math.sqrt(d) * math.sqrt(2 * math.log(1.25/delta))/eps gaus_rho = 1 + gaus_scale * math.sqrt(2 * math.log(d / math.sqrt(2 * math.pi * delta))) schema_c = copy.copy(schema) schema_c["PUMS.PUMS"].max_ids = d private_reader = PrivateReader(reader, schema_c, eps, delta) assert(private_reader._options.max_contrib == d) r = private_reader._execute_ast(q) assert(math.isclose(private_reader.tau, gaus_rho, rel_tol=0.03, abs_tol=2))
def run_agg_query_df(self, df, metadata, query, confidence, file_name="d1"): """ Run the query using the private reader and input query Get query response back for multiple dimensions and aggregations """ # Getting exact result reader = PandasReader(df, metadata) exact = reader.execute_typed(query).rows()[1:] exact_res = [] for row in exact: exact_res.append(row) private_reader = PrivateReader(reader, metadata, self.epsilon) query_ast = private_reader.parse_query_string(query) # Distinguishing dimension and measure columns srs_orig = private_reader.reader.execute_ast_typed(query_ast) srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) sample_res = private_reader._execute_ast(query_ast, True) headers = sample_res.colnames dim_cols = [] num_cols = [] for col in headers: if (sample_res.types[col] == "string"): dim_cols.append(col) else: num_cols.append(col) # Repeated query and store results along with intervals res = [] for idx in range(self.repeat_count): dim_rows = [] num_rows = [] srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) singleres = private_reader._execute_ast(query_ast, True) values = singleres[col] for col in dim_cols: dim_rows.append(singleres[col]) for col in num_cols: values = singleres[col] #low = singleres.report[col].intervals[confidence].low #high = singleres.report[col].intervals[confidence].high #num_rows.append(list(zip(values, low, high))) num_rows.append(list(zip(values))) res.extend(list(zip(*dim_rows, *num_rows))) exact_df = pd.DataFrame(exact_res, columns=headers) noisy_df = pd.DataFrame(res, columns=headers) # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2 if (len(dim_cols) == 0): dim_cols.append("__dim__") if (dim_cols[0] == "__dim__"): exact_df[dim_cols[0]] = ["key"] * len(exact_df) noisy_df[dim_cols[0]] = ["key"] * len(noisy_df) return noisy_df, exact_df, dim_cols, num_cols
def run_agg_query_df(self, df, metadata, query, confidence, file_name="d1"): """ Run the query using the private reader and input query Get query response back for multiple dimensions and aggregations """ # Getting exact result reader = PandasReader(df, metadata) exact_res = reader.execute(query)[1:] private_reader = PrivateReader(reader, metadata, self.epsilon) query_ast = private_reader.parse_query_string(query) # Distinguishing dimension and measure columns sample_res = private_reader._execute_ast(query_ast, True) headers = sample_res[0] dim_cols = [] num_cols = [] out_syms = query_ast.all_symbols() out_types = [s[1].type() for s in out_syms] out_col_names = [s[0] for s in out_syms] for col, ctype in zip(out_col_names, out_types): if (ctype == "string"): dim_cols.append(col) else: num_cols.append(col) # Repeated query and store results res = [] for idx in range(self.repeat_count): dim_rows = [] num_rows = [] singleres = private_reader._execute_ast_df(query_ast, True) #values = singleres[col] for col in dim_cols: dim_rows.append(singleres[col].tolist()) for col in num_cols: values = singleres[col].tolist() num_rows.append(list(zip(values))) res.extend(list(zip(*dim_rows, *num_rows))) exact_df = pd.DataFrame(exact_res, columns=headers) noisy_df = pd.DataFrame(res, columns=headers) # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2 if (len(dim_cols) == 0): dim_cols.append("__dim__") if (dim_cols[0] == "__dim__"): exact_df[dim_cols[0]] = ["key"] * len(exact_df) noisy_df[dim_cols[0]] = ["key"] * len(noisy_df) return noisy_df, exact_df, dim_cols, num_cols