def test_check_thresholds_gauss(self): # check tau for various privacy parameters epsilons = [0.1, 2.0] max_contribs = [1, 3] deltas = [10E-5, 10E-15] query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married" reader = PandasReader(df, schema) qp = QueryParser(schema) q = qp.query(query) for eps in epsilons: for d in max_contribs: for delta in deltas: privacy = Privacy(epsilon=eps, delta=delta) privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian # using slightly different formulations of same formula from different papers # make sure private_reader round-trips gaus_scale = math.sqrt(d) * math.sqrt( 2 * math.log(1.25 / delta)) / eps gaus_rho = 1 + gaus_scale * math.sqrt( 2 * math.log(d / math.sqrt(2 * math.pi * delta))) schema_c = copy.copy(schema) schema_c["PUMS.PUMS"].max_ids = d private_reader = PrivateReader(reader, metadata=schema_c, privacy=privacy) assert (private_reader._options.max_contrib == d) r = private_reader._execute_ast(q) assert (math.isclose(private_reader.tau, gaus_rho, rel_tol=0.03, abs_tol=2))
def setup_class(cls): meta = Metadata.from_file(meta_path) meta["PUMS.PUMS"].censor_dims = False df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=10.0, delta=0.1)) cls.reader = private_reader
def run_agg_query(self, df, metadata, query, confidence, get_exact=True): """ Run the query using the private reader and input query Get query response back """ reader = PandasReader(df, metadata) actual = 0.0 # VAR not supported in Pandas Reader. So not needed to fetch actual on every aggregation if get_exact: actual = reader.execute(query)[1:][0][0] private_reader = PrivateReader(reader, metadata, privacy=Privacy(epsilon=self.epsilon)) query_ast = private_reader.parse_query_string(query) noisy_values = [] low_bounds = [] high_bounds = [] for idx in range(self.repeat_count): res = private_reader._execute_ast(query_ast, True) # Disabled because confidence interval not available in report # interval = res.report[res.colnames[0]].intervals[confidence] # low_bounds.append(interval[0].low) # high_bounds.append(interval[0].high) noisy_values.append(res[1:][0][0]) return np.array(noisy_values), actual, low_bounds, high_bounds
def test_odo_het_alternate(self): privacy = Privacy(epsilon=0.1, delta=1 / (1000)) odo = OdometerHeterogeneous(privacy) for _ in range(300): odo.spend(privacy) eps, delt = odo.spent assert (np.isclose(eps, 8.2519)) assert (np.isclose(delt, 0.2596633))
def test_odo_hom(self): privacy = Privacy(epsilon=0.1, delta=1 / (1000)) odo = Odometer(privacy) for _ in range(300): odo.spend() eps, delt = odo.spent assert (np.isclose(eps, 8.4917)) assert (np.isclose(delt, 0.19256))
def test_execute_without_dpsu(self): schema_no_dpsu = copy.copy(schema) schema_no_dpsu["PUMS.PUMS"].use_dpsu = False reader = PandasReader(df, schema_no_dpsu) private_reader = PrivateReader(reader, schema_no_dpsu, privacy=Privacy(epsilon=1.0)) assert (private_reader._options.use_dpsu == False) query = QueryParser(schema_no_dpsu).queries( "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0] assert (private_reader._get_reader(query) is private_reader.reader)
def test_simple_row_privacy(self, test_databases): alpha = 0.07 privacy = Privacy(alphas=[alpha], epsilon=0.5, delta=1 / 1000) query = 'SELECT COUNT(*), COUNT(educ), SUM(age) FROM PUMS.PUMS' reader = test_databases.get_private_reader(database='PUMS', engine="pandas", privacy=privacy) if reader: simple_a = reader.get_simple_accuracy(query, alpha=alpha) res = reader.execute(query, accuracy=True) simple_b = res[1][1][0] assert (all([a == b for a, b in zip(simple_a, simple_b)]))
def test_yes_tau_gauss_row(self, test_databases): # should drop approximately half of educ bins privacy = Privacy(epsilon=1.0, delta=1 / 1000) privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian readers = test_databases.get_private_readers(database='PUMS', privacy=privacy) assert (len(readers) > 0) for reader in readers: rs = test_databases.to_tuples( reader.execute( "SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 70 GROUP BY educ" )) assert (len(rs) >= 2 and len(rs) <= 8)
def test_no_order_limit(self, test_databases): """ The top 3 education levels are each more than double the 4th, so a SELECT TOP will reliably give the top 3 """ privacy = Privacy(epsilon=3.0, delta=1/1000) readers = test_databases.get_private_readers(database='PUMS_dup', privacy=privacy) for reader in readers: res = reader.execute('SELECT educ, COUNT(*) AS n FROM PUMS.PUMS GROUP BY educ LIMIT 4') res = test_databases.to_tuples(res) educs = [str(r[0]) for r in res] top_educs = ['9', '13', '11', '12'] assert(not all([a == b for a, b in zip(educs, top_educs)]))
def test_yes_tau_laplace_no_group(self, test_databases): # This should always return empty, because it pinpoints a small cohort privacy = Privacy(epsilon=1.0, delta=1 / 100_000) privacy.mechanisms.map[Stat.threshold] = Mechanism.laplace readers = test_databases.get_private_readers(database='PUMS', privacy=privacy) assert (len(readers) > 0) for reader in readers: rs = test_databases.to_tuples( reader.execute( "SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 70 AND educ < 2" )) assert (len(rs) <= 1)
def setup_class(self): meta = Metadata.from_file(meta_path) meta["PUMS.PUMS"].censor_dims = False meta["PUMS.PUMS"]["sex"].type = "int" meta["PUMS.PUMS"]["educ"].type = "int" meta["PUMS.PUMS"]["married"].type = "bool" df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=10.0, delta=10e-3)) self.reader = private_reader
def test_order_limit(self, test_databases): """ The top 3 education levels are each more than double the 4th, so a SELECT TOP will reliably give the top 3 """ privacy = Privacy(epsilon=3.0, delta=1/1000) readers = test_databases.get_private_readers(database='PUMS_dup', privacy=privacy) for reader in readers: res = reader.execute('SELECT TOP 5 educ, COUNT(*) AS n FROM PUMS.PUMS GROUP BY educ ORDER BY n DESC') res = test_databases.to_tuples(res) educs = [str(r[0]) for r in res] assert('9' in educs) assert('13' in educs) assert('11' in educs)
def test_empty_result_count_typed_notau_prepost(self): schema_all = copy.deepcopy(schema) schema_all['PUMS.PUMS'].censor_dims = False reader = PandasReader(df, schema) query = QueryParser(schema).queries( "SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] private_reader = PrivateReader(reader, schema_all, privacy=Privacy(epsilon=1.0)) private_reader._execute_ast(query, True) for i in range(3): print(private_reader._options) trs = private_reader._execute_ast(query, True) print("empty query") print(trs) assert (len(trs) == 2)
def test_simple_pid(self, test_databases): max_ids = 2 alpha = 0.05 privacy = Privacy(alphas=[alpha], epsilon=1.5, delta=1 / 100_000) privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian query = 'SELECT COUNT(DISTINCT pid), COUNT(*), COUNT(educ), SUM(age) FROM PUMS.PUMS' reader = test_databases.get_private_reader( database='PUMS_pid', engine="pandas", privacy=privacy, overrides={'max_ids': max_ids}) if reader: simple_a = reader.get_simple_accuracy(query, alpha=alpha) res = reader.execute(query, accuracy=True) simple_b = res[1][1][0] assert (all([a == b for a, b in zip(simple_a, simple_b)]))
def test_no_tau(self, test_databases): # should never drop rows privacy = Privacy(epsilon=4.0) readers = test_databases.get_private_readers( database='PUMS_pid', privacy=privacy, overrides={'censor_dims': False}) assert (len(readers) > 0) for reader in readers: if reader.engine == "spark": continue for _ in range(10): rs = reader.execute_df( "SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 90 AND educ = '8'" ) assert (len(rs['c']) == 1)
def test_queries(self, test_databases): query = "SELECT age, sex, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, sex HAVING income > 100000" privacy = Privacy(10.0, 0.1) readers = test_databases.get_private_readers( privacy=privacy, database='PUMS', overrides={'censor_dims': False}) for reader in readers: res = [ len(test_databases.to_tuples(reader.execute(query))) for i in range(5) ] assert np.mean(res) < 115 and np.mean( res) > 10 # actual is 14, but noise is huge query = "SELECT age, sex, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, sex HAVING sex = 1" res = self.reader.execute(query) assert len(res) == 74 query = "SELECT age, sex, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, sex HAVING income > 100000 OR sex = 1" res = self.reader.execute(query) assert len(res) > 80 and len(res) < 150 query = "SELECT age, COUNT(*) FROM PUMS.PUMS GROUP BY age HAVING age < 30 OR age > 60" res = self.reader.execute(query) assert len(res) == 43 # # this one is indeterminate behavior based on engine, but works on PrivateReader # query = "SELECT age * 1000 as age, COUNT(*) FROM PUMS.PUMS GROUP BY age HAVING age < 30000 OR age > 60000" # res = self.reader.execute(query) # assert len(res) == 43 query = "SELECT age as age, COUNT(*) FROM PUMS.PUMS GROUP BY age HAVING age * 1000 < 30000 OR age * 2 > 120" res = self.reader.execute(query) assert len(res) == 43 query = "SELECT age, COUNT(*) AS n FROM PUMS.PUMS GROUP BY age HAVING (age < 30 OR age > 60) AND n > 10" res = self.reader.execute(query) assert len(res) < 25 # [len is 16 for non-private] query = "SELECT age, COUNT(*) * 1000 AS n FROM PUMS.PUMS GROUP BY age HAVING (age < 30 OR age > 60) AND n > 10000" res = self.reader.execute(query) assert len(res) < 25 #[len is 16 for non-private] query = "SELECT age, COUNT(*) AS n FROM PUMS.PUMS GROUP BY age HAVING (age < 30 OR age > 60) AND n * 100 / 2 > 500" res = self.reader.execute(query) assert len(res) < 25 #[len is 16 for non-private]
def test_geom_small_sum(self, test_databases): query = 'SELECT SUM(age) FROM PUMS.PUMS' sensitivity = 100 for alpha, epsilon, delta, max_contrib in grid: privacy = Privacy(epsilon=epsilon, delta=delta) reader = test_databases.get_private_reader( database='PUMS_pid', engine="pandas", privacy=privacy, overrides={'max_contrib': max_contrib}) if reader: mech_class = privacy.mechanisms.get_mechanism( sensitivity, 'sum', 'int') mech = mech_class(epsilon, delta=delta, sensitivity=sensitivity, max_contrib=max_contrib) assert (mech.mechanism == Mechanism.geometric) acc = reader.get_simple_accuracy(query, alpha) assert (np.isclose(acc[0], mech.accuracy(alpha)))
def get_privacy_cost(self, query_string): """Estimates the epsilon and delta cost for running the given query. Privacy cost is returned without running the query or incrementing the odometer. :param query_string: The query string to analyze :returns: A tuple of (epsilon, delta) estimating total privacy cost for running this query. .. code-block:: python # metadata specifies censor_dims: False privacy = Privacy(epsilon=0.1, delta=1/1000) reader = from_df(df, metadata=metadata, privacy=privacy) query = 'SELECT AVG(age) FROM PUMS.PUMS GROUP BY educ' eps_cost, delta_cost = reader.get_privacy_cost(query) # will be ~0.2 epsilon, since AVG computed from SUM and COUNT print(f'Total epsilon spent will be {eps_cost}') query = 'SELECT SUM(age), COUNT(age), AVG(age) FROM PUMS.PUMS GROUP BY educ' eps_cost, delta_cost = reader.get_privacy_cost(query) # will be ~0.2 epsilon, since noisy SUM and COUNT are re-used print(f'Total epsilon spent will be {eps_cost}') query = 'SELECT COUNT(*), AVG(age) FROM PUMS.PUMS GROUP BY educ' eps_cost, delta_cost = reader.get_privacy_cost(query) # will be ~0.3 epsilon, since COUNT(*) and COUNT(age) can be different print(f'Total epsilon spent will be {eps_cost}') """ odo = OdometerHeterogeneous(self.privacy) costs = self._get_mechanism_costs(query_string) costs = [cost for cost in costs if cost] for epsilon, delta in costs: odo.spend(Privacy(epsilon=epsilon, delta=delta)) return odo.spent
def test_lap_count(self, test_databases): query = 'SELECT COUNT(educ) FROM PUMS.PUMS' sensitivity = 1 for alpha, epsilon, delta, max_contrib in grid: if delta == 0.0: delta = 1 / 100_000 privacy = Privacy(epsilon=epsilon, delta=delta) privacy.mechanisms.map[Stat.count] = Mechanism.laplace reader = test_databases.get_private_reader( database='PUMS_pid', engine="pandas", privacy=privacy, overrides={'max_contrib': max_contrib}) if reader: mech_class = privacy.mechanisms.get_mechanism( sensitivity, 'count', 'int') mech = mech_class(epsilon, delta=delta, sensitivity=sensitivity, max_contrib=max_contrib) assert (mech.mechanism == Mechanism.laplace) acc = reader.get_simple_accuracy(query, alpha) assert (np.isclose(acc[0], mech.accuracy(alpha)))
def test_geom_key_count(self, test_databases): # reverts to laplace because we need a threshold query = 'SELECT COUNT(DISTINCT pid) FROM PUMS.PUMS' sensitivity = 1 for alpha, epsilon, delta, max_contrib in grid: if delta == 0.0: # not permitted when thresholding delta = 1 / 100_000 privacy = Privacy(epsilon=epsilon, delta=delta) reader = test_databases.get_private_reader( database='PUMS_pid', engine="pandas", privacy=privacy, overrides={'max_contrib': max_contrib}) if reader: mech_class = privacy.mechanisms.get_mechanism( sensitivity, 'threshold', 'int') mech = mech_class(epsilon, delta=delta, sensitivity=sensitivity, max_contrib=max_contrib) assert (mech.mechanism == Mechanism.laplace) acc = reader.get_simple_accuracy(query, alpha) assert (np.isclose(acc[0], mech.accuracy(alpha)))
def test_geom_large_sum(self, test_databases): # reverts to laplace because it's large query = 'SELECT SUM(income) FROM PUMS.PUMS' sensitivity = 500_000 for alpha, epsilon, delta, max_contrib in grid: if delta == 0.0: delta = 1 / 100_000 privacy = Privacy(epsilon=epsilon, delta=delta) reader = test_databases.get_private_reader( database='PUMS_pid', engine="pandas", privacy=privacy, overrides={'max_contrib': max_contrib}) if reader: mech_class = privacy.mechanisms.get_mechanism( sensitivity, 'sum', 'int') mech = mech_class(epsilon, delta=delta, sensitivity=sensitivity, max_contrib=max_contrib) assert (mech.mechanism == Mechanism.laplace) acc = reader.get_simple_accuracy(query, alpha) assert (np.isclose(acc[0], mech.accuracy(alpha)))
def test_queries(self, test_databases): query = 'SELECT TOP 20 age, married, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, married ORDER BY married, age DESC' privacy = Privacy(10.0, 0.1) tdb = test_databases readers = tdb.get_private_readers(privacy=privacy, database='PUMS_pid', overrides={'censor_dims': False}) for reader in readers: if reader.engine == "spark": continue res = test_databases.to_tuples(reader.execute(query)) assert len(res) == 21 reader = self.reader res = reader.execute(query) assert len(res) == 21 query = 'SELECT age, married, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, married ORDER BY married, age DESC LIMIT 10' res = reader.execute(query) assert len(res) == 11 # run the same query with exact reader. Since ORDER BY is # on non-private dimension, order will be the same res_e = reader.reader.execute(query) assert len(res_e) == 11 ages = [r[0] for r in res[1:]] ages_e = [r[0] for r in res_e[1:]] assert all([age == age_e for (age, age_e) in zip(ages, ages_e)]) query = 'SELECT age, married, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, married ORDER BY income DESC LIMIT 50' res = reader.execute(query) assert len(res) == 51 # run the same query with exact reader. Since ORDER BY is # on non-private dimension, order will be different res_e = reader.reader.execute(query) assert len(res_e) == 51 ages = [r[0] for r in res[1:]] ages_e = [r[0] for r in res_e[1:]] assert not all([age == age_e for (age, age_e) in zip(ages, ages_e)])
def test_count_accuracy_small_delta(self): acc = Accuracy(root, subquery, privacy=Privacy(epsilon=1.0, delta=0.1)) error = acc.count(alpha=0.01) error_wide = acc.count(alpha=0.05) assert (error_wide < error)
from snsql.sql import PrivateReader from snsql.sql.privacy import Privacy from snsql.sql.parse import QueryParser git_root_dir = subprocess.check_output( "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip() meta_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.yaml")) csv_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.csv")) meta = Metadata.from_file(meta_path) pums = pd.read_csv(csv_path) query = 'SELECT AVG(age), STD(age), VAR(age), SUM(age), COUNT(age) FROM PUMS.PUMS GROUP BY sex' q = QueryParser(meta).query(query) privacy = Privacy(alphas=[0.01, 0.05], delta=1 / (math.sqrt(100) * 100)) priv = PrivateReader.from_connection(pums, privacy=privacy, metadata=meta) subquery, root = priv._rewrite(query) acc = Accuracy(root, subquery, privacy) class TestAccuracy: def test_count_accuracy(self): error = acc.count(alpha=0.05) assert (error < 7.53978 and error > 0.5) error_wide = acc.count(alpha=0.01) assert (error_wide < 9.909) assert (error_wide > error) def test_count_accuracy_small_delta(self):
import subprocess from snsql.sql.odometer import Odometer, OdometerHeterogeneous from snsql.sql.privacy import Privacy from snsql.sql.private_reader import PrivateReader import pandas as pd import numpy as np from snsql.metadata import Metadata git_root_dir = subprocess.check_output( "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip() meta_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.yaml")) csv_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.csv")) pums = pd.read_csv(csv_path) privacy = Privacy(epsilon=1.0) meta_obj = Metadata.from_(meta_path) class TestOdometer: def test_count_pid_query(self): priv = PrivateReader.from_connection(pums, privacy=privacy, metadata=meta_path) assert (priv.odometer.spent == (0.0, 0.0)) assert (priv.odometer.k == 0) res = priv.execute( "SELECT COUNT(DISTINCT pid) FROM PUMS.PUMS GROUP BY educ") assert (priv.odometer.k == 1)
def test_group_by_noisy_typed_order_desc(self): reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=4.0)) rs = private_reader.execute_df("SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC") assert(rs['c'][0] > rs['c'][1])
def test_sum_noisy_postprocess(self): reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=1.0)) trs = private_reader.execute_df("SELECT POWER(SUM(age), 2) as age_total FROM PUMS.PUMS") assert(trs['age_total'][0] > 1000 ** 2)
from snsql.sql.privacy import Privacy privacy = Privacy(epsilon=30.0) class TestGroupingClamp: def test_clamp_on(self, test_databases): readers = test_databases.get_private_readers(database='PUMS_pid', privacy=privacy) assert (len(readers) > 0) for reader in readers: meta = reader.metadata meta["PUMS.PUMS"]["income"].upper = 100 query = "SELECT AVG(income) AS income FROM PUMS.PUMS" res = test_databases.to_tuples(reader.execute(query)) assert (res[1][0] < 150.0) def test_clamp_off(self, test_databases): readers = test_databases.get_private_readers(database='PUMS_pid', privacy=privacy) assert (len(readers) > 0) for reader in readers: meta = reader.metadata meta["PUMS.PUMS"]["income"].upper = 100 query = "SELECT income, COUNT(pid) AS n FROM PUMS.PUMS GROUP BY income" res = test_databases.to_tuples(reader.execute(query)) assert (len(res) > 40)
def _execute_ast(self, query, *ignore, accuracy:bool=False, pre_aggregated=None, postprocess=True): if isinstance(query, str): raise ValueError("Please pass AST to _execute_ast.") subquery, query = self._rewrite_ast(query) if pre_aggregated is not None: exact_aggregates = self._check_pre_aggregated_columns(pre_aggregated, subquery) else: exact_aggregates = self._get_reader(subquery)._execute_ast(subquery) _accuracy = None if accuracy: _accuracy = Accuracy(query, subquery, self.privacy) syms = subquery._select_symbols source_col_names = [s.name for s in syms] # tell which are counts, in column order is_count = [s.expression.is_count for s in syms] # get a list of mechanisms in column order mechs = self._get_mechanisms(subquery) check_sens = [m for m in mechs if m] if any([m.sensitivity is np.inf for m in check_sens]): raise ValueError(f"Attempting to query an unbounded column") kc_pos = self._get_keycount_position(subquery) def randomize_row_values(row_in): row = [v for v in row_in] # set null to 0 before adding noise for idx in range(len(row)): if mechs[idx] and row[idx] is None: row[idx] = 0.0 # call all mechanisms to add noise return [ mech.release([v])[0] if mech is not None else v for mech, v in zip(mechs, row) ] if hasattr(exact_aggregates, "rdd"): # it's a dataframe out = exact_aggregates.rdd.map(randomize_row_values) elif hasattr(exact_aggregates, "map"): # it's an RDD out = exact_aggregates.map(randomize_row_values) elif isinstance(exact_aggregates, list): out = map(randomize_row_values, exact_aggregates[1:]) elif isinstance(exact_aggregates, np.ndarray): out = map(randomize_row_values, exact_aggregates) else: raise ValueError("Unexpected type for exact_aggregates") # censor infrequent dimensions if self._options.censor_dims: if kc_pos is None: raise ValueError("Query needs a key count column to censor dimensions") else: thresh_mech = mechs[kc_pos] self.tau = thresh_mech.threshold if hasattr(out, "filter"): # it's an RDD tau = self.tau out = out.filter(lambda row: row[kc_pos] > tau) else: out = filter(lambda row: row[kc_pos] > self.tau, out) if not postprocess: return out def process_clamp_counts(row_in): # clamp counts to be non-negative row = [v for v in row_in] for idx in range(len(row)): if is_count[idx] and row[idx] < 0: row[idx] = 0 return row clamp_counts = self._options.clamp_counts if clamp_counts: if hasattr(out, "rdd"): # it's a dataframe out = out.rdd.map(process_clamp_counts) elif hasattr(out, "map"): # it's an RDD out = out.map(process_clamp_counts) else: out = map(process_clamp_counts, out) # get column information for outer query out_syms = query._select_symbols out_types = [s.expression.type() for s in out_syms] out_col_names = [s.name for s in out_syms] def convert(val, type): if val is None: return None # all columns are nullable if type == "string" or type == "unknown": return str(val) elif type == "int": return int(float(str(val).replace('"', "").replace("'", ""))) elif type == "float": return float(str(val).replace('"', "").replace("'", "")) elif type == "boolean": if isinstance(val, int): return val != 0 else: return bool(str(val).replace('"', "").replace("'", "")) elif type == "datetime": v = parse_datetime(val) if v is None: raise ValueError(f"Could not parse datetime: {val}") return v else: raise ValueError("Can't convert type " + type) alphas = [alpha for alpha in self.privacy.alphas] def process_out_row(row): bindings = dict((name.lower(), val) for name, val in zip(source_col_names, row)) out_row = [c.expression.evaluate(bindings) for c in query.select.namedExpressions] try: out_row =[convert(val, type) for val, type in zip(out_row, out_types)] except Exception as e: raise ValueError( f"Error converting output row: {e}\n" f"Expecting types {out_types}" ) # compute accuracies if accuracy == True and alphas: accuracies = [_accuracy.accuracy(row=list(row), alpha=alpha) for alpha in alphas] return tuple([out_row, accuracies]) else: return tuple([out_row, []]) if hasattr(out, "map"): # it's an RDD out = out.map(process_out_row) else: out = map(process_out_row, out) def filter_aggregate(row, condition): bindings = dict((name.lower(), val) for name, val in zip(out_col_names, row[0])) keep = condition.evaluate(bindings) return keep if query.having is not None: condition = query.having.condition if hasattr(out, "filter"): # it's an RDD out = out.filter(lambda row: filter_aggregate(row, condition)) else: out = filter(lambda row: filter_aggregate(row, condition), out) # sort it if necessary if query.order is not None: sort_fields = [] for si in query.order.sortItems: if type(si.expression) is not ast.Column: raise ValueError("We only know how to sort by column names right now") colname = si.expression.name.lower() if colname not in out_col_names: raise ValueError( "Can't sort by {0}, because it's not in output columns: {1}".format( colname, out_col_names ) ) colidx = out_col_names.index(colname) desc = False if si.order is not None and si.order.lower() == "desc": desc = True if desc and not (out_types[colidx] in ["int", "float", "boolean", "datetime"]): raise ValueError("We don't know how to sort descending by " + out_types[colidx]) sf = (desc, colidx) sort_fields.append(sf) def sort_func(row): # use index 0, since index 1 is accuracy return SortKey(row[0], sort_fields) if hasattr(out, "sortBy"): out = out.sortBy(sort_func) else: out = sorted(out, key=sort_func) # check for LIMIT or TOP limit_rows = None if query.limit is not None: if query.select.quantifier is not None: raise ValueError("Query cannot have both LIMIT and TOP set") limit_rows = query.limit.n elif query.select.quantifier is not None and isinstance(query.select.quantifier, Top): limit_rows = query.select.quantifier.n if limit_rows is not None: if hasattr(out, "rdd"): # it's a dataframe out = out.limit(limit_rows) elif hasattr(out, "map"): # it's an RDD out = out.take(limit_rows) else: out = itertools.islice(out, limit_rows) # drop empty accuracy if no accuracy requested def drop_accuracy(row): return row[0] if accuracy == False: if hasattr(out, "rdd"): # it's a dataframe out = out.rdd.map(drop_accuracy) elif hasattr(out, "map"): # it's an RDD out = out.map(drop_accuracy) else: out = map(drop_accuracy, out) # increment odometer for mech in mechs: if mech: self.odometer.spend(Privacy(epsilon=mech.epsilon, delta=mech.delta)) # output it if accuracy == False and hasattr(out, "toDF"): # Pipeline RDD if not out.isEmpty(): return out.toDF(out_col_names) else: return out elif hasattr(out, "map"): # Bare RDD return out else: row0 = [out_col_names] if accuracy == True: row0 = [[out_col_names, [[col_name+'_' + str(1-alpha).replace('0.', '') for col_name in out_col_names] for alpha in self.privacy.alphas ]]] out_rows = row0 + list(out) return out_rows
def run_agg_query_df(self, df, metadata, query, confidence, file_name="d1"): """ Run the query using the private reader and input query Get query response back for multiple dimensions and aggregations """ # Getting exact result reader = PandasReader(df, metadata) exact_res = reader.execute(query)[1:] private_reader = PrivateReader(reader, metadata, privacy=Privacy(epsilon=self.epsilon)) query_ast = private_reader.parse_query_string(query) # Distinguishing dimension and measure columns sample_res = private_reader._execute_ast(query_ast, True) headers = sample_res[0] dim_cols = [] num_cols = [] out_syms = query_ast.all_symbols() out_types = [s[1].type() for s in out_syms] out_col_names = [s[0] for s in out_syms] for col, ctype in zip(out_col_names, out_types): if ctype == "string": dim_cols.append(col) else: num_cols.append(col) # Repeated query and store results res = [] for idx in range(self.repeat_count): dim_rows = [] num_rows = [] singleres = private_reader._execute_ast_df(query_ast, cache_exact=True) # values = singleres[col] for col in dim_cols: dim_rows.append(singleres[col].tolist()) for col in num_cols: values = singleres[col].tolist() num_rows.append(list(zip(values))) res.extend(list(zip(*dim_rows, *num_rows))) exact_df = pd.DataFrame(exact_res, columns=headers) noisy_df = pd.DataFrame(res, columns=headers) # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2 if len(dim_cols) == 0: dim_cols.append("__dim__") if dim_cols[0] == "__dim__": exact_df[dim_cols[0]] = ["key"] * len(exact_df) noisy_df[dim_cols[0]] = ["key"] * len(noisy_df) return noisy_df, exact_df, dim_cols, num_cols