예제 #1
0
 def test_check_thresholds_gauss(self):
     # check tau for various privacy parameters
     epsilons = [0.1, 2.0]
     max_contribs = [1, 3]
     deltas = [10E-5, 10E-15]
     query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married"
     reader = PandasReader(df, schema)
     qp = QueryParser(schema)
     q = qp.query(query)
     for eps in epsilons:
         for d in max_contribs:
             for delta in deltas:
                 privacy = Privacy(epsilon=eps, delta=delta)
                 privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian
                 # using slightly different formulations of same formula from different papers
                 # make sure private_reader round-trips
                 gaus_scale = math.sqrt(d) * math.sqrt(
                     2 * math.log(1.25 / delta)) / eps
                 gaus_rho = 1 + gaus_scale * math.sqrt(
                     2 * math.log(d / math.sqrt(2 * math.pi * delta)))
                 schema_c = copy.copy(schema)
                 schema_c["PUMS.PUMS"].max_ids = d
                 private_reader = PrivateReader(reader,
                                                metadata=schema_c,
                                                privacy=privacy)
                 assert (private_reader._options.max_contrib == d)
                 r = private_reader._execute_ast(q)
                 assert (math.isclose(private_reader.tau,
                                      gaus_rho,
                                      rel_tol=0.03,
                                      abs_tol=2))
예제 #2
0
 def setup_class(cls):
     meta = Metadata.from_file(meta_path)
     meta["PUMS.PUMS"].censor_dims = False
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=10.0, delta=0.1))
     cls.reader = private_reader
예제 #3
0
    def run_agg_query(self, df, metadata, query, confidence, get_exact=True):
        """
        Run the query using the private reader and input query
        Get query response back
        """
        reader = PandasReader(df, metadata)
        actual = 0.0
        # VAR not supported in Pandas Reader. So not needed to fetch actual on every aggregation
        if get_exact:
            actual = reader.execute(query)[1:][0][0]
        private_reader = PrivateReader(reader,
                                       metadata,
                                       privacy=Privacy(epsilon=self.epsilon))
        query_ast = private_reader.parse_query_string(query)

        noisy_values = []
        low_bounds = []
        high_bounds = []
        for idx in range(self.repeat_count):
            res = private_reader._execute_ast(query_ast, True)
            # Disabled because confidence interval not available in report
            # interval = res.report[res.colnames[0]].intervals[confidence]
            # low_bounds.append(interval[0].low)
            # high_bounds.append(interval[0].high)
            noisy_values.append(res[1:][0][0])
        return np.array(noisy_values), actual, low_bounds, high_bounds
예제 #4
0
 def test_odo_het_alternate(self):
     privacy = Privacy(epsilon=0.1, delta=1 / (1000))
     odo = OdometerHeterogeneous(privacy)
     for _ in range(300):
         odo.spend(privacy)
     eps, delt = odo.spent
     assert (np.isclose(eps, 8.2519))
     assert (np.isclose(delt, 0.2596633))
예제 #5
0
 def test_odo_hom(self):
     privacy = Privacy(epsilon=0.1, delta=1 / (1000))
     odo = Odometer(privacy)
     for _ in range(300):
         odo.spend()
     eps, delt = odo.spent
     assert (np.isclose(eps, 8.4917))
     assert (np.isclose(delt, 0.19256))
예제 #6
0
 def test_execute_without_dpsu(self):
     schema_no_dpsu = copy.copy(schema)
     schema_no_dpsu["PUMS.PUMS"].use_dpsu = False
     reader = PandasReader(df, schema_no_dpsu)
     private_reader = PrivateReader(reader,
                                    schema_no_dpsu,
                                    privacy=Privacy(epsilon=1.0))
     assert (private_reader._options.use_dpsu == False)
     query = QueryParser(schema_no_dpsu).queries(
         "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0]
     assert (private_reader._get_reader(query) is private_reader.reader)
예제 #7
0
 def test_simple_row_privacy(self, test_databases):
     alpha = 0.07
     privacy = Privacy(alphas=[alpha], epsilon=0.5, delta=1 / 1000)
     query = 'SELECT COUNT(*), COUNT(educ), SUM(age) FROM PUMS.PUMS'
     reader = test_databases.get_private_reader(database='PUMS',
                                                engine="pandas",
                                                privacy=privacy)
     if reader:
         simple_a = reader.get_simple_accuracy(query, alpha=alpha)
         res = reader.execute(query, accuracy=True)
         simple_b = res[1][1][0]
         assert (all([a == b for a, b in zip(simple_a, simple_b)]))
예제 #8
0
 def test_yes_tau_gauss_row(self, test_databases):
     # should drop approximately half of educ bins
     privacy = Privacy(epsilon=1.0, delta=1 / 1000)
     privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian
     readers = test_databases.get_private_readers(database='PUMS',
                                                  privacy=privacy)
     assert (len(readers) > 0)
     for reader in readers:
         rs = test_databases.to_tuples(
             reader.execute(
                 "SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 70 GROUP BY educ"
             ))
         assert (len(rs) >= 2 and len(rs) <= 8)
예제 #9
0
 def test_no_order_limit(self, test_databases):
     """
     The top 3 education levels are each more than double the 4th,
     so a SELECT TOP will reliably give the top 3
     """
     privacy = Privacy(epsilon=3.0, delta=1/1000)
     readers = test_databases.get_private_readers(database='PUMS_dup', privacy=privacy)
     for reader in readers:
         res = reader.execute('SELECT educ, COUNT(*) AS n FROM PUMS.PUMS GROUP BY educ LIMIT 4')
         res = test_databases.to_tuples(res)
         educs = [str(r[0]) for r in res]
         top_educs = ['9', '13', '11', '12']
         assert(not all([a == b for a, b in zip(educs, top_educs)]))
예제 #10
0
 def test_yes_tau_laplace_no_group(self, test_databases):
     # This should always return empty, because it pinpoints a small cohort
     privacy = Privacy(epsilon=1.0, delta=1 / 100_000)
     privacy.mechanisms.map[Stat.threshold] = Mechanism.laplace
     readers = test_databases.get_private_readers(database='PUMS',
                                                  privacy=privacy)
     assert (len(readers) > 0)
     for reader in readers:
         rs = test_databases.to_tuples(
             reader.execute(
                 "SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 70 AND educ < 2"
             ))
         assert (len(rs) <= 1)
예제 #11
0
 def setup_class(self):
     meta = Metadata.from_file(meta_path)
     meta["PUMS.PUMS"].censor_dims = False
     meta["PUMS.PUMS"]["sex"].type = "int"
     meta["PUMS.PUMS"]["educ"].type = "int"
     meta["PUMS.PUMS"]["married"].type = "bool"
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader,
                                    meta,
                                    privacy=Privacy(epsilon=10.0,
                                                    delta=10e-3))
     self.reader = private_reader
예제 #12
0
 def test_order_limit(self, test_databases):
     """
     The top 3 education levels are each more than double the 4th,
     so a SELECT TOP will reliably give the top 3
     """
     privacy = Privacy(epsilon=3.0, delta=1/1000)
     readers = test_databases.get_private_readers(database='PUMS_dup', privacy=privacy)
     for reader in readers:
         res = reader.execute('SELECT TOP 5 educ, COUNT(*) AS n FROM PUMS.PUMS GROUP BY educ ORDER BY n DESC')
         res = test_databases.to_tuples(res)
         educs = [str(r[0]) for r in res]
         assert('9' in educs)
         assert('13' in educs)
         assert('11' in educs)
예제 #13
0
 def test_empty_result_count_typed_notau_prepost(self):
     schema_all = copy.deepcopy(schema)
     schema_all['PUMS.PUMS'].censor_dims = False
     reader = PandasReader(df, schema)
     query = QueryParser(schema).queries(
         "SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0]
     private_reader = PrivateReader(reader,
                                    schema_all,
                                    privacy=Privacy(epsilon=1.0))
     private_reader._execute_ast(query, True)
     for i in range(3):
         print(private_reader._options)
         trs = private_reader._execute_ast(query, True)
         print("empty query")
         print(trs)
         assert (len(trs) == 2)
예제 #14
0
 def test_simple_pid(self, test_databases):
     max_ids = 2
     alpha = 0.05
     privacy = Privacy(alphas=[alpha], epsilon=1.5, delta=1 / 100_000)
     privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian
     query = 'SELECT COUNT(DISTINCT pid), COUNT(*), COUNT(educ), SUM(age) FROM PUMS.PUMS'
     reader = test_databases.get_private_reader(
         database='PUMS_pid',
         engine="pandas",
         privacy=privacy,
         overrides={'max_ids': max_ids})
     if reader:
         simple_a = reader.get_simple_accuracy(query, alpha=alpha)
         res = reader.execute(query, accuracy=True)
         simple_b = res[1][1][0]
         assert (all([a == b for a, b in zip(simple_a, simple_b)]))
예제 #15
0
 def test_no_tau(self, test_databases):
     # should never drop rows
     privacy = Privacy(epsilon=4.0)
     readers = test_databases.get_private_readers(
         database='PUMS_pid',
         privacy=privacy,
         overrides={'censor_dims': False})
     assert (len(readers) > 0)
     for reader in readers:
         if reader.engine == "spark":
             continue
         for _ in range(10):
             rs = reader.execute_df(
                 "SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 90 AND educ = '8'"
             )
             assert (len(rs['c']) == 1)
예제 #16
0
    def test_queries(self, test_databases):
        query = "SELECT age, sex, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, sex HAVING income > 100000"
        privacy = Privacy(10.0, 0.1)
        readers = test_databases.get_private_readers(
            privacy=privacy, database='PUMS', overrides={'censor_dims': False})
        for reader in readers:
            res = [
                len(test_databases.to_tuples(reader.execute(query)))
                for i in range(5)
            ]
            assert np.mean(res) < 115 and np.mean(
                res) > 10  # actual is 14, but noise is huge

        query = "SELECT age, sex, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, sex HAVING sex = 1"
        res = self.reader.execute(query)
        assert len(res) == 74

        query = "SELECT age, sex, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, sex HAVING income > 100000 OR sex = 1"
        res = self.reader.execute(query)
        assert len(res) > 80 and len(res) < 150

        query = "SELECT age, COUNT(*) FROM PUMS.PUMS GROUP BY age HAVING age < 30 OR age > 60"
        res = self.reader.execute(query)
        assert len(res) == 43

        # # this one is indeterminate behavior based on engine, but works on PrivateReader
        # query = "SELECT age * 1000 as age, COUNT(*) FROM PUMS.PUMS GROUP BY age HAVING age < 30000 OR age > 60000"
        # res = self.reader.execute(query)
        # assert len(res) == 43

        query = "SELECT age as age, COUNT(*) FROM PUMS.PUMS GROUP BY age HAVING age * 1000 < 30000 OR age * 2 > 120"
        res = self.reader.execute(query)
        assert len(res) == 43

        query = "SELECT age, COUNT(*) AS n FROM PUMS.PUMS GROUP BY age HAVING (age < 30 OR age > 60) AND n > 10"
        res = self.reader.execute(query)
        assert len(res) < 25  # [len is 16 for non-private]

        query = "SELECT age, COUNT(*) * 1000 AS n FROM PUMS.PUMS GROUP BY age HAVING (age < 30 OR age > 60) AND n > 10000"
        res = self.reader.execute(query)
        assert len(res) < 25  #[len is 16 for non-private]

        query = "SELECT age, COUNT(*) AS n FROM PUMS.PUMS GROUP BY age HAVING (age < 30 OR age > 60) AND n * 100 / 2 > 500"
        res = self.reader.execute(query)
        assert len(res) < 25  #[len is 16 for non-private]
예제 #17
0
 def test_geom_small_sum(self, test_databases):
     query = 'SELECT SUM(age) FROM PUMS.PUMS'
     sensitivity = 100
     for alpha, epsilon, delta, max_contrib in grid:
         privacy = Privacy(epsilon=epsilon, delta=delta)
         reader = test_databases.get_private_reader(
             database='PUMS_pid',
             engine="pandas",
             privacy=privacy,
             overrides={'max_contrib': max_contrib})
         if reader:
             mech_class = privacy.mechanisms.get_mechanism(
                 sensitivity, 'sum', 'int')
             mech = mech_class(epsilon,
                               delta=delta,
                               sensitivity=sensitivity,
                               max_contrib=max_contrib)
             assert (mech.mechanism == Mechanism.geometric)
             acc = reader.get_simple_accuracy(query, alpha)
             assert (np.isclose(acc[0], mech.accuracy(alpha)))
예제 #18
0
    def get_privacy_cost(self, query_string):
        """Estimates the epsilon and delta cost for running the given query.
        Privacy cost is returned without running the query or incrementing the odometer.

        :param query_string: The query string to analyze
        :returns: A tuple of (epsilon, delta) estimating total privacy cost for
            running this query.

        .. code-block:: python

            # metadata specifies censor_dims: False
            privacy = Privacy(epsilon=0.1, delta=1/1000)
            reader = from_df(df, metadata=metadata, privacy=privacy)

            query = 'SELECT AVG(age) FROM PUMS.PUMS GROUP BY educ'
            eps_cost, delta_cost = reader.get_privacy_cost(query)

            # will be ~0.2 epsilon, since AVG computed from SUM and COUNT
            print(f'Total epsilon spent will be {eps_cost}')

            query = 'SELECT SUM(age), COUNT(age), AVG(age) FROM PUMS.PUMS GROUP BY educ'
            eps_cost, delta_cost = reader.get_privacy_cost(query)

            # will be ~0.2 epsilon, since noisy SUM and COUNT are re-used
            print(f'Total epsilon spent will be {eps_cost}')

            query = 'SELECT COUNT(*), AVG(age) FROM PUMS.PUMS GROUP BY educ'
            eps_cost, delta_cost = reader.get_privacy_cost(query)

            # will be ~0.3 epsilon, since COUNT(*) and COUNT(age) can be different
            print(f'Total epsilon spent will be {eps_cost}')

        """
        odo = OdometerHeterogeneous(self.privacy)
        costs = self._get_mechanism_costs(query_string)
        costs = [cost for cost in costs if cost]
        for epsilon, delta in costs:
            odo.spend(Privacy(epsilon=epsilon, delta=delta))
        return odo.spent
예제 #19
0
 def test_lap_count(self, test_databases):
     query = 'SELECT COUNT(educ) FROM PUMS.PUMS'
     sensitivity = 1
     for alpha, epsilon, delta, max_contrib in grid:
         if delta == 0.0:
             delta = 1 / 100_000
         privacy = Privacy(epsilon=epsilon, delta=delta)
         privacy.mechanisms.map[Stat.count] = Mechanism.laplace
         reader = test_databases.get_private_reader(
             database='PUMS_pid',
             engine="pandas",
             privacy=privacy,
             overrides={'max_contrib': max_contrib})
         if reader:
             mech_class = privacy.mechanisms.get_mechanism(
                 sensitivity, 'count', 'int')
             mech = mech_class(epsilon,
                               delta=delta,
                               sensitivity=sensitivity,
                               max_contrib=max_contrib)
             assert (mech.mechanism == Mechanism.laplace)
             acc = reader.get_simple_accuracy(query, alpha)
             assert (np.isclose(acc[0], mech.accuracy(alpha)))
예제 #20
0
 def test_geom_key_count(self, test_databases):
     # reverts to laplace because we need a threshold
     query = 'SELECT COUNT(DISTINCT pid) FROM PUMS.PUMS'
     sensitivity = 1
     for alpha, epsilon, delta, max_contrib in grid:
         if delta == 0.0:  # not permitted when thresholding
             delta = 1 / 100_000
         privacy = Privacy(epsilon=epsilon, delta=delta)
         reader = test_databases.get_private_reader(
             database='PUMS_pid',
             engine="pandas",
             privacy=privacy,
             overrides={'max_contrib': max_contrib})
         if reader:
             mech_class = privacy.mechanisms.get_mechanism(
                 sensitivity, 'threshold', 'int')
             mech = mech_class(epsilon,
                               delta=delta,
                               sensitivity=sensitivity,
                               max_contrib=max_contrib)
             assert (mech.mechanism == Mechanism.laplace)
             acc = reader.get_simple_accuracy(query, alpha)
             assert (np.isclose(acc[0], mech.accuracy(alpha)))
예제 #21
0
 def test_geom_large_sum(self, test_databases):
     # reverts to laplace because it's large
     query = 'SELECT SUM(income) FROM PUMS.PUMS'
     sensitivity = 500_000
     for alpha, epsilon, delta, max_contrib in grid:
         if delta == 0.0:
             delta = 1 / 100_000
         privacy = Privacy(epsilon=epsilon, delta=delta)
         reader = test_databases.get_private_reader(
             database='PUMS_pid',
             engine="pandas",
             privacy=privacy,
             overrides={'max_contrib': max_contrib})
         if reader:
             mech_class = privacy.mechanisms.get_mechanism(
                 sensitivity, 'sum', 'int')
             mech = mech_class(epsilon,
                               delta=delta,
                               sensitivity=sensitivity,
                               max_contrib=max_contrib)
             assert (mech.mechanism == Mechanism.laplace)
             acc = reader.get_simple_accuracy(query, alpha)
             assert (np.isclose(acc[0], mech.accuracy(alpha)))
예제 #22
0
    def test_queries(self, test_databases):
        query = 'SELECT TOP 20 age, married, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, married ORDER BY married, age DESC'
        privacy = Privacy(10.0, 0.1)
        tdb = test_databases
        readers = tdb.get_private_readers(privacy=privacy, database='PUMS_pid', overrides={'censor_dims': False})

        for reader in readers:
            if reader.engine == "spark":
                continue
            res = test_databases.to_tuples(reader.execute(query))
            assert len(res) == 21

        reader = self.reader
        res = reader.execute(query)
        assert len(res) == 21

        query = 'SELECT age, married, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, married ORDER BY married, age DESC LIMIT 10'
        res = reader.execute(query)
        assert len(res) == 11
        # run the same query with exact reader. Since ORDER BY is
        # on non-private dimension, order will be the same
        res_e = reader.reader.execute(query)
        assert len(res_e) == 11
        ages = [r[0] for r in res[1:]]
        ages_e = [r[0] for r in res_e[1:]]
        assert all([age == age_e for (age, age_e) in zip(ages, ages_e)])

        query = 'SELECT age, married, COUNT(*) AS n, SUM(income) AS income FROM PUMS.PUMS GROUP BY age, married ORDER BY income DESC LIMIT 50'
        res = reader.execute(query)
        assert len(res) == 51
        # run the same query with exact reader. Since ORDER BY is
        # on non-private dimension, order will be different
        res_e = reader.reader.execute(query)
        assert len(res_e) == 51
        ages = [r[0] for r in res[1:]]
        ages_e = [r[0] for r in res_e[1:]]
        assert not all([age == age_e for (age, age_e) in zip(ages, ages_e)])
예제 #23
0
 def test_count_accuracy_small_delta(self):
     acc = Accuracy(root, subquery, privacy=Privacy(epsilon=1.0, delta=0.1))
     error = acc.count(alpha=0.01)
     error_wide = acc.count(alpha=0.05)
     assert (error_wide < error)
예제 #24
0
from snsql.sql import PrivateReader
from snsql.sql.privacy import Privacy
from snsql.sql.parse import QueryParser

git_root_dir = subprocess.check_output(
    "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip()
meta_path = os.path.join(git_root_dir, os.path.join("datasets",
                                                    "PUMS_pid.yaml"))
csv_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.csv"))

meta = Metadata.from_file(meta_path)
pums = pd.read_csv(csv_path)
query = 'SELECT AVG(age), STD(age), VAR(age), SUM(age), COUNT(age) FROM PUMS.PUMS GROUP BY sex'
q = QueryParser(meta).query(query)

privacy = Privacy(alphas=[0.01, 0.05], delta=1 / (math.sqrt(100) * 100))
priv = PrivateReader.from_connection(pums, privacy=privacy, metadata=meta)
subquery, root = priv._rewrite(query)

acc = Accuracy(root, subquery, privacy)


class TestAccuracy:
    def test_count_accuracy(self):
        error = acc.count(alpha=0.05)
        assert (error < 7.53978 and error > 0.5)
        error_wide = acc.count(alpha=0.01)
        assert (error_wide < 9.909)
        assert (error_wide > error)

    def test_count_accuracy_small_delta(self):
예제 #25
0
import subprocess
from snsql.sql.odometer import Odometer, OdometerHeterogeneous
from snsql.sql.privacy import Privacy
from snsql.sql.private_reader import PrivateReader
import pandas as pd
import numpy as np

from snsql.metadata import Metadata

git_root_dir = subprocess.check_output(
    "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip()
meta_path = os.path.join(git_root_dir, os.path.join("datasets",
                                                    "PUMS_pid.yaml"))
csv_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.csv"))
pums = pd.read_csv(csv_path)
privacy = Privacy(epsilon=1.0)

meta_obj = Metadata.from_(meta_path)


class TestOdometer:
    def test_count_pid_query(self):
        priv = PrivateReader.from_connection(pums,
                                             privacy=privacy,
                                             metadata=meta_path)
        assert (priv.odometer.spent == (0.0, 0.0))
        assert (priv.odometer.k == 0)
        res = priv.execute(
            "SELECT COUNT(DISTINCT pid) FROM PUMS.PUMS GROUP BY educ")
        assert (priv.odometer.k == 1)
예제 #26
0
 def test_group_by_noisy_typed_order_desc(self):
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=4.0))
     rs = private_reader.execute_df("SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC")
     assert(rs['c'][0] > rs['c'][1])
예제 #27
0
 def test_sum_noisy_postprocess(self):
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=1.0))
     trs = private_reader.execute_df("SELECT POWER(SUM(age), 2) as age_total FROM PUMS.PUMS")
     assert(trs['age_total'][0] > 1000 ** 2)
예제 #28
0
from snsql.sql.privacy import Privacy

privacy = Privacy(epsilon=30.0)


class TestGroupingClamp:
    def test_clamp_on(self, test_databases):
        readers = test_databases.get_private_readers(database='PUMS_pid',
                                                     privacy=privacy)
        assert (len(readers) > 0)
        for reader in readers:
            meta = reader.metadata
            meta["PUMS.PUMS"]["income"].upper = 100
            query = "SELECT AVG(income) AS income FROM PUMS.PUMS"
            res = test_databases.to_tuples(reader.execute(query))
            assert (res[1][0] < 150.0)

    def test_clamp_off(self, test_databases):
        readers = test_databases.get_private_readers(database='PUMS_pid',
                                                     privacy=privacy)
        assert (len(readers) > 0)
        for reader in readers:
            meta = reader.metadata
            meta["PUMS.PUMS"]["income"].upper = 100
            query = "SELECT income, COUNT(pid) AS n FROM PUMS.PUMS GROUP BY income"
            res = test_databases.to_tuples(reader.execute(query))
            assert (len(res) > 40)
예제 #29
0
    def _execute_ast(self, query, *ignore, accuracy:bool=False, pre_aggregated=None, postprocess=True):
        if isinstance(query, str):
            raise ValueError("Please pass AST to _execute_ast.")

        subquery, query = self._rewrite_ast(query)

        if pre_aggregated is not None:
            exact_aggregates = self._check_pre_aggregated_columns(pre_aggregated, subquery)
        else:
            exact_aggregates = self._get_reader(subquery)._execute_ast(subquery)

        _accuracy = None
        if accuracy:
            _accuracy = Accuracy(query, subquery, self.privacy)

        syms = subquery._select_symbols
        source_col_names = [s.name for s in syms]

        # tell which are counts, in column order
        is_count = [s.expression.is_count for s in syms]

        # get a list of mechanisms in column order
        mechs = self._get_mechanisms(subquery)
        check_sens = [m for m in mechs if m]
        if any([m.sensitivity is np.inf for m in check_sens]):
            raise ValueError(f"Attempting to query an unbounded column")

        kc_pos = self._get_keycount_position(subquery)

        def randomize_row_values(row_in):
            row = [v for v in row_in]
            # set null to 0 before adding noise
            for idx in range(len(row)):
                if mechs[idx] and row[idx] is None:
                    row[idx] = 0.0
            # call all mechanisms to add noise
            return [
                mech.release([v])[0] if mech is not None else v
                for mech, v in zip(mechs, row)
            ]

        if hasattr(exact_aggregates, "rdd"):
            # it's a dataframe
            out = exact_aggregates.rdd.map(randomize_row_values)
        elif hasattr(exact_aggregates, "map"):
            # it's an RDD
            out = exact_aggregates.map(randomize_row_values)
        elif isinstance(exact_aggregates, list):
            out = map(randomize_row_values, exact_aggregates[1:])
        elif isinstance(exact_aggregates, np.ndarray):
            out = map(randomize_row_values, exact_aggregates)
        else:
            raise ValueError("Unexpected type for exact_aggregates")

        # censor infrequent dimensions
        if self._options.censor_dims:
            if kc_pos is None:
                raise ValueError("Query needs a key count column to censor dimensions")
            else:
                thresh_mech = mechs[kc_pos]
                self.tau = thresh_mech.threshold
            if hasattr(out, "filter"):
                # it's an RDD
                tau = self.tau
                out = out.filter(lambda row: row[kc_pos] > tau)
            else:
                out = filter(lambda row: row[kc_pos] > self.tau, out)

        if not postprocess:
            return out

        def process_clamp_counts(row_in):
            # clamp counts to be non-negative
            row = [v for v in row_in]
            for idx in range(len(row)):
                if is_count[idx] and row[idx] < 0:
                    row[idx] = 0
            return row

        clamp_counts = self._options.clamp_counts
        if clamp_counts:
            if hasattr(out, "rdd"):
                # it's a dataframe
                out = out.rdd.map(process_clamp_counts)
            elif hasattr(out, "map"):
                # it's an RDD
                out = out.map(process_clamp_counts)
            else:
                out = map(process_clamp_counts, out)

        # get column information for outer query
        out_syms = query._select_symbols
        out_types = [s.expression.type() for s in out_syms]
        out_col_names = [s.name for s in out_syms]

        def convert(val, type):
            if val is None:
                return None # all columns are nullable
            if type == "string" or type == "unknown":
                return str(val)
            elif type == "int":
                return int(float(str(val).replace('"', "").replace("'", "")))
            elif type == "float":
                return float(str(val).replace('"', "").replace("'", ""))
            elif type == "boolean":
                if isinstance(val, int):
                    return val != 0
                else:
                    return bool(str(val).replace('"', "").replace("'", ""))
            elif type == "datetime":
                v = parse_datetime(val)
                if v is None:
                    raise ValueError(f"Could not parse datetime: {val}")
                return v
            else:
                raise ValueError("Can't convert type " + type)
        
        alphas = [alpha for alpha in self.privacy.alphas]

        def process_out_row(row):
            bindings = dict((name.lower(), val) for name, val in zip(source_col_names, row))
            out_row = [c.expression.evaluate(bindings) for c in query.select.namedExpressions]
            try:
                out_row =[convert(val, type) for val, type in zip(out_row, out_types)]
            except Exception as e:
                raise ValueError(
                    f"Error converting output row: {e}\n"
                    f"Expecting types {out_types}"
                )

            # compute accuracies
            if accuracy == True and alphas:
                accuracies = [_accuracy.accuracy(row=list(row), alpha=alpha) for alpha in alphas]
                return tuple([out_row, accuracies])
            else:
                return tuple([out_row, []])

        if hasattr(out, "map"):
            # it's an RDD
            out = out.map(process_out_row)
        else:
            out = map(process_out_row, out)

        def filter_aggregate(row, condition):
            bindings = dict((name.lower(), val) for name, val in zip(out_col_names, row[0]))
            keep = condition.evaluate(bindings)
            return keep

        if query.having is not None:
            condition = query.having.condition
            if hasattr(out, "filter"):
                # it's an RDD
                out = out.filter(lambda row: filter_aggregate(row, condition))
            else:
                out = filter(lambda row: filter_aggregate(row, condition), out)

        # sort it if necessary
        if query.order is not None:
            sort_fields = []
            for si in query.order.sortItems:
                if type(si.expression) is not ast.Column:
                    raise ValueError("We only know how to sort by column names right now")
                colname = si.expression.name.lower()
                if colname not in out_col_names:
                    raise ValueError(
                        "Can't sort by {0}, because it's not in output columns: {1}".format(
                            colname, out_col_names
                        )
                    )
                colidx = out_col_names.index(colname)
                desc = False
                if si.order is not None and si.order.lower() == "desc":
                    desc = True
                if desc and not (out_types[colidx] in ["int", "float", "boolean", "datetime"]):
                    raise ValueError("We don't know how to sort descending by " + out_types[colidx])
                sf = (desc, colidx)
                sort_fields.append(sf)

            def sort_func(row):
                # use index 0, since index 1 is accuracy
                return SortKey(row[0], sort_fields)
                
            if hasattr(out, "sortBy"):
                out = out.sortBy(sort_func)
            else:
                out = sorted(out, key=sort_func)

        # check for LIMIT or TOP
        limit_rows = None
        if query.limit is not None:
            if query.select.quantifier is not None:
                raise ValueError("Query cannot have both LIMIT and TOP set")
            limit_rows = query.limit.n
        elif query.select.quantifier is not None and isinstance(query.select.quantifier, Top):
            limit_rows = query.select.quantifier.n
        if limit_rows is not None:
            if hasattr(out, "rdd"):
                # it's a dataframe
                out = out.limit(limit_rows)
            elif hasattr(out, "map"):
                # it's an RDD
                out = out.take(limit_rows)
            else:
                out = itertools.islice(out, limit_rows)


        # drop empty accuracy if no accuracy requested
        def drop_accuracy(row):
            return row[0]
        if accuracy == False:
            if hasattr(out, "rdd"):
                # it's a dataframe
                out = out.rdd.map(drop_accuracy)
            elif hasattr(out, "map"):
                # it's an RDD
                out = out.map(drop_accuracy)
            else:
                out = map(drop_accuracy, out)

        # increment odometer
        for mech in mechs:
            if mech:
                self.odometer.spend(Privacy(epsilon=mech.epsilon, delta=mech.delta))

        # output it
        if accuracy == False and hasattr(out, "toDF"):
            # Pipeline RDD
            if not out.isEmpty():
                return out.toDF(out_col_names)
            else:
                return out
        elif hasattr(out, "map"):
            # Bare RDD
            return out
        else:
            row0 = [out_col_names]
            if accuracy == True:
                row0 = [[out_col_names, [[col_name+'_' + str(1-alpha).replace('0.', '') for col_name in out_col_names] for alpha in self.privacy.alphas ]]]
            out_rows = row0 + list(out)
            return out_rows
예제 #30
0
    def run_agg_query_df(self,
                         df,
                         metadata,
                         query,
                         confidence,
                         file_name="d1"):
        """
        Run the query using the private reader and input query
        Get query response back for multiple dimensions and aggregations
        """
        # Getting exact result
        reader = PandasReader(df, metadata)
        exact_res = reader.execute(query)[1:]

        private_reader = PrivateReader(reader,
                                       metadata,
                                       privacy=Privacy(epsilon=self.epsilon))
        query_ast = private_reader.parse_query_string(query)

        # Distinguishing dimension and measure columns

        sample_res = private_reader._execute_ast(query_ast, True)
        headers = sample_res[0]

        dim_cols = []
        num_cols = []

        out_syms = query_ast.all_symbols()
        out_types = [s[1].type() for s in out_syms]
        out_col_names = [s[0] for s in out_syms]

        for col, ctype in zip(out_col_names, out_types):
            if ctype == "string":
                dim_cols.append(col)
            else:
                num_cols.append(col)

        # Repeated query and store results
        res = []
        for idx in range(self.repeat_count):
            dim_rows = []
            num_rows = []
            singleres = private_reader._execute_ast_df(query_ast,
                                                       cache_exact=True)
            # values = singleres[col]
            for col in dim_cols:
                dim_rows.append(singleres[col].tolist())
            for col in num_cols:
                values = singleres[col].tolist()
                num_rows.append(list(zip(values)))

            res.extend(list(zip(*dim_rows, *num_rows)))

        exact_df = pd.DataFrame(exact_res, columns=headers)
        noisy_df = pd.DataFrame(res, columns=headers)

        # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2
        if len(dim_cols) == 0:
            dim_cols.append("__dim__")

        if dim_cols[0] == "__dim__":
            exact_df[dim_cols[0]] = ["key"] * len(exact_df)
            noisy_df[dim_cols[0]] = ["key"] * len(noisy_df)

        return noisy_df, exact_df, dim_cols, num_cols