Exemplo n.º 1
0
    def run_agg_query(self, df, metadata, query, confidence, get_exact=True):
        """
        Run the query using the private reader and input query
        Get query response back
        """
        reader = PandasReader(metadata, df)
        actual = 0.0
        # VAR not supported in Pandas Reader. So not needed to fetch actual on every aggregation
        if (get_exact):
            actual = reader.execute_typed(query).rows()[1:][0][0]
        private_reader = PrivateReader(metadata, reader, self.epsilon)
        query_ast = private_reader.parse_query_string(query)

        srs_orig = private_reader.reader.execute_ast_typed(query_ast)

        noisy_values = []
        low_bounds = []
        high_bounds = []
        for idx in range(self.repeat_count):
            srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values()))
            res = private_reader._execute_ast(query_ast, True)
            # Disabled because confidence interval not available in report
            #interval = res.report[res.colnames[0]].intervals[confidence]
            #low_bounds.append(interval[0].low)
            #high_bounds.append(interval[0].high)
            noisy_values.append(res.rows()[1:][0][0])
        return np.array(noisy_values), actual, low_bounds, high_bounds
Exemplo n.º 2
0
 def test_no_tau_noisy(self):
     # should never drop rows
     reader = PandasReader(schema, df)
     private_reader = PrivateReader(schema, reader, 0.01)
     for i in range(10):
         rs = private_reader.execute_typed("SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 90 AND educ = '8'")
         assert(len(rs['c']) == 1)
Exemplo n.º 3
0
 def test_group_by_noisy_typed_order_desc(self):
     reader = PandasReader(schema, df)
     private_reader = PrivateReader(schema, reader, 4.0)
     rs = private_reader.execute_typed(
         "SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c DESC"
     )
     assert (rs['c'][0] > rs['c'][1])
Exemplo n.º 4
0
 def test_group_by_noisy_order(self):
     reader = PandasReader(schema, df)
     private_reader = PrivateReader(schema, reader, 4.0)
     rs = private_reader.execute(
         "SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c"
     )
     assert (rs[1][0] < rs[2][0])
Exemplo n.º 5
0
 def test_check_thresholds_gauss(self):
     # check tau for various privacy parameters
     epsilons = [0.1, 2.0]
     max_contribs = [1, 3]
     deltas = [10E-5, 10E-15]
     query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married"
     reader = PandasReader(schema, df)
     qp = QueryParser(schema)
     q = qp.query(query)
     for eps in epsilons:
         for d in max_contribs:
             for delta in deltas:
                 # using slightly different formulations of same formula from different papers
                 # make sure private_reader round-trips
                 gaus_scale = math.sqrt(d) * math.sqrt(
                     2 * math.log(1.25 / delta)) / eps
                 gaus_rho = 1 + gaus_scale * math.sqrt(
                     2 * math.log(d / math.sqrt(2 * math.pi * delta)))
                 private_reader = PrivateReader(schema, reader, eps, delta)
                 q.max_ids = d  # hijack the AST
                 r = private_reader.execute_ast(q)
                 assert (math.isclose(private_reader.tau,
                                      gaus_rho,
                                      rel_tol=0.03,
                                      abs_tol=2))
Exemplo n.º 6
0
 def test_execute_without_dpsu(self):
     reader = PandasReader(schema, df)
     private_reader = PrivateReader(schema, reader, 1.0)
     query = QueryParser(schema).queries(
         "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0]
     private_reader.options.use_dpsu = False
     assert (private_reader._get_reader(query) is private_reader.reader)
Exemplo n.º 7
0
 def test_empty_result_count_typed_notau_prepost(self):
     reader = PandasReader(schema, df)
     query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0]
     private_reader = PrivateReader(schema, reader, 1.0)
     private_reader._execute_ast(query, True)
     for i in range(3):
         trs = private_reader._execute_ast(query, True)
         assert(len(trs) == 1)
Exemplo n.º 8
0
 def test_yes_tau(self):
     # should usually drop some rows
     reader = PandasReader(schema, df)
     private_reader = PrivateReader(schema, reader, 1.0, 1/10000)
     lengths = []
     for i in range(10):
         rs = private_reader.execute_typed("SELECT COUNT(*) AS c FROM PUMS.PUMS WHERE age > 80 GROUP BY educ")
         lengths.append(len(rs['c']))
     l = lengths[0]
     assert(any([l != ll for ll in lengths]))
 def test_group_by_noisy_typed_order_inter_constant(self):
     reader = PandasReader(schema, df)
     private_reader = PrivateReader(schema, reader, 1.0)
     rs = private_reader.execute_typed(
         "SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c"
     )
     rs2 = private_reader.execute_typed(
         "SELECT COUNT(*) * 2 AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c"
     )
     assert (len(rs.report['c'].intervals[0.95]) == 2)
     assert (len(rs2.report['c'].intervals[0.95]) == 2)
     assert (all(a.low < b.low for a, b in zip(
         rs.report['c'].intervals[0.95], rs2.report['c'].intervals[0.95])))
     assert (all(a.low < b.low
                 for a, b in zip(rs.report['c'].intervals[0.985],
                                 rs2.report['c'].intervals[0.985])))
Exemplo n.º 10
0
 def test_group_by_noisy_typed_order_inter(self):
     reader = PandasReader(schema, df)
     private_reader = PrivateReader(schema, reader, 1.0)
     rs = private_reader.execute_typed(
         "SELECT COUNT(*) AS c, married AS m FROM PUMS.PUMS GROUP BY married ORDER BY c"
     )
     assert (rs['c'][0] < rs['c'][1])
     assert (len(rs.report['c'].intervals[0.95]) == 2)
     print(rs.report['c'].intervals[0.95])
     assert (all(ival.low < ival.high
                 for ival in rs.report['c'].intervals[0.95]))
     assert (all(ival.low < ival.high
                 for ival in rs.report['c'].intervals[0.985]))
     assert (all(outer.low < inner.low for inner, outer in zip(
         rs.report['c'].intervals[0.95], rs.report['c'].intervals[0.985])))
     assert (all(outer.high > inner.high for inner, outer in zip(
         rs.report['c'].intervals[0.95], rs.report['c'].intervals[0.985])))
 def release(self, dataset: object, actual=False) -> Report:
     """
     Dataset is a collection of [Dataset Metadata, PandasReader]
     Releases response to SQL query based on the number of repetitions
     requested by eval_params if actual is set of False. 
     Actual response is only returned once
     """
     if (not actual):
         private_reader = PrivateReader(dataset[0], dataset[1],
                                        self.privacy_params.epsilon)
         query_ast = private_reader.parse_query_string(self.algorithm)
         srs_orig = private_reader.reader.execute_ast_typed(query_ast)
         noisy_values = []
         for idx in range(self.eval_params.repeat_count):
             srs = TypedRowset(srs_orig.rows(),
                               list(srs_orig.types.values()))
             res = private_reader._execute_ast(query_ast, True)
             noisy_values.append(res.rows()[1:][0][0])
         return Report({"__key__": noisy_values})
     else:
         reader = dataset[1]
         exact = reader.execute_typed(self.algorithm).rows()[1:][0][0]
         return Report({"__key__": exact})
Exemplo n.º 12
0
    def test_dpsu_vs_korolova(self):
        query = "SELECT ngram, COUNT(*) as n FROM reddit.reddit GROUP BY ngram ORDER BY n desc"
        reader = PandasReader(schema, df)
        private_reader = PrivateReader(schema, reader, 3.0)
        private_reader.options.max_contrib = 10
        result = private_reader.execute_typed(query)

        private_reader_korolova = PrivateReader(schema, reader, 3.0)
        private_reader_korolova.options.dpsu = False
        private_reader_korolova.options.max_contrib = 10
        korolova_result = private_reader_korolova.execute_typed(query)

        assert len(result['n']) > len(korolova_result['n'])
        assert len(final_df) < len(df)
Exemplo n.º 13
0
import mlflow
import json
import sys

import pandas as pd

from opendp.whitenoise.client import get_dataset_client
from opendp.whitenoise.data.adapters import load_reader, load_metadata, load_dataset
from opendp.whitenoise.sql import PrivateReader
from pandasql import sqldf

if __name__ == "__main__":
    dataset_name = sys.argv[1]
    budget = float(sys.argv[2])
    query = sys.argv[3]

    with mlflow.start_run():
        dataset_document = get_dataset_client().read(dataset_name, budget)
        dataset = load_dataset(dataset_document)
        reader = load_reader(dataset_document)
        schema = load_metadata(dataset_document)
        private_reader = PrivateReader(schema, reader, budget)
        rowset = private_reader.execute(query)

        result = {"query_result": rowset}
        df = pd.DataFrame(rowset[1:], columns=rowset[0])
        with open("result.json", "w") as stream:
            json.dump(df.to_dict(), stream)
        mlflow.log_artifact("result.json")
Exemplo n.º 14
0
import pandas as pd
from opendp.whitenoise.sql import PostgresReader, PrivateReader
from opendp.whitenoise.metadata import CollectionMetadata

meta = CollectionMetadata.from_file('PUMS_large.yaml')

query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS_large GROUP BY married'
query = 'SELECT AVG(age) FROM PUMS.PUMS_large'

reader = PostgresReader('127.0.0.1', 'PUMS', 'postgres')
private_reader = PrivateReader(reader, meta, 1.0)

exact = reader.execute_typed(query)
print(exact)

private = private_reader.execute_typed(query)
print(private)
Exemplo n.º 15
0
 def test_sum_noisy_postprocess(self):
     reader = PandasReader(schema, df)
     private_reader = PrivateReader(schema, reader, 1.0)
     trs = private_reader.execute_typed("SELECT POWER(SUM(age), 2) as age_total FROM PUMS.PUMS")
     assert(trs['age_total'][0] > 1000 ** 2)
Exemplo n.º 16
0
    def run_agg_query_df(self,
                         df,
                         metadata,
                         query,
                         confidence,
                         file_name="d1"):
        # Getting exact result
        reader = PandasReader(metadata, df)
        exact = reader.execute_typed(query).rows()[1:]
        exact_res = []
        for row in exact:
            exact_res.append(row)

        private_reader = PrivateReader(metadata, reader, self.epsilon)
        query_ast = private_reader.parse_query_string(query)

        # Distinguishing dimension and measure columns
        srs_orig = private_reader.reader.execute_ast_typed(query_ast)
        srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values()))

        sample_res = private_reader._execute_ast(query_ast, True)
        headers = sample_res.colnames

        dim_cols = []
        num_cols = []

        for col in headers:
            if (sample_res.types[col] == "string"):
                dim_cols.append(col)
            else:
                num_cols.append(col)

        # Repeated query and store results along with intervals
        res = []
        for idx in range(self.repeat_count):
            dim_rows = []
            num_rows = []
            srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values()))
            singleres = private_reader._execute_ast(query_ast, True)
            values = singleres[col]
            for col in dim_cols:
                dim_rows.append(singleres[col])
            for col in num_cols:
                values = singleres[col]
                #low = singleres.report[col].intervals[confidence].low
                #high = singleres.report[col].intervals[confidence].high
                #num_rows.append(list(zip(values, low, high)))
                num_rows.append(list(zip(values)))

            res.extend(list(zip(*dim_rows, *num_rows)))

        exact_df = pd.DataFrame(exact_res, columns=headers)
        noisy_df = pd.DataFrame(res, columns=headers)

        # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2
        if (len(dim_cols) == 0):
            dim_cols.append("__dim__")

        if (dim_cols[0] == "__dim__"):
            exact_df[dim_cols[0]] = ["key"] * len(exact_df)
            noisy_df[dim_cols[0]] = ["key"] * len(noisy_df)

        return noisy_df, exact_df, dim_cols, num_cols
Exemplo n.º 17
0
import json
import sys

import pandas as pd

from opendp.whitenoise.client import get_dataset_client
from opendp.whitenoise.data.adapters import load_reader, load_metadata
from opendp.whitenoise.sql import PrivateReader

if __name__ == "__main__":
    dataset_name = sys.argv[1]
    budget = float(sys.argv[2])
    query = sys.argv[3]

    with mlflow.start_run():
        dataset_document = get_dataset_client().read(dataset_name, budget)
        reader = load_reader(dataset_document)
        metadata = load_metadata(dataset_document)

        budget_per_column = budget / PrivateReader.get_budget_multiplier(
            metadata, reader, query)
        private_reader = PrivateReader(metadata, reader, budget_per_column)

        rowset = private_reader.execute(query)
        result = {"query_result": rowset}
        df = pd.DataFrame(rowset[1:], columns=rowset[0])

        with open("result.json", "w") as stream:
            json.dump(df.to_dict(), stream)
        mlflow.log_artifact("result.json")
Exemplo n.º 18
0
import pandas as pd
from opendp.whitenoise.sql import PandasReader, PrivateReader
from opendp.whitenoise.metadata import CollectionMetadata

pums = pd.read_csv('PUMS.csv')
meta = CollectionMetadata.from_file('PUMS.yaml')

query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS GROUP BY married'

reader = PandasReader(meta, pums)
private_reader = PrivateReader(meta, reader)

result = private_reader.execute_typed(query)
print(result)
Exemplo n.º 19
0
import pandas as pd
from opendp.whitenoise.sql import PandasReader, PrivateReader
from opendp.whitenoise.metadata import CollectionMetadata

pums = pd.read_csv('PUMS.csv')
meta = CollectionMetadata.from_file('PUMS.yaml')

query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS GROUP BY married'

query = 'SELECT COUNT(*) AS n, COUNT(pid) AS foo FROM PUMS.PUMS WHERE age > 80 GROUP BY educ'

reader = PandasReader(meta, pums)
private_reader = PrivateReader(meta, reader, 4.0)
private_reader.options.censor_dims = True
private_reader.options.clamp_counts = True

exact = reader.execute_typed(query)
print(exact)

private = private_reader.execute_typed(query)
print(private)