def __init__(self): from snsql.metadata import Metadata self.metadata = { 'PUMS': Metadata.from_file(pums_schema_path), 'PUMS_large': Metadata.from_file(pums_large_schema_path), 'PUMS_pid': Metadata.from_file(pums_pid_schema_path), 'PUMS_dup': Metadata.from_file(pums_dup_schema_path), 'PUMS_null': Metadata.from_file(pums_dup_schema_path) } self.engines = {} home = os.path.expanduser("~") p = os.path.join(home, ".smartnoise", "connections-unit.yaml") if not os.environ.get('SKIP_PANDAS'): self.engines['pandas'] = DbEngine('pandas') else: print("Skipping pandas database tests") if os.environ.get('TEST_SPARK'): self.engines['spark'] = DbEngine('spark') else: print("TEST_SPARK not set, so skipping Spark tests") if not os.path.exists(p): print ("No config file at ~/.smartnoise/connections-unit.yaml") else: with open(p, 'r') as stream: conns = yaml.safe_load(stream) if conns is None: print("List of installed test engines is empty") else: for engine in conns: eng = conns[engine] host = conns[engine]["host"] port = conns[engine]["port"] user = conns[engine]["user"] databases = eng['databases'] self.engines[engine] = DbEngine(engine, user, host, port, databases)
def setup_class(cls): meta = Metadata.from_file(meta_path) meta["PUMS.PUMS"].censor_dims = False df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=10.0, delta=0.1)) cls.reader = private_reader
def __init__( self, reader, metadata, privacy=None ): """Create a new private reader. Do not use the constructor directly; use the from_connection factory method. :param metadata: The Metadata object with information about all tables referenced in this query :param reader: The data reader to wrap, such as a SqlServerReader, PandasReader, or SparkReader The PrivateReader intercepts queries to the underlying reader and ensures differential privacy. :param epsilon_per_column: The privacy budget to spend for each column in the query (deprecated) :param delta: The delta privacy parameter (deprecated) :param privacy: Pass epsilon and delta """ if isinstance(reader, Reader): self.reader = reader else: raise ValueError("Parameter reader must be of type Reader") self.metadata = Metadata.from_(metadata) self.rewriter = Rewriter(metadata) self._options = PrivateReaderOptions() if privacy: self.privacy = privacy else: raise ValueError("Must pass in a Privacy object with privacy parameters.") self.odometer = OdometerHeterogeneous(self.privacy) self._refresh_options()
def __init__(self, query, metadata): self.query = query if metadata: self.metadata = Metadata.from_(metadata) else: self.metadata = metadata
def load_metadata(): table_features = { "Store": { "name": "Store", "type": "int", "private_id": True }, "Date": { "name": "Date", "type": "datetime" }, "Temperature": { "name": "Temperature", "type": "float" }, "Fuel_Price": { "name": "Fuel_Price", "type": "float" }, "IsHoliday": { "name": "IsHoliday", "type": "boolean" }, "row_privacy": False, } table_sales = { "Store": { "name": "Store", "type": "int", "private_id": True }, "Date": { "name": "Date", "type": "datetime" }, "Weekly_Sales": { "name": "Weekly_Sales", "type": "float" }, "IsHoliday": { "name": "IsHoliday", "type": "boolean" }, "row_privacy": False, } metadata_dict = { "": { "": { "features": table_features, "sales": table_sales, } }, } return Metadata.from_dict(metadata_dict)
def test_pandas(self): engine = "pandas" meta = Metadata.from_file(meta_path) df = pd.read_csv(csv_path) reader = PandasReader(df, meta) assert(isinstance(reader, SqlReader)) assert(isinstance(reader, PandasReader)) assert(isinstance(reader.compare, NameCompare)) assert(isinstance(reader.compare, PandasNameCompare)) assert(isinstance(reader.serializer, Serializer)) assert(isinstance(reader.serializer, PandasSerializer))
def test_with_censor_dims(self): meta = Metadata.from_file(meta_path) df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=3.0)) query = "SELECT COUNT (*) AS foo, COUNT(DISTINCT pid) AS bar FROM PUMS.PUMS" q = QueryParser(meta).query(query) inner, outer = private_reader._rewrite_ast(q) ne = outer.select.namedExpressions assert (ne[0].expression.expression.name != 'keycount') assert (ne[1].expression.expression.name == 'keycount')
def setup_class(self): meta = Metadata.from_file(meta_path) meta["PUMS.PUMS"].censor_dims = False meta["PUMS.PUMS"]["sex"].type = "int" meta["PUMS.PUMS"]["educ"].type = "int" meta["PUMS.PUMS"]["married"].type = "bool" df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=10.0, delta=10e-3)) self.reader = private_reader
def queries(self, query_string, metadata=None): if metadata is None and self.metadata is not None: metadata = self.metadata elif metadata: metadata = Metadata.from_(metadata) istream = InputStream(query_string) parser = self.start_parser(istream) bv = BatchVisitor() queries = [q for q in bv.visit(parser.batch()).queries] if metadata is not None: for q in queries: q.load_symbols(metadata) return queries
def test_case_sensitive(self): sample = Table( "PUMS", "PUMS", [Int('pid', is_key=True), Int('"PiD"')], 150) meta = Metadata([sample], "csv") reader = PostgresReader("localhost", "PUMS", "admin", "password") private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=3.0)) query = 'SELECT COUNT (DISTINCT pid) AS foo, COUNT(DISTINCT "PiD") AS bar FROM PUMS.PUMS' inner, outer = private_reader._rewrite(query) ne = outer.select.namedExpressions assert (ne[0].expression.expression.name == 'keycount') assert (ne[1].expression.expression.name != 'keycount')
def test_reuse_expression(self): meta = Metadata.from_file(meta_path) df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=3.0)) query = 'SELECT AVG(age), SUM(age), COUNT(age) FROM PUMS.PUMS' q = QueryParser(meta).query(query) inner, outer = private_reader._rewrite(query) names = unique( [f.name for f in outer.select.namedExpressions.find_nodes(Column)]) assert (len(names) == 2) assert ('count_age' in names) assert ('sum_age' in names)
def test_meta_from_string(self): meta_good = """col: ? '' : table: duration: type: float id: private_id: true type: int max_ids: 1 row_privacy: false rows: 1000 sample_max_ids: true engine: pandas""" file = io.StringIO(meta_good) c = Metadata.from_file(file)
def test_meta_bad_int(self): meta_bad_float = """col: ? '' : table: id: private_id: true type: int events: type: int max_ids: 1 row_privacy: false rows: 1000 sample_max_ids: true engine: pandas""" file = io.StringIO(meta_bad_float) c = Metadata.from_file(file) assert (c["table"]["events"].unbounded)
def get_private_readers(self, *ignore, metadata=None, privacy, database, engine=None, overrides={}, **kwargs): readers = [] if metadata is None and database in self.metadata: metadata = self.metadata[database] if metadata is None: print(f"No metadata available for {database}") return [] if isinstance(metadata, str): from snsql.metadata import Metadata metadata = Metadata.from_file(metadata) if len(overrides) > 0: # make a copy metadata = copy.deepcopy(metadata) # apply overrides to only the first table in the metadata table_name = list(metadata.m_tables)[0] table = metadata.m_tables[table_name] for propname in overrides: propval = overrides[propname] if propname == 'censor_dims': table.censor_dims = propval elif propname == 'clamp_counts': table.clamp_counts = propval elif propname == 'clamp_columns': table.clamp_columns = propval elif propname == 'max_ids' or propname == 'max_contrib': table.max_ids = propval else: print(f"Unable to set override for {propname}={propval}") if engine is not None: engines = [engine] else: engines = [eng for eng in self.engines] for engine in engines: if engine in self.engines: eng = self.engines[engine] reader = None try: reader = eng.get_private_reader(metadata=metadata, privacy=privacy, database=database) except: pass finally: if reader: readers.append(reader) return readers
def test_load_from_path(self): meta = Metadata.from_(meta_path) p = meta['PUMS.PUMS']
privacy = Privacy(epsilon=3.0, delta=0.1) metadata = Metadata.from_dict({ "": { "PUMS": { "PUMS": { "censor_dims": True, "row_privacy": False, "uuid": { "type": "int", "private_id": True }, "start_date": { "type": "datetime" }, "start_time": { "type": "string" }, "sales": { "type": "float", "upper": 10000, "lower": 0.0 } } } } }) reader = from_connection(None, engine="sqlserver",
import pytest from snsql.metadata import Metadata from snsql.sql.parse import QueryParser from os import listdir from os.path import isfile, join, dirname dir_name = dirname(__file__) testpath = join(dir_name, "queries") + "/" metadata = Metadata.from_file(join(dir_name, "TestDB.yaml")) other_dirs = [ f for f in listdir(testpath) if not isfile(join(testpath, f)) and f != "parse" ] parse_files = [ join(testpath + "parse/", f) for f in listdir(testpath + "parse") if isfile(join(testpath + "parse", f)) ] good_files = [f for f in parse_files if not "_fail" in f] bad_files = [f for f in parse_files if "_fail" in f] for d in other_dirs: other_files = [ join(testpath + d + "/", f) for f in listdir(testpath + d) if isfile(join(testpath + d, f)) ] good_files.extend(other_files)
"type": "boolean" }, "row_privacy": False, } metadata_dict = { "engine": "sqlserver", "": { "": { "features": table_features, "sales": table_sales, } }, } metadata = Metadata.from_dict(metadata_dict) class TestAstFromMeta: def test_meta_load(self): assert (metadata["sales"]["Date"].nullable == False) assert (metadata["features"]["Date"].nullable == True) assert (metadata["features"]["IsHoliday"].nullable == False) assert (metadata["features"]["Store"].sensitivity == 150) assert (metadata["features"]["Temperature"].sensitivity == 75) def test_ast_attach_sens(self): query = 'SELECT SUM("Temperature"), SUM(features."Store") AS store FROM features' q = QueryParser(metadata).query(query) assert (q._select_symbols[0].expression.sensitivity() == 75) assert (q._named_symbols['store'].expression.sensitivity() == 150)
from snsql.sql.privacy import Privacy from snsql.sql.private_reader import PrivateReader import pandas as pd import numpy as np from snsql.metadata import Metadata git_root_dir = subprocess.check_output( "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip() meta_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.yaml")) csv_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.csv")) pums = pd.read_csv(csv_path) privacy = Privacy(epsilon=1.0) meta_obj = Metadata.from_(meta_path) class TestOdometer: def test_count_pid_query(self): priv = PrivateReader.from_connection(pums, privacy=privacy, metadata=meta_path) assert (priv.odometer.spent == (0.0, 0.0)) assert (priv.odometer.k == 0) res = priv.execute( "SELECT COUNT(DISTINCT pid) FROM PUMS.PUMS GROUP BY educ") assert (priv.odometer.k == 1) def test_count_query(self): priv = PrivateReader.from_connection(pums,
from os.path import dirname, join from snsql.metadata import Metadata from snsql.sql.parse import QueryParser dir_name = dirname(__file__) metadata = Metadata.from_file(join(dir_name, "Devices.yaml")) def qp(query_string): return QueryParser().query(query_string) # # Unit tests # class TestTypes: def test_s12(self): q = qp("SELECT Refurbished FROM Telemetry.Crashes;") q.load_symbols(metadata) print(str(q["Refurbished"])) assert q["Refurbished"].expression.type() == "boolean" assert q["Refurbished"].expression.sensitivity() == 1 def test_s13(self): q = qp("SELECT * FROM Telemetry.Crashes;") q.load_symbols(metadata) assert q["Refurbished"].expression.type() == "boolean" assert q["Refurbished"].expression.sensitivity() == 1 assert q["Temperature"].expression.sensitivity() == 65.0
def __init__(self, metadata=None): if metadata: self.metadata = Metadata.from_(metadata) else: self.metadata = None
import math import pandas as pd from snsql.metadata import Metadata from snsql.sql import PrivateReader from snsql.sql.privacy import Privacy from snsql.sql.parse import QueryParser git_root_dir = subprocess.check_output( "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip() meta_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.yaml")) csv_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.csv")) meta = Metadata.from_file(meta_path) pums = pd.read_csv(csv_path) query = 'SELECT AVG(age), STD(age), VAR(age), SUM(age), COUNT(age) FROM PUMS.PUMS GROUP BY sex' q = QueryParser(meta).query(query) privacy = Privacy(alphas=[0.01, 0.05], delta=1 / (math.sqrt(100) * 100)) priv = PrivateReader.from_connection(pums, privacy=privacy, metadata=meta) subquery, root = priv._rewrite(query) acc = Accuracy(root, subquery, privacy) class TestAccuracy: def test_count_accuracy(self): error = acc.count(alpha=0.05) assert (error < 7.53978 and error > 0.5)
def __init__(self, metadata, privacy=None): self.options = RewriterOptions() self.metadata = Metadata.from_(metadata) self.privacy = privacy