def setup_class(cls): meta = CollectionMetadata.from_file(meta_path) meta["PUMS.PUMS"].censor_dims = False df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, 10.0, 10E-3) cls.reader = private_reader
def setup_class(self): meta = CollectionMetadata.from_file(meta_path) meta["PUMS.PUMS"].censor_dims = False meta["PUMS.PUMS"]["sex"].type = "int" meta["PUMS.PUMS"]["educ"].type = "int" meta["PUMS.PUMS"]["married"].type = "bool" df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, 10.0, 10E-3) self.reader = private_reader
def test_calculate_multiplier(self): pums_meta_path = os.path.join( git_root_dir, os.path.join("service", "datasets", "PUMS.yaml")) pums_csv_path = os.path.join( git_root_dir, os.path.join("service", "datasets", "PUMS.csv")) pums_schema = CollectionMetadata.from_file(pums_meta_path) pums_df = pd.read_csv(pums_csv_path) pums_reader = PandasReader(pums_df, pums_schema) query = "SELECT COUNT(*) FROM PUMS.PUMS" cost = PrivateReader.get_budget_multiplier(pums_schema, pums_reader, query) query = "SELECT AVG(age) FROM PUMS.PUMS" cost_avg = PrivateReader.get_budget_multiplier(pums_schema, pums_reader, query) assert 1 + cost == cost_avg
def test_sklearn_query(): sklearn_dataset = sklearn.datasets.load_iris() sklearn_df = pd.DataFrame(data=sklearn_dataset.data, columns=sklearn_dataset.feature_names) iris = Table("dbo", "iris", [ Float("sepal length (cm)", 4, 8), Float("sepal width (cm)", 2, 5), Float("petal length (cm)", 1, 7), Float("petal width (cm)", 0, 3) ], 150) schema = CollectionMetadata([iris], "csv") reader = PandasReader(sklearn_df, schema) rowset = execute_private_query( schema, reader, 0.3, 'SELECT AVG("petal width (cm)") FROM dbo.iris') df = pd.DataFrame(rowset[1:], columns=rowset[0]) assert df is not None assert len(df) == 1
def test_sklearn_query(): sklearn_dataset = sklearn.datasets.load_iris() sklearn_df = pd.DataFrame(data=sklearn_dataset.data, columns=sklearn_dataset.feature_names) iris = Table("dbo", "iris", [ Float("sepal length (cm)", 4, 8), Float("sepal width (cm)", 2, 5), Float("petal length (cm)", 1, 7), Float("petal width (cm)", 0, 3) ], 150) schema = CollectionMetadata([iris], "csv") reader = PandasReader(sklearn_df, schema) # Calling both times for back compat check for params in ([reader, schema], [schema, reader]): df = execute_private_query( *params, 0.3, 'SELECT AVG("petal width (cm)") FROM dbo.iris') assert df is not None assert len(df) == 1
iris_dataset_path = os.path.join(root_url, "service", "datasets", "iris.csv") if not os.path.exists(iris_dataset_path): sklearn_dataset = sklearn.datasets.load_iris() sklearn_df = pd.DataFrame(data=sklearn_dataset.data, columns=sklearn_dataset.feature_names) sklearn_df.to_csv(iris_dataset_path) iris_schema_path = os.path.join(root_url, "service", "datasets", "iris.yaml") if not os.path.exists(iris_schema_path): iris = Table("iris", "iris", [ Float("sepal length (cm)", 4, 8), Float("sepal width (cm)", 2, 5), Float("petal length (cm)", 1, 7), Float("petal width (cm)", 0, 3) ], 150) schema = CollectionMetadata([iris], "csv") schema.to_file(iris_schema_path, "iris") def find_ngrams(input_list, n): return input_list if n == 1 else list( zip(*[input_list[i:] for i in range(n)])) def _download_file(url, local_file): try: from urllib import urlretrieve except ImportError: from urllib.request import urlretrieve urlretrieve(url, local_file)
import pandas as pd from opendp.smartnoise.sql import PostgresReader, PrivateReader from opendp.smartnoise.metadata import CollectionMetadata meta = CollectionMetadata.from_file('PUMS_large.yaml') query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS_large GROUP BY married' query = 'SELECT AVG(age) FROM PUMS.PUMS_large' reader = PostgresReader('127.0.0.1', 'PUMS', 'postgres') private_reader = PrivateReader(reader, meta, 1.0) exact = reader.execute_typed(query) print(exact) private = private_reader.execute_typed(query) print(private)
from opendp.smartnoise.synthesizers.pytorch.nn import DPGAN, DPCTGAN, PATECTGAN except: import logging test_logger = logging.getLogger(__name__) test_logger.warning("Requires torch and torchdp") git_root_dir = subprocess.check_output( "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip() meta_path = os.path.join(git_root_dir, os.path.join("service", "datasets", "PUMS.yaml")) csv_path = os.path.join(git_root_dir, os.path.join("service", "datasets", "PUMS.csv")) schema = CollectionMetadata.from_file(meta_path) df = pd.read_csv(csv_path) @pytest.mark.torch class TestPytorchDPSynthesizer_DPGAN: def setup(self): self.dpgan = PytorchDPSynthesizer(DPGAN(), GeneralTransformer()) def test_fit(self): self.dpgan.fit(df) assert self.dpgan.gan.generator def test_sample(self): self.dpgan.fit(df) sample_size = len(df)
from os.path import dirname, join from opendp.smartnoise.metadata import CollectionMetadata from opendp.smartnoise.sql.parse import QueryParser dir_name = dirname(__file__) metadata = CollectionMetadata.from_file(join(dir_name, "Devices.yaml")) def qp(query_string): return QueryParser().query(query_string) # # Unit tests # class TestTypes: def test_s12(self): q = qp("SELECT Refurbished FROM Telemetry.Crashes;") q.load_symbols(metadata) print(str(q["Refurbished"])) assert q["Refurbished"].type() == "boolean" assert q["Refurbished"].sensitivity() == 1 def test_s13(self): q = qp("SELECT * FROM Telemetry.Crashes;") q.load_symbols(metadata) assert q["Refurbished"].type() == "boolean" assert q["Refurbished"].sensitivity() == 1 assert q["Temperature"].sensitivity() == 65.0
def _load_metadata(dataset_document): return CollectionMetadata.from_file( dataset_document.dataverse_details.local_metadata_path)
import pytest from opendp.smartnoise.metadata import CollectionMetadata from opendp.smartnoise.sql.parse import QueryParser from os import listdir from os.path import isfile, join, dirname dir_name = dirname(__file__) testpath = join(dir_name, "queries") + "/" metadata = CollectionMetadata.from_file(join(dir_name, "TestDB.yaml")) other_dirs = [ f for f in listdir(testpath) if not isfile(join(testpath, f)) and f != "parse" ] parse_files = [ join(testpath + "parse/", f) for f in listdir(testpath + "parse") if isfile(join(testpath + "parse", f)) ] good_files = [f for f in parse_files if not "_fail" in f] bad_files = [f for f in parse_files if "_fail" in f] for d in other_dirs: other_files = [ join(testpath + d + "/", f) for f in listdir(testpath + d) if isfile(join(testpath + d, f)) ] good_files.extend(other_files)