def test_sklearn_query():
    sklearn_dataset = sklearn.datasets.load_iris()
    sklearn_df = pd.DataFrame(data=sklearn_dataset.data,
                              columns=sklearn_dataset.feature_names)

    iris = Table("dbo", "iris", 150, [
        Float("sepal length (cm)", 4, 8),
        Float("sepal width (cm)", 2, 5),
        Float("petal length (cm)", 1, 7),
        Float("petal width (cm)", 0, 3)
    ])
    schema = CollectionMetadata([iris], "csv")

    reader = PandasReader(schema, sklearn_df)
    rowset = execute_private_query(
        schema, reader, 0.3, 'SELECT AVG("petal width (cm)") FROM dbo.iris')
    df = pd.DataFrame(rowset[1:], columns=rowset[0])
    assert df is not None
    assert len(df) == 1
示例#2
0
import pandas as pd
from pandasql import sqldf
import math

from opendp.whitenoise.metadata import CollectionMetadata
from opendp.whitenoise.sql import PrivateReader, PandasReader
from opendp.whitenoise.sql.parse import QueryParser
from opendp.whitenoise.reader.rowset import TypedRowset

git_root_dir = subprocess.check_output("git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip()

meta_path = os.path.join(git_root_dir, os.path.join("service", "datasets", "PUMS.yaml"))
csv_path = os.path.join(git_root_dir, os.path.join("service", "datasets", "PUMS.csv"))

schema = CollectionMetadata.from_file(meta_path)
df = pd.read_csv(csv_path)

#   Unit tests
#
class TestQuery:
    def test_count_exact(self):
        reader = PandasReader(schema, df)
        rs = reader.execute("SELECT COUNT(*) AS c FROM PUMS.PUMS")
        assert(rs[1][0] == 1000)
    def test_empty_result(self):
        reader = PandasReader(schema, df)
        rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100")
        assert(len(rs) == 1)
    def test_empty_result_typed(self):
        reader = PandasReader(schema, df)
validate_files = [
    join(testpath + "validate/", f) for f in listdir(testpath + "validate")
    if isfile(join(testpath + "validate", f))
]

good_files = [f for f in validate_files if not "_fail" in f]
bad_files = [f for f in validate_files if "_fail" in f]

for d in other_dirs:
    other_files = [
        join(testpath + d + "/", f) for f in listdir(testpath + d)
        if isfile(join(testpath + d, f))
    ]
    good_files.extend(other_files)

metadata = CollectionMetadata.from_file(join(dir_name, "Devices.yaml"))


#
#   Unit tests
#
class TestValidate:
    def test_all_good_queries(self):
        for goodpath in good_files:
            print(goodpath)
            gqt = GoodQueryTester(goodpath)
            gqt.runValidate()

    def test_all_bad_queries(self):
        for badpath in bad_files:
            bqt = BadQueryTester(badpath)
示例#4
0
iris_dataset_path = os.path.join(root_url, "service", "datasets", "iris.csv")
if not os.path.exists(iris_dataset_path):
    sklearn_dataset = sklearn.datasets.load_iris()
    sklearn_df = pd.DataFrame(data=sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    sklearn_df.to_csv(iris_dataset_path)


iris_schema_path = os.path.join(root_url, "service", "datasets", "iris.yaml")
if not os.path.exists(iris_schema_path):
    iris = Table("iris", "iris", 150, [
                Float("sepal length (cm)", 4, 8),
                Float("sepal width (cm)", 2, 5),
                Float("petal length (cm)", 1, 7),
                Float("petal width (cm)", 0, 3)
    ])
    schema = CollectionMetadata([iris], "csv")
    schema.to_file(iris_schema_path, "iris")

def find_ngrams(input_list, n):
    return input_list if n == 1 else list(zip(*[input_list[i:] for i in range(n)]))

def _download_file(url, local_file):
    try:
        from urllib import urlretrieve
    except ImportError:
        from urllib.request import urlretrieve
    urlretrieve(url, local_file)

pums_1000_dataset_path = os.path.join(root_url, "service", "datasets", "evaluation", "PUMS_1000.csv")
if not os.path.exists(pums_1000_dataset_path):
    pums_url = "https://raw.githubusercontent.com/opendifferentialprivacy/dp-test-datasets/master/data/PUMS_california_demographics_1000/data.csv"
import pandas as pd
from opendp.whitenoise.sql import PandasReader, PrivateReader
from opendp.whitenoise.metadata import CollectionMetadata

pums = pd.read_csv('PUMS.csv')
meta = CollectionMetadata.from_file('PUMS.yaml')

query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS GROUP BY married'

query = 'SELECT COUNT(*) AS n, COUNT(pid) AS foo FROM PUMS.PUMS WHERE age > 80 GROUP BY educ'

reader = PandasReader(meta, pums)
private_reader = PrivateReader(meta, reader, 4.0)
private_reader.options.censor_dims = True
private_reader.options.clamp_counts = True

exact = reader.execute_typed(query)
print(exact)

private = private_reader.execute_typed(query)
print(private)
 def _load_metadata(dataset_document):
     return CollectionMetadata.from_file(
         dataset_document.dataverse_details.local_metadata_path)