def test_subqueries_query(self):
     query = 'SELECT SUM(subquery.Store), SUM(avg_price) FROM (SELECT Store, Temperature, AVG(table1.Fuel_Price) AS avg_price FROM features AS table1 GROUP BY Store, Temperature) AS subquery GROUP BY Temperature;'
     _ = QueryParser(schema).query(query)
     query = 'SELECT SUM(Store), SUM(avg_price) FROM (SELECT Store, Temperature, AVG(table1.Fuel_Price) AS avg_price FROM features AS table1 GROUP BY Store, Temperature) AS subquery GROUP BY Temperature;'
     _ = QueryParser(schema).query(query)
     query = 'SELECT SUM(avg_price) FROM (SELECT AVG(Fuel_Price) AS avg_price FROM features GROUP BY IsHoliday) AS subquery;'
     _ = QueryParser(schema).query(query)
示例#2
0
 def test_cast_float(self):
     frag = "CAST(EXTRACT(WEEKDAY FROM CAST('2017-05-10 09:01:01' AS TIMESTAMP)) AS FLOAT)"
     expr = QueryParser().parse_expression(frag)
     assert(frag.replace(' ', '') == str(expr).replace(' ', ''))
     v = expr.evaluate({})
     assert(isinstance(v, float))
     assert(v == 2.0)
示例#3
0
 def test_check_thresholds_gauss(self):
     # check tau for various privacy parameters
     epsilons = [0.1, 2.0]
     max_contribs = [1, 3]
     deltas = [10E-5, 10E-15]
     query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married"
     reader = PandasReader(df, schema)
     qp = QueryParser(schema)
     q = qp.query(query)
     for eps in epsilons:
         for d in max_contribs:
             for delta in deltas:
                 privacy = Privacy(epsilon=eps, delta=delta)
                 privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian
                 # using slightly different formulations of same formula from different papers
                 # make sure private_reader round-trips
                 gaus_scale = math.sqrt(d) * math.sqrt(
                     2 * math.log(1.25 / delta)) / eps
                 gaus_rho = 1 + gaus_scale * math.sqrt(
                     2 * math.log(d / math.sqrt(2 * math.pi * delta)))
                 schema_c = copy.copy(schema)
                 schema_c["PUMS.PUMS"].max_ids = d
                 private_reader = PrivateReader(reader,
                                                metadata=schema_c,
                                                privacy=privacy)
                 assert (private_reader._options.max_contrib == d)
                 r = private_reader._execute_ast(q)
                 assert (math.isclose(private_reader.tau,
                                      gaus_rho,
                                      rel_tol=0.03,
                                      abs_tol=2))
 def test_join_query(self):
     query = 'SELECT COUNT(Store), COUNT(*) FROM sales'
     _ = QueryParser(schema).query(query)
     query = 'SELECT COUNT(table1.Store), COUNT(*) FROM sales AS table1'
     _ = QueryParser(schema).query(query)
     query = 'SELECT COUNT(sales.Store), COUNT(*) FROM sales'
     _ = QueryParser(schema).query(query)
示例#5
0
 def test_iif(self):
     qp = QueryParser()
     c = qp.parse_expression("IIF(x <= 5, y, 0)")
     bindings = dict([('x', 5), ('y', 10), ('z', 12)])
     assert (c.evaluate(bindings) == 10)
     bindings["x"] = 6
     assert (c.evaluate(bindings) == 0)
     c = qp.parse_expression("IIF(x <= 5, y, 'string')")
     assert (c.evaluate(bindings) == "string")
示例#6
0
def test_rewriting():
    for query in queries:
        try:
            query = QueryParser(metadata).query(str(query))
            dp_query = Rewriter(metadata).query(query)
        except:
            raise ValueError(f"Query parse and rewrite failed: {query}")
        parsed_dp_query = QueryParser(metadata).query(str(dp_query))
        assert dp_query == parsed_dp_query
示例#7
0
    def test_ast_attach_nullable_true(self):
        query = 'SELECT COUNT("IsHoliday") FROM sales'
        q = QueryParser(metadata).query(query)
        assert (q._select_symbols[0].expression.xpath_first(
            '//TableColumn').nullable == True)

        query = 'SELECT SUM(Store), "Date" as d FROM features GROUP BY "date"'
        q = QueryParser(metadata).query(query)
        assert (q._named_symbols['d'].expression.nullable == True)
示例#8
0
 def test_simple_case(self):
     qp = QueryParser()
     c = qp.parse_expression(
         "CASE x WHEN 5 THEN 'five' WHEN 6 THEN 'six' ELSE '' END")
     bindings = dict([('x', 5)])
     assert (c.evaluate(bindings) == "five")
     bindings = dict([('x', 6)])
     assert (c.evaluate(bindings) == 'six')
     bindings = dict([('x', 7)])
     assert (c.evaluate(bindings) == '')
示例#9
0
 def test_variable_replace(self):
     qp = QueryParser()
     c = qp.parse_expression(
         "CASE x WHEN 5 THEN y WHEN 6 THEN z ELSE 0 END")
     bindings = dict([('x', 5), ('y', 10), ('z', 12)])
     assert (c.evaluate(bindings) == 10)
     bindings['x'] = 6
     assert (c.evaluate(bindings) == 12)
     bindings['x'] = 1
     assert (c.evaluate(bindings) == 0)
示例#10
0
    def test_ast_attach_sens(self):
        query = 'SELECT SUM("Temperature"), SUM(features."Store") AS store FROM features'
        q = QueryParser(metadata).query(query)
        assert (q._select_symbols[0].expression.sensitivity() == 75)
        assert (q._named_symbols['store'].expression.sensitivity() == 150)

        query = 'SELECT COUNT(DISTINCT "Temperature"), COUNT(features."Store") AS store FROM features'
        q = QueryParser(metadata).query(query)
        assert (q._select_symbols[0].expression.sensitivity() == 1)
        assert (q._named_symbols['store'].expression.sensitivity() == 1)
示例#11
0
 def test_full_case(self):
     qp = QueryParser()
     c = qp.parse_expression(
         "CASE WHEN x <= 5 THEN y WHEN x > 6 THEN 0 ELSE z END")
     bindings = dict([('x', 5), ('y', 10), ('z', 12)])
     assert (c.evaluate(bindings) == 10)
     bindings['x'] = 6
     assert (c.evaluate(bindings) == 12)
     bindings['x'] = 10
     assert (c.evaluate(bindings) == 0)
示例#12
0
 def test_string_bound(self):
     qp = QueryParser()
     c = qp.parse_expression(
         "CASE x WHEN 5 THEN y WHEN 6 THEN z ELSE q END")
     bindings = dict([('x', 5), ('y', 'ten'), ('z', 'twelve'),
                      ('q', 'zero')])
     assert (c.evaluate(bindings) == "ten")
     bindings['x'] = 6
     assert (c.evaluate(bindings) == "twelve")
     bindings['x'] = 1
     assert (c.evaluate(bindings) == "zero")
示例#13
0
 def runValidate(self):
     for qs in self.queries:
         q = QueryParser(metadata).query(qs)
         try:
             Validate().validateQuery(q, metadata)
         except:
             raise ValueError(f"Validation failed for query: {str(q)}")
示例#14
0
 def runValidate(self):
     for qs in self.queries:
         try:
             q = QueryParser(metadata).query(qs)
             Validate().validateQuery(q, metadata)
         except Exception as e:
             raise ValueError(
                 f"Parse and validate failed for query: {str(q)}")
示例#15
0
 def test_execute_with_dpsu(self):
     schema_dpsu = copy.copy(schema)
     schema_dpsu["PUMS.PUMS"].use_dpsu = True
     reader = PandasReader(df, schema_dpsu)
     private_reader = PrivateReader(reader, schema_dpsu, 1.0)
     assert (private_reader._options.use_dpsu == True)
     query = QueryParser(schema_dpsu).queries(
         "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0]
     assert (private_reader._get_reader(query) is not private_reader.reader)
示例#16
0
 def test_choose(self):
     qp = QueryParser()
     c = qp.parse_expression("CHOOSE(x, 'a', 'b', 'c')")
     bindings = dict([('x', 3), ('y', 10), ('z', 12)])
     assert (c.evaluate(bindings) == "c")
     bindings["x"] = 1
     assert (c.evaluate(bindings) == 'a')
     bindings["x"] = 0
     assert (c.evaluate(bindings) == None)
     bindings["x"] = 10
     assert (c.evaluate(bindings) == None)
     c = qp.parse_expression("CHOOSE(x, 'a', 5, NULL)")
     bindings = dict([('x', 3), ('y', 10), ('z', 12)])
     assert (c.evaluate(bindings) == None)
     bindings["x"] = "2"
     assert (c.evaluate(bindings) == 5)
     c = qp.parse_expression("CHOOSE(x % 2 + 1, NULL, 5)")
     bindings["x"] = 13
     assert (c.evaluate(bindings) == 5)
示例#17
0
 def test_execute_without_dpsu(self):
     schema_no_dpsu = copy.copy(schema)
     schema_no_dpsu["PUMS.PUMS"].use_dpsu = False
     reader = PandasReader(df, schema_no_dpsu)
     private_reader = PrivateReader(reader,
                                    schema_no_dpsu,
                                    privacy=Privacy(epsilon=1.0))
     assert (private_reader._options.use_dpsu == False)
     query = QueryParser(schema_no_dpsu).queries(
         "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0]
     assert (private_reader._get_reader(query) is private_reader.reader)
示例#18
0
 def test_same_colname(self):
     query = 'SELECT sales."Store", features."Store" FROM sales, features'
     q = QueryParser(metadata).query(query)
     assert (
         q._named_symbols['"sales_Store"'].expression.tablename == 'sales')
     assert (
         q._named_symbols['"sales_Store"'].expression.colname == 'Store')
     assert (q._named_symbols['"features_Store"'].expression.tablename ==
             'features')
     assert (
         q._named_symbols['"features_Store"'].expression.colname == 'Store')
示例#19
0
def preprocess_df_from_query(schema, df, query_string):
    """
    Returns a dataframe with user_id | tuple based on query grouping keys.
    """
    qp = QueryParser(schema)
    q = qp.query(query_string)
    queries = qp.queries(query_string)
    query_ast = queries[0]

    group_cols = [
        ge.expression.name for ge in query_ast.agg.groupingExpressions
    ]
    table_name = q.source.find_node(Table).name
    key_col = schema[table_name].key_cols()[0].name

    preprocessed_df = pd.DataFrame()
    preprocessed_df[key_col] = df[key_col]
    preprocessed_df["group_cols"] = tuple(df[group_cols].values.tolist())

    return preprocessed_df
示例#20
0
 def runBuild(self, exc):
     for query in self.queries:
         failed = False
         try:
             qb = QueryParser().query(query)
         except exc:
             failed = True
         if not failed:
             print(
                 "{0} should have thrown ValueError, but succeeded".format(
                     query))
         assert failed
示例#21
0
 def runBuild(self):
     for query in self.queries:
         try:
             q = QueryParser().query(query)
             self.walk_children(q)
             assert query.replace(' ', '').replace(
                 '\n',
                 '').lower() == str(q).replace(' ', '').replace('\n',
                                                                '').lower()
             self.runParseAgain(q)
         except Exception as e:
             raise ValueError(f"Parse error for {str(query)}: {str(e)}")
示例#22
0
 def test_with_censor_dims(self):
     meta = Metadata.from_file(meta_path)
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader,
                                    meta,
                                    privacy=Privacy(epsilon=3.0))
     query = "SELECT COUNT (*) AS foo, COUNT(DISTINCT pid) AS bar FROM PUMS.PUMS"
     q = QueryParser(meta).query(query)
     inner, outer = private_reader._rewrite_ast(q)
     ne = outer.select.namedExpressions
     assert (ne[0].expression.expression.name != 'keycount')
     assert (ne[1].expression.expression.name == 'keycount')
示例#23
0
 def runRewrite(self):
     qb = QueryParser(metadata).queries(self.queryBatch)
     for q in qb:
         try:
             new_q = Rewriter(metadata).query(q)
             assert q.has_symbols()
             assert new_q.has_symbols()
             assert all([
                 qt.expression.type() == nqt.expression.type() for qt, nqt
                 in zip(q._select_symbols, new_q._select_symbols)
             ])
         except Exception as e:
             raise ValueError(f"Rewrite error for query: {str(q)}")
示例#24
0
 def test_viz_query_rewritten(self):
     query = "SELECT SUM(age) AS my_sum FROM PUMS.PUMS GROUP BY age"
     parsed_query = QueryParser(schema).query(query)
     reader = PandasReader(df, schema)
     private_reader = PrivateReader(reader,
                                    schema,
                                    privacy=Privacy(epsilon=1.0))
     inner, outer = private_reader._rewrite_ast(parsed_query)
     graph = outer.visualize(n_trunc=30)
     assert (isinstance(graph, Digraph))
     #graph.render('ast_digraph', view=True, cleanup=True)
     graph = inner.visualize(n_trunc=30)
     assert (isinstance(graph, Digraph))
示例#25
0
 def test_reuse_expression(self):
     meta = Metadata.from_file(meta_path)
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader,
                                    meta,
                                    privacy=Privacy(epsilon=3.0))
     query = 'SELECT AVG(age), SUM(age), COUNT(age) FROM PUMS.PUMS'
     q = QueryParser(meta).query(query)
     inner, outer = private_reader._rewrite(query)
     names = unique(
         [f.name for f in outer.select.namedExpressions.find_nodes(Column)])
     assert (len(names) == 2)
     assert ('count_age' in names)
     assert ('sum_age' in names)
示例#26
0
 def test_empty_result_count_typed_notau_prepost(self):
     schema_all = copy.deepcopy(schema)
     schema_all['PUMS.PUMS'].censor_dims = False
     reader = PandasReader(df, schema)
     query = QueryParser(schema).queries(
         "SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0]
     private_reader = PrivateReader(reader,
                                    schema_all,
                                    privacy=Privacy(epsilon=1.0))
     private_reader._execute_ast(query, True)
     for i in range(3):
         print(private_reader._options)
         trs = private_reader._execute_ast(query, True)
         print("empty query")
         print(trs)
         assert (len(trs) == 2)
示例#27
0
from snsql.metadata import Metadata
from snsql.sql import PrivateReader
from snsql.sql.privacy import Privacy
from snsql.sql.parse import QueryParser

git_root_dir = subprocess.check_output(
    "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip()
meta_path = os.path.join(git_root_dir, os.path.join("datasets",
                                                    "PUMS_pid.yaml"))
csv_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.csv"))

meta = Metadata.from_file(meta_path)
pums = pd.read_csv(csv_path)
query = 'SELECT AVG(age), STD(age), VAR(age), SUM(age), COUNT(age) FROM PUMS.PUMS GROUP BY sex'
q = QueryParser(meta).query(query)

privacy = Privacy(alphas=[0.01, 0.05], delta=1 / (math.sqrt(100) * 100))
priv = PrivateReader.from_connection(pums, privacy=privacy, metadata=meta)
subquery, root = priv._rewrite(query)

acc = Accuracy(root, subquery, privacy)


class TestAccuracy:
    def test_count_accuracy(self):
        error = acc.count(alpha=0.05)
        assert (error < 7.53978 and error > 0.5)
        error_wide = acc.count(alpha=0.01)
        assert (error_wide < 9.909)
        assert (error_wide > error)
示例#28
0
 def test_viz_query(self):
     query = "SELECT SUM(age) AS my_sum FROM pums.pums GROUP BY age"
     parsed_query = QueryParser().query(query)
     graph = parsed_query.visualize(color_types={Query: 'red'}, n_trunc=30)
     assert (isinstance(graph, Digraph))
示例#29
0
 def test_viz_query_symbols(self):
     query = "SELECT SUM(age) AS my_sum FROM PUMS.PUMS GROUP BY age"
     parsed_query = QueryParser(schema).query(query)
     graph = parsed_query.visualize(color_types={Table: 'red'}, n_trunc=5)
     assert (isinstance(graph, Digraph))
示例#30
0
def qp(query_string):
    return QueryParser().query(query_string)