def test_check_thresholds_gauss(self): # check tau for various privacy parameters epsilons = [0.1, 2.0] max_contribs = [1, 3] deltas = [10E-5, 10E-15] query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married" reader = PandasReader(df, schema) qp = QueryParser(schema) q = qp.query(query) for eps in epsilons: for d in max_contribs: for delta in deltas: # using slightly different formulations of same formula from different papers # make sure private_reader round-trips gaus_scale = math.sqrt(d) * math.sqrt( 2 * math.log(1.25 / delta)) / eps gaus_rho = 1 + gaus_scale * math.sqrt( 2 * math.log(d / math.sqrt(2 * math.pi * delta))) schema_c = copy.copy(schema) schema_c["PUMS.PUMS"].max_ids = d private_reader = PrivateReader(reader, schema_c, eps, delta) assert (private_reader._options.max_contrib == d) r = private_reader.execute_ast(q) assert (math.isclose(private_reader.tau, gaus_rho, rel_tol=0.03, abs_tol=2))
def runRewrite(self): qb = QueryParser(metadata).queries(self.queryBatch) for q in qb: print(q) new_q = Rewriter(metadata).query(q) assert q.has_symbols() assert new_q.has_symbols() assert all([qt[1].type() == nqt[1].type() for qt, nqt in zip(q.m_symbols, new_q.m_symbols) ])
def test_empty_result_count_typed_notau_prepost(self): reader = PandasReader(df, schema) query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] private_reader = PrivateReader(reader, schema, 1.0) private_reader._execute_ast(query, True) for i in range(3): trs = private_reader._execute_ast(query, True) assert(len(trs) == 2)
def test_execute_without_dpsu(self): schema_no_dpsu = copy.copy(schema) schema_no_dpsu["PUMS.PUMS"].use_dpsu = False reader = PandasReader(df, schema_no_dpsu) private_reader = PrivateReader(reader, schema_no_dpsu, 1.0) assert(private_reader._options.use_dpsu == False) query = QueryParser(schema_no_dpsu).queries("SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0] assert(private_reader._get_reader(query) is private_reader.reader)
def preprocess_df_from_query(schema, df, query_string): """ Returns a dataframe with user_id | tuple based on query grouping keys. """ qp = QueryParser(schema) q = qp.query(query_string) queries = qp.queries(query_string) query_ast = queries[0] group_cols = [ge.expression.name for ge in query_ast.agg.groupingExpressions] table_name = q.source.find_node(Table).name key_col = schema[table_name].key_cols()[0].name preprocessed_df = pd.DataFrame() preprocessed_df[key_col] = df[key_col] preprocessed_df["group_cols"] = tuple(df[group_cols].values.tolist()) return preprocessed_df
def runBuild(self): for query in self.queries: q = QueryParser().query(query) self.walk_children(q) # assert len(qb) == len(self.queries) assert query.replace(' ', '').replace( '\n', '').lower() == str(q).replace(' ', '').replace('\n', '').lower() self.runParseAgain(q)
def test_viz_query_rewritten(self): query = "SELECT SUM(age) AS my_sum FROM PUMS.PUMS GROUP BY age" parsed_query = QueryParser(schema).query(query) reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, 1.0) inner, outer = private_reader.rewrite_ast(parsed_query) graph = outer.visualize(n_trunc=30) assert (isinstance(graph, Digraph)) #graph.render('ast_digraph', view=True, cleanup=True) graph = inner.visualize(n_trunc=30) assert (isinstance(graph, Digraph))
def runBuild(self, exc): for query in self.queries: failed = False try: qb = QueryParser().query(query) except exc: failed = True if not failed: print( "{0} should have thrown ValueError, but succeeded".format( query)) assert failed
def test_simple(self): query = "SELECT * FROM FOO;" QueryParser().parse_only(query) # try parsing without building qb = QueryParser().query(query)
def test_viz_query_symbols(self): query = "SELECT SUM(age) AS my_sum FROM PUMS.PUMS GROUP BY age" parsed_query = QueryParser(schema).query(query) graph = parsed_query.visualize(color_types={Table: 'red'}, n_trunc=5) assert (isinstance(graph, Digraph))
def test_viz_query(self): query = "SELECT SUM(age) AS my_sum FROM pums.pums GROUP BY age" parsed_query = QueryParser().query(query) graph = parsed_query.visualize(color_types={Query: 'red'}, n_trunc=30) assert (isinstance(graph, Digraph))
def test_unsupported(self): with pytest.raises(ValueError) as err: qb = QueryParser().query( "SELECT * FROM FOO UNION ALL SELECT * FROM BAR", True)
def test_sum_no_rows_exact_typed(self): reader = PandasReader(df, schema) query = QueryParser(schema).queries( "SELECT SUM(age) as c FROM PUMS.PUMS WHERE age > 100")[0] trs = reader.execute_ast_typed(query) assert (trs['c'][0] == None)
def test_tsql_escaped_error(self): with pytest.raises(ValueError) as err: QueryParser().parse_only("SELECT [FOO.BAR] FROM HR;") err.match("^Lexer error")
def test_count_no_rows_exact_typed(self): reader = PandasReader(df, schema) query = QueryParser(schema).queries("SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] trs = reader._execute_ast_df(query) assert(trs['c'][0] == 0)
def runValidate(self): for qs in self.queries: print(qs) with pytest.raises(ValueError): q = QueryParser(metadata).query(qs) self.validateSingle(q)
def runValidate(self): for qs in self.queries: print(qs) q = QueryParser(metadata).query(qs) Validate().validateQuery(q, metadata)
def runParse(self): for query in self.queries: print(query) QueryParser().parse_only(query)
def test_sum_noisy(self): reader = PandasReader(df, schema) query = QueryParser(schema).queries("SELECT SUM(age) as age_total FROM PUMS.PUMS")[0] trs = reader._execute_ast_df(query) assert(trs['age_total'][0] > 1000)
def test_bad_token(self): with pytest.raises(ValueError) as err: QueryParser().parse_only("SELECT * FROM FOO WHENCE ZIP ZAG") err.match("^Bad token")
def test_rewriting(): for query in queries: query = QueryParser(metadata).query(str(query)) dp_query = Rewriter(metadata).query(query) parsed_dp_query = QueryParser(metadata).query(str(dp_query)) assert dp_query == parsed_dp_query
def test_batch13(self): qb = QueryParser().queries( open(testpath + "parse/" + "test.sql").read()) assert len(qb) == 13
def qp(query_string): return QueryParser().query(query_string)
def runParseAgain(self, q): """ Converts AST to text, re-parses to AST, and compares the two ASTs""" repeat = QueryParser().query(str(q)) assert q == repeat