Exemplo n.º 1
0
 def test_check_thresholds_gauss(self):
     # check tau for various privacy parameters
     epsilons = [0.1, 2.0]
     max_contribs = [1, 3]
     deltas = [10E-5, 10E-15]
     query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married"
     reader = PandasReader(schema, df)
     qp = QueryParser(schema)
     q = qp.query(query)
     for eps in epsilons:
         for d in max_contribs:
             for delta in deltas:
                 # using slightly different formulations of same formula from different papers
                 # make sure private_reader round-trips
                 gaus_scale = math.sqrt(d) * math.sqrt(
                     2 * math.log(1.25 / delta)) / eps
                 gaus_rho = 1 + gaus_scale * math.sqrt(
                     2 * math.log(d / math.sqrt(2 * math.pi * delta)))
                 private_reader = PrivateReader(schema, reader, eps, delta)
                 q.max_ids = d  # hijack the AST
                 r = private_reader.execute_ast(q)
                 assert (math.isclose(private_reader.tau,
                                      gaus_rho,
                                      rel_tol=0.03,
                                      abs_tol=2))
Exemplo n.º 2
0
def preprocess_df_from_query(schema, df, query_string):
    """
    Returns a dataframe with user_id | tuple based on query grouping keys.
    """
    qp = QueryParser(schema)
    q = qp.query(query_string)
    queries = qp.queries(query_string)
    query_ast = queries[0]

    group_cols = [
        ge.expression.name for ge in query_ast.agg.groupingExpressions
    ]
    table_name = q.source.find_node(Table).name
    key_col = schema[table_name].key_cols()[0].name

    preprocessed_df = pd.DataFrame()
    preprocessed_df[key_col] = df[key_col]
    preprocessed_df["group_cols"] = tuple(df[group_cols].values.tolist())

    return preprocessed_df