示例#1
0
 def test_bounds1b_norm(self):
     # check that analytic and bootstrap bounds work with tiny epsilon
     g = Gaussian(0.05, (1 / 125.0))  # epsilon of 0.05, very wide bounds
     lower, upper = g.bounds(False)[0]  # analytic bounds
     lower2, upper2 = g.bounds(True)[0]  # bootstrap bounds
     assert (lower < upper)
     assert (lower2 < upper2)
示例#2
0
 def test_bounds1_norm(self):
     # check that analytic and bootstrap bounds work
     g = Gaussian(0.5, 1 / 125.0)  # epsilon of 0.5
     lower, upper = g.bounds(False)[0]  # analytic bounds
     lower2, upper2 = g.bounds(True)[0]  # bootstrap bounds
     assert (lower < upper)
     assert (lower2 < upper2)
示例#3
0
 def test_bounds1c_norm(self):
     # check that analytic and bootstrap bounds work
     # use very small bounds to make sure order doesn't swap
     g = Gaussian(1.0, interval_widths=[0.1])  # epsilon of 1.0
     lower, upper = g.bounds(False)[0]  # analytic bounds
     lower2, upper2 = g.bounds(True)[0]  # bootstrap bounds
     assert (lower <= upper)
     assert (lower2 <= upper2)
示例#4
0
 def test_intervals_norm(self):
     g = Gaussian(4.0, interval_widths=[0.95, 0.5])
     vals = [100, 3333, 99999]
     r_n = g.release(vals, False)
     r = g.release(vals, True)
     assert (r_n.accuracy is None)
     assert (r.accuracy is not None)
     r0, r1 = r.intervals
     assert (r1.inside(r0))
     assert (not r1.contains(r0))
 def dp_mechanism_count(self, df, colname):
     exact_count = df[colname].count()
     mech = Laplace(self.epsilon)
     if (self.mechanism == "Gaussian"):
         mech = Gaussian(self.epsilon)
     return np.array([
         mech.release([exact_count]).values[0]
         for i in range(self.repeat_count)
     ])
 def dp_mechanism_sum(self, df, colname):
     exact_sum = df[colname].sum()
     M = float(abs(max(df[colname]) - min(df[colname])))
     mech = Laplace(self.epsilon, sensitivity=M)
     if (self.mechanism == "Gaussian"):
         mech = Gaussian(self.epsilon)
     return np.array([
         mech.release([exact_sum]).values[0]
         for i in range(self.repeat_count)
     ])
 def dp_mechanism_count(self, df, colname):
     """
     Returns repeatedly applied noise adding mechanisms
     like Laplace and Gaussian available in WhiteNoise-System to count query
     """
     exact_count = df[colname].count()
     mech = Laplace(self.epsilon)
     if (self.mechanism == "Gaussian"):
         mech = Gaussian(self.epsilon)
     return np.array([
         mech.release([exact_count]).values[0]
         for i in range(self.repeat_count)
     ])
 def dp_mechanism_sum(self, df, colname):
     """
     Returns repeatedly applied noise adding mechanisms
     like Laplace and Gaussian available in WhiteNoise-System to sum query.
     Sensitivity is set as absolute difference between max and min values
     within the column
     """
     exact_sum = df[colname].sum()
     M = float(abs(max(df[colname]) - min(df[colname])))
     mech = Laplace(self.epsilon, sensitivity=M)
     if (self.mechanism == "Gaussian"):
         mech = Gaussian(self.epsilon)
     return np.array([
         mech.release([exact_sum]).values[0]
         for i in range(self.repeat_count)
     ])
示例#9
0
 def test_bounds2_norm(self):
     # check that outer bounds enclose inner bounds
     g = Gaussian(4.0,
                  interval_widths=[0.95,
                                   0.97])  # epsilon of 4.0, tighter bounds
     lower1, upper1 = g.bounds(False)[0]
     lower1b, upper1b = g.bounds(True)[0]
     lower2, upper2 = g.bounds(False)[1]
     lower2b, upper2b = g.bounds(True)[1]
     assert (lower2 < lower1)
     assert (upper2 > upper1)
     assert (lower2b < lower1b)
     assert (upper2b > upper1b)
示例#10
0
 def test_simple_norm(self):
     g = Gaussian(0.1)  # epsilon of 0.1
     x = range(10000)
     y = g.release(x).values
     assert (round(np.sum(x) / 10E+6) == round(np.sum(y) / 10E+6))
示例#11
0
    def _execute_ast(self, query, cache_exact=False):
        if isinstance(query, str):
            raise ValueError("Please pass AST to _execute.")

        subquery, query = self.rewrite_ast(query)
        max_contrib = self.options.max_contrib if self.options.max_contrib is not None else 1
        self.tau = max_contrib * (
            1 - (math.log(2 * self.delta / max_contrib) / self.epsilon))

        syms = subquery.all_symbols()
        source_col_names = [s[0] for s in syms]

        # list of sensitivities in column order
        sens = [s[1].sensitivity() for s in syms]

        # tell which are counts, in column order
        is_count = [s[1].is_count for s in syms]

        # set sensitivity to None if the column is a grouping key
        if subquery.agg is not None:
            group_keys = [
                ge.expression.name if hasattr(ge.expression, 'name') else None
                for ge in subquery.agg.groupingExpressions
            ]
        else:
            group_keys = []
        is_group_key = [
            colname in group_keys for colname in [s[0] for s in syms]
        ]
        for idx in range(len(sens)):
            if is_group_key[idx]:
                sens[idx] = None

        kc_pos = None
        kcc_pos = []
        for idx in range(len(syms)):
            sname, sym = syms[idx]
            if sname == 'keycount':
                kc_pos = idx
            elif sym.is_key_count:
                kcc_pos.append(idx)
        if kc_pos is None and len(kcc_pos) > 0:
            kc_pos = kcc_pos.pop()

        # make a list of mechanisms in column order
        mechs = [
            Gaussian(self.epsilon, self.delta, s, max_contrib,
                     self.interval_widths) if s is not None else None
            for s in sens
        ]

        # execute the subquery against the backend and load in tuples
        if cache_exact:
            # we only execute the exact query once
            if self._cached_exact is not None:
                if subquery == self._cached_ast:
                    db_rs = self._cached_exact
                else:
                    raise ValueError(
                        "Cannot run different query against cached result.  "
                        "Make a new PrivateReader or else clear the cache with cache = False"
                    )
            else:
                db_rs = self._get_reader(subquery).execute_ast(subquery)
                self._cached_exact = list(db_rs)
                self._cached_ast = subquery
        else:
            self.cached_exact = None
            self.cached_ast = None
            db_rs = self._get_reader(subquery).execute_ast(subquery)

        clamp_counts = self.options.clamp_counts

        def process_row(row_in):
            # pull out tuple values
            row = [v for v in row_in]
            # set null to 0 before adding noise
            for idx in range(len(row)):
                if sens[idx] is not None and row[idx] is None:
                    row[idx] = 0.0
            # call all mechanisms to add noise
            out_row = [
                noise.release([v]).values[0] if noise is not None else v
                for noise, v in zip(mechs, row)
            ]
            # ensure all key counts are the same
            for idx in kcc_pos:
                out_row[idx] = out_row[kc_pos]
            # clamp counts to be non-negative
            if clamp_counts:
                for idx in range(len(row)):
                    if is_count[idx] and out_row[idx] < 0:
                        out_row[idx] = 0
            return out_row

        if hasattr(db_rs, 'rdd'):
            # it's a dataframe
            out = db_rs.rdd.map(process_row)
        elif hasattr(db_rs, 'map'):
            # it's an RDD
            out = db_rs.map(process_row)
        else:
            out = map(process_row, db_rs[1:])

        if subquery.agg is not None and self.options.censor_dims:
            if hasattr(out, 'filter'):
                # it's an RDD
                tau = self.tau
                out = out.filter(lambda row: row[kc_pos] > tau)
            else:
                out = filter(lambda row: row[kc_pos] > self.tau, out)

        # get column information for outer query
        out_syms = query.all_symbols()
        out_types = [s[1].type() for s in out_syms]
        out_colnames = [s[0] for s in out_syms]

        def convert(val, type):
            if type == 'string' or type == 'unknown':
                return str(val).replace('"', '').replace("'", '')
            elif type == 'int':
                return int(float(str(val).replace('"', '').replace("'", '')))
            elif type == 'float':
                return float(str(val).replace('"', '').replace("'", ''))
            elif type == 'boolean':
                if isinstance(val, int):
                    return val != 0
                else:
                    return bool(str(val).replace('"', '').replace("'", ''))
            else:
                raise ValueError("Can't convert type " + type)

        def process_out_row(row):
            bindings = dict((name.lower(), val)
                            for name, val in zip(source_col_names, row))
            row = [
                c.expression.evaluate(bindings)
                for c in query.select.namedExpressions
            ]
            return [convert(val, type) for val, type in zip(row, out_types)]

        if hasattr(out, 'map'):
            # it's an RDD
            out = out.map(process_out_row)
        else:
            out = map(process_out_row, out)

        # sort it if necessary
        if query.order is not None:
            sort_fields = []
            for si in query.order.sortItems:
                if type(si.expression) is not ast.Column:
                    raise ValueError(
                        "We only know how to sort by column names right now")
                colname = si.expression.name.lower()
                if colname not in out_colnames:
                    raise ValueError(
                        "Can't sort by {0}, because it's not in output columns: {1}"
                        .format(colname, out_colnames))
                colidx = out_colnames.index(colname)
                desc = False
                if si.order is not None and si.order.lower() == "desc":
                    desc = True
                if desc and not (out_types[colidx]
                                 in ["int", "float", "boolean"]):
                    raise ValueError(
                        "We don't know how to sort descending by " +
                        out_types[colidx])
                sf = (desc, colidx)
                sort_fields.append(sf)

            def sort_func(row):
                return tuple([
                    row[idx] if not desc else
                    not row[idx] if out_types[idx] == "boolean" else -row[idx]
                    for desc, idx in sort_fields
                ])

            if hasattr(out, 'sortBy'):
                out = out.sortBy(sort_func)
            else:
                out = sorted(out, key=sort_func)

        # output it
        if hasattr(out, 'toDF'):
            # Pipeline RDD
            return out.toDF(out_colnames)
        elif hasattr(out, 'map'):
            # Bare RDD
            return out
        else:
            return TypedRowset([out_colnames] + list(out), out_types)