Exemplo n.º 1
0
 def dp_mechanism_count(self, df, colname):
     exact_count = df[colname].count()
     mech = Laplace(self.epsilon)
     if (self.mechanism == "Gaussian"):
         mech = Gaussian(self.epsilon)
     return np.array(
         [mech.release([exact_count])[0] for i in range(self.repeat_count)])
Exemplo n.º 2
0
 def test_bounds1b_lap(self):
     # check that analytic and bootstrap bounds work
     g = Laplace(0.05)  # epsilon of 0.05, very wide bounds
     lower, upper = g.bounds(0.95, False)  # analytic bounds
     lower2, upper2 = g.bounds(0.95, True)  # bootstrap bounds
     assert (lower < upper)
     assert (lower2 < upper2)
Exemplo n.º 3
0
 def dp_mechanism_sum(self, df, colname):
     exact_sum = df[colname].sum()
     M = float(abs(max(df[colname]) - min(df[colname])))
     mech = Laplace(self.epsilon, sensitivity=M)
     if (self.mechanism == "Gaussian"):
         mech = Gaussian(self.epsilon)
     return np.array(
         [mech.release([exact_sum])[0] for i in range(self.repeat_count)])
Exemplo n.º 4
0
 def test_bounds1c_lap(self):
     # check that analytic and bootstrap bounds work
     # use very small bounds to make sure order doesn't swap
     g = Laplace(1.0)  # epsilon of 1.0
     lower, upper = g.bounds(0.1, False)  # analytic bounds
     lower2, upper2 = g.bounds(0.1, True)  # bootstrap bounds
     assert (lower <= upper)
     assert (lower2 <= upper2)
Exemplo n.º 5
0
    def _apply_noise(self, subquery, query, syms, types, sens, srs, pct=0.95):
        # if user has selected keycount for outer query, use that instead
        kcc = [kc for kc in subquery.keycount_symbols() if kc[0] != "keycount"]
        if len(kcc) > 0:
            srs["keycount"] = srs[kcc[0][0].lower()]
        srs = srs.filter("keycount", ">", self.tau**2)

        # add noise to all columns that need noise
        for nsym in subquery.numeric_symbols():
            name, sym = nsym
            name = name.lower()
            sens = sym.sensitivity()
            mechanism = Laplace(self.epsilon, sens, self.tau)
            srs.bounds[name] = mechanism.bounds(pct)
            if sym.sensitivity() == 1:
                counts = mechanism.release(srs[name])
                counts[counts < 0] = 0
                srs[name] = counts
                srs = srs.filter(name, ">", self.tau)
            else:
                srs[name] = mechanism.release(srs[name])

        syms = query.all_symbols()
        types = [s[1].type() for s in syms]
        sens = [s[1].sensitivity() for s in syms]
        colnames = [s[0] for s in syms]
        newrs = TypedRowset([colnames], types, sens)

        srsc = srs.m_cols
        bindings = dict((name.lower(), srsc[name]) for name in srsc.keys())

        cols = []
        for c in query.select.namedExpressions:
            cols.append(c.expression.evaluate(bindings))
        for idx in range(len(cols)):
            newrs[newrs.idxcol[idx]] = cols[idx]

        # Now sort, if it has order by clause
        if query.order is not None:
            sort_fields = []
            for si in query.order.sortItems:
                if type(si.expression) is not ast.Column:
                    raise ValueError(
                        "We only know how to sort by column names right now")
                colname = si.expression.name.lower()
                desc = False
                if si.order is not None and si.order.lower() == "desc":
                    desc = True
                sf = (colname, desc)
                sort_fields.append(sf)
            sf = [("-" if desc else "") + colname
                  for colname, desc in sort_fields]

            newrs.sort(sf)

        return (newrs.rows(), srs.bounds)
Exemplo n.º 6
0
    def release(self, dataset):
        # get the column count
        num_obs = dataset.shape[0]
        # obfuscate the count
        sens = 2
        tau = 5
        counts = Laplace(self._epsilon, tau).count([num_obs])
        count_release = counts[0]

        # calculate accuracy from epsilon
        accuracy = self._compute_accuracy(self._epsilon)
        accuracy_bound = accuracy * num_obs
        mci = [num_obs - accuracy_bound, num_obs + accuracy_bound]
        return CountResult(count_release, self._column, accuracy,
                           self._epsilon, mci)
Exemplo n.º 7
0
 def test_bounds2_lap(self):
     # check that outer bounds enclose inner bounds
     g = Laplace(4.0)  # epsilon of 4.0, tighter bounds
     lower1, upper1 = g.bounds(0.95, False)
     lower1b, upper1b = g.bounds(0.95, True)
     lower2, upper2 = g.bounds(0.97, False)
     lower2b, upper2b = g.bounds(0.97, True)
     assert (lower2 < lower1)
     assert (upper2 > upper1)
     assert (lower2b < lower1b)
     assert (upper2b > upper1b)
Exemplo n.º 8
0
 def test_simple_lap(self):
     g = Laplace(0.1)  # epsilon of 0.1
     x = range(10000)
     y = g.count(x)
     assert (round(np.sum(x) / 10E+6) == round(np.sum(y) / 10E+6))
Exemplo n.º 9
0
    def _postprocess(self, subquery, query, syms, types, sens, srs, pct=0.95):
        # Postprocess:
        # 1. Add Noise to subquery results
        # 1b. Clamp counts to 0, set SUM = NULL if count = 0
        # 2. Filter tau thresh
        # 3. Evaluate outer expression, set AVG = NULL if count = 0
        # 4. Sort

        # # if user has selected keycount for outer query, use that instead
        kcc = [kc for kc in subquery.keycount_symbols() if kc[0] != "keycount"]
        if len(kcc) > 0:
            srs["keycount"] = srs[kcc[0][0].lower()]

        # add noise to all columns that need noise
        for nsym in subquery.numeric_symbols():
            name, sym = nsym
            name = name.lower()
            sens = sym.sensitivity()
            # treat null as 0 before adding noise
            srs[name] = np.array(
                [v if v is not None else 0.0 for v in srs[name]])
            mechanism = Laplace(self.epsilon, sens, self.tau)
            srs.bounds[name] = mechanism.bounds(pct)
            srs[name] = mechanism.release(srs[name])
            # BUGBUG: Things other than counts can have sensitivity of 1
            if sym.sensitivity() == 1:
                counts = srs[name]
                counts[counts < 0] = 0
                srs[name] = counts

        if subquery.agg is not None:
            srs = srs.filter("keycount", ">", self.tau**2)

        syms = query.all_symbols()
        types = [s[1].type() for s in syms]
        sens = [s[1].sensitivity() for s in syms]
        colnames = [s[0] for s in syms]
        newrs = TypedRowset([colnames], types, sens)

        srsc = srs.m_cols
        bindings = dict((name.lower(), srsc[name]) for name in srsc.keys())

        cols = []
        for c in query.select.namedExpressions:
            cols.append(c.expression.evaluate(bindings))
        for idx in range(len(cols)):
            newrs[newrs.idxcol[idx]] = cols[idx]

        # Now sort, if it has order by clause
        if query.order is not None:
            sort_fields = []
            for si in query.order.sortItems:
                if type(si.expression) is not ast.Column:
                    raise ValueError(
                        "We only know how to sort by column names right now")
                colname = si.expression.name.lower()
                desc = False
                if si.order is not None and si.order.lower() == "desc":
                    desc = True
                sf = (colname, desc)
                sort_fields.append(sf)
            sf = [("-" if desc else "") + colname
                  for colname, desc in sort_fields]
            newrs.sort(sf)
        return newrs