Exemplo n.º 1
0
    def _apply_noise(self, subquery, query, syms, types, sens, srs, pct=0.95):
        # if user has selected keycount for outer query, use that instead
        kcc = [kc for kc in subquery.keycount_symbols() if kc[0] != "keycount"]
        if len(kcc) > 0:
            srs["keycount"] = srs[kcc[0][0].lower()]
        srs = srs.filter("keycount", ">", self.tau**2)

        # add noise to all columns that need noise
        for nsym in subquery.numeric_symbols():
            name, sym = nsym
            name = name.lower()
            sens = sym.sensitivity()
            mechanism = Laplace(self.epsilon, sens, self.tau)
            srs.bounds[name] = mechanism.bounds(pct)
            if sym.sensitivity() == 1:
                counts = mechanism.release(srs[name])
                counts[counts < 0] = 0
                srs[name] = counts
                srs = srs.filter(name, ">", self.tau)
            else:
                srs[name] = mechanism.release(srs[name])

        syms = query.all_symbols()
        types = [s[1].type() for s in syms]
        sens = [s[1].sensitivity() for s in syms]
        colnames = [s[0] for s in syms]
        newrs = TypedRowset([colnames], types, sens)

        srsc = srs.m_cols
        bindings = dict((name.lower(), srsc[name]) for name in srsc.keys())

        cols = []
        for c in query.select.namedExpressions:
            cols.append(c.expression.evaluate(bindings))
        for idx in range(len(cols)):
            newrs[newrs.idxcol[idx]] = cols[idx]

        # Now sort, if it has order by clause
        if query.order is not None:
            sort_fields = []
            for si in query.order.sortItems:
                if type(si.expression) is not ast.Column:
                    raise ValueError(
                        "We only know how to sort by column names right now")
                colname = si.expression.name.lower()
                desc = False
                if si.order is not None and si.order.lower() == "desc":
                    desc = True
                sf = (colname, desc)
                sort_fields.append(sf)
            sf = [("-" if desc else "") + colname
                  for colname, desc in sort_fields]

            newrs.sort(sf)

        return (newrs.rows(), srs.bounds)
Exemplo n.º 2
0
    def run_agg_query(self, df, metadata_path, query, confidence):
        metadata = MetadataLoader(metadata_path).read_schema()
        reader = CSVReader(metadata, df)
        private_reader = PrivateQuery(reader, metadata, self.epsilon)
        query_ast = private_reader.parse_query_string(query)
        subquery, query, syms, types, sens, srs_orig = private_reader._preprocess(
            query_ast)

        #exact_values = private_reader.execute_ast(query)
        #bounds_centered_zero = list(private_reader._apply_noise(*exact_values, confidence)[1].values())[1]
        #actual_value = exact_values[1:][0][1]
        #bounds = np.array([bounds_centered_zero[0] + actual_value, bounds_centered_zero[1] + actual_value])

        noisy_values = []
        for idx in range(self.repeat_count):
            srs = TypedRowset(srs_orig.rows(), types, sens)
            noisy_values.append(
                private_reader._postprocess(subquery, query, syms, types, sens,
                                            srs).rows()[1:][0][0])

        return np.array(noisy_values)  #, bounds
Exemplo n.º 3
0
 def test_make_empty(self):
     trs = TypedRowset(rows_1[0:1], types, sens)
     assert (len(trs) == 0)
Exemplo n.º 4
0
 def test_make_1(self):
     trs = TypedRowset(rows_1, types, sens)
     assert (len(trs) == 1)
Exemplo n.º 5
0
 def test_empty_result_typed(self):
     reader = CSVReader(schema, df)
     rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100")
     trs = TypedRowset(rs, ['int'], [None])
     assert(len(trs) == 0)
Exemplo n.º 6
0
    def _postprocess(self, subquery, query, syms, types, sens, srs, pct=0.95):
        # Postprocess:
        # 1. Add Noise to subquery results
        # 1b. Clamp counts to 0, set SUM = NULL if count = 0
        # 2. Filter tau thresh
        # 3. Evaluate outer expression, set AVG = NULL if count = 0
        # 4. Sort

        # # if user has selected keycount for outer query, use that instead
        kcc = [kc for kc in subquery.keycount_symbols() if kc[0] != "keycount"]
        if len(kcc) > 0:
            srs["keycount"] = srs[kcc[0][0].lower()]

        # add noise to all columns that need noise
        for nsym in subquery.numeric_symbols():
            name, sym = nsym
            name = name.lower()
            sens = sym.sensitivity()
            # treat null as 0 before adding noise
            srs[name] = np.array(
                [v if v is not None else 0.0 for v in srs[name]])
            mechanism = Laplace(self.epsilon, sens, self.tau)
            srs.bounds[name] = mechanism.bounds(pct)
            srs[name] = mechanism.release(srs[name])
            # BUGBUG: Things other than counts can have sensitivity of 1
            if sym.sensitivity() == 1:
                counts = srs[name]
                counts[counts < 0] = 0
                srs[name] = counts

        if subquery.agg is not None:
            srs = srs.filter("keycount", ">", self.tau**2)

        syms = query.all_symbols()
        types = [s[1].type() for s in syms]
        sens = [s[1].sensitivity() for s in syms]
        colnames = [s[0] for s in syms]
        newrs = TypedRowset([colnames], types, sens)

        srsc = srs.m_cols
        bindings = dict((name.lower(), srsc[name]) for name in srsc.keys())

        cols = []
        for c in query.select.namedExpressions:
            cols.append(c.expression.evaluate(bindings))
        for idx in range(len(cols)):
            newrs[newrs.idxcol[idx]] = cols[idx]

        # Now sort, if it has order by clause
        if query.order is not None:
            sort_fields = []
            for si in query.order.sortItems:
                if type(si.expression) is not ast.Column:
                    raise ValueError(
                        "We only know how to sort by column names right now")
                colname = si.expression.name.lower()
                desc = False
                if si.order is not None and si.order.lower() == "desc":
                    desc = True
                sf = (colname, desc)
                sort_fields.append(sf)
            sf = [("-" if desc else "") + colname
                  for colname, desc in sort_fields]
            newrs.sort(sf)
        return newrs