def run_agg_query(self, df, metadata, query, confidence, get_exact=True):
        """
        Run the query using the private reader and input query
        Get query response back
        """
        reader = PandasReader(metadata, df)
        actual = 0.0
        # VAR not supported in Pandas Reader. So not needed to fetch actual on every aggregation
        if (get_exact):
            actual = reader.execute_typed(query).rows()[1:][0][0]
        private_reader = PrivateReader(metadata, reader, self.epsilon)
        query_ast = private_reader.parse_query_string(query)

        srs_orig = private_reader.reader.execute_ast_typed(query_ast)

        noisy_values = []
        low_bounds = []
        high_bounds = []
        for idx in range(self.repeat_count):
            srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values()))
            res = private_reader._execute_ast(query_ast, True)
            # Disabled because confidence interval not available in report
            #interval = res.report[res.colnames[0]].intervals[confidence]
            #low_bounds.append(interval[0].low)
            #high_bounds.append(interval[0].high)
            noisy_values.append(res.rows()[1:][0][0])
        return np.array(noisy_values), actual, low_bounds, high_bounds
 def release(self, dataset: object, actual=False) -> Report:
     """
     Dataset is a collection of [Dataset Metadata, PandasReader]
     Releases response to SQL query based on the number of repetitions
     requested by eval_params if actual is set of False. 
     Actual response is only returned once
     """
     if (not actual):
         private_reader = PrivateReader(dataset[0], dataset[1],
                                        self.privacy_params.epsilon)
         query_ast = private_reader.parse_query_string(self.algorithm)
         srs_orig = private_reader.reader.execute_ast_typed(query_ast)
         noisy_values = []
         for idx in range(self.eval_params.repeat_count):
             srs = TypedRowset(srs_orig.rows(),
                               list(srs_orig.types.values()))
             res = private_reader._execute_ast(query_ast, True)
             noisy_values.append(res.rows()[1:][0][0])
         return Report({"__key__": noisy_values})
     else:
         reader = dataset[1]
         exact = reader.execute_typed(self.algorithm).rows()[1:][0][0]
         return Report({"__key__": exact})
示例#3
0
    def execute_typed(self, query):
        if not isinstance(query, str):
            raise ValueError(
                "Please pass a string to this function.  You can use execute_ast to execute ASTs"
            )

        rows = self.execute(query)
        if len(rows) < 1:
            return None
        types = ["unknown" for i in range(len(rows[0]))]
        if len(rows) > 1:
            row = rows[1]
            for idx in range(len(row)):
                val = row[idx]
                if isinstance(val, int):
                    types[idx] = "int"
                elif isinstance(val, float):
                    types[idx] = "float"
                elif isinstance(val, bool):
                    types[idx] = "boolean"
                else:
                    types[idx] = "string"

        return TypedRowset(rows, types)
示例#4
0
 def test_empty_result_typed(self):
     reader = PandasReader(schema, df)
     rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100")
     trs = TypedRowset(rs, ['int'])
     assert(len(trs) == 0)
    def run_agg_query_df(self,
                         df,
                         metadata,
                         query,
                         confidence,
                         file_name="d1"):
        # Getting exact result
        reader = PandasReader(metadata, df)
        exact = reader.execute_typed(query).rows()[1:]
        exact_res = []
        for row in exact:
            exact_res.append(row)

        private_reader = PrivateReader(metadata, reader, self.epsilon)
        query_ast = private_reader.parse_query_string(query)

        # Distinguishing dimension and measure columns
        srs_orig = private_reader.reader.execute_ast_typed(query_ast)
        srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values()))

        sample_res = private_reader._execute_ast(query_ast, True)
        headers = sample_res.colnames

        dim_cols = []
        num_cols = []

        for col in headers:
            if (sample_res.types[col] == "string"):
                dim_cols.append(col)
            else:
                num_cols.append(col)

        # Repeated query and store results along with intervals
        res = []
        for idx in range(self.repeat_count):
            dim_rows = []
            num_rows = []
            srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values()))
            singleres = private_reader._execute_ast(query_ast, True)
            values = singleres[col]
            for col in dim_cols:
                dim_rows.append(singleres[col])
            for col in num_cols:
                values = singleres[col]
                #low = singleres.report[col].intervals[confidence].low
                #high = singleres.report[col].intervals[confidence].high
                #num_rows.append(list(zip(values, low, high)))
                num_rows.append(list(zip(values)))

            res.extend(list(zip(*dim_rows, *num_rows)))

        exact_df = pd.DataFrame(exact_res, columns=headers)
        noisy_df = pd.DataFrame(res, columns=headers)

        # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2
        if (len(dim_cols) == 0):
            dim_cols.append("__dim__")

        if (dim_cols[0] == "__dim__"):
            exact_df[dim_cols[0]] = ["key"] * len(exact_df)
            noisy_df[dim_cols[0]] = ["key"] * len(noisy_df)

        return noisy_df, exact_df, dim_cols, num_cols
示例#6
0
    def execute_ast_typed(self, query):
        syms = query.all_symbols()
        types = [s[1].type() for s in syms]

        rows = self.execute_ast(query)
        return TypedRowset(rows, types)
示例#7
0
    def _execute_ast(self, query, cache_exact=False):
        if isinstance(query, str):
            raise ValueError("Please pass AST to _execute.")

        subquery, query = self.rewrite_ast(query)
        max_contrib = self.options.max_contrib if self.options.max_contrib is not None else 1
        self.tau = max_contrib * (
            1 - (math.log(2 * self.delta / max_contrib) / self.epsilon))

        syms = subquery.all_symbols()
        source_col_names = [s[0] for s in syms]

        # list of sensitivities in column order
        sens = [s[1].sensitivity() for s in syms]

        # tell which are counts, in column order
        is_count = [s[1].is_count for s in syms]

        # set sensitivity to None if the column is a grouping key
        if subquery.agg is not None:
            group_keys = [
                ge.expression.name if hasattr(ge.expression, 'name') else None
                for ge in subquery.agg.groupingExpressions
            ]
        else:
            group_keys = []
        is_group_key = [
            colname in group_keys for colname in [s[0] for s in syms]
        ]
        for idx in range(len(sens)):
            if is_group_key[idx]:
                sens[idx] = None

        kc_pos = None
        kcc_pos = []
        for idx in range(len(syms)):
            sname, sym = syms[idx]
            if sname == 'keycount':
                kc_pos = idx
            elif sym.is_key_count:
                kcc_pos.append(idx)
        if kc_pos is None and len(kcc_pos) > 0:
            kc_pos = kcc_pos.pop()

        # make a list of mechanisms in column order
        mechs = [
            Gaussian(self.epsilon, self.delta, s, max_contrib,
                     self.interval_widths) if s is not None else None
            for s in sens
        ]

        # execute the subquery against the backend and load in tuples
        if cache_exact:
            # we only execute the exact query once
            if self._cached_exact is not None:
                if subquery == self._cached_ast:
                    db_rs = self._cached_exact
                else:
                    raise ValueError(
                        "Cannot run different query against cached result.  "
                        "Make a new PrivateReader or else clear the cache with cache = False"
                    )
            else:
                db_rs = self._get_reader(subquery).execute_ast(subquery)
                self._cached_exact = list(db_rs)
                self._cached_ast = subquery
        else:
            self.cached_exact = None
            self.cached_ast = None
            db_rs = self._get_reader(subquery).execute_ast(subquery)

        clamp_counts = self.options.clamp_counts

        def process_row(row_in):
            # pull out tuple values
            row = [v for v in row_in]
            # set null to 0 before adding noise
            for idx in range(len(row)):
                if sens[idx] is not None and row[idx] is None:
                    row[idx] = 0.0
            # call all mechanisms to add noise
            out_row = [
                noise.release([v]).values[0] if noise is not None else v
                for noise, v in zip(mechs, row)
            ]
            # ensure all key counts are the same
            for idx in kcc_pos:
                out_row[idx] = out_row[kc_pos]
            # clamp counts to be non-negative
            if clamp_counts:
                for idx in range(len(row)):
                    if is_count[idx] and out_row[idx] < 0:
                        out_row[idx] = 0
            return out_row

        if hasattr(db_rs, 'rdd'):
            # it's a dataframe
            out = db_rs.rdd.map(process_row)
        elif hasattr(db_rs, 'map'):
            # it's an RDD
            out = db_rs.map(process_row)
        else:
            out = map(process_row, db_rs[1:])

        if subquery.agg is not None and self.options.censor_dims:
            if hasattr(out, 'filter'):
                # it's an RDD
                tau = self.tau
                out = out.filter(lambda row: row[kc_pos] > tau)
            else:
                out = filter(lambda row: row[kc_pos] > self.tau, out)

        # get column information for outer query
        out_syms = query.all_symbols()
        out_types = [s[1].type() for s in out_syms]
        out_colnames = [s[0] for s in out_syms]

        def convert(val, type):
            if type == 'string' or type == 'unknown':
                return str(val).replace('"', '').replace("'", '')
            elif type == 'int':
                return int(float(str(val).replace('"', '').replace("'", '')))
            elif type == 'float':
                return float(str(val).replace('"', '').replace("'", ''))
            elif type == 'boolean':
                if isinstance(val, int):
                    return val != 0
                else:
                    return bool(str(val).replace('"', '').replace("'", ''))
            else:
                raise ValueError("Can't convert type " + type)

        def process_out_row(row):
            bindings = dict((name.lower(), val)
                            for name, val in zip(source_col_names, row))
            row = [
                c.expression.evaluate(bindings)
                for c in query.select.namedExpressions
            ]
            return [convert(val, type) for val, type in zip(row, out_types)]

        if hasattr(out, 'map'):
            # it's an RDD
            out = out.map(process_out_row)
        else:
            out = map(process_out_row, out)

        # sort it if necessary
        if query.order is not None:
            sort_fields = []
            for si in query.order.sortItems:
                if type(si.expression) is not ast.Column:
                    raise ValueError(
                        "We only know how to sort by column names right now")
                colname = si.expression.name.lower()
                if colname not in out_colnames:
                    raise ValueError(
                        "Can't sort by {0}, because it's not in output columns: {1}"
                        .format(colname, out_colnames))
                colidx = out_colnames.index(colname)
                desc = False
                if si.order is not None and si.order.lower() == "desc":
                    desc = True
                if desc and not (out_types[colidx]
                                 in ["int", "float", "boolean"]):
                    raise ValueError(
                        "We don't know how to sort descending by " +
                        out_types[colidx])
                sf = (desc, colidx)
                sort_fields.append(sf)

            def sort_func(row):
                return tuple([
                    row[idx] if not desc else
                    not row[idx] if out_types[idx] == "boolean" else -row[idx]
                    for desc, idx in sort_fields
                ])

            if hasattr(out, 'sortBy'):
                out = out.sortBy(sort_func)
            else:
                out = sorted(out, key=sort_func)

        # output it
        if hasattr(out, 'toDF'):
            # Pipeline RDD
            return out.toDF(out_colnames)
        elif hasattr(out, 'map'):
            # Bare RDD
            return out
        else:
            return TypedRowset([out_colnames] + list(out), out_types)
 def test_make_empty(self):
     trs = TypedRowset(rows_1[0:1], types)
     assert (len(trs) == 0)
 def test_make_1(self):
     trs = TypedRowset(rows_1, types)
     assert (len(trs) == 1)