def __create_sex_histograms(self, data): sex_histogram_geometric = wn.dp_histogram( wn.to_bool(data['sex'], true_label="0"), upper=self.__nsize, privacy_usage={'epsilon': .5, 'delta': 0.00001} ) sex_prep = wn.histogram(wn.to_bool( data['sex'], true_label="0"), null_value=True) sex_histogram_laplace = wn.laplace_mechanism( sex_prep, privacy_usage={"epsilon": 0.4, "delta": .000001}) return sex_histogram_geometric, sex_histogram_laplace
def __create_state_histograms(self, data): states = self.get_states() state_histogram_geometric = wn.dp_histogram( data['state'], categories=states, null_value=states[0], privacy_usage={'epsilon': 0.2} ) state_prep = wn.histogram(data['state'], categories=states, null_value=states[0]) state_histogram_laplace = wn.laplace_mechanism(state_prep, privacy_usage={"epsilon": 0.5, "delta": .000001}) return state_histogram_geometric, state_histogram_laplace
def __create_age_histograms(self, data): age_edges = list(range(20, 80, 10)) age_histogram_geometric = wn.dp_histogram( wn.to_int(data['age'], lower=20, upper=80), edges=age_edges, upper=self.__nsize, null_value=20, privacy_usage={'epsilon': 0.5} ) age_prep = wn.histogram(wn.to_int(data['age'], lower=20, upper=80), edges=age_edges, null_value=20) age_histogram_laplace = wn.laplace_mechanism( age_prep, privacy_usage={"epsilon": 0.5, "delta": .000001}) return age_histogram_geometric, age_histogram_laplace
def test_insertion_simple(): """ Conduct a differentially private analysis with values inserted from other systems :return: """ with wn.Analysis() as analysis: # construct a fake dataset that describes your actual data (will never be run) data = wn.Dataset(path="", column_names=["A", "B", "C", "D"]) # pull a column out col_a = wn.to_float(data['A']) # describe the preprocessing you actually perform on the data col_a_clamped = wn.impute(wn.clamp(col_a, lower=0., upper=10.)) col_a_resized = wn.resize(col_a_clamped, n=1000000) # run a fake aggregation actual_mean = wn.mean(col_a_resized) # insert aggregated data from an external system actual_mean.set(10) # describe the differentially private operation gaussian_mean = wn.gaussian_mechanism(actual_mean, privacy_usage={ "epsilon": .4, "delta": 1e-6 }) # check if the analysis is permissible analysis.validate() # compute the missing releasable nodes- in this case, only the gaussian mean analysis.release() # retrieve the noised mean print("gaussian mean", gaussian_mean.value) # release a couple other statistics using other mechanisms in the same batch actual_sum = wn.sum(col_a_clamped) actual_sum.set(123456) laplace_sum = wn.laplace_mechanism(actual_sum, privacy_usage={"epsilon": .1}) actual_count = wn.count(col_a) actual_count.set(9876) geo_count = wn.simple_geometric_mechanism( actual_count, 0, 10000, privacy_usage={"epsilon": .1}) analysis.release() print("laplace sum", laplace_sum.value) print("geometric count", geo_count.value) actual_histogram_b = wn.histogram( wn.clamp(data['B'], categories=['X', 'Y', 'Z'], null_value="W")) actual_histogram_b.set([12, 1280, 2345, 12]) geo_histogram_b = wn.simple_geometric_mechanism( actual_histogram_b, 0, 10000, privacy_usage={"epsilon": .1}) col_c = wn.to_bool(data['C'], true_label="T") actual_histogram_c = wn.histogram(col_c) actual_histogram_c.set([5000, 5000]) lap_histogram_c = wn.laplace_mechanism(actual_histogram_c, privacy_usage={"epsilon": .1}) analysis.release() print("noised histogram b", geo_histogram_b.value) print("noised histogram c", lap_histogram_c.value) print("C dimensionality", col_c.dimensionality) print("C categories", col_c.categories) # multicolumnar insertion # pull a column out col_rest = wn.to_float(data[['C', 'D']]) # describe the preprocessing you actually perform on the data col_rest_resized = wn.resize(wn.impute( wn.clamp(col_rest, lower=[0., 5.], upper=1000.)), n=10000) # run a fake aggregation actual_mean = wn.mean(col_rest_resized) # insert aggregated data from an external system actual_mean.set([[10., 12.]]) # describe the differentially private operation gaussian_mean = wn.gaussian_mechanism(actual_mean, privacy_usage={ "epsilon": .4, "delta": 1e-6 }) # check if the analysis is permissible analysis.validate() # compute the missing releasable nodes- in this case, only the gaussian mean analysis.release() # retrieve the noised mean print("rest gaussian mean", gaussian_mean.value)
def test_everything(run=True): with wn.Analysis(dynamic=True) as analysis: data = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names) age_int = wn.to_int(data['age'], 0, 150) sex = wn.to_bool(data['sex'], "1") educ = wn.to_float(data['educ']) race = data['race'] income = wn.to_float(data['income']) married = wn.to_bool(data['married'], "1") numerics = wn.to_float(data[['age', 'income']]) # intentionally busted component # print("invalid component id ", (sex + "a").component_id) # broadcast scalar over 2d, broadcast scalar over 1d, columnar broadcasting, left and right mul numerics * 2. + 2. * educ # add different values for each column numerics + [[1., 2.]] # index into first column age = numerics[0] income = numerics[[False, True]] # boolean ops and broadcasting mask = sex & married | (~married ^ False) | (age > 50.) | (age_int == 25) # numerical clamping wn.clamp(numerics, 0., [150., 150_000.]) wn.clamp(data['educ'], categories=[str(i) for i in range(8, 10)], null_value="-1") wn.count(mask) wn.covariance(age, income) wn.digitize(educ, edges=[1., 3., 10.], null_value=-1) # checks for safety against division by zero income / 2. income / wn.clamp(educ, 5., 20.) wn.dp_count(data, privacy_usage={"epsilon": 0.5}) wn.dp_count(mask, privacy_usage={"epsilon": 0.5}) wn.dp_histogram(mask, privacy_usage={"epsilon": 0.5}) age = wn.impute(wn.clamp(age, 0., 150.)) wn.dp_maximum(age, privacy_usage={"epsilon": 0.5}) wn.dp_minimum(age, privacy_usage={"epsilon": 0.5}) wn.dp_median(age, privacy_usage={"epsilon": 0.5}) age_n = wn.resize(age, n=800) wn.dp_mean(age_n, privacy_usage={"epsilon": 0.5}) wn.dp_moment_raw(age_n, order=3, privacy_usage={"epsilon": 0.5}) wn.dp_sum(age, privacy_usage={"epsilon": 0.5}) wn.dp_variance(age_n, privacy_usage={"epsilon": 0.5}) wn.filter(income, mask) race_histogram = wn.histogram(race, categories=["1", "2", "3"], null_value="3") wn.histogram(income, edges=[0., 10000., 50000.], null_value=-1) wn.dp_histogram(married, privacy_usage={"epsilon": 0.5}) wn.gaussian_mechanism(race_histogram, privacy_usage={ "epsilon": 0.5, "delta": .000001 }) wn.laplace_mechanism(race_histogram, privacy_usage={ "epsilon": 0.5, "delta": .000001 }) wn.kth_raw_sample_moment(educ, k=3) wn.log(wn.clamp(educ, 0.001, 50.)) wn.maximum(educ) wn.mean(educ) wn.minimum(educ) educ % 2. educ**2. wn.quantile(educ, .32) wn.resize(educ, 1200, 0., 50.) wn.resize(race, 1200, categories=["1", "2"], weights=[1, 2]) wn.resize(data[["age", "sex"]], 1200, categories=[["1", "2"], ["a", "b"]], weights=[1, 2]) wn.resize(data[["age", "sex"]], 1200, categories=[["1", "2"], ["a", "b", "c"]], weights=[[1, 2], [3, 7, 2]]) wn.sum(educ) wn.variance(educ) if run: analysis.release() return analysis