def test_dp_covariance(): # establish data information var_names = ["age", "sex", "educ", "race", "income", "married"] with sn.Analysis() as analysis: wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) # # get scalar covariance age_income_cov_scalar = sn.dp_covariance( left=sn.to_float(wn_data['age']), right=sn.to_float(wn_data['income']), privacy_usage={'epsilon': 5000}, left_lower=0., left_upper=100., left_rows=1000, right_lower=0., right_upper=500_000., right_rows=1000) data = sn.to_float(wn_data['age', 'income']) # get full covariance matrix age_income_cov_matrix = sn.dp_covariance( data=data, privacy_usage={'epsilon': 5000}, data_lower=[0., 0.], data_upper=[100., 500_000.], data_rows=1000) # get cross-covariance matrix cross_covar = sn.dp_covariance(left=data, right=data, privacy_usage={'epsilon': 5000}, left_lower=[0., 0.], left_upper=[100., 500_000.], left_rows=1_000, right_lower=[0., 0.], right_upper=[100., 500_000.], right_rows=1000) analysis.release() print('scalar covariance:\n{0}\n'.format(age_income_cov_scalar.value)) print('covariance matrix:\n{0}\n'.format(age_income_cov_matrix.value)) print('cross-covariance matrix:\n{0}'.format(cross_covar.value))
def dp_all(numeric, categorical, args): return { "covariance": sn.dp_covariance(left=numeric, right=numeric, **args), "histogram": sn.dp_histogram(categorical, **args), "maximum": sn.dp_maximum(numeric, **args), "mean": sn.dp_mean(numeric, **args), "median": sn.dp_median(numeric, **args), "minimum": sn.dp_minimum(numeric, **args), "quantile": sn.dp_quantile(numeric, .75, **args), "raw_moment": sn.dp_raw_moment(numeric, 2, **args), "sum": sn.dp_sum(numeric, **args), "variance": sn.dp_variance(numeric, **args) }
def test_covariance(): import numpy as np import pandas as pd import matplotlib.pyplot as plt data = np.genfromtxt(TEST_PUMS_PATH, delimiter=',', names=True) with sn.Analysis() as analysis: wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) # get full covariance matrix cov = sn.dp_covariance(data=sn.to_float(wn_data['age', 'sex', 'educ', 'income', 'married']), privacy_usage={'epsilon': 10}, data_lower=[0., 0., 1., 0., 0.], data_upper=[100., 1., 16., 500_000., 1.], data_rows=1000) analysis.release() # store DP covariance and correlation matrix dp_cov = cov.value print(dp_cov) dp_corr = dp_cov / np.outer(np.sqrt(np.diag(dp_cov)), np.sqrt(np.diag(dp_cov))) # get non-DP covariance/correlation matrices age = list(data[:]['age']) sex = list(data[:]['sex']) educ = list(data[:]['educ']) income = list(data[:]['income']) married = list(data[:]['married']) non_dp_cov = np.cov([age, sex, educ, income, married]) non_dp_corr = non_dp_cov / np.outer(np.sqrt(np.diag(non_dp_cov)), np.sqrt(np.diag(non_dp_cov))) print('Non-DP Covariance Matrix:\n{0}\n\n'.format( pd.DataFrame(non_dp_cov))) print('Non-DP Correlation Matrix:\n{0}\n\n'.format( pd.DataFrame(non_dp_corr))) print('DP Correlation Matrix:\n{0}'.format(pd.DataFrame(dp_corr))) # skip plot step if IS_CI_BUILD: return plt.imshow(non_dp_corr - dp_corr, interpolation='nearest') plt.colorbar() plt.show()
def test_dp_linear_stats(run=True): with sn.Analysis() as analysis: dataset_pums = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) age = dataset_pums['age'] analysis.release() num_records = sn.dp_count(age, privacy_usage={'epsilon': .5}, lower=0, upper=10000) analysis.release() print("number of records:", num_records.value) vars = sn.to_float(dataset_pums[["age", "income"]]) covariance = sn.dp_covariance(data=vars, privacy_usage={'epsilon': .5}, data_lower=[0., 0.], data_upper=[150., 150000.], data_rows=num_records) print("covariance released") num_means = sn.dp_mean(data=vars, privacy_usage={'epsilon': .5}, data_lower=[0., 0.], data_upper=[150., 150000.], data_rows=num_records) analysis.release() print("covariance:\n", covariance.value) print("means:\n", num_means.value) age = sn.to_float(age) age_variance = sn.dp_variance(age, privacy_usage={'epsilon': .5}, data_lower=0., data_upper=150., data_rows=num_records) analysis.release() print("age variance:", age_variance.value) # If I clamp, impute, resize, then I can reuse their properties for multiple statistics clamped_age = sn.clamp(age, lower=0., upper=100.) imputed_age = sn.impute(clamped_age) preprocessed_age = sn.resize(imputed_age, number_rows=num_records) # properties necessary for mean are statically known mean = sn.dp_mean(preprocessed_age, privacy_usage={'epsilon': .5}) # properties necessary for variance are statically known variance = sn.dp_variance(preprocessed_age, privacy_usage={'epsilon': .5}) # sum doesn't need n, so I pass the data in before resizing age_sum = sn.dp_sum(imputed_age, privacy_usage={'epsilon': .5}) # mean with lower, upper properties propagated up from prior bounds transformed_mean = sn.dp_mean(-(preprocessed_age + 2.), privacy_usage={'epsilon': .5}) analysis.release() print("age transformed mean:", transformed_mean.value) # releases may be pieced together from combinations of smaller components custom_mean = sn.laplace_mechanism(sn.mean(preprocessed_age), privacy_usage={'epsilon': .5}) custom_maximum = sn.laplace_mechanism(sn.maximum(preprocessed_age), privacy_usage={'epsilon': .5}) custom_maximum = sn.laplace_mechanism(sn.maximum(preprocessed_age), privacy_usage={'epsilon': .5}) custom_quantile = sn.laplace_mechanism(sn.quantile(preprocessed_age, alpha=.5), privacy_usage={'epsilon': 500}) income = sn.to_float(dataset_pums['income']) income_max = sn.laplace_mechanism(sn.maximum(income, data_lower=0., data_upper=1000000.), privacy_usage={'epsilon': 10}) # releases may also be postprocessed and reused as arguments to more components age_sum + custom_maximum * 23. analysis.release() print("laplace quantile:", custom_quantile.value) age_histogram = sn.dp_histogram(sn.to_int(age, lower=0, upper=100), edges=list(range(0, 100, 25)), null_value=150, privacy_usage={'epsilon': 2.}) sex_histogram = sn.dp_histogram(sn.to_bool(dataset_pums['sex'], true_label="1"), privacy_usage={'epsilon': 2.}) education_histogram = sn.dp_histogram(dataset_pums['educ'], categories=["5", "7", "10"], null_value="-1", privacy_usage={'epsilon': 2.}) analysis.release() print("age histogram: ", age_histogram.value) print("sex histogram: ", sex_histogram.value) print("education histogram: ", education_histogram.value) if run: analysis.release() # get the mean computed when release() was called print(mean.value) print(variance.value) return analysis