def test_fail_groupby(): with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) is_male = sn.to_bool(data['sex'], true_label="1") educ_inc = sn.impute( sn.clamp(sn.to_float(data[['educ', 'income']]), lower=[0., 0.], upper=[15., 200_000.])) partitioned = sn.partition(educ_inc, by=is_male) bounds = { "data_lower": [0., 0.], "data_upper": [15., 200_000.], "data_rows": 500 } union = sn.union({ True: sn.mean(partitioned[True], privacy_usage={"epsilon": 0.1}, **bounds), False: sn.mean(partitioned[False], **bounds), }) sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0}) print(analysis.privacy_usage)
def test_groupby_3(): # now union the released output with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) is_male = sn.to_bool(data['sex'], true_label="1") educ_inc = sn.impute( sn.clamp(sn.to_float(data[['educ', 'income']]), lower=[0., 0.], upper=[15., 200_000.])) partitioned = sn.partition(educ_inc, by=is_male) means = {} for cat in is_male.categories: part = partitioned[cat] part = sn.resize(part, number_rows=500) part = sn.dp_mean(part, privacy_usage={"epsilon": 1.0}) # print("mean: ", part.properties) means[cat] = part union = sn.union(means) # analysis.plot() analysis.release() print(analysis.privacy_usage) print(union.value)
def test_groupby_c_stab(): # use the same partition multiple times in union with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) is_male = sn.to_bool(data['sex'], true_label="1") educ_inc = sn.impute( sn.clamp(sn.to_float(data[['educ', 'income']]), lower=[0., 0.], upper=[15., 200_000.])) partitioned = sn.partition(educ_inc, by=is_male) def analyze(data): return sn.mean(sn.resize(data, number_rows=500)) means = { True: analyze(partitioned[True]), False: analyze(partitioned[False]), "duplicate_that_inflates_c_stab": analyze(partitioned[True]), } union = sn.union(means) noised = sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0}) # analysis.plot() analysis.release() print(analysis.privacy_usage) print(noised.value)
def analytic_gaussian_similarity(): analytic_gauss_estimates = [] gauss_estimates = [] with sn.Analysis(strict_parameter_checks=False): PUMS = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) age = sn.impute(sn.to_float(PUMS['age']), data_lower=0., data_upper=100., data_rows=1000) for i in range(100): an_gauss_component = sn.dp_mean(age, mechanism="AnalyticGaussian", privacy_usage={ "epsilon": 1.0, "delta": 1E-6 }) gauss_component = sn.dp_mean(age, mechanism="Gaussian", privacy_usage={ "epsilon": 1.0, "delta": 1E-6 }) # this triggers an analysis.release (which also computes gauss_component) analytic_gauss_estimates.append(an_gauss_component.value) gauss_estimates.append(gauss_component.value) print(sum(analytic_gauss_estimates) / len(analytic_gauss_estimates)) print(sum(gauss_estimates) / len(gauss_estimates))
def snapping_similarity(): snapping_estimates = [] laplace_estimates = [] with sn.Analysis(strict_parameter_checks=False): PUMS = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) age = sn.impute(sn.to_float(PUMS['age']), data_lower=0., data_upper=100., data_rows=1000) for i in range(100): snapping_component = sn.dp_mean(age, mechanism="snapping", privacy_usage={ "epsilon": 1.0, "delta": 1E-6 }) laplace_component = sn.dp_mean(age, mechanism="laplace", privacy_usage={ "epsilon": 1.0, "delta": 1E-6 }) snapping_estimates.append(snapping_component.value) laplace_estimates.append(laplace_component.value) print(sum(snapping_estimates) / len(snapping_estimates)) print(sum(laplace_estimates) / len(laplace_estimates))
def test_histogram(): import numpy as np # establish data information data = np.genfromtxt(TEST_PUMS_PATH, delimiter=',', names=True) education_categories = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17" ] income = list(data[:]['income']) income_edges = list(range(0, 100_000, 10_000)) print('actual', np.histogram(income, bins=income_edges)[0]) with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) income = sn.to_int(data['income'], lower=0, upper=0) sex = sn.to_bool(data['sex'], true_label="1") income_histogram = sn.dp_histogram(income, edges=income_edges, privacy_usage={'epsilon': 1.}) analysis.release() print("Income histogram Geometric DP release: " + str(income_histogram.value))
def test_groupby_4(): # now union private data, and apply mechanism after with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) is_male = sn.to_bool(data['sex'], true_label="1") educ_inc = sn.impute( sn.clamp(sn.to_float(data[['educ', 'income']]), lower=[0., 0.], upper=[15., 200_000.])) partitioned = sn.partition(educ_inc, by=is_male) means = {} for cat in is_male.categories: part = partitioned[cat] part = sn.resize(part, number_rows=500) part = sn.mean(part) means[cat] = part union = sn.union(means) noised = sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0}) # analysis.plot() analysis.release() print(analysis.privacy_usage) print(noised.value)
def test_properties(): with sn.Analysis(): # load data data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) # establish data age_dt = sn.cast(data['age'], 'FLOAT') # ensure data are non-null non_null_age_dt = sn.impute(age_dt, distribution='Uniform', lower=0., upper=100.) clamped = sn.clamp(age_dt, lower=0., upper=100.) # create potential for null data again potentially_null_age_dt = non_null_age_dt / 0. # print('original properties:\n{0}\n\n'.format(age_dt.properties)) print('properties after imputation:\n{0}\n\n'.format( non_null_age_dt.nullity)) print('properties after nan mult:\n{0}\n\n'.format( potentially_null_age_dt.nullity)) print("lower", clamped.lower) print("upper", clamped.upper) print("releasable", clamped.releasable) # print("props", clamped.properties) print("data_type", clamped.data_type) print("categories", clamped.categories)
def test_dataframe_partitioning_2(): # dataframe partition with multi-index grouping with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) grouper = sn.clamp(data[['sex', 'educ']], categories=[['0', '1'], [str(i) for i in range(14)]], null_value='-1') partitioned = sn.partition(data, by=grouper) sn.union( { key: sn.dp_count(partitioned[key], privacy_usage={"epsilon": 0.5}) for key in partitioned.partition_keys }, flatten=False) print( sn.union({ key: sn.dp_mean( sn.to_float(partitioned[key]['income']), implementation="plug-in", # data_rows=100, data_lower=0., data_upper=200_000., privacy_usage={"epsilon": 0.5}) for key in partitioned.partition_keys }))
def test_multilayer_partition_1(): # multilayer partition with mechanisms applied inside partitions with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) is_male = sn.to_bool(data['sex'], true_label="1") educ_inc = sn.impute( sn.clamp(sn.to_float(data[['educ', 'income']]), lower=[0., 0.], upper=[15., 200_000.])) partitioned = sn.partition(educ_inc, by=is_male) def analyze(data): educ = sn.clamp(sn.to_int(sn.index(data, indices=0), lower=0, upper=15), categories=list(range(15)), null_value=-1) income = sn.index(data, indices=1) repartitioned = sn.partition(income, by=educ) inner_count = {} inner_means = {} for key in [5, 8, 12]: educ_level_part = repartitioned[key] inner_count[key] = sn.dp_count(educ_level_part, privacy_usage={"epsilon": 0.4}) inner_means[key] = sn.dp_mean(educ_level_part, privacy_usage={"epsilon": 0.6}, data_rows=sn.row_max( 1, inner_count[key])) return sn.union(inner_means, flatten=False), sn.union(inner_count, flatten=False) means = {} counts = {} for key in partitioned.partition_keys: part_means, part_counts = analyze(partitioned[key]) means[key] = part_means counts[key] = part_counts means = sn.union(means, flatten=False) counts = sn.union(counts, flatten=False) # analysis.plot() print("releasing") print(len(analysis.components.items())) analysis.release() print(analysis.privacy_usage) print("Counts:") print(counts.value) print("Means:") print(means.value)
def test_index(): with sn.Analysis(filter_level='all') as analysis: data = generate_bools() index_0 = sn.index(data, indices=0) analysis.release() assert all(a == b for a, b in zip(index_0.value, [True, True, False, False]))
def test_equal(): with sn.Analysis(filter_level='all') as analysis: data = generate_bools() equality = sn.index(data, indices=0) == sn.index(data, indices=1) analysis.release() assert np.array_equal(equality.value, np.array([True, False, False, True]))
def test_partition(): with sn.Analysis(filter_level='all') as analysis: data = generate_bools() partitioned = sn.partition(data, num_partitions=3) analysis.release() # print(partitioned.value) assert np.array_equal(partitioned.value[0], np.array([[True, True], [True, False]])) assert np.array_equal(partitioned.value[1], np.array([[False, True]])) assert np.array_equal(partitioned.value[2], np.array([[False, False]]))
def test_map_1(): # map a count over all dataframe partitions with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) partitioned = sn.partition(data, by=sn.to_bool(data['sex'], true_label="1")) counts = sn.dp_count(partitioned, privacy_usage={"epsilon": 0.5}) print(counts.value) print(analysis.privacy_usage)
def test_dp_mean(): with sn.Analysis(): data = generate_synthetic(float, variants=['Random']) mean = sn.dp_mean(data['F_Random'], privacy_usage={'epsilon': 0.1}, data_lower=0., data_upper=10., data_rows=10) print("accuracy", mean.get_accuracy(0.05)) print(mean.from_accuracy(2.3, .05)) with sn.Analysis(): data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) print( sn.dp_mean(sn.to_float(data['income']), implementation="plug-in", data_lower=0., data_upper=200_000., privacy_usage={ "epsilon": 0.5 }).value)
def test_private_clamped_sum_helpers(): # Compute the CI with smartnoise with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_DATA_PATH, column_names=TEST_DATA_COLUMNS) D = sn.to_float(data["age"]) D_tilde = sn.resize(sn.clamp(data=D, lower=0.0, upper=100.0), number_rows=1000,) release = sn.dp_sum(data=sn.impute(D_tilde), privacy_usage={"epsilon": 1.0}) smartnoise_ci = release.get_accuracy(0.05) op = PrivateClampedSum(lower_bound=0, upper_bound=100) eeprivacy_ci = op.confidence_interval(epsilon=1, confidence=0.95) assert pytest.approx(smartnoise_ci, abs=0.001) == eeprivacy_ci
def test_reports(): with sn.Analysis() as analysis: # load data data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) # get mean of age age_mean = sn.dp_mean(data=sn.to_float(data['age']), privacy_usage={'epsilon': .65}, data_lower=0., data_upper=100., data_rows=1000) print("Pre-Release\n") print("DP mean of age: {0}".format(age_mean.value)) print("Privacy usage: {0}\n\n".format(analysis.privacy_usage))
def test_dp_count(run=True): with sn.Analysis() as analysis: dataset_pums = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) count = sn.dp_count(dataset_pums['sex'] == '1', privacy_usage={'epsilon': 0.5}) if run: analysis.release() print(count.value) return analysis
def try_sn(): # establish data information #data_path = 'https://raw.githubusercontent.com/opendp/smartnoise-samples/86-requirements-fix/analysis/data/PUMS_california_demographics_1000/data.csv' data_path = os.path.join('.', 'data', 'PUMS_california_demographics_1000', 'data.csv') data_path = os.path.abspath(data_path) print('data_path', data_path) var_names = ["age", "sex", "educ", "race", "income", "married", "pid"] D = pd.read_csv(data_path)['age'] D_mean_age = np.mean(D) print('D_mean_age', D_mean_age) # establish extra information for this simulation age_lower_bound = 0. age_upper_bound = 100. D_tilde = np.clip(D, age_lower_bound, age_upper_bound) D_tilde_mean_age = np.mean(D_tilde) data_size = 1000 df = pd.read_csv(data_path) df_as_array = [list(row) for row in df.itertuples()] #df.values.tolist() print('D.values', df_as_array) n_sims = 2 releases = [] with sn.Analysis(dynamic=True) as analysis: data = sn.Dataset(path=data_path, column_names=var_names) #data = sn.Dataset(value=df_as_array, column_names=var_names) D = sn.to_float(data['age']) # preprocess data (resize is a no-op because we have the correct data size) D_tilde = sn.resize(sn.clamp(data=D, lower=0., upper=100.), number_rows=data_size) for index in range(n_sims): # get DP mean of age releases.append( sn.dp_mean(data=sn.impute(D_tilde), privacy_usage={'epsilon': 1})) accuracy = releases[0].get_accuracy(0.05) analysis.release() dp_values = [release.value for release in releases] print( 'Accuracy interval (with accuracy value {0}) contains the true mean on D_tilde with probability {1}' .format( round(accuracy, 4), np.mean([(D_tilde_mean_age >= val - accuracy) & (D_tilde_mean_age <= val + accuracy) for val in dp_values])))
def test_raw_dataset(run=True): with sn.Analysis() as analysis: data = sn.to_float(sn.Dataset(value=[1., 2., 3., 4., 5.])) sn.dp_mean(data=data, privacy_usage={'epsilon': 1}, data_lower=0., data_upper=10., data_rows=10, data_columns=1) if run: analysis.release() return analysis
def test_map_2(): # map a count over a large number of tuple partitions of dataframes with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) grouper = sn.clamp(data[['sex', 'educ']], categories=[['0', '1'], [str(i) for i in range(14)]], null_value='-1') partitioned = sn.partition(data, by=grouper) counts = sn.dp_count(partitioned, privacy_usage={"epsilon": 0.5}) print(counts.value) print(analysis.privacy_usage)
def test_divide(): with sn.Analysis(): data_A = generate_synthetic(float, variants=['Random']) f_random = data_A['F_Random'] imputed = sn.impute(f_random, lower=0., upper=10.) clamped_nonzero = sn.clamp(imputed, lower=1., upper=10.) clamped_zero = sn.clamp(imputed, lower=0., upper=10.) # test properties assert f_random.nullity assert not imputed.nullity assert (2. / imputed).nullity assert (f_random / imputed).nullity assert (2. / clamped_zero).nullity
def test_map_3(): # chain multiple maps over an array partition with implicit preprocessing with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) partitioned = sn.partition(sn.to_float(data['age']), by=sn.to_bool(data['sex'], true_label="1")) means = sn.dp_mean(partitioned, privacy_usage={'epsilon': 0.1}, data_rows=500, data_lower=0., data_upper=15.) print(means.value) print(analysis.privacy_usage)
def test_covariance(): import numpy as np import pandas as pd import matplotlib.pyplot as plt data = np.genfromtxt(TEST_PUMS_PATH, delimiter=',', names=True) with sn.Analysis() as analysis: wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) # get full covariance matrix cov = sn.dp_covariance(data=sn.to_float(wn_data['age', 'sex', 'educ', 'income', 'married']), privacy_usage={'epsilon': 10}, data_lower=[0., 0., 1., 0., 0.], data_upper=[100., 1., 16., 500_000., 1.], data_rows=1000) analysis.release() # store DP covariance and correlation matrix dp_cov = cov.value print(dp_cov) dp_corr = dp_cov / np.outer(np.sqrt(np.diag(dp_cov)), np.sqrt(np.diag(dp_cov))) # get non-DP covariance/correlation matrices age = list(data[:]['age']) sex = list(data[:]['sex']) educ = list(data[:]['educ']) income = list(data[:]['income']) married = list(data[:]['married']) non_dp_cov = np.cov([age, sex, educ, income, married]) non_dp_corr = non_dp_cov / np.outer(np.sqrt(np.diag(non_dp_cov)), np.sqrt(np.diag(non_dp_cov))) print('Non-DP Covariance Matrix:\n{0}\n\n'.format( pd.DataFrame(non_dp_cov))) print('Non-DP Correlation Matrix:\n{0}\n\n'.format( pd.DataFrame(non_dp_corr))) print('DP Correlation Matrix:\n{0}'.format(pd.DataFrame(dp_corr))) # skip plot step if IS_CI_BUILD: return plt.imshow(non_dp_corr - dp_corr, interpolation='nearest') plt.colorbar() plt.show()
def test_median_education(): # import pandas as pd # print(pd.read_csv(data_path)['value'].median()) with sn.Analysis(filter_level="all") as analysis: data = sn.Dataset(path=TEST_EDUC_PATH, column_names=TEST_EDUC_NAMES) candidates = list(map(float, range(1, 200, 2))) median_scores = sn.median(sn.impute(sn.to_float(data['value']), 100., 200.), candidates=candidates) # print(list(zip(candidates, median_scores.value[0]))) dp_median = sn.exponential_mechanism(median_scores, candidates=candidates, privacy_usage={"epsilon": 100.}) print(dp_median.value) analysis.release()
def test_groupby_1(): with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) is_male = sn.to_bool(data['sex'], true_label="1") partitioned = sn.partition(data[['educ', 'income']], by=is_male) counts = { cat: sn.dp_count(partitioned[cat], privacy_usage={'epsilon': 0.1}) for cat in is_male.categories } # analysis.plot() analysis.release() print(analysis.privacy_usage) print({cat: counts[cat].value for cat in counts})
def test_dp_median_raw(): with sn.Analysis() as analysis: # create a literal data vector, and tag it as private data = sn.Component.of([float(i) for i in range(20)], public=False) dp_median = sn.dp_median( sn.to_float(data), privacy_usage={ "epsilon": 1. }, candidates=[-10., -2., 2., 3., 4., 7., 10., 12.], data_lower=0., data_upper=10., data_columns=1).value print(dp_median) # analysis.plot() assert dp_median is not None
def test_dp_covariance(): # establish data information var_names = ["age", "sex", "educ", "race", "income", "married"] with sn.Analysis() as analysis: wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) # # get scalar covariance age_income_cov_scalar = sn.dp_covariance( left=sn.to_float(wn_data['age']), right=sn.to_float(wn_data['income']), privacy_usage={'epsilon': 5000}, left_lower=0., left_upper=100., left_rows=1000, right_lower=0., right_upper=500_000., right_rows=1000) data = sn.to_float(wn_data['age', 'income']) # get full covariance matrix age_income_cov_matrix = sn.dp_covariance( data=data, privacy_usage={'epsilon': 5000}, data_lower=[0., 0.], data_upper=[100., 500_000.], data_rows=1000) # get cross-covariance matrix cross_covar = sn.dp_covariance(left=data, right=data, privacy_usage={'epsilon': 5000}, left_lower=[0., 0.], left_upper=[100., 500_000.], left_rows=1_000, right_lower=[0., 0.], right_upper=[100., 500_000.], right_rows=1000) analysis.release() print('scalar covariance:\n{0}\n'.format(age_income_cov_scalar.value)) print('covariance matrix:\n{0}\n'.format(age_income_cov_matrix.value)) print('cross-covariance matrix:\n{0}'.format(cross_covar.value))
def test_dp_linear_regression(): with sn.Analysis(): wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) wn_data = sn.resize(sn.to_float(wn_data[["age", "income"]]), number_rows=1000, lower=[0., 0.], upper=[100., 500_000.]) dp_linear_regression = sn.dp_linear_regression( data_x=sn.index(wn_data, indices=0), data_y=sn.index(wn_data, indices=1), privacy_usage={'epsilon': 10.}, lower_slope=0., upper_slope=1000., lower_intercept=0., upper_intercept=1000.) print(dp_linear_regression.value)
def test_dataframe_partitioning_1(): # dataframe partition with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) is_male = sn.to_bool(data['sex'], true_label="1") partitioned = sn.partition(data, by=is_male) print( sn.union({ key: sn.dp_mean(sn.impute( sn.clamp(sn.to_float(partitioned[key]['income']), 0., 200_000.)), implementation="plug-in", privacy_usage={"epsilon": 0.5}) for key in partitioned.partition_keys }).value) print(analysis.privacy_usage)