def test_groupby_3(): # now union the released output with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) is_male = sn.to_bool(data['sex'], true_label="1") educ_inc = sn.impute( sn.clamp(sn.to_float(data[['educ', 'income']]), lower=[0., 0.], upper=[15., 200_000.])) partitioned = sn.partition(educ_inc, by=is_male) means = {} for cat in is_male.categories: part = partitioned[cat] part = sn.resize(part, number_rows=500) part = sn.dp_mean(part, privacy_usage={"epsilon": 1.0}) # print("mean: ", part.properties) means[cat] = part union = sn.union(means) # analysis.plot() analysis.release() print(analysis.privacy_usage) print(union.value)
def test_groupby_4(): # now union private data, and apply mechanism after with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) is_male = sn.to_bool(data['sex'], true_label="1") educ_inc = sn.impute( sn.clamp(sn.to_float(data[['educ', 'income']]), lower=[0., 0.], upper=[15., 200_000.])) partitioned = sn.partition(educ_inc, by=is_male) means = {} for cat in is_male.categories: part = partitioned[cat] part = sn.resize(part, number_rows=500) part = sn.mean(part) means[cat] = part union = sn.union(means) noised = sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0}) # analysis.plot() analysis.release() print(analysis.privacy_usage) print(noised.value)
def test_private_clamped_sum_helpers(): # Compute the CI with smartnoise with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_DATA_PATH, column_names=TEST_DATA_COLUMNS) D = sn.to_float(data["age"]) D_tilde = sn.resize(sn.clamp(data=D, lower=0.0, upper=100.0), number_rows=1000,) release = sn.dp_sum(data=sn.impute(D_tilde), privacy_usage={"epsilon": 1.0}) smartnoise_ci = release.get_accuracy(0.05) op = PrivateClampedSum(lower_bound=0, upper_bound=100) eeprivacy_ci = op.confidence_interval(epsilon=1, confidence=0.95) assert pytest.approx(smartnoise_ci, abs=0.001) == eeprivacy_ci
def try_sn(): # establish data information #data_path = 'https://raw.githubusercontent.com/opendp/smartnoise-samples/86-requirements-fix/analysis/data/PUMS_california_demographics_1000/data.csv' data_path = os.path.join('.', 'data', 'PUMS_california_demographics_1000', 'data.csv') data_path = os.path.abspath(data_path) print('data_path', data_path) var_names = ["age", "sex", "educ", "race", "income", "married", "pid"] D = pd.read_csv(data_path)['age'] D_mean_age = np.mean(D) print('D_mean_age', D_mean_age) # establish extra information for this simulation age_lower_bound = 0. age_upper_bound = 100. D_tilde = np.clip(D, age_lower_bound, age_upper_bound) D_tilde_mean_age = np.mean(D_tilde) data_size = 1000 df = pd.read_csv(data_path) df_as_array = [list(row) for row in df.itertuples()] #df.values.tolist() print('D.values', df_as_array) n_sims = 2 releases = [] with sn.Analysis(dynamic=True) as analysis: data = sn.Dataset(path=data_path, column_names=var_names) #data = sn.Dataset(value=df_as_array, column_names=var_names) D = sn.to_float(data['age']) # preprocess data (resize is a no-op because we have the correct data size) D_tilde = sn.resize(sn.clamp(data=D, lower=0., upper=100.), number_rows=data_size) for index in range(n_sims): # get DP mean of age releases.append( sn.dp_mean(data=sn.impute(D_tilde), privacy_usage={'epsilon': 1})) accuracy = releases[0].get_accuracy(0.05) analysis.release() dp_values = [release.value for release in releases] print( 'Accuracy interval (with accuracy value {0}) contains the true mean on D_tilde with probability {1}' .format( round(accuracy, 4), np.mean([(D_tilde_mean_age >= val - accuracy) & (D_tilde_mean_age <= val + accuracy) for val in dp_values])))
def test_dp_linear_regression(): with sn.Analysis(): wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) wn_data = sn.resize(sn.to_float(wn_data[["age", "income"]]), number_rows=1000, lower=[0., 0.], upper=[100., 500_000.]) dp_linear_regression = sn.dp_linear_regression( data_x=sn.index(wn_data, indices=0), data_y=sn.index(wn_data, indices=1), privacy_usage={'epsilon': 10.}, lower_slope=0., upper_slope=1000., lower_intercept=0., upper_intercept=1000.) print(dp_linear_regression.value)
def analyze(data): educ = sn.clamp(sn.to_int(sn.index(data, indices=0), lower=0, upper=15), categories=list(range(15)), null_value=-1) income = sn.index(data, indices=1) repartitioned = sn.partition(income, by=educ) inner_count = {} inner_means = {} for key in [5, 8, 12]: educ_level_part = repartitioned[key] inner_count[key] = sn.dp_count(educ_level_part, privacy_usage={"epsilon": 0.4}) inner_means[key] = sn.mean( sn.resize(educ_level_part, number_rows=sn.row_min(1, inner_count[key] * 4 // 5))) return sn.union(inner_means), sn.union(inner_count)
def test_mechanism(args, constructor): with sn.Analysis() as analysis: PUMS = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) categorical = sn.resize(sn.clamp(PUMS['sex'], categories=["0", "1"], null_value="0"), number_rows=1000) numeric = sn.impute(sn.to_float(PUMS['age']), data_lower=0., data_upper=100., data_rows=1000) all = constructor(numeric, categorical, args) analysis.release() all_values = {stat: all[stat].value for stat in all} print() pprint(all_values) for value in all_values.values(): assert value is not None
def test_private_clamped_mean_helpers(): # Compute the CI with smartnoise with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_DATA_PATH, column_names=TEST_DATA_COLUMNS) D = sn.to_float(data["age"]) D_tilde = sn.resize(sn.clamp(data=D, lower=0.0, upper=100.0), number_rows=1000,) release = sn.dp_mean(data=sn.impute(D_tilde), privacy_usage={"epsilon": 1.0}) smartnoise_ci = release.get_accuracy(0.05) # Compute the CI with eeprivacy op = PrivateClampedMean(lower_bound=0, upper_bound=100) eeprivacy_ci = op.confidence_interval(epsilon=1, N=1000, confidence=0.95) # Compare computed confidence intervals assert pytest.approx(smartnoise_ci, abs=0.001) == eeprivacy_ci smartnoise_epsilon = release.from_accuracy(value=1, alpha=0.05)[0]["epsilon"] eeprivacy_epsilon = op.epsilon_for_confidence_interval( target_ci=1, N=1000, confidence=0.95 ) # Compare computed epsilons for confidence interval assert pytest.approx(smartnoise_epsilon, abs=0.001) == eeprivacy_epsilon
def create_dicts(data, non_income_data, plausible_variable_combinations): count_dict = dict() priv_count_dict = dict() mean_income_dict = dict() priv_mean_income_dict = dict() median_income_dict = dict() priv_median_income_dict = dict() min_income_dict = dict() priv_min_income_dict = dict() max_income_dict = dict() priv_max_income_dict = dict() # get number of data elements with each set of variable values for i, combination in enumerate(plausible_variable_combinations): # print('run {0} of {1}'.format(i+1, len(plausible_variable_combinations))) if len(combination) == 1: dt = data[non_income_data[combination[0]] == 1] elif len(combination) == 2: dt = data[(non_income_data[combination[0]] == 1) & (non_income_data[combination[1]] == 1)] elif len(combination) == 3: dt = data[(non_income_data[combination[0]] == 1) & (non_income_data[combination[1]] == 1) & (non_income_data[combination[2]] == 1)] elif len(combination) == 4: dt = data[(non_income_data[combination[0]] == 1) & (non_income_data[combination[1]] == 1) & (non_income_data[combination[2]] == 1) & (non_income_data[combination[3]] == 1)] elif len(combination) == 5: dt = data[(non_income_data[combination[0]] == 1) & (non_income_data[combination[1]] == 1) & (non_income_data[combination[2]] == 1) & (non_income_data[combination[3]] == 1) & (non_income_data[combination[4]] == 1)] count_dict['__'.join(combination)] = dt.shape[0] mean_income_dict['__'.join(combination)] = np.mean(dt['income']) median_income_dict['__'.join(combination)] = np.median(dt['income']) min_income_dict['__'.join(combination)] = np.min(dt['income']) max_income_dict['__'.join(combination)] = np.max(dt['income']) with sn.Analysis() as analysis: # load data priv_data = sn.Dataset(value=dt['income']) # estimate sample size count = sn.dp_count(priv_data, privacy_usage={'epsilon': .05}) # preprocess data priv_data = sn.resize(sn.to_float(priv_data), number_columns=1, number_rows=sn.row_max(1, count), lower=0., upper=100_000.) priv_data = sn.impute(sn.clamp(priv_data, lower=0., upper=100_000.)) # get mean mean = sn.dp_mean(priv_data, privacy_usage={'epsilon': 0.1}) # get median median = sn.dp_median(priv_data, privacy_usage={'epsilon': 0.1}) # get min _min = sn.dp_minimum(priv_data, privacy_usage={'epsilon': 0.1}) # get max _max = sn.dp_maximum(priv_data, privacy_usage={'epsilon': 0.1}) analysis.release() priv_count_dict['__'.join(combination)] = max(0, count.value) priv_mean_income_dict['__'.join(combination)] = min( max(0, mean.value), 100_000) priv_median_income_dict['__'.join(combination)] = min( max(0, median.value), 100_000) priv_min_income_dict['__'.join(combination)] = min( max(0, _min.value), 100_000) priv_max_income_dict['__'.join(combination)] = min( max(0, _max.value), 100_000) return (count_dict, priv_count_dict, mean_income_dict, priv_mean_income_dict, median_income_dict, priv_median_income_dict, min_income_dict, priv_min_income_dict, max_income_dict, priv_max_income_dict)
def test_dp_linear_stats(run=True): with sn.Analysis() as analysis: dataset_pums = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) age = dataset_pums['age'] analysis.release() num_records = sn.dp_count(age, privacy_usage={'epsilon': .5}, lower=0, upper=10000) analysis.release() print("number of records:", num_records.value) vars = sn.to_float(dataset_pums[["age", "income"]]) covariance = sn.dp_covariance(data=vars, privacy_usage={'epsilon': .5}, data_lower=[0., 0.], data_upper=[150., 150000.], data_rows=num_records) print("covariance released") num_means = sn.dp_mean(data=vars, privacy_usage={'epsilon': .5}, data_lower=[0., 0.], data_upper=[150., 150000.], data_rows=num_records) analysis.release() print("covariance:\n", covariance.value) print("means:\n", num_means.value) age = sn.to_float(age) age_variance = sn.dp_variance(age, privacy_usage={'epsilon': .5}, data_lower=0., data_upper=150., data_rows=num_records) analysis.release() print("age variance:", age_variance.value) # If I clamp, impute, resize, then I can reuse their properties for multiple statistics clamped_age = sn.clamp(age, lower=0., upper=100.) imputed_age = sn.impute(clamped_age) preprocessed_age = sn.resize(imputed_age, number_rows=num_records) # properties necessary for mean are statically known mean = sn.dp_mean(preprocessed_age, privacy_usage={'epsilon': .5}) # properties necessary for variance are statically known variance = sn.dp_variance(preprocessed_age, privacy_usage={'epsilon': .5}) # sum doesn't need n, so I pass the data in before resizing age_sum = sn.dp_sum(imputed_age, privacy_usage={'epsilon': .5}) # mean with lower, upper properties propagated up from prior bounds transformed_mean = sn.dp_mean(-(preprocessed_age + 2.), privacy_usage={'epsilon': .5}) analysis.release() print("age transformed mean:", transformed_mean.value) # releases may be pieced together from combinations of smaller components custom_mean = sn.laplace_mechanism(sn.mean(preprocessed_age), privacy_usage={'epsilon': .5}) custom_maximum = sn.laplace_mechanism(sn.maximum(preprocessed_age), privacy_usage={'epsilon': .5}) custom_maximum = sn.laplace_mechanism(sn.maximum(preprocessed_age), privacy_usage={'epsilon': .5}) custom_quantile = sn.laplace_mechanism(sn.quantile(preprocessed_age, alpha=.5), privacy_usage={'epsilon': 500}) income = sn.to_float(dataset_pums['income']) income_max = sn.laplace_mechanism(sn.maximum(income, data_lower=0., data_upper=1000000.), privacy_usage={'epsilon': 10}) # releases may also be postprocessed and reused as arguments to more components age_sum + custom_maximum * 23. analysis.release() print("laplace quantile:", custom_quantile.value) age_histogram = sn.dp_histogram(sn.to_int(age, lower=0, upper=100), edges=list(range(0, 100, 25)), null_value=150, privacy_usage={'epsilon': 2.}) sex_histogram = sn.dp_histogram(sn.to_bool(dataset_pums['sex'], true_label="1"), privacy_usage={'epsilon': 2.}) education_histogram = sn.dp_histogram(dataset_pums['educ'], categories=["5", "7", "10"], null_value="-1", privacy_usage={'epsilon': 2.}) analysis.release() print("age histogram: ", age_histogram.value) print("sex histogram: ", sex_histogram.value) print("education histogram: ", education_histogram.value) if run: analysis.release() # get the mean computed when release() was called print(mean.value) print(variance.value) return analysis
def test_everything(run=True): with sn.Analysis() as analysis: data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) age_int = sn.to_int(data['age'], 0, 150) sex = sn.to_bool(data['sex'], "1") educ = sn.to_float(data['educ']) race = data['race'] income = sn.to_float(data['income']) married = sn.to_bool(data['married'], "1") numerics = sn.to_float(data[['age', 'income']]) # intentionally busted component # print("invalid component id ", (sex + "a").component_id) # broadcast scalar over 2d, broadcast scalar over 1d, columnar broadcasting, left and right mul numerics * 2. + 2. * educ # add different values for each column numerics + [[1., 2.]] # index into first column age = sn.index(numerics, indices=0) income = sn.index(numerics, mask=[False, True]) # boolean ops and broadcasting mask = sex & married | (~married ^ False) | (age > 50.) | (age_int == 25) # numerical clamping sn.clamp(numerics, 0., [150., 150_000.]) sn.clamp(data['educ'], categories=[str(i) for i in range(8, 10)], null_value="-1") sn.count(mask) sn.covariance(age, income) sn.digitize(educ, edges=[1., 3., 10.], null_value=-1) # checks for safety against division by zero income / 2. income / sn.clamp(educ, 5., 20.) sn.dp_count(data, privacy_usage={"epsilon": 0.5}) sn.dp_count(mask, privacy_usage={"epsilon": 0.5}) sn.dp_histogram(mask, privacy_usage={"epsilon": 0.5}) age = sn.impute(sn.clamp(age, 0., 150.)) sn.dp_maximum(age, privacy_usage={"epsilon": 0.5}) sn.dp_minimum(age, privacy_usage={"epsilon": 0.5}) sn.dp_median(age, privacy_usage={"epsilon": 0.5}) age_n = sn.resize(age, number_rows=800) sn.dp_mean(age_n, privacy_usage={"epsilon": 0.5}) sn.dp_raw_moment(age_n, order=3, privacy_usage={"epsilon": 0.5}) sn.dp_sum(age, privacy_usage={"epsilon": 0.5}) sn.dp_variance(age_n, privacy_usage={"epsilon": 0.5}) sn.filter(income, mask) race_histogram = sn.histogram(race, categories=["1", "2", "3"], null_value="3") sn.histogram(income, edges=[0., 10000., 50000.], null_value=-1) sn.dp_histogram(married, privacy_usage={"epsilon": 0.5}) sn.gaussian_mechanism(race_histogram, privacy_usage={ "epsilon": 0.5, "delta": .000001 }) sn.laplace_mechanism(race_histogram, privacy_usage={ "epsilon": 0.5, "delta": .000001 }) sn.raw_moment(educ, order=3) sn.log(sn.clamp(educ, 0.001, 50.)) sn.maximum(educ) sn.mean(educ) sn.minimum(educ) educ % 2. educ**2. sn.quantile(educ, .32) sn.resize(educ, number_rows=1200, lower=0., upper=50.) sn.resize(race, number_rows=1200, categories=["1", "2"], weights=[1, 2]) sn.resize(data[["age", "sex"]], 1200, categories=[["1", "2"], ["a", "b"]], weights=[1, 2]) sn.resize(data[["age", "sex"]], 1200, categories=[["1", "2"], ["a", "b", "c"]], weights=[[1, 2], [3, 7, 2]]) sn.sum(educ) sn.variance(educ) if run: analysis.release() return analysis
def test_multilayer_analysis(run=True): with sn.Analysis() as analysis: PUMS = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES) age = sn.to_float(PUMS['age']) sex = sn.to_bool(PUMS['sex'], true_label="TRUE") age_clamped = sn.clamp(age, lower=0., upper=150.) age_resized = sn.resize(age_clamped, number_rows=1000) race = sn.to_float(PUMS['race']) mean_age = sn.dp_mean(data=race, privacy_usage={'epsilon': .65}, data_lower=0., data_upper=100., data_rows=500) analysis.release() sex_plus_22 = sn.add(sn.to_float(sex), 22., left_rows=1000, left_lower=0., left_upper=1.) sn.dp_mean(age_resized / 2. + sex_plus_22, privacy_usage={'epsilon': .1}, data_lower=mean_age - 5.2, data_upper=102., data_rows=500) + 5. sn.dp_variance(data=sn.to_float(PUMS['educ']), privacy_usage={'epsilon': .15}, data_rows=1000, data_lower=0., data_upper=12.) # sn.dp_raw_moment( # sn.to_float(PUMS['married']), # privacy_usage={'epsilon': .15}, # data_rows=1000000, # data_lower=0., # data_upper=12., # order=3 # ) # # sn.dp_covariance( # left=sn.to_float(PUMS['age']), # right=sn.to_float(PUMS['married']), # privacy_usage={'epsilon': .15}, # left_rows=1000, # right_rows=1000, # left_lower=0., # left_upper=1., # right_lower=0., # right_upper=1. # ) if run: analysis.release() return analysis
def analyze(data): return sn.mean(sn.resize(data, number_rows=500))
# preprocessed = sn.impute(sn.clamp( # sn.resize( # sn.to_float(sn.Dataset(value=raw_data)), # number_rows=4, # number_columns=1, # lower=0., upper=5.), # lower=0., upper=5.)) # # mean = sn.mean(preprocessed) # dp_mean = sn.laplace_mechanism(mean, privacy_usage={"epsilon": 0.1}) # print(mean.value) # print(dp_mean.value) import opendp.smartnoise.core as sn import numpy as np with sn.Analysis(filter_level="all"): raw_data = np.array([1, 3, 7, 3, 2, 3, 1, 7, 7]) preprocessed = sn.resize(sn.to_float(sn.Dataset(value=raw_data)), number_columns=1, lower=0., upper=5.) print( sn.dp_mean(preprocessed, implementation="plug-in", privacy_usage={ "epsilon": 0.1 }, data_lower=0., data_upper=5.).value)
def generate_bools(): private_data = [[True, True], [True, False], [False, True], [False, False]] dataset = sn.literal(value=private_data, value_public=False) typed = sn.to_bool(dataset, true_label=True) return sn.resize(typed, number_columns=2, categories=[True, False])
def generate_synthetic(var_type, n=10, rand_min=0, rand_max=10, cats_str=None, cats_num=None, variants=None): cats_str = ['A', 'B', 'C', 'D'] if cats_str is None else cats_str cats_num = [0, 1, 2, 3] if cats_num is None else cats_num variants = ['Index', 'Random', 'Constant', 'Categories' ] if variants is None else variants data = [] names = [] for variant in variants: if var_type == bool: data.append( list({ 'Index': (bool(i % 2) for i in range(n)), 'Random': (random.choice([True, False]) for _ in range(n)), 'Constant': (bool(1) for _ in range(n)), 'Categories': (bool(random.choice(cats_num)) for _ in range(n)) }[variant])) names.append('B_' + variant) if var_type == float: data.append( list({ 'Index': (float(i) for i in range(n)), 'Random': (rand_min + random.random() * (rand_max - rand_min) for _ in range(n)), 'Constant': (float(1) for _ in range(n)), 'Categories': (float(random.choice(cats_num)) for _ in range(n)), }[variant])) names.append('F_' + variant) if var_type == int: data.append( list({ 'Index': range(n), 'Random': (random.randrange(rand_min, rand_max) for _ in range(n)), 'Constant': (1 for _ in range(n)), 'Categories': (random.choice(cats_num) for _ in range(n)), }[variant])) names.append('I_' + variant) if var_type == str: data.append( list({ 'Index': (str(i) for i in range(n)), 'Random': (''.join([ random.choice(string.ascii_letters + string.digits) for n in range(2) ]) for _ in range(n)), 'Constant': (str(1) for _ in range(n)), 'Categories': (random.choice(cats_str) for _ in range(n)), }[variant])) names.append('S_' + variant) data = list(zip(*data)) dataset = sn.literal(value=data, value_public=False) typed = sn.cast(dataset, atomic_type={ bool: 'bool', float: 'float', int: 'int', str: 'str' }[var_type], true_label=True, lower=0, upper=10) resized = sn.resize(typed, number_columns=len(variants), lower=0., upper=10.) return sn.to_dataframe(resized, names=names)