예제 #1
0
def test_fail_groupby():
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        bounds = {
            "data_lower": [0., 0.],
            "data_upper": [15., 200_000.],
            "data_rows": 500
        }

        union = sn.union({
            True:
            sn.mean(partitioned[True],
                    privacy_usage={"epsilon": 0.1},
                    **bounds),
            False:
            sn.mean(partitioned[False], **bounds),
        })

        sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0})

        print(analysis.privacy_usage)
예제 #2
0
def test_groupby_3():
    # now union the released output
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        means = {}
        for cat in is_male.categories:
            part = partitioned[cat]
            part = sn.resize(part, number_rows=500)
            part = sn.dp_mean(part, privacy_usage={"epsilon": 1.0})
            # print("mean: ", part.properties)
            means[cat] = part

        union = sn.union(means)

    # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print(union.value)
예제 #3
0
def test_groupby_c_stab():
    # use the same partition multiple times in union
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        def analyze(data):
            return sn.mean(sn.resize(data, number_rows=500))

        means = {
            True: analyze(partitioned[True]),
            False: analyze(partitioned[False]),
            "duplicate_that_inflates_c_stab": analyze(partitioned[True]),
        }

        union = sn.union(means)
        noised = sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0})

        # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print(noised.value)
def analytic_gaussian_similarity():
    analytic_gauss_estimates = []
    gauss_estimates = []
    with sn.Analysis(strict_parameter_checks=False):
        PUMS = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        age = sn.impute(sn.to_float(PUMS['age']),
                        data_lower=0.,
                        data_upper=100.,
                        data_rows=1000)

        for i in range(100):
            an_gauss_component = sn.dp_mean(age,
                                            mechanism="AnalyticGaussian",
                                            privacy_usage={
                                                "epsilon": 1.0,
                                                "delta": 1E-6
                                            })
            gauss_component = sn.dp_mean(age,
                                         mechanism="Gaussian",
                                         privacy_usage={
                                             "epsilon": 1.0,
                                             "delta": 1E-6
                                         })

            # this triggers an analysis.release (which also computes gauss_component)
            analytic_gauss_estimates.append(an_gauss_component.value)
            gauss_estimates.append(gauss_component.value)

    print(sum(analytic_gauss_estimates) / len(analytic_gauss_estimates))
    print(sum(gauss_estimates) / len(gauss_estimates))
def snapping_similarity():
    snapping_estimates = []
    laplace_estimates = []
    with sn.Analysis(strict_parameter_checks=False):
        PUMS = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        age = sn.impute(sn.to_float(PUMS['age']),
                        data_lower=0.,
                        data_upper=100.,
                        data_rows=1000)

        for i in range(100):
            snapping_component = sn.dp_mean(age,
                                            mechanism="snapping",
                                            privacy_usage={
                                                "epsilon": 1.0,
                                                "delta": 1E-6
                                            })
            laplace_component = sn.dp_mean(age,
                                           mechanism="laplace",
                                           privacy_usage={
                                               "epsilon": 1.0,
                                               "delta": 1E-6
                                           })

            snapping_estimates.append(snapping_component.value)
            laplace_estimates.append(laplace_component.value)

    print(sum(snapping_estimates) / len(snapping_estimates))
    print(sum(laplace_estimates) / len(laplace_estimates))
예제 #6
0
def test_histogram():
    import numpy as np

    # establish data information

    data = np.genfromtxt(TEST_PUMS_PATH, delimiter=',', names=True)
    education_categories = [
        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
        "14", "15", "16", "17"
    ]

    income = list(data[:]['income'])
    income_edges = list(range(0, 100_000, 10_000))

    print('actual', np.histogram(income, bins=income_edges)[0])

    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)
        income = sn.to_int(data['income'], lower=0, upper=0)
        sex = sn.to_bool(data['sex'], true_label="1")

        income_histogram = sn.dp_histogram(income,
                                           edges=income_edges,
                                           privacy_usage={'epsilon': 1.})

    analysis.release()

    print("Income histogram Geometric DP release:   " +
          str(income_histogram.value))
예제 #7
0
def test_groupby_4():
    # now union private data, and apply mechanism after
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        means = {}
        for cat in is_male.categories:
            part = partitioned[cat]
            part = sn.resize(part, number_rows=500)
            part = sn.mean(part)
            means[cat] = part

        union = sn.union(means)
        noised = sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0})

    # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print(noised.value)
예제 #8
0
def test_properties():
    with sn.Analysis():
        # load data
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        # establish data
        age_dt = sn.cast(data['age'], 'FLOAT')

        # ensure data are non-null
        non_null_age_dt = sn.impute(age_dt,
                                    distribution='Uniform',
                                    lower=0.,
                                    upper=100.)
        clamped = sn.clamp(age_dt, lower=0., upper=100.)

        # create potential for null data again
        potentially_null_age_dt = non_null_age_dt / 0.

        # print('original properties:\n{0}\n\n'.format(age_dt.properties))
        print('properties after imputation:\n{0}\n\n'.format(
            non_null_age_dt.nullity))
        print('properties after nan mult:\n{0}\n\n'.format(
            potentially_null_age_dt.nullity))

        print("lower", clamped.lower)
        print("upper", clamped.upper)
        print("releasable", clamped.releasable)
        # print("props", clamped.properties)
        print("data_type", clamped.data_type)
        print("categories", clamped.categories)
예제 #9
0
def test_dataframe_partitioning_2():
    # dataframe partition with multi-index grouping
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        grouper = sn.clamp(data[['sex', 'educ']],
                           categories=[['0', '1'],
                                       [str(i) for i in range(14)]],
                           null_value='-1')
        partitioned = sn.partition(data, by=grouper)

        sn.union(
            {
                key: sn.dp_count(partitioned[key],
                                 privacy_usage={"epsilon": 0.5})
                for key in partitioned.partition_keys
            },
            flatten=False)

        print(
            sn.union({
                key: sn.dp_mean(
                    sn.to_float(partitioned[key]['income']),
                    implementation="plug-in",
                    # data_rows=100,
                    data_lower=0.,
                    data_upper=200_000.,
                    privacy_usage={"epsilon": 0.5})
                for key in partitioned.partition_keys
            }))
예제 #10
0
def test_multilayer_partition_1():
    # multilayer partition with mechanisms applied inside partitions
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        def analyze(data):
            educ = sn.clamp(sn.to_int(sn.index(data, indices=0),
                                      lower=0,
                                      upper=15),
                            categories=list(range(15)),
                            null_value=-1)
            income = sn.index(data, indices=1)
            repartitioned = sn.partition(income, by=educ)

            inner_count = {}
            inner_means = {}
            for key in [5, 8, 12]:
                educ_level_part = repartitioned[key]

                inner_count[key] = sn.dp_count(educ_level_part,
                                               privacy_usage={"epsilon": 0.4})
                inner_means[key] = sn.dp_mean(educ_level_part,
                                              privacy_usage={"epsilon": 0.6},
                                              data_rows=sn.row_max(
                                                  1, inner_count[key]))

            return sn.union(inner_means,
                            flatten=False), sn.union(inner_count,
                                                     flatten=False)

        means = {}
        counts = {}
        for key in partitioned.partition_keys:
            part_means, part_counts = analyze(partitioned[key])
            means[key] = part_means
            counts[key] = part_counts

        means = sn.union(means, flatten=False)
        counts = sn.union(counts, flatten=False)

        # analysis.plot()
    print("releasing")
    print(len(analysis.components.items()))
    analysis.release()
    print(analysis.privacy_usage)
    print("Counts:")
    print(counts.value)

    print("Means:")
    print(means.value)
예제 #11
0
def test_index():
    with sn.Analysis(filter_level='all') as analysis:
        data = generate_bools()

        index_0 = sn.index(data, indices=0)

        analysis.release()
        assert all(a == b
                   for a, b in zip(index_0.value, [True, True, False, False]))
예제 #12
0
def test_equal():
    with sn.Analysis(filter_level='all') as analysis:
        data = generate_bools()

        equality = sn.index(data, indices=0) == sn.index(data, indices=1)

        analysis.release()
        assert np.array_equal(equality.value,
                              np.array([True, False, False, True]))
예제 #13
0
def test_partition():
    with sn.Analysis(filter_level='all') as analysis:
        data = generate_bools()

        partitioned = sn.partition(data, num_partitions=3)
        analysis.release()
        # print(partitioned.value)

        assert np.array_equal(partitioned.value[0],
                              np.array([[True, True], [True, False]]))
        assert np.array_equal(partitioned.value[1], np.array([[False, True]]))
        assert np.array_equal(partitioned.value[2], np.array([[False, False]]))
예제 #14
0
def test_map_1():
    # map a count over all dataframe partitions
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        partitioned = sn.partition(data,
                                   by=sn.to_bool(data['sex'], true_label="1"))

        counts = sn.dp_count(partitioned, privacy_usage={"epsilon": 0.5})

        print(counts.value)
        print(analysis.privacy_usage)
예제 #15
0
def test_dp_mean():
    with sn.Analysis():
        data = generate_synthetic(float, variants=['Random'])
        mean = sn.dp_mean(data['F_Random'],
                          privacy_usage={'epsilon': 0.1},
                          data_lower=0.,
                          data_upper=10.,
                          data_rows=10)

        print("accuracy", mean.get_accuracy(0.05))
        print(mean.from_accuracy(2.3, .05))

    with sn.Analysis():
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)
        print(
            sn.dp_mean(sn.to_float(data['income']),
                       implementation="plug-in",
                       data_lower=0.,
                       data_upper=200_000.,
                       privacy_usage={
                           "epsilon": 0.5
                       }).value)
예제 #16
0
def test_private_clamped_sum_helpers():
    # Compute the CI with smartnoise
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_DATA_PATH, column_names=TEST_DATA_COLUMNS)
        D = sn.to_float(data["age"])
        D_tilde = sn.resize(sn.clamp(data=D, lower=0.0, upper=100.0), number_rows=1000,)
        release = sn.dp_sum(data=sn.impute(D_tilde), privacy_usage={"epsilon": 1.0})
    smartnoise_ci = release.get_accuracy(0.05)

    op = PrivateClampedSum(lower_bound=0, upper_bound=100)
    eeprivacy_ci = op.confidence_interval(epsilon=1, confidence=0.95)

    assert pytest.approx(smartnoise_ci, abs=0.001) == eeprivacy_ci
예제 #17
0
def test_reports():
    with sn.Analysis() as analysis:
        # load data
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)
        # get mean of age
        age_mean = sn.dp_mean(data=sn.to_float(data['age']),
                              privacy_usage={'epsilon': .65},
                              data_lower=0.,
                              data_upper=100.,
                              data_rows=1000)
        print("Pre-Release\n")
        print("DP mean of age: {0}".format(age_mean.value))
        print("Privacy usage: {0}\n\n".format(analysis.privacy_usage))
예제 #18
0
def test_dp_count(run=True):
    with sn.Analysis() as analysis:
        dataset_pums = sn.Dataset(path=TEST_PUMS_PATH,
                                  column_names=TEST_PUMS_NAMES)

        count = sn.dp_count(dataset_pums['sex'] == '1',
                            privacy_usage={'epsilon': 0.5})

    if run:
        analysis.release()
        print(count.value)

    return analysis
예제 #19
0
def try_sn():
    # establish data information
    #data_path = 'https://raw.githubusercontent.com/opendp/smartnoise-samples/86-requirements-fix/analysis/data/PUMS_california_demographics_1000/data.csv'
    data_path = os.path.join('.', 'data', 'PUMS_california_demographics_1000',
                             'data.csv')
    data_path = os.path.abspath(data_path)
    print('data_path', data_path)
    var_names = ["age", "sex", "educ", "race", "income", "married", "pid"]
    D = pd.read_csv(data_path)['age']
    D_mean_age = np.mean(D)
    print('D_mean_age', D_mean_age)

    # establish extra information for this simulation
    age_lower_bound = 0.
    age_upper_bound = 100.
    D_tilde = np.clip(D, age_lower_bound, age_upper_bound)
    D_tilde_mean_age = np.mean(D_tilde)
    data_size = 1000

    df = pd.read_csv(data_path)
    df_as_array = [list(row) for row in df.itertuples()]
    #df.values.tolist()
    print('D.values', df_as_array)

    n_sims = 2
    releases = []
    with sn.Analysis(dynamic=True) as analysis:
        data = sn.Dataset(path=data_path, column_names=var_names)
        #data = sn.Dataset(value=df_as_array, column_names=var_names)
        D = sn.to_float(data['age'])
        # preprocess data (resize is a no-op because we have the correct data size)
        D_tilde = sn.resize(sn.clamp(data=D, lower=0., upper=100.),
                            number_rows=data_size)

        for index in range(n_sims):
            # get DP mean of age
            releases.append(
                sn.dp_mean(data=sn.impute(D_tilde),
                           privacy_usage={'epsilon': 1}))

    accuracy = releases[0].get_accuracy(0.05)

    analysis.release()
    dp_values = [release.value for release in releases]
    print(
        'Accuracy interval (with accuracy value {0}) contains the true mean on D_tilde with probability {1}'
        .format(
            round(accuracy, 4),
            np.mean([(D_tilde_mean_age >= val - accuracy) &
                     (D_tilde_mean_age <= val + accuracy)
                     for val in dp_values])))
예제 #20
0
def test_raw_dataset(run=True):
    with sn.Analysis() as analysis:
        data = sn.to_float(sn.Dataset(value=[1., 2., 3., 4., 5.]))

        sn.dp_mean(data=data,
                   privacy_usage={'epsilon': 1},
                   data_lower=0.,
                   data_upper=10.,
                   data_rows=10,
                   data_columns=1)

    if run:
        analysis.release()

    return analysis
예제 #21
0
def test_map_2():
    # map a count over a large number of tuple partitions of dataframes
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        grouper = sn.clamp(data[['sex', 'educ']],
                           categories=[['0', '1'],
                                       [str(i) for i in range(14)]],
                           null_value='-1')
        partitioned = sn.partition(data, by=grouper)

        counts = sn.dp_count(partitioned, privacy_usage={"epsilon": 0.5})

        print(counts.value)
        print(analysis.privacy_usage)
예제 #22
0
def test_divide():
    with sn.Analysis():
        data_A = generate_synthetic(float, variants=['Random'])

        f_random = data_A['F_Random']
        imputed = sn.impute(f_random, lower=0., upper=10.)
        clamped_nonzero = sn.clamp(imputed, lower=1., upper=10.)
        clamped_zero = sn.clamp(imputed, lower=0., upper=10.)

        # test properties
        assert f_random.nullity
        assert not imputed.nullity
        assert (2. / imputed).nullity
        assert (f_random / imputed).nullity
        assert (2. / clamped_zero).nullity
예제 #23
0
def test_map_3():
    # chain multiple maps over an array partition with implicit preprocessing
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        partitioned = sn.partition(sn.to_float(data['age']),
                                   by=sn.to_bool(data['sex'], true_label="1"))

        means = sn.dp_mean(partitioned,
                           privacy_usage={'epsilon': 0.1},
                           data_rows=500,
                           data_lower=0.,
                           data_upper=15.)

        print(means.value)
        print(analysis.privacy_usage)
예제 #24
0
def test_covariance():
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    data = np.genfromtxt(TEST_PUMS_PATH, delimiter=',', names=True)

    with sn.Analysis() as analysis:
        wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)
        # get full covariance matrix
        cov = sn.dp_covariance(data=sn.to_float(wn_data['age', 'sex', 'educ',
                                                        'income', 'married']),
                               privacy_usage={'epsilon': 10},
                               data_lower=[0., 0., 1., 0., 0.],
                               data_upper=[100., 1., 16., 500_000., 1.],
                               data_rows=1000)
    analysis.release()

    # store DP covariance and correlation matrix
    dp_cov = cov.value
    print(dp_cov)
    dp_corr = dp_cov / np.outer(np.sqrt(np.diag(dp_cov)),
                                np.sqrt(np.diag(dp_cov)))

    # get non-DP covariance/correlation matrices
    age = list(data[:]['age'])
    sex = list(data[:]['sex'])
    educ = list(data[:]['educ'])
    income = list(data[:]['income'])
    married = list(data[:]['married'])
    non_dp_cov = np.cov([age, sex, educ, income, married])
    non_dp_corr = non_dp_cov / np.outer(np.sqrt(np.diag(non_dp_cov)),
                                        np.sqrt(np.diag(non_dp_cov)))

    print('Non-DP Covariance Matrix:\n{0}\n\n'.format(
        pd.DataFrame(non_dp_cov)))
    print('Non-DP Correlation Matrix:\n{0}\n\n'.format(
        pd.DataFrame(non_dp_corr)))
    print('DP Correlation Matrix:\n{0}'.format(pd.DataFrame(dp_corr)))

    # skip plot step
    if IS_CI_BUILD:
        return

    plt.imshow(non_dp_corr - dp_corr, interpolation='nearest')
    plt.colorbar()
    plt.show()
예제 #25
0
def test_median_education():
    # import pandas as pd
    # print(pd.read_csv(data_path)['value'].median())
    with sn.Analysis(filter_level="all") as analysis:
        data = sn.Dataset(path=TEST_EDUC_PATH, column_names=TEST_EDUC_NAMES)
        candidates = list(map(float, range(1, 200, 2)))
        median_scores = sn.median(sn.impute(sn.to_float(data['value']), 100.,
                                            200.),
                                  candidates=candidates)

        # print(list(zip(candidates, median_scores.value[0])))

        dp_median = sn.exponential_mechanism(median_scores,
                                             candidates=candidates,
                                             privacy_usage={"epsilon": 100.})
        print(dp_median.value)
    analysis.release()
예제 #26
0
def test_groupby_1():

    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        partitioned = sn.partition(data[['educ', 'income']], by=is_male)

        counts = {
            cat: sn.dp_count(partitioned[cat], privacy_usage={'epsilon': 0.1})
            for cat in is_male.categories
        }

    # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print({cat: counts[cat].value for cat in counts})
예제 #27
0
def test_dp_median_raw():
    with sn.Analysis() as analysis:
        # create a literal data vector, and tag it as private
        data = sn.Component.of([float(i) for i in range(20)], public=False)

        dp_median = sn.dp_median(
            sn.to_float(data),
            privacy_usage={
                "epsilon": 1.
            },
            candidates=[-10., -2., 2., 3., 4., 7., 10., 12.],
            data_lower=0.,
            data_upper=10.,
            data_columns=1).value
        print(dp_median)

        # analysis.plot()
        assert dp_median is not None
예제 #28
0
def test_dp_covariance():

    # establish data information
    var_names = ["age", "sex", "educ", "race", "income", "married"]

    with sn.Analysis() as analysis:
        wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        # # get scalar covariance
        age_income_cov_scalar = sn.dp_covariance(
            left=sn.to_float(wn_data['age']),
            right=sn.to_float(wn_data['income']),
            privacy_usage={'epsilon': 5000},
            left_lower=0.,
            left_upper=100.,
            left_rows=1000,
            right_lower=0.,
            right_upper=500_000.,
            right_rows=1000)

        data = sn.to_float(wn_data['age', 'income'])
        # get full covariance matrix
        age_income_cov_matrix = sn.dp_covariance(
            data=data,
            privacy_usage={'epsilon': 5000},
            data_lower=[0., 0.],
            data_upper=[100., 500_000.],
            data_rows=1000)

        # get cross-covariance matrix
        cross_covar = sn.dp_covariance(left=data,
                                       right=data,
                                       privacy_usage={'epsilon': 5000},
                                       left_lower=[0., 0.],
                                       left_upper=[100., 500_000.],
                                       left_rows=1_000,
                                       right_lower=[0., 0.],
                                       right_upper=[100., 500_000.],
                                       right_rows=1000)

    analysis.release()
    print('scalar covariance:\n{0}\n'.format(age_income_cov_scalar.value))
    print('covariance matrix:\n{0}\n'.format(age_income_cov_matrix.value))
    print('cross-covariance matrix:\n{0}'.format(cross_covar.value))
예제 #29
0
def test_dp_linear_regression():

    with sn.Analysis():
        wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)
        wn_data = sn.resize(sn.to_float(wn_data[["age", "income"]]),
                            number_rows=1000,
                            lower=[0., 0.],
                            upper=[100., 500_000.])

        dp_linear_regression = sn.dp_linear_regression(
            data_x=sn.index(wn_data, indices=0),
            data_y=sn.index(wn_data, indices=1),
            privacy_usage={'epsilon': 10.},
            lower_slope=0.,
            upper_slope=1000.,
            lower_intercept=0.,
            upper_intercept=1000.)

        print(dp_linear_regression.value)
예제 #30
0
def test_dataframe_partitioning_1():

    # dataframe partition
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        partitioned = sn.partition(data, by=is_male)

        print(
            sn.union({
                key: sn.dp_mean(sn.impute(
                    sn.clamp(sn.to_float(partitioned[key]['income']), 0.,
                             200_000.)),
                                implementation="plug-in",
                                privacy_usage={"epsilon": 0.5})
                for key in partitioned.partition_keys
            }).value)
        print(analysis.privacy_usage)