Exemplo n.º 1
0
def test_dp_covariance():

    # establish data information
    var_names = ["age", "sex", "educ", "race", "income", "married"]

    with sn.Analysis() as analysis:
        wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        # # get scalar covariance
        age_income_cov_scalar = sn.dp_covariance(
            left=sn.to_float(wn_data['age']),
            right=sn.to_float(wn_data['income']),
            privacy_usage={'epsilon': 5000},
            left_lower=0.,
            left_upper=100.,
            left_rows=1000,
            right_lower=0.,
            right_upper=500_000.,
            right_rows=1000)

        data = sn.to_float(wn_data['age', 'income'])
        # get full covariance matrix
        age_income_cov_matrix = sn.dp_covariance(
            data=data,
            privacy_usage={'epsilon': 5000},
            data_lower=[0., 0.],
            data_upper=[100., 500_000.],
            data_rows=1000)

        # get cross-covariance matrix
        cross_covar = sn.dp_covariance(left=data,
                                       right=data,
                                       privacy_usage={'epsilon': 5000},
                                       left_lower=[0., 0.],
                                       left_upper=[100., 500_000.],
                                       left_rows=1_000,
                                       right_lower=[0., 0.],
                                       right_upper=[100., 500_000.],
                                       right_rows=1000)

    analysis.release()
    print('scalar covariance:\n{0}\n'.format(age_income_cov_scalar.value))
    print('covariance matrix:\n{0}\n'.format(age_income_cov_matrix.value))
    print('cross-covariance matrix:\n{0}'.format(cross_covar.value))
def dp_all(numeric, categorical, args):
    return {
        "covariance": sn.dp_covariance(left=numeric, right=numeric, **args),
        "histogram": sn.dp_histogram(categorical, **args),
        "maximum": sn.dp_maximum(numeric, **args),
        "mean": sn.dp_mean(numeric, **args),
        "median": sn.dp_median(numeric, **args),
        "minimum": sn.dp_minimum(numeric, **args),
        "quantile": sn.dp_quantile(numeric, .75, **args),
        "raw_moment": sn.dp_raw_moment(numeric, 2, **args),
        "sum": sn.dp_sum(numeric, **args),
        "variance": sn.dp_variance(numeric, **args)
    }
Exemplo n.º 3
0
def test_covariance():
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    data = np.genfromtxt(TEST_PUMS_PATH, delimiter=',', names=True)

    with sn.Analysis() as analysis:
        wn_data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)
        # get full covariance matrix
        cov = sn.dp_covariance(data=sn.to_float(wn_data['age', 'sex', 'educ',
                                                        'income', 'married']),
                               privacy_usage={'epsilon': 10},
                               data_lower=[0., 0., 1., 0., 0.],
                               data_upper=[100., 1., 16., 500_000., 1.],
                               data_rows=1000)
    analysis.release()

    # store DP covariance and correlation matrix
    dp_cov = cov.value
    print(dp_cov)
    dp_corr = dp_cov / np.outer(np.sqrt(np.diag(dp_cov)),
                                np.sqrt(np.diag(dp_cov)))

    # get non-DP covariance/correlation matrices
    age = list(data[:]['age'])
    sex = list(data[:]['sex'])
    educ = list(data[:]['educ'])
    income = list(data[:]['income'])
    married = list(data[:]['married'])
    non_dp_cov = np.cov([age, sex, educ, income, married])
    non_dp_corr = non_dp_cov / np.outer(np.sqrt(np.diag(non_dp_cov)),
                                        np.sqrt(np.diag(non_dp_cov)))

    print('Non-DP Covariance Matrix:\n{0}\n\n'.format(
        pd.DataFrame(non_dp_cov)))
    print('Non-DP Correlation Matrix:\n{0}\n\n'.format(
        pd.DataFrame(non_dp_corr)))
    print('DP Correlation Matrix:\n{0}'.format(pd.DataFrame(dp_corr)))

    # skip plot step
    if IS_CI_BUILD:
        return

    plt.imshow(non_dp_corr - dp_corr, interpolation='nearest')
    plt.colorbar()
    plt.show()
Exemplo n.º 4
0
def test_dp_linear_stats(run=True):
    with sn.Analysis() as analysis:
        dataset_pums = sn.Dataset(path=TEST_PUMS_PATH,
                                  column_names=TEST_PUMS_NAMES)

        age = dataset_pums['age']
        analysis.release()

        num_records = sn.dp_count(age,
                                  privacy_usage={'epsilon': .5},
                                  lower=0,
                                  upper=10000)
        analysis.release()

        print("number of records:", num_records.value)

        vars = sn.to_float(dataset_pums[["age", "income"]])

        covariance = sn.dp_covariance(data=vars,
                                      privacy_usage={'epsilon': .5},
                                      data_lower=[0., 0.],
                                      data_upper=[150., 150000.],
                                      data_rows=num_records)
        print("covariance released")

        num_means = sn.dp_mean(data=vars,
                               privacy_usage={'epsilon': .5},
                               data_lower=[0., 0.],
                               data_upper=[150., 150000.],
                               data_rows=num_records)

        analysis.release()
        print("covariance:\n", covariance.value)
        print("means:\n", num_means.value)

        age = sn.to_float(age)

        age_variance = sn.dp_variance(age,
                                      privacy_usage={'epsilon': .5},
                                      data_lower=0.,
                                      data_upper=150.,
                                      data_rows=num_records)

        analysis.release()

        print("age variance:", age_variance.value)

        # If I clamp, impute, resize, then I can reuse their properties for multiple statistics
        clamped_age = sn.clamp(age, lower=0., upper=100.)
        imputed_age = sn.impute(clamped_age)
        preprocessed_age = sn.resize(imputed_age, number_rows=num_records)

        # properties necessary for mean are statically known
        mean = sn.dp_mean(preprocessed_age, privacy_usage={'epsilon': .5})

        # properties necessary for variance are statically known
        variance = sn.dp_variance(preprocessed_age,
                                  privacy_usage={'epsilon': .5})

        # sum doesn't need n, so I pass the data in before resizing
        age_sum = sn.dp_sum(imputed_age, privacy_usage={'epsilon': .5})

        # mean with lower, upper properties propagated up from prior bounds
        transformed_mean = sn.dp_mean(-(preprocessed_age + 2.),
                                      privacy_usage={'epsilon': .5})

        analysis.release()
        print("age transformed mean:", transformed_mean.value)

        # releases may be pieced together from combinations of smaller components
        custom_mean = sn.laplace_mechanism(sn.mean(preprocessed_age),
                                           privacy_usage={'epsilon': .5})

        custom_maximum = sn.laplace_mechanism(sn.maximum(preprocessed_age),
                                              privacy_usage={'epsilon': .5})

        custom_maximum = sn.laplace_mechanism(sn.maximum(preprocessed_age),
                                              privacy_usage={'epsilon': .5})

        custom_quantile = sn.laplace_mechanism(sn.quantile(preprocessed_age,
                                                           alpha=.5),
                                               privacy_usage={'epsilon': 500})

        income = sn.to_float(dataset_pums['income'])
        income_max = sn.laplace_mechanism(sn.maximum(income,
                                                     data_lower=0.,
                                                     data_upper=1000000.),
                                          privacy_usage={'epsilon': 10})

        # releases may also be postprocessed and reused as arguments to more components
        age_sum + custom_maximum * 23.

        analysis.release()
        print("laplace quantile:", custom_quantile.value)

        age_histogram = sn.dp_histogram(sn.to_int(age, lower=0, upper=100),
                                        edges=list(range(0, 100, 25)),
                                        null_value=150,
                                        privacy_usage={'epsilon': 2.})

        sex_histogram = sn.dp_histogram(sn.to_bool(dataset_pums['sex'],
                                                   true_label="1"),
                                        privacy_usage={'epsilon': 2.})

        education_histogram = sn.dp_histogram(dataset_pums['educ'],
                                              categories=["5", "7", "10"],
                                              null_value="-1",
                                              privacy_usage={'epsilon': 2.})

        analysis.release()

        print("age histogram: ", age_histogram.value)
        print("sex histogram: ", sex_histogram.value)
        print("education histogram: ", education_histogram.value)

    if run:
        analysis.release()

        # get the mean computed when release() was called
        print(mean.value)
        print(variance.value)

    return analysis