Пример #1
0
def test_dataframe_partitioning_2():
    # dataframe partition with multi-index grouping
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        grouper = sn.clamp(data[['sex', 'educ']],
                           categories=[['0', '1'],
                                       [str(i) for i in range(14)]],
                           null_value='-1')
        partitioned = sn.partition(data, by=grouper)

        sn.union(
            {
                key: sn.dp_count(partitioned[key],
                                 privacy_usage={"epsilon": 0.5})
                for key in partitioned.partition_keys
            },
            flatten=False)

        print(
            sn.union({
                key: sn.dp_mean(
                    sn.to_float(partitioned[key]['income']),
                    implementation="plug-in",
                    # data_rows=100,
                    data_lower=0.,
                    data_upper=200_000.,
                    privacy_usage={"epsilon": 0.5})
                for key in partitioned.partition_keys
            }))
Пример #2
0
def test_groupby_4():
    # now union private data, and apply mechanism after
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        means = {}
        for cat in is_male.categories:
            part = partitioned[cat]
            part = sn.resize(part, number_rows=500)
            part = sn.mean(part)
            means[cat] = part

        union = sn.union(means)
        noised = sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0})

    # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print(noised.value)
Пример #3
0
def test_groupby_3():
    # now union the released output
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        means = {}
        for cat in is_male.categories:
            part = partitioned[cat]
            part = sn.resize(part, number_rows=500)
            part = sn.dp_mean(part, privacy_usage={"epsilon": 1.0})
            # print("mean: ", part.properties)
            means[cat] = part

        union = sn.union(means)

    # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print(union.value)
Пример #4
0
def test_groupby_c_stab():
    # use the same partition multiple times in union
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        def analyze(data):
            return sn.mean(sn.resize(data, number_rows=500))

        means = {
            True: analyze(partitioned[True]),
            False: analyze(partitioned[False]),
            "duplicate_that_inflates_c_stab": analyze(partitioned[True]),
        }

        union = sn.union(means)
        noised = sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0})

        # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print(noised.value)
Пример #5
0
def test_fail_groupby():
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        bounds = {
            "data_lower": [0., 0.],
            "data_upper": [15., 200_000.],
            "data_rows": 500
        }

        union = sn.union({
            True:
            sn.mean(partitioned[True],
                    privacy_usage={"epsilon": 0.1},
                    **bounds),
            False:
            sn.mean(partitioned[False], **bounds),
        })

        sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0})

        print(analysis.privacy_usage)
Пример #6
0
def test_multilayer_partition_1():
    # multilayer partition with mechanisms applied inside partitions
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        def analyze(data):
            educ = sn.clamp(sn.to_int(sn.index(data, indices=0),
                                      lower=0,
                                      upper=15),
                            categories=list(range(15)),
                            null_value=-1)
            income = sn.index(data, indices=1)
            repartitioned = sn.partition(income, by=educ)

            inner_count = {}
            inner_means = {}
            for key in [5, 8, 12]:
                educ_level_part = repartitioned[key]

                inner_count[key] = sn.dp_count(educ_level_part,
                                               privacy_usage={"epsilon": 0.4})
                inner_means[key] = sn.dp_mean(educ_level_part,
                                              privacy_usage={"epsilon": 0.6},
                                              data_rows=sn.row_max(
                                                  1, inner_count[key]))

            return sn.union(inner_means,
                            flatten=False), sn.union(inner_count,
                                                     flatten=False)

        means = {}
        counts = {}
        for key in partitioned.partition_keys:
            part_means, part_counts = analyze(partitioned[key])
            means[key] = part_means
            counts[key] = part_counts

        means = sn.union(means, flatten=False)
        counts = sn.union(counts, flatten=False)

        # analysis.plot()
    print("releasing")
    print(len(analysis.components.items()))
    analysis.release()
    print(analysis.privacy_usage)
    print("Counts:")
    print(counts.value)

    print("Means:")
    print(means.value)
Пример #7
0
def test_map_1():
    # map a count over all dataframe partitions
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        partitioned = sn.partition(data,
                                   by=sn.to_bool(data['sex'], true_label="1"))

        counts = sn.dp_count(partitioned, privacy_usage={"epsilon": 0.5})

        print(counts.value)
        print(analysis.privacy_usage)
Пример #8
0
def test_partition():
    with sn.Analysis(filter_level='all') as analysis:
        data = generate_bools()

        partitioned = sn.partition(data, num_partitions=3)
        analysis.release()
        # print(partitioned.value)

        assert np.array_equal(partitioned.value[0],
                              np.array([[True, True], [True, False]]))
        assert np.array_equal(partitioned.value[1], np.array([[False, True]]))
        assert np.array_equal(partitioned.value[2], np.array([[False, False]]))
Пример #9
0
def test_map_2():
    # map a count over a large number of tuple partitions of dataframes
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        grouper = sn.clamp(data[['sex', 'educ']],
                           categories=[['0', '1'],
                                       [str(i) for i in range(14)]],
                           null_value='-1')
        partitioned = sn.partition(data, by=grouper)

        counts = sn.dp_count(partitioned, privacy_usage={"epsilon": 0.5})

        print(counts.value)
        print(analysis.privacy_usage)
Пример #10
0
def test_map_3():
    # chain multiple maps over an array partition with implicit preprocessing
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        partitioned = sn.partition(sn.to_float(data['age']),
                                   by=sn.to_bool(data['sex'], true_label="1"))

        means = sn.dp_mean(partitioned,
                           privacy_usage={'epsilon': 0.1},
                           data_rows=500,
                           data_lower=0.,
                           data_upper=15.)

        print(means.value)
        print(analysis.privacy_usage)
Пример #11
0
def test_groupby_1():

    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        partitioned = sn.partition(data[['educ', 'income']], by=is_male)

        counts = {
            cat: sn.dp_count(partitioned[cat], privacy_usage={'epsilon': 0.1})
            for cat in is_male.categories
        }

    # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print({cat: counts[cat].value for cat in counts})
Пример #12
0
def test_dataframe_partitioning_1():

    # dataframe partition
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        partitioned = sn.partition(data, by=is_male)

        print(
            sn.union({
                key: sn.dp_mean(sn.impute(
                    sn.clamp(sn.to_float(partitioned[key]['income']), 0.,
                             200_000.)),
                                implementation="plug-in",
                                privacy_usage={"epsilon": 0.5})
                for key in partitioned.partition_keys
            }).value)
        print(analysis.privacy_usage)
Пример #13
0
def test_map_4():
    # chain multiple mapped releases over a partition with implicit preprocessing
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        partitioned = sn.partition(sn.to_float(data['age']),
                                   by=sn.to_bool(data['sex'], true_label="1"))

        counts = sn.row_max(
            1, sn.dp_count(partitioned, privacy_usage={'epsilon': 0.5}))

        means = sn.dp_mean(partitioned,
                           privacy_usage={'epsilon': 0.7},
                           data_rows=counts,
                           data_lower=0.,
                           data_upper=15.)

        print("counts:", counts.value)
        print("means:", means.value)

        print(analysis.privacy_usage)
Пример #14
0
        def analyze(data):
            educ = sn.clamp(sn.to_int(sn.index(data, indices=0),
                                      lower=0,
                                      upper=15),
                            categories=list(range(15)),
                            null_value=-1)
            income = sn.index(data, indices=1)
            repartitioned = sn.partition(income, by=educ)

            inner_count = {}
            inner_means = {}
            for key in [5, 8, 12]:
                educ_level_part = repartitioned[key]

                inner_count[key] = sn.dp_count(educ_level_part,
                                               privacy_usage={"epsilon": 0.4})
                inner_means[key] = sn.mean(
                    sn.resize(educ_level_part,
                              number_rows=sn.row_min(1, inner_count[key] * 4 //
                                                     5)))

            return sn.union(inner_means), sn.union(inner_count)
Пример #15
0
def test_groupby_2():
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        partitioned = sn.partition(sn.to_float(data[['educ', 'income']]),
                                   by=is_male)

        counts = {
            True:
            sn.dp_count(partitioned[True], privacy_usage={'epsilon': 0.1}),
            False:
            sn.dp_mean(partitioned[False],
                       privacy_usage={'epsilon': 0.1},
                       data_rows=500,
                       data_lower=[0., 0.],
                       data_upper=[15., 200_000.])
        }

    # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print({cat: counts[cat].value for cat in counts})