Пример #1
0
def jl_dp_accuracy(clf=KNeighborsClassifier(),
                   params=KNN_PARAMS,
                   test_size=0.25,
                   is_sparse=True):
    x, y = datasets.heart_disease()

    x = MinMaxScaler().fit_transform(x)
    split_random_state = int(random.random() * 100)
    gs, x_train, y_train, x_test, y_test = grid_search(
        x,
        y,
        clf,
        params,
        test_size=test_size,
        standardize=False,
        verbose=1,
        random_state=split_random_state)
    clf = gs.best_estimator_
    y_pred = clf.predict(x_test)
    plot.print_metrics(y_test, y_pred)
    baseline = metrics.balanced_accuracy_score(y_test, y_pred)
    print(baseline)
    num_rounds = 100
    dims = x.shape[1]
    scores = defaultdict(list)
    delta = 0.1

    for eps in EPS_RANGE:
        print('epsilon = ', eps)
        for k in range(1, dims + 1):
            tmp_k = []
            for i in range(num_rounds):
                z, p, sigma = random_projection.private_projection(
                    x, eps=eps, delta=delta, k=k, is_sparse=is_sparse)
                z_train, z_test, y_train, y_test = train_test_split(
                    z, y, test_size=test_size)
                clf = gs.best_estimator_
                clf.fit(z_train, y_train)
                y_pred = clf.predict(z_test)
                score = metrics.balanced_accuracy_score(y_test, y_pred)
                tmp_k.append(score)

            mean_k = np.mean(tmp_k)
            print('k = %d, mean = %.10f' % (k, mean_k))
            scores[eps].append(mean_k)

    return scores, baseline
Пример #2
0
def heart_generalized_response(
        clf=KNeighborsClassifier(), params=KNN_PARAMS, test_size=0.25):
    x, y = datasets.heart_disease()

    x_scale = MinMaxScaler().fit_transform(x)

    gs, x_train, y_train, x_test, y_test = grid_search(x_scale,
                                                       y,
                                                       clf,
                                                       params=params,
                                                       test_size=test_size,
                                                       standardize=False,
                                                       verbose=1)
    clf = gs.best_estimator_
    y_pred = clf.predict(x_test)
    plot.print_metrics(y_test, y_pred)
    baseline = metrics.balanced_accuracy_score(y_test, y_pred)
    print(baseline)
    num_rounds = 100
    scores = defaultdict(list)

    for eps in EPS_RANGE:
        for i in range(num_rounds):
            z, ps, qs = randomized_response(x, eps)
            z_scale = MinMaxScaler().fit_transform(z)
            z_train, z_test, y_train, y_test = train_test_split(
                z_scale, y, test_size=test_size)
            clf = gs.best_estimator_
            clf.fit(z_train, y_train)
            y_pred = clf.predict(z_test)
            score = metrics.balanced_accuracy_score(y_test, y_pred)
            scores[eps].append(score)

        print('eps = %.2f, mean = %.10f' % (eps, np.mean(scores[eps])))

    return scores, baseline
Пример #3
0
            return curr_top_k.tolist()
        else:
            # res = prev_top_k[0: k - m]
            res = gdv_prev[0:k - m]
            # insert in the output m random values
            start = curr_top_k[k - 1]  # last item
            # end = prev_top_k[k - m]
            end = gdv_prev[k - m]  # k - m th item
            # res = res.tolist()
            for _ in range(m):
                rand = np.random.uniform(start, end)
                res.append(rand)
            return sorted(res)


x, y = heart_disease()
# x = MinMaxScaler().fit_transform(x)
num_nodes = 3
k = 5  # number of k nearest neighbors
num_rounds = 2
p0 = 1.0
d = 0.25

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
global_knn = KNeighborsClassifier(n_neighbors=k)
global_knn.fit(x_train, y_train)
global_y_pred = global_knn.predict(x_test)
baseline = metrics.balanced_accuracy_score(y_test, global_y_pred)
print(baseline)
# Suppose 3 nodes
# For simplicity every nodes has only one sample x
Пример #4
0
def test(p0=1.0, d=0.75, rounds=10, k=5):
    # data preparation

    x, y = heart_disease()
    # x = MinMaxScaler().fit_transform(x)
    randomize = np.random.permutation(len(x))
    x = x[randomize]
    y = y[randomize]
    x = np.squeeze(x)
    y = np.squeeze(y)
    n = 3

    x_split = np.array_split(x, n + 1)
    y_split = np.array_split(y, n + 1)

    x_test_dataset = x_split[n]
    y_test_dataset = y_split[n]

    y_pred = []

    for sample_id in range(len(y_test_dataset)):

        if DEBUG:
            print('SAMPLE %d' % (sample_id))

        x_test_sample = x_test_dataset[sample_id].reshape(1, -1)
        ldv = []

        # Each node comptes the distance between x and each point in its database
        for node_idx in range(n):
            dataset = x_split[node_idx]
            ldv_i = []
            tmp = {}

            for point_id in range(len(dataset)):
                point = dataset[point_id]
                d = distance.euclidean(point, x_test_sample)
                tmp[point_id] = d

            sorted_tmp = [(k, tmp[k])
                          for k in sorted(tmp, key=tmp.get, reverse=False)]
            tmp_dists = []
            tmp_ids = []
            for point_id, dist in sorted_tmp:
                tmp_ids.append(point_id)
                tmp_dists.append(dist)
            tmp_ids = np.array(tmp_ids)
            tmp_dists = np.array(tmp_dists)
            ldv_i.append((tmp_dists[:k], tmp_ids[:k]))
            ldv.append(ldv_i)

        # for node_idx in range(n):
        #     local_knn = local_knn_classifiers[node_idx]
        #     ldv_i = local_knn.kneighbors(x_test_sample, n_neighbors=k)
        #     ldv.append(ldv_i)

        # Now we need to find GLOBALLY the the k nearest distances and store them in gdv

        # gdv = [[np.random.uniform() for _ in range(k + 1)]]

        gdv = [[np.random.uniform(100.0, 200.0) for _ in range(k + 1)]]

        for r in range(rounds):

            # keep only the last item of gdv at next round
            if r > 0:
                gdv = [gdv[-1]]

            if DEBUG:
                print('Round: %d' % r)

            # alg init: curr = prev, then curr=1 and prev = 0
            node_idx_curr = 0
            node_idx_prev = 0
            gdv_i = knearest_global(node_idx_curr, node_idx_prev, r, ldv, gdv,
                                    p0, d)
            gdv.append(gdv_i)
            gdv = [gdv[1]]  # remove randomly generated list

            for idx in range(n):
                node_idx_prev = idx
                node_idx_curr = idx + 1
                if node_idx_curr >= n:  # then the ring is closed
                    break
                gdv_i = knearest_global(node_idx_curr, node_idx_prev, r, ldv,
                                        gdv, p0, d)
                gdv.append(gdv_i)
                if DEBUG:
                    print(gdv_i, node_idx_prev, node_idx_curr)

            node_idx_curr = 0
            node_idx_prev = n - 1
            gdv_i = knearest_global(node_idx_curr, node_idx_prev, r, ldv, gdv,
                                    p0, d)
            gdv.append(gdv_i)

        gdv = gdv_i
        gdv = np.array(gdv)
        gdv.sort()
        # At the end of the rounds gdv = real_gdv (CHECK FOR PRIVACY BUGS: maybe with this implementation a node can snoop
        # other nodes' topk)

        real_gdv = []
        for ldv_i in ldv:
            dists = ldv_i[0][0].flatten()
            real_gdv.append(dists)

        real_gdv = np.array(real_gdv).flatten()
        real_gdv.sort()

        if DEBUG:
            print('REAL TOP K DISTANCES ', real_gdv[:5])
            print('GLOBAL DISTANCE VECTOR ', gdv)

        # sanity check: to which database the top k belong ?
        # if DEBUG:
        #     for v in real_gdv[:5]:
        #         for node_idx in range(n):
        #             data = ldv[node_idx][0].flatten()
        #             if v in data:
        #                 print(v, node_idx)

        # So now every node knows the k nearest distances

        # CLASSIFICATION
        # After each node determines the points in its database which are within the kth nearest distance from x, each
        # node computes a local classification vector of the query instance, where the ith element is the amount of
        # votes the ith class received from the points in this node's database which are among the k  nearest neighbors.
        # Note: so, i need to compute for every node the classification of x ?
        # The nodes then participate to find a global classification vector

        k_point = gdv[-1]

        lcv = []

        for node_idx in range(n):

            lcv_i = np.zeros((1, len(np.unique(y)))).squeeze().tolist()

            # ldv_i = local_knn.kneighbors(x_test_sample, n_neighbors=k)
            ldv_i = ldv[node_idx][0]
            # from the paper isn't very clear. My interpretation:
            # find all points that are in the radius of the k-th points
            dists = ldv_i[0].flatten()
            ids = ldv_i[1].flatten()

            dist_id = 0
            for dist in dists:
                if dist <= k_point:
                    idx = ids[dist_id]
                    cl = y_split[node_idx][idx]
                    if DEBUG:
                        print(node_idx, idx, cl)
                    lcv_i[cl] += 1
                dist_id += 1

            # if DEBUG:
            #     prediction = local_knn.predict(x_test_sample)
            #     my_prediction = np.argmax(lcv_i)
            #     if prediction != my_prediction:
            #         prediction = local_knn.predict(x_test_sample)
            lcv.append(lcv_i)
            node_idx += 1

        # in this case we have that node 0 doesn't have any point in lcv. So he knows that the other nodes have all the
        # other distances. But i think this is part of the algorithm, since the global distances vector is public and
        # every node can compute it. If node 0 colludes with node 1, they will find out that all the classification is
        # made by node 2, that has the most classification power.
        #  Maybe with a better randomization (see previous _todo_) the problem will happen less,
        # i.e. every node has more or less the same power in deciding the classification of a test point.

        # let's say the random values are known only to a trusted third party
        random_values = np.random.randint(100, size=2)

        gcv = np.copy(random_values)

        # secure sum (this is a local simplification, off course. But the main idea is that every node adds to
        # the global vector he receives its class values. A node doesn't know in which position he is of the rings and
        # the local classification values of every other node. Still he can collude with the others.

        for lcv_i in lcv:
            pos = 0
            for val in lcv_i:
                gcv[pos] += val
                pos += 1

        real_gcv = gcv - random_values

        if DEBUG:
            print('final gcv (what the final node sees) ', gcv)
            print('random values ', random_values)
            print('real gcv ', real_gcv)

        if real_gcv[0] == real_gcv[1]:  # flip a coin
            cls = 0 if random.random() < 0.5 else 1
        else:
            cls = np.argmax(real_gcv)

        if DEBUG:
            print('REAL CLASS: %d\n'
                  'PREDICTED CLASS: %d' % (y_test_dataset[sample_id], cls))

        y_pred.append(cls)
    # plot.print_metrics(y_test_dataset, y_pred)
    return metrics.balanced_accuracy_score(y_test_dataset, y_pred)
Пример #5
0
def test_knn(p0=1.0, d=0.75, rounds=10, k=5, n=3):
    '''

    :param p0:
    :param d:
    :param rounds:
    :param k:
    :param n:
    :return:
    '''
    # data preparation

    x, y = heart_disease()
    # x, y = datasets.load_breast_cancer(return_X_y=True)
    # x, y = abalone()
    # x = MinMaxScaler().fit_transform(x)
    # x = StandardScaler().fit_transform(x)
    randomize = np.random.permutation(len(x))
    x = x[randomize]
    y = y[randomize]
    x = np.squeeze(x)
    y = np.squeeze(y)

    x_split = np.array_split(x, n + 1)
    y_split = np.array_split(y, n + 1)

    x_test_dataset = x_split[n]
    y_test_dataset = y_split[n]

    true_knn = KNeighborsClassifier(n_neighbors=k)
    true_x = []
    true_y = []
    for node_idx in range(n):
        local_data = x_split[node_idx]
        for value in local_data:
            true_x.append(value)
    for node_idx in range(n):
        local_data = y_split[node_idx]
        for label in local_data:
            true_y.append(label)

    true_knn.fit(true_x, true_y)
    true_pred = true_knn.predict(x_test_dataset)
    true_baseline = metrics.balanced_accuracy_score(y_test_dataset, true_pred)
    local_classifiers = []
    for node_idx in range(n):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(x_split[node_idx], y_split[node_idx])
        local_classifiers.append(knn)

    y_pred = []

    for sample_id in range(len(y_test_dataset)):

        if DEBUG:
            print('SAMPLE %d' % (sample_id))

        x_test_sample = x_test_dataset[sample_id].reshape(1, -1)
        ldv = []

        for node_idx in range(n):
            node_knn = local_classifiers[node_idx]
            ldv_i = node_knn.kneighbors(x_test_sample, n_neighbors=k)[0]
            ldv.append(ldv_i)

        gdv = [[np.random.uniform(100) for _ in range(k)]]

        for r in range(rounds):

            # keep only the last item of gdv at next round
            if r > 0:
                gdv = [gdv[-1]]

            if DEBUG:
                print('Round: %d' % r)

            # alg init: curr = prev, then curr=1 and prev = 0
            node_idx_curr = 0
            node_idx_prev = 0
            gdv_i = knearest_global_knn(node_idx_curr,
                                        node_idx_prev,
                                        r,
                                        ldv,
                                        gdv,
                                        p0,
                                        d,
                                        k=k)
            gdv.append(gdv_i)
            gdv = [gdv[1]]  # remove randomly generated list

            for idx in range(n):
                node_idx_prev = idx
                node_idx_curr = idx + 1
                if node_idx_curr >= n:  # then the ring is closed
                    break
                gdv_i = knearest_global_knn(node_idx_curr,
                                            node_idx_prev,
                                            r,
                                            ldv,
                                            gdv,
                                            p0,
                                            d,
                                            k=k)
                gdv.append(gdv_i)
                if DEBUG:
                    print(gdv_i, node_idx_prev, node_idx_curr)

            node_idx_curr = 0
            node_idx_prev = n - 1
            gdv_i = knearest_global_knn(node_idx_curr,
                                        node_idx_prev,
                                        r,
                                        ldv,
                                        gdv,
                                        p0,
                                        d,
                                        k=k)
            gdv.append(gdv_i)

        gdv = gdv_i
        gdv = np.array(gdv)
        gdv.sort()
        # At the end of the rounds gdv = real_gdv (CHECK FOR PRIVACY BUGS: maybe with this implementation a node can snoop
        # other nodes' topk)

        real_gdv = []
        for ldv_i in ldv:
            dists = ldv_i[0].flatten()
            real_gdv.append(dists)

        real_gdv = np.array(real_gdv).flatten()
        real_gdv.sort()
        # real_gdv = true_knn.kneighbors(x_test_sample, n_neighbors=k)
        if DEBUG:
            print('REAL TOP K DISTANCES ', real_gdv[:k])
            print('GLOBAL DISTANCE VECTOR ', gdv)

        k_point = gdv[0]

        lcv = []

        for node_idx in range(n):
            lcv_i = np.zeros((1, len(np.unique(y)))).squeeze().tolist()

            local_knn = local_classifiers[node_idx]
            ldv_i = local_knn.kneighbors(x_test_sample, n_neighbors=k)

            dists = ldv_i[0].flatten()
            ids = ldv_i[1].flatten()

            dist_id = 0
            for dist in dists:
                if dist <= k_point:
                    idx = ids[dist_id]
                    cl = y_split[node_idx][idx]
                    if DEBUG:
                        print(node_idx, idx, cl)
                    lcv_i[cl] += dist

                dist_id += 1

            lcv.append(lcv_i)
            # local_pred = int(local_knn.predict(x_test_sample))
            # lcv_i[local_pred] += 1
            # lcv.append(lcv_i)
            # lcv_i = local_knn.predict_proba(x_test_sample)
            # lcv.append(lcv_i)
            node_idx += 1

        random_values = np.random.randint(100, size=len(np.unique(y)))
        gcv = np.copy(random_values)

        for lcv_i in lcv:
            pos = 0
            for val in lcv_i:
                gcv[pos] += val
                pos += 1

        real_gcv = gcv - random_values

        if DEBUG:
            print('final gcv (what the final node sees) ', gcv)
            print('random values ', random_values)
            print('real gcv ', real_gcv)

        cls = np.argmax(real_gcv)

        if DEBUG:
            print('REAL CLASS: %d\n'
                  'PREDICTED CLASS: %d' % (y_test_dataset[sample_id], cls))

        y_pred.append(cls)
    # plot.print_metrics(y_test_dataset, y_pred)
    return metrics.balanced_accuracy_score(y_test_dataset,
                                           y_pred), true_baseline
Пример #6
0
def lap_knn(n=3, k=5, eps=1.0):
    x, y = heart_disease()
    # x, y = datasets.load_breast_cancer(return_X_y=True)
    # x, y = abalone()
    # x = MinMaxScaler().fit_transform(x)
    # x = StandardScaler().fit_transform(x)
    randomize = np.random.permutation(len(x))
    x = x[randomize]
    y = y[randomize]
    x = np.squeeze(x)
    y = np.squeeze(y)

    x_split = np.array_split(x, n + 1)
    y_split = np.array_split(y, n + 1)

    x_test_dataset = x_split[n]
    y_test_dataset = y_split[n]

    true_knn = KNeighborsClassifier(n_neighbors=k)
    true_x = []
    true_y = []
    for node_idx in range(n):
        local_data = x_split[node_idx]
        for value in local_data:
            true_x.append(value)
    for node_idx in range(n):
        local_data = y_split[node_idx]
        for label in local_data:
            true_y.append(label)

    true_knn.fit(true_x, true_y)
    true_pred = true_knn.predict(x_test_dataset)
    true_baseline = metrics.balanced_accuracy_score(y_test_dataset, true_pred)
    local_classifiers = []
    for node_idx in range(n):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(x_split[node_idx], y_split[node_idx])
        local_classifiers.append(knn)

    y_pred = []

    for sample_id in range(len(y_test_dataset)):

        if DEBUG:
            print('SAMPLE %d' % (sample_id))

        x_test_sample = x_test_dataset[sample_id].reshape(1, -1)
        ldv = []

        for node_idx in range(n):
            node_knn = local_classifiers[node_idx]
            ldv_i = node_knn.kneighbors(x_test_sample, n_neighbors=k)[0]
            ldv.append(ldv_i + np.random.laplace(scale=k / eps, size=k))

            print('DIFFERENCE')
            print(
                np.linalg.norm(ldv_i - ldv[node_idx]) / np.linalg.norm(ldv_i))

        # instead of multiround algorithm each nodes publish its topk values adding laplacian noise
        # we publish instead the top k distances adding lap noise
        # the top-k values are differentially private with Lap(k/eps)
        # also we can use only one single round

        ldv = np.array(ldv)
        gdv = ldv.flatten()
        gdv.sort()
        gdv = gdv[:k]

        k_point = gdv[-1]

        lcv = []

        for node_idx in range(n):
            lcv_i = np.zeros((1, len(np.unique(y)))).squeeze().tolist()

            local_knn = local_classifiers[node_idx]
            ldv_i = local_knn.kneighbors(x_test_sample, n_neighbors=k)

            dists = ldv_i[0].flatten()
            ids = ldv_i[1].flatten()

            dist_id = 0
            for dist in dists:
                if dist <= k_point:
                    idx = ids[dist_id]
                    cl = y_split[node_idx][idx]
                    if DEBUG:
                        print(node_idx, idx, cl)
                    lcv_i[cl] += dist

                dist_id += 1

            lcv.append(lcv_i)
            # local_pred = int(local_knn.predict(x_test_sample))
            # lcv_i[local_pred] += 1
            # lcv.append(lcv_i)
            # lcv_i = local_knn.predict_proba(x_test_sample)
            # lcv.append(lcv_i)
            node_idx += 1

        random_values = np.random.randint(100, size=len(np.unique(y)))
        gcv = np.copy(random_values)

        for lcv_i in lcv:
            pos = 0
            for val in lcv_i:
                gcv[pos] += val
                pos += 1

        real_gcv = gcv - random_values

        if DEBUG:
            print('final gcv (what the final node sees) ', gcv)
            print('random values ', random_values)
            print('real gcv ', real_gcv)

        cls = np.argmax(real_gcv)

        if DEBUG:
            print('REAL CLASS: %d\n'
                  'PREDICTED CLASS: %d' % (y_test_dataset[sample_id], cls))

        y_pred.append(cls)
    # plot.print_metrics(y_test_dataset, y_pred)
    return metrics.balanced_accuracy_score(y_test_dataset,
                                           y_pred), true_baseline