def jl_dp_accuracy(clf=KNeighborsClassifier(), params=KNN_PARAMS, test_size=0.25, is_sparse=True): x, y = datasets.heart_disease() x = MinMaxScaler().fit_transform(x) split_random_state = int(random.random() * 100) gs, x_train, y_train, x_test, y_test = grid_search( x, y, clf, params, test_size=test_size, standardize=False, verbose=1, random_state=split_random_state) clf = gs.best_estimator_ y_pred = clf.predict(x_test) plot.print_metrics(y_test, y_pred) baseline = metrics.balanced_accuracy_score(y_test, y_pred) print(baseline) num_rounds = 100 dims = x.shape[1] scores = defaultdict(list) delta = 0.1 for eps in EPS_RANGE: print('epsilon = ', eps) for k in range(1, dims + 1): tmp_k = [] for i in range(num_rounds): z, p, sigma = random_projection.private_projection( x, eps=eps, delta=delta, k=k, is_sparse=is_sparse) z_train, z_test, y_train, y_test = train_test_split( z, y, test_size=test_size) clf = gs.best_estimator_ clf.fit(z_train, y_train) y_pred = clf.predict(z_test) score = metrics.balanced_accuracy_score(y_test, y_pred) tmp_k.append(score) mean_k = np.mean(tmp_k) print('k = %d, mean = %.10f' % (k, mean_k)) scores[eps].append(mean_k) return scores, baseline
def heart_generalized_response( clf=KNeighborsClassifier(), params=KNN_PARAMS, test_size=0.25): x, y = datasets.heart_disease() x_scale = MinMaxScaler().fit_transform(x) gs, x_train, y_train, x_test, y_test = grid_search(x_scale, y, clf, params=params, test_size=test_size, standardize=False, verbose=1) clf = gs.best_estimator_ y_pred = clf.predict(x_test) plot.print_metrics(y_test, y_pred) baseline = metrics.balanced_accuracy_score(y_test, y_pred) print(baseline) num_rounds = 100 scores = defaultdict(list) for eps in EPS_RANGE: for i in range(num_rounds): z, ps, qs = randomized_response(x, eps) z_scale = MinMaxScaler().fit_transform(z) z_train, z_test, y_train, y_test = train_test_split( z_scale, y, test_size=test_size) clf = gs.best_estimator_ clf.fit(z_train, y_train) y_pred = clf.predict(z_test) score = metrics.balanced_accuracy_score(y_test, y_pred) scores[eps].append(score) print('eps = %.2f, mean = %.10f' % (eps, np.mean(scores[eps]))) return scores, baseline
return curr_top_k.tolist() else: # res = prev_top_k[0: k - m] res = gdv_prev[0:k - m] # insert in the output m random values start = curr_top_k[k - 1] # last item # end = prev_top_k[k - m] end = gdv_prev[k - m] # k - m th item # res = res.tolist() for _ in range(m): rand = np.random.uniform(start, end) res.append(rand) return sorted(res) x, y = heart_disease() # x = MinMaxScaler().fit_transform(x) num_nodes = 3 k = 5 # number of k nearest neighbors num_rounds = 2 p0 = 1.0 d = 0.25 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) global_knn = KNeighborsClassifier(n_neighbors=k) global_knn.fit(x_train, y_train) global_y_pred = global_knn.predict(x_test) baseline = metrics.balanced_accuracy_score(y_test, global_y_pred) print(baseline) # Suppose 3 nodes # For simplicity every nodes has only one sample x
def test(p0=1.0, d=0.75, rounds=10, k=5): # data preparation x, y = heart_disease() # x = MinMaxScaler().fit_transform(x) randomize = np.random.permutation(len(x)) x = x[randomize] y = y[randomize] x = np.squeeze(x) y = np.squeeze(y) n = 3 x_split = np.array_split(x, n + 1) y_split = np.array_split(y, n + 1) x_test_dataset = x_split[n] y_test_dataset = y_split[n] y_pred = [] for sample_id in range(len(y_test_dataset)): if DEBUG: print('SAMPLE %d' % (sample_id)) x_test_sample = x_test_dataset[sample_id].reshape(1, -1) ldv = [] # Each node comptes the distance between x and each point in its database for node_idx in range(n): dataset = x_split[node_idx] ldv_i = [] tmp = {} for point_id in range(len(dataset)): point = dataset[point_id] d = distance.euclidean(point, x_test_sample) tmp[point_id] = d sorted_tmp = [(k, tmp[k]) for k in sorted(tmp, key=tmp.get, reverse=False)] tmp_dists = [] tmp_ids = [] for point_id, dist in sorted_tmp: tmp_ids.append(point_id) tmp_dists.append(dist) tmp_ids = np.array(tmp_ids) tmp_dists = np.array(tmp_dists) ldv_i.append((tmp_dists[:k], tmp_ids[:k])) ldv.append(ldv_i) # for node_idx in range(n): # local_knn = local_knn_classifiers[node_idx] # ldv_i = local_knn.kneighbors(x_test_sample, n_neighbors=k) # ldv.append(ldv_i) # Now we need to find GLOBALLY the the k nearest distances and store them in gdv # gdv = [[np.random.uniform() for _ in range(k + 1)]] gdv = [[np.random.uniform(100.0, 200.0) for _ in range(k + 1)]] for r in range(rounds): # keep only the last item of gdv at next round if r > 0: gdv = [gdv[-1]] if DEBUG: print('Round: %d' % r) # alg init: curr = prev, then curr=1 and prev = 0 node_idx_curr = 0 node_idx_prev = 0 gdv_i = knearest_global(node_idx_curr, node_idx_prev, r, ldv, gdv, p0, d) gdv.append(gdv_i) gdv = [gdv[1]] # remove randomly generated list for idx in range(n): node_idx_prev = idx node_idx_curr = idx + 1 if node_idx_curr >= n: # then the ring is closed break gdv_i = knearest_global(node_idx_curr, node_idx_prev, r, ldv, gdv, p0, d) gdv.append(gdv_i) if DEBUG: print(gdv_i, node_idx_prev, node_idx_curr) node_idx_curr = 0 node_idx_prev = n - 1 gdv_i = knearest_global(node_idx_curr, node_idx_prev, r, ldv, gdv, p0, d) gdv.append(gdv_i) gdv = gdv_i gdv = np.array(gdv) gdv.sort() # At the end of the rounds gdv = real_gdv (CHECK FOR PRIVACY BUGS: maybe with this implementation a node can snoop # other nodes' topk) real_gdv = [] for ldv_i in ldv: dists = ldv_i[0][0].flatten() real_gdv.append(dists) real_gdv = np.array(real_gdv).flatten() real_gdv.sort() if DEBUG: print('REAL TOP K DISTANCES ', real_gdv[:5]) print('GLOBAL DISTANCE VECTOR ', gdv) # sanity check: to which database the top k belong ? # if DEBUG: # for v in real_gdv[:5]: # for node_idx in range(n): # data = ldv[node_idx][0].flatten() # if v in data: # print(v, node_idx) # So now every node knows the k nearest distances # CLASSIFICATION # After each node determines the points in its database which are within the kth nearest distance from x, each # node computes a local classification vector of the query instance, where the ith element is the amount of # votes the ith class received from the points in this node's database which are among the k nearest neighbors. # Note: so, i need to compute for every node the classification of x ? # The nodes then participate to find a global classification vector k_point = gdv[-1] lcv = [] for node_idx in range(n): lcv_i = np.zeros((1, len(np.unique(y)))).squeeze().tolist() # ldv_i = local_knn.kneighbors(x_test_sample, n_neighbors=k) ldv_i = ldv[node_idx][0] # from the paper isn't very clear. My interpretation: # find all points that are in the radius of the k-th points dists = ldv_i[0].flatten() ids = ldv_i[1].flatten() dist_id = 0 for dist in dists: if dist <= k_point: idx = ids[dist_id] cl = y_split[node_idx][idx] if DEBUG: print(node_idx, idx, cl) lcv_i[cl] += 1 dist_id += 1 # if DEBUG: # prediction = local_knn.predict(x_test_sample) # my_prediction = np.argmax(lcv_i) # if prediction != my_prediction: # prediction = local_knn.predict(x_test_sample) lcv.append(lcv_i) node_idx += 1 # in this case we have that node 0 doesn't have any point in lcv. So he knows that the other nodes have all the # other distances. But i think this is part of the algorithm, since the global distances vector is public and # every node can compute it. If node 0 colludes with node 1, they will find out that all the classification is # made by node 2, that has the most classification power. # Maybe with a better randomization (see previous _todo_) the problem will happen less, # i.e. every node has more or less the same power in deciding the classification of a test point. # let's say the random values are known only to a trusted third party random_values = np.random.randint(100, size=2) gcv = np.copy(random_values) # secure sum (this is a local simplification, off course. But the main idea is that every node adds to # the global vector he receives its class values. A node doesn't know in which position he is of the rings and # the local classification values of every other node. Still he can collude with the others. for lcv_i in lcv: pos = 0 for val in lcv_i: gcv[pos] += val pos += 1 real_gcv = gcv - random_values if DEBUG: print('final gcv (what the final node sees) ', gcv) print('random values ', random_values) print('real gcv ', real_gcv) if real_gcv[0] == real_gcv[1]: # flip a coin cls = 0 if random.random() < 0.5 else 1 else: cls = np.argmax(real_gcv) if DEBUG: print('REAL CLASS: %d\n' 'PREDICTED CLASS: %d' % (y_test_dataset[sample_id], cls)) y_pred.append(cls) # plot.print_metrics(y_test_dataset, y_pred) return metrics.balanced_accuracy_score(y_test_dataset, y_pred)
def test_knn(p0=1.0, d=0.75, rounds=10, k=5, n=3): ''' :param p0: :param d: :param rounds: :param k: :param n: :return: ''' # data preparation x, y = heart_disease() # x, y = datasets.load_breast_cancer(return_X_y=True) # x, y = abalone() # x = MinMaxScaler().fit_transform(x) # x = StandardScaler().fit_transform(x) randomize = np.random.permutation(len(x)) x = x[randomize] y = y[randomize] x = np.squeeze(x) y = np.squeeze(y) x_split = np.array_split(x, n + 1) y_split = np.array_split(y, n + 1) x_test_dataset = x_split[n] y_test_dataset = y_split[n] true_knn = KNeighborsClassifier(n_neighbors=k) true_x = [] true_y = [] for node_idx in range(n): local_data = x_split[node_idx] for value in local_data: true_x.append(value) for node_idx in range(n): local_data = y_split[node_idx] for label in local_data: true_y.append(label) true_knn.fit(true_x, true_y) true_pred = true_knn.predict(x_test_dataset) true_baseline = metrics.balanced_accuracy_score(y_test_dataset, true_pred) local_classifiers = [] for node_idx in range(n): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(x_split[node_idx], y_split[node_idx]) local_classifiers.append(knn) y_pred = [] for sample_id in range(len(y_test_dataset)): if DEBUG: print('SAMPLE %d' % (sample_id)) x_test_sample = x_test_dataset[sample_id].reshape(1, -1) ldv = [] for node_idx in range(n): node_knn = local_classifiers[node_idx] ldv_i = node_knn.kneighbors(x_test_sample, n_neighbors=k)[0] ldv.append(ldv_i) gdv = [[np.random.uniform(100) for _ in range(k)]] for r in range(rounds): # keep only the last item of gdv at next round if r > 0: gdv = [gdv[-1]] if DEBUG: print('Round: %d' % r) # alg init: curr = prev, then curr=1 and prev = 0 node_idx_curr = 0 node_idx_prev = 0 gdv_i = knearest_global_knn(node_idx_curr, node_idx_prev, r, ldv, gdv, p0, d, k=k) gdv.append(gdv_i) gdv = [gdv[1]] # remove randomly generated list for idx in range(n): node_idx_prev = idx node_idx_curr = idx + 1 if node_idx_curr >= n: # then the ring is closed break gdv_i = knearest_global_knn(node_idx_curr, node_idx_prev, r, ldv, gdv, p0, d, k=k) gdv.append(gdv_i) if DEBUG: print(gdv_i, node_idx_prev, node_idx_curr) node_idx_curr = 0 node_idx_prev = n - 1 gdv_i = knearest_global_knn(node_idx_curr, node_idx_prev, r, ldv, gdv, p0, d, k=k) gdv.append(gdv_i) gdv = gdv_i gdv = np.array(gdv) gdv.sort() # At the end of the rounds gdv = real_gdv (CHECK FOR PRIVACY BUGS: maybe with this implementation a node can snoop # other nodes' topk) real_gdv = [] for ldv_i in ldv: dists = ldv_i[0].flatten() real_gdv.append(dists) real_gdv = np.array(real_gdv).flatten() real_gdv.sort() # real_gdv = true_knn.kneighbors(x_test_sample, n_neighbors=k) if DEBUG: print('REAL TOP K DISTANCES ', real_gdv[:k]) print('GLOBAL DISTANCE VECTOR ', gdv) k_point = gdv[0] lcv = [] for node_idx in range(n): lcv_i = np.zeros((1, len(np.unique(y)))).squeeze().tolist() local_knn = local_classifiers[node_idx] ldv_i = local_knn.kneighbors(x_test_sample, n_neighbors=k) dists = ldv_i[0].flatten() ids = ldv_i[1].flatten() dist_id = 0 for dist in dists: if dist <= k_point: idx = ids[dist_id] cl = y_split[node_idx][idx] if DEBUG: print(node_idx, idx, cl) lcv_i[cl] += dist dist_id += 1 lcv.append(lcv_i) # local_pred = int(local_knn.predict(x_test_sample)) # lcv_i[local_pred] += 1 # lcv.append(lcv_i) # lcv_i = local_knn.predict_proba(x_test_sample) # lcv.append(lcv_i) node_idx += 1 random_values = np.random.randint(100, size=len(np.unique(y))) gcv = np.copy(random_values) for lcv_i in lcv: pos = 0 for val in lcv_i: gcv[pos] += val pos += 1 real_gcv = gcv - random_values if DEBUG: print('final gcv (what the final node sees) ', gcv) print('random values ', random_values) print('real gcv ', real_gcv) cls = np.argmax(real_gcv) if DEBUG: print('REAL CLASS: %d\n' 'PREDICTED CLASS: %d' % (y_test_dataset[sample_id], cls)) y_pred.append(cls) # plot.print_metrics(y_test_dataset, y_pred) return metrics.balanced_accuracy_score(y_test_dataset, y_pred), true_baseline
def lap_knn(n=3, k=5, eps=1.0): x, y = heart_disease() # x, y = datasets.load_breast_cancer(return_X_y=True) # x, y = abalone() # x = MinMaxScaler().fit_transform(x) # x = StandardScaler().fit_transform(x) randomize = np.random.permutation(len(x)) x = x[randomize] y = y[randomize] x = np.squeeze(x) y = np.squeeze(y) x_split = np.array_split(x, n + 1) y_split = np.array_split(y, n + 1) x_test_dataset = x_split[n] y_test_dataset = y_split[n] true_knn = KNeighborsClassifier(n_neighbors=k) true_x = [] true_y = [] for node_idx in range(n): local_data = x_split[node_idx] for value in local_data: true_x.append(value) for node_idx in range(n): local_data = y_split[node_idx] for label in local_data: true_y.append(label) true_knn.fit(true_x, true_y) true_pred = true_knn.predict(x_test_dataset) true_baseline = metrics.balanced_accuracy_score(y_test_dataset, true_pred) local_classifiers = [] for node_idx in range(n): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(x_split[node_idx], y_split[node_idx]) local_classifiers.append(knn) y_pred = [] for sample_id in range(len(y_test_dataset)): if DEBUG: print('SAMPLE %d' % (sample_id)) x_test_sample = x_test_dataset[sample_id].reshape(1, -1) ldv = [] for node_idx in range(n): node_knn = local_classifiers[node_idx] ldv_i = node_knn.kneighbors(x_test_sample, n_neighbors=k)[0] ldv.append(ldv_i + np.random.laplace(scale=k / eps, size=k)) print('DIFFERENCE') print( np.linalg.norm(ldv_i - ldv[node_idx]) / np.linalg.norm(ldv_i)) # instead of multiround algorithm each nodes publish its topk values adding laplacian noise # we publish instead the top k distances adding lap noise # the top-k values are differentially private with Lap(k/eps) # also we can use only one single round ldv = np.array(ldv) gdv = ldv.flatten() gdv.sort() gdv = gdv[:k] k_point = gdv[-1] lcv = [] for node_idx in range(n): lcv_i = np.zeros((1, len(np.unique(y)))).squeeze().tolist() local_knn = local_classifiers[node_idx] ldv_i = local_knn.kneighbors(x_test_sample, n_neighbors=k) dists = ldv_i[0].flatten() ids = ldv_i[1].flatten() dist_id = 0 for dist in dists: if dist <= k_point: idx = ids[dist_id] cl = y_split[node_idx][idx] if DEBUG: print(node_idx, idx, cl) lcv_i[cl] += dist dist_id += 1 lcv.append(lcv_i) # local_pred = int(local_knn.predict(x_test_sample)) # lcv_i[local_pred] += 1 # lcv.append(lcv_i) # lcv_i = local_knn.predict_proba(x_test_sample) # lcv.append(lcv_i) node_idx += 1 random_values = np.random.randint(100, size=len(np.unique(y))) gcv = np.copy(random_values) for lcv_i in lcv: pos = 0 for val in lcv_i: gcv[pos] += val pos += 1 real_gcv = gcv - random_values if DEBUG: print('final gcv (what the final node sees) ', gcv) print('random values ', random_values) print('real gcv ', real_gcv) cls = np.argmax(real_gcv) if DEBUG: print('REAL CLASS: %d\n' 'PREDICTED CLASS: %d' % (y_test_dataset[sample_id], cls)) y_pred.append(cls) # plot.print_metrics(y_test_dataset, y_pred) return metrics.balanced_accuracy_score(y_test_dataset, y_pred), true_baseline