def analyze_gnmax_conf_data_ind(votes, threshold, sigma1, sigma2, delta): orders = np.logspace(np.log10(1.5), np.log10(500), num=100) n = votes.shape[0] rdp_total = np.zeros(len(orders)) answered_total = 0 answered = np.zeros(n) eps_cum = np.full(n, None, dtype=float) for i in range(n): v = votes[i, ] if threshold is not None and sigma1 is not None: q_step1 = np.exp(pate.compute_logpr_answered(threshold, sigma1, v)) rdp_total += pate.rdp_data_independent_gaussian(sigma1, orders) else: q_step1 = 1. # always answer answered_total += q_step1 answered[i] = answered_total rdp_total += q_step1 * pate.rdp_data_independent_gaussian( sigma2, orders) eps_cum[i], order_opt = pate.compute_eps_from_delta( orders, rdp_total, delta) if i > 0 and (i + 1) % 1000 == 0: print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} ' 'at order = {:.2f}.'.format(i + 1, answered[i], eps_cum[i], order_opt)) sys.stdout.flush() return eps_cum, answered
def analyze_gnmax_conf_data_ind(votes, threshold, sigma1, sigma2, delta): orders = np.logspace(np.log10(1.5), np.log10(500), num=100) n = votes.shape[0] rdp_total = np.zeros(len(orders)) answered_total = 0 answered = np.zeros(n) eps_cum = np.full(n, None, dtype=float) for i in range(n): v = votes[i,] if threshold is not None and sigma1 is not None: q_step1 = np.exp(pate.compute_logpr_answered(threshold, sigma1, v)) rdp_total += pate.rdp_data_independent_gaussian(sigma1, orders) else: q_step1 = 1. # always answer answered_total += q_step1 answered[i] = answered_total rdp_total += q_step1 * pate.rdp_data_independent_gaussian(sigma2, orders) eps_cum[i], order_opt = pate.compute_eps_from_delta(orders, rdp_total, delta) if i > 0 and (i + 1) % 1000 == 0: print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} ' 'at order = {:.2f}.'.format( i + 1, answered[i], eps_cum[i], order_opt)) sys.stdout.flush() return eps_cum, answered
def scatter_plot(votes, threshold, sigma1, sigma2, order): fig, ax = setup_plot() x = [] y = [] for i, v in enumerate(votes): if threshold is not None and sigma1 is not None: q_step1 = math.exp(pate.compute_logpr_answered(threshold, sigma1, v)) else: q_step1 = 1. if random.random() < q_step1: logq_step2 = pate.compute_logq_gaussian(v, sigma2) x.append(max(v)) y.append(pate.rdp_gaussian(logq_step2, sigma2, order)) print('Selected {} queries.'.format(len(x))) # Plot the data-independent curve: # data_ind = pate.rdp_data_independent_gaussian(sigma, order) # plt.plot([0, 5000], [data_ind, data_ind], color='tab:blue', linestyle='-', linewidth=2) ax.set_yscale('log') plt.xlim(xmin=0, xmax=5000) plt.ylim(ymin=1e-300, ymax=1) plt.yticks([1, 1e-100, 1e-200, 1e-300]) plt.scatter(x, y, s=1, alpha=0.5) plt.ylabel(r'RDP at $\alpha={}$'.format(order), fontsize=16) plt.xlabel(r'max count', fontsize=16) ax.tick_params(labelsize=14) plt.show()
def compute_expected_answered_per_bin(bin_num, votes, threshold, sigma1): """Computes expected number of answers per bin. Args: bin_num: Number of bins. votes: A matrix of votes, where each row contains votes in one instance. threshold: The threshold against which check is performed. sigma1: The std of the Gaussian noise with which check is performed. (Same as sigma_1 in Algorithms 1 and 2.) Returns: Expected number of queries answered per bin. """ n = votes.shape[0] bin_answered = np.zeros(bin_num) for i in xrange(n): v = votes[i,] p = math.exp(pate.compute_logpr_answered(threshold, sigma1, v)) bin_idx = int(math.floor(max(v) * bin_num / sum(v))) assert 0 <= bin_idx < bin_num bin_answered[bin_idx] += p if (i + 1) % 1000 == 0: print('example {}'.format(i + 1)) sys.stdout.flush() return bin_answered
def _compute_rdp(votes, baseline, threshold, sigma1, sigma2, delta, orders, data_ind): """Computes the (data-dependent) RDP curve for Confident GNMax.""" rdp_cum = np.zeros(len(orders)) rdp_sqrd_cum = np.zeros(len(orders)) answered = 0 for i, v in enumerate(votes): if threshold is None: logq_step1 = 0 # No thresholding, always proceed to step 2. rdp_step1 = np.zeros(len(orders)) else: logq_step1 = pate.compute_logpr_answered(threshold, sigma1, v - baseline[i, ]) if data_ind: rdp_step1 = pate.compute_rdp_data_independent_threshold( sigma1, orders) else: rdp_step1 = pate.compute_rdp_threshold(logq_step1, sigma1, orders) if data_ind: rdp_step2 = pate.rdp_data_independent_gaussian(sigma2, orders) else: logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp_step2 = pate.rdp_gaussian(logq_step2, sigma2, orders) q_step1 = np.exp(logq_step1) rdp = rdp_step1 + rdp_step2 * q_step1 # The expression below evaluates # E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2] rdp_sqrd = (rdp_step1**2 + 2 * rdp_step1 * q_step1 * rdp_step2 + q_step1 * rdp_step2**2) rdp_sqrd_cum += rdp_sqrd rdp_cum += rdp answered += q_step1 if ((i + 1) % 1000 == 0) or (i == votes.shape[0] - 1): rdp_var = rdp_sqrd_cum / i - (rdp_cum / i)**2 # Ignore Bessel's correction. eps_total, order_opt = pate.compute_eps_from_delta( orders, rdp_cum, delta) order_opt_idx = np.searchsorted(orders, order_opt) eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5 # Std of the sum. print( 'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) ' 'at order = {:.2f} (contribution from delta = {:.3f})'.format( i + 1, answered, eps_total, eps_std, order_opt, -math.log(delta) / (order_opt - 1))) sys.stdout.flush() _, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta) return order_opt
def _compute_rdp(votes, baseline, threshold, sigma1, sigma2, delta, orders, data_ind): """Computes the (data-dependent) RDP curve for Confident GNMax.""" rdp_cum = np.zeros(len(orders)) rdp_sqrd_cum = np.zeros(len(orders)) answered = 0 for i, v in enumerate(votes): if threshold is None: logq_step1 = 0 # No thresholding, always proceed to step 2. rdp_step1 = np.zeros(len(orders)) else: logq_step1 = pate.compute_logpr_answered(threshold, sigma1, v - baseline[i,]) if data_ind: rdp_step1 = pate.compute_rdp_data_independent_threshold(sigma1, orders) else: rdp_step1 = pate.compute_rdp_threshold(logq_step1, sigma1, orders) if data_ind: rdp_step2 = pate.rdp_data_independent_gaussian(sigma2, orders) else: logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp_step2 = pate.rdp_gaussian(logq_step2, sigma2, orders) q_step1 = np.exp(logq_step1) rdp = rdp_step1 + rdp_step2 * q_step1 # The expression below evaluates # E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2] rdp_sqrd = ( rdp_step1**2 + 2 * rdp_step1 * q_step1 * rdp_step2 + q_step1 * rdp_step2**2) rdp_sqrd_cum += rdp_sqrd rdp_cum += rdp answered += q_step1 if ((i + 1) % 1000 == 0) or (i == votes.shape[0] - 1): rdp_var = rdp_sqrd_cum / i - ( rdp_cum / i)**2 # Ignore Bessel's correction. eps_total, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta) order_opt_idx = np.searchsorted(orders, order_opt) eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5 # Std of the sum. print( 'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) ' 'at order = {:.2f} (contribution from delta = {:.3f})'.format( i + 1, answered, eps_total, eps_std, order_opt, -math.log(delta) / (order_opt - 1))) sys.stdout.flush() _, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta) return order_opt
def compute_rdp_curve(votes, threshold, sigma1, sigma2, orders, target_answered): rdp_cum = np.zeros(len(orders)) answered = 0 for i, v in enumerate(votes): v = sorted(v, reverse=True) q_step1 = math.exp(pate.compute_logpr_answered(threshold, sigma1, v)) logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp = pate.rdp_gaussian(logq_step2, sigma2, orders) rdp_cum += q_step1 * rdp answered += q_step1 if answered >= target_answered: print('Processed {} queries to answer {}.'.format(i, target_answered)) return rdp_cum assert False, 'Never reached {} answered queries.'.format(target_answered)
def run_analysis(votes, mechanism, noise_scale, params): """Computes data-dependent privacy. Args: votes: A matrix of votes, where each row contains votes in one instance. mechanism: A name of the mechanism ('lnmax', 'gnmax', or 'gnmax_conf') noise_scale: A mechanism privacy parameter. params: Other privacy parameters. Returns: Four lists: cumulative privacy cost epsilon, how privacy budget is split, how many queries were answered, optimal order. """ def compute_partition(order_opt, eps): order_opt_idx = np.searchsorted(orders, order_opt) if mechanism == 'gnmax_conf': p = (rdp_select_cum[order_opt_idx], rdp_cum[order_opt_idx] - rdp_select_cum[order_opt_idx], -math.log(delta) / (order_opt - 1)) else: p = (rdp_cum[order_opt_idx], -math.log(delta) / (order_opt - 1)) return [x / eps for x in p] # Ensures that sum(x) == 1 # Short list of orders. # orders = np.round(np.concatenate((np.arange(2, 50 + 1, 1), # np.logspace(np.log10(50), np.log10(1000), num=20)))) # Long list of orders. orders = np.concatenate((np.arange(2, 100 + 1, .5), np.logspace(np.log10(100), np.log10(500), num=100))) delta = 1e-8 n = votes.shape[0] eps_total = np.zeros(n) partition = [None] * n order_opt = np.full(n, np.nan, dtype=float) answered = np.zeros(n, dtype=float) rdp_cum = np.zeros(len(orders)) rdp_sqrd_cum = np.zeros(len(orders)) rdp_select_cum = np.zeros(len(orders)) answered_sum = 0 for i in range(n): v = votes[i,] if mechanism == 'lnmax': logq_lnmax = pate.compute_logq_laplace(v, noise_scale) rdp_query = pate.rdp_pure_eps(logq_lnmax, 2. / noise_scale, orders) rdp_sqrd = rdp_query ** 2 pr_answered = 1 elif mechanism == 'gnmax': logq_gmax = pate.compute_logq_gaussian(v, noise_scale) rdp_query = pate.rdp_gaussian(logq_gmax, noise_scale, orders) rdp_sqrd = rdp_query ** 2 pr_answered = 1 elif mechanism == 'gnmax_conf': logq_step1 = pate.compute_logpr_answered(params['t'], params['sigma1'], v) logq_step2 = pate.compute_logq_gaussian(v, noise_scale) q_step1 = np.exp(logq_step1) logq_step1_min = min(logq_step1, math.log1p(-q_step1)) rdp_gnmax_step1 = pate.rdp_gaussian(logq_step1_min, 2 ** .5 * params['sigma1'], orders) rdp_gnmax_step2 = pate.rdp_gaussian(logq_step2, noise_scale, orders) rdp_query = rdp_gnmax_step1 + q_step1 * rdp_gnmax_step2 # The expression below evaluates # E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2] rdp_sqrd = ( rdp_gnmax_step1 ** 2 + 2 * rdp_gnmax_step1 * q_step1 * rdp_gnmax_step2 + q_step1 * rdp_gnmax_step2 ** 2) rdp_select_cum += rdp_gnmax_step1 pr_answered = q_step1 else: raise ValueError( 'Mechanism must be one of ["lnmax", "gnmax", "gnmax_conf"]') rdp_cum += rdp_query rdp_sqrd_cum += rdp_sqrd answered_sum += pr_answered answered[i] = answered_sum eps_total[i], order_opt[i] = pate.compute_eps_from_delta( orders, rdp_cum, delta) partition[i] = compute_partition(order_opt[i], eps_total[i]) if i > 0 and (i + 1) % 1000 == 0: rdp_var = rdp_sqrd_cum / i - ( rdp_cum / i) ** 2 # Ignore Bessel's correction. order_opt_idx = np.searchsorted(orders, order_opt[i]) eps_std = ((i + 1) * rdp_var[order_opt_idx]) ** .5 # Std of the sum. print( 'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) ' 'at order = {:.2f} (contribution from delta = {:.3f})'.format( i + 1, answered_sum, eps_total[i], eps_std, order_opt[i], -math.log(delta) / (order_opt[i] - 1))) sys.stdout.flush() return eps_total, partition, answered, order_opt
def analyze_gnmax_conf_data_dep(votes, threshold, sigma1, sigma2, delta): # Short list of orders. # orders = np.round(np.logspace(np.log10(20), np.log10(200), num=20)) # Long list of orders. orders = np.concatenate((np.arange(20, 40, .2), np.arange(40, 75, .5), np.logspace(np.log10(75), np.log10(200), num=20))) n = votes.shape[0] num_classes = votes.shape[1] num_teachers = int(sum(votes[0, ])) if threshold is not None and sigma1 is not None: is_data_ind_step1 = pate.is_data_independent_always_opt_gaussian( num_teachers, num_classes, sigma1, orders) else: is_data_ind_step1 = [True] * len(orders) is_data_ind_step2 = pate.is_data_independent_always_opt_gaussian( num_teachers, num_classes, sigma2, orders) eps_partitioned = np.full(n, None, dtype=Partition) order_opt = np.full(n, None, dtype=float) ss_std_opt = np.full(n, None, dtype=float) answered = np.zeros(n) rdp_step1_total = np.zeros(len(orders)) rdp_step2_total = np.zeros(len(orders)) ls_total = np.zeros((len(orders), num_teachers)) answered_total = 0 for i in range(n): v = votes[i, ] if threshold is not None and sigma1 is not None: logq_step1 = pate.compute_logpr_answered(threshold, sigma1, v) rdp_step1_total += pate.compute_rdp_threshold( logq_step1, sigma1, orders) else: logq_step1 = 0. # always answer pr_answered = np.exp(logq_step1) logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp_step2_total += pr_answered * pate.rdp_gaussian( logq_step2, sigma2, orders) answered_total += pr_answered rdp_ss = np.zeros(len(orders)) ss_std = np.zeros(len(orders)) for j, order in enumerate(orders): if not is_data_ind_step1[j]: ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold( v, num_teachers, threshold, sigma1, order) else: ls_step1 = np.full(num_teachers, 0, dtype=float) if not is_data_ind_step2[j]: ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax( v, num_teachers, sigma2, order) else: ls_step2 = np.full(num_teachers, 0, dtype=float) ls_total[j, ] += ls_step1 + pr_answered * ls_step2 beta_ss = .49 / order ss = pate_ss.compute_discounted_max(beta_ss, ls_total[j, ]) sigma_ss = ((order * math.exp(2 * beta_ss)) / ss)**(1 / 3) rdp_ss[j] = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( beta_ss, sigma_ss, order) ss_std[j] = ss * sigma_ss rdp_total = rdp_step1_total + rdp_step2_total + rdp_ss answered[i] = answered_total _, order_opt[i] = pate.compute_eps_from_delta(orders, rdp_total, delta) order_idx = np.searchsorted(orders, order_opt[i]) # Since optimal orders are always non-increasing, shrink orders array # and all cumulative arrays to speed up computation. if order_idx < len(orders): orders = orders[:order_idx + 1] rdp_step1_total = rdp_step1_total[:order_idx + 1] rdp_step2_total = rdp_step2_total[:order_idx + 1] eps_partitioned[i] = Partition(step1=rdp_step1_total[order_idx], step2=rdp_step2_total[order_idx], ss=rdp_ss[order_idx], delta=-math.log(delta) / (order_opt[i] - 1)) ss_std_opt[i] = ss_std[order_idx] if i > 0 and (i + 1) % 1 == 0: print( 'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} +/- {:.3f} ' 'at order = {:.2f}. Contributions: delta = {:.3f}, step1 = {:.3f}, ' 'step2 = {:.3f}, ss = {:.3f}'.format( i + 1, answered[i], sum(eps_partitioned[i]), ss_std_opt[i], order_opt[i], eps_partitioned[i].delta, eps_partitioned[i].step1, eps_partitioned[i].step2, eps_partitioned[i].ss)) sys.stdout.flush() return eps_partitioned, answered, ss_std_opt, order_opt
def analyze_gnmax_conf_data_dep(votes, threshold, sigma1, sigma2, delta): # Short list of orders. # orders = np.round(np.logspace(np.log10(20), np.log10(200), num=20)) # Long list of orders. orders = np.concatenate((np.arange(20, 40, .2), np.arange(40, 75, .5), np.logspace(np.log10(75), np.log10(200), num=20))) n = votes.shape[0] num_classes = votes.shape[1] num_teachers = int(sum(votes[0,])) if threshold is not None and sigma1 is not None: is_data_ind_step1 = pate.is_data_independent_always_opt_gaussian( num_teachers, num_classes, sigma1, orders) else: is_data_ind_step1 = [True] * len(orders) is_data_ind_step2 = pate.is_data_independent_always_opt_gaussian( num_teachers, num_classes, sigma2, orders) eps_partitioned = np.full(n, None, dtype=Partition) order_opt = np.full(n, None, dtype=float) ss_std_opt = np.full(n, None, dtype=float) answered = np.zeros(n) rdp_step1_total = np.zeros(len(orders)) rdp_step2_total = np.zeros(len(orders)) ls_total = np.zeros((len(orders), num_teachers)) answered_total = 0 for i in range(n): v = votes[i,] if threshold is not None and sigma1 is not None: logq_step1 = pate.compute_logpr_answered(threshold, sigma1, v) rdp_step1_total += pate.compute_rdp_threshold(logq_step1, sigma1, orders) else: logq_step1 = 0. # always answer pr_answered = np.exp(logq_step1) logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp_step2_total += pr_answered * pate.rdp_gaussian(logq_step2, sigma2, orders) answered_total += pr_answered rdp_ss = np.zeros(len(orders)) ss_std = np.zeros(len(orders)) for j, order in enumerate(orders): if not is_data_ind_step1[j]: ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold(v, num_teachers, threshold, sigma1, order) else: ls_step1 = np.full(num_teachers, 0, dtype=float) if not is_data_ind_step2[j]: ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax( v, num_teachers, sigma2, order) else: ls_step2 = np.full(num_teachers, 0, dtype=float) ls_total[j,] += ls_step1 + pr_answered * ls_step2 beta_ss = .49 / order ss = pate_ss.compute_discounted_max(beta_ss, ls_total[j,]) sigma_ss = ((order * math.exp(2 * beta_ss)) / ss) ** (1 / 3) rdp_ss[j] = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( beta_ss, sigma_ss, order) ss_std[j] = ss * sigma_ss rdp_total = rdp_step1_total + rdp_step2_total + rdp_ss answered[i] = answered_total _, order_opt[i] = pate.compute_eps_from_delta(orders, rdp_total, delta) order_idx = np.searchsorted(orders, order_opt[i]) # Since optimal orders are always non-increasing, shrink orders array # and all cumulative arrays to speed up computation. if order_idx < len(orders): orders = orders[:order_idx + 1] rdp_step1_total = rdp_step1_total[:order_idx + 1] rdp_step2_total = rdp_step2_total[:order_idx + 1] eps_partitioned[i] = Partition(step1=rdp_step1_total[order_idx], step2=rdp_step2_total[order_idx], ss=rdp_ss[order_idx], delta=-math.log(delta) / (order_opt[i] - 1)) ss_std_opt[i] = ss_std[order_idx] if i > 0 and (i + 1) % 1 == 0: print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} +/- {:.3f} ' 'at order = {:.2f}. Contributions: delta = {:.3f}, step1 = {:.3f}, ' 'step2 = {:.3f}, ss = {:.3f}'.format( i + 1, answered[i], sum(eps_partitioned[i]), ss_std_opt[i], order_opt[i], eps_partitioned[i].delta, eps_partitioned[i].step1, eps_partitioned[i].step2, eps_partitioned[i].ss)) sys.stdout.flush() return eps_partitioned, answered, ss_std_opt, order_opt
def _find_optimal_smooth_sensitivity_parameters(votes, baseline, num_teachers, threshold, sigma1, sigma2, delta, ind_step1, ind_step2, order): """Optimizes smooth sensitivity parameters by minimizing a cost function. The cost function is exact_eps + cost of GNSS + two stds of noise, which captures that upper bound of the confidence interval of the sanitized privacy budget. Since optimization is done with full view of sensitive data, the results cannot be released. """ rdp_cum = 0 answered_cum = 0 ls_cum = 0 # Define a plausible range for the beta values. betas = np.arange(.3 / order, .495 / order, .01 / order) cost_delta = math.log(1 / delta) / (order - 1) for i, v in enumerate(votes): if threshold is None: log_pr_answered = 0 rdp1 = 0 ls_step1 = np.zeros(num_teachers) else: log_pr_answered = pate.compute_logpr_answered( threshold, sigma1, v - baseline[i, ]) if ind_step1: # apply data-independent bound for step 1 (thresholding). rdp1 = pate.compute_rdp_data_independent_threshold( sigma1, order) ls_step1 = np.zeros(num_teachers) else: rdp1 = pate.compute_rdp_threshold(log_pr_answered, sigma1, order) ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold( v - baseline[i, ], num_teachers, threshold, sigma1, order) pr_answered = math.exp(log_pr_answered) answered_cum += pr_answered if ind_step2: # apply data-independent bound for step 2 (GNMax). rdp2 = pate.rdp_data_independent_gaussian(sigma2, order) ls_step2 = np.zeros(num_teachers) else: logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp2 = pate.rdp_gaussian(logq_step2, sigma2, order) # Compute smooth sensitivity. ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax( v, num_teachers, sigma2, order) rdp_cum += rdp1 + pr_answered * rdp2 ls_cum += ls_step1 + pr_answered * ls_step2 # Expected local sensitivity. if ind_step1 and ind_step2: # Data-independent bounds. cost_opt, beta_opt, ss_opt, sigma_ss_opt = None, 0., 0., np.inf else: # Data-dependent bounds. cost_opt, beta_opt, ss_opt, sigma_ss_opt = np.inf, None, None, None for beta in betas: ss = pate_ss.compute_discounted_max(beta, ls_cum) # Solution to the minimization problem: # min_sigma {order * exp(2 * beta)/ sigma^2 + 2 * ss * sigma} sigma_ss = ((order * math.exp(2 * beta)) / ss)**(1 / 3) cost_ss = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( beta, sigma_ss, order) # Cost captures exact_eps + cost of releasing SS + two stds of noise. cost = rdp_cum + cost_ss + 2 * ss * sigma_ss if cost < cost_opt: cost_opt, beta_opt, ss_opt, sigma_ss_opt = cost, beta, ss, sigma_ss if ((i + 1) % 100 == 0) or (i == votes.shape[0] - 1): eps_before_ss = rdp_cum + cost_delta eps_with_ss = (eps_before_ss + pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( beta_opt, sigma_ss_opt, order)) print( '{}: E[answered queries] = {:.1f}, RDP at {} goes from {:.3f} to ' '{:.3f} +/- {:.3f} (ss = {:.4}, beta = {:.4f}, sigma_ss = {:.3f})' .format(i + 1, answered_cum, order, eps_before_ss, eps_with_ss, ss_opt * sigma_ss_opt, ss_opt, beta_opt, sigma_ss_opt)) sys.stdout.flush() # Return optimal parameters for the last iteration. return beta_opt, ss_opt, sigma_ss_opt
def _find_optimal_smooth_sensitivity_parameters( votes, baseline, num_teachers, threshold, sigma1, sigma2, delta, ind_step1, ind_step2, order): """Optimizes smooth sensitivity parameters by minimizing a cost function. The cost function is exact_eps + cost of GNSS + two stds of noise, which captures that upper bound of the confidence interval of the sanitized privacy budget. Since optimization is done with full view of sensitive data, the results cannot be released. """ rdp_cum = 0 answered_cum = 0 ls_cum = 0 # Define a plausible range for the beta values. betas = np.arange(.3 / order, .495 / order, .01 / order) cost_delta = math.log(1 / delta) / (order - 1) for i, v in enumerate(votes): if threshold is None: log_pr_answered = 0 rdp1 = 0 ls_step1 = np.zeros(num_teachers) else: log_pr_answered = pate.compute_logpr_answered(threshold, sigma1, v - baseline[i,]) if ind_step1: # apply data-independent bound for step 1 (thresholding). rdp1 = pate.compute_rdp_data_independent_threshold(sigma1, order) ls_step1 = np.zeros(num_teachers) else: rdp1 = pate.compute_rdp_threshold(log_pr_answered, sigma1, order) ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold( v - baseline[i,], num_teachers, threshold, sigma1, order) pr_answered = math.exp(log_pr_answered) answered_cum += pr_answered if ind_step2: # apply data-independent bound for step 2 (GNMax). rdp2 = pate.rdp_data_independent_gaussian(sigma2, order) ls_step2 = np.zeros(num_teachers) else: logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp2 = pate.rdp_gaussian(logq_step2, sigma2, order) # Compute smooth sensitivity. ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax( v, num_teachers, sigma2, order) rdp_cum += rdp1 + pr_answered * rdp2 ls_cum += ls_step1 + pr_answered * ls_step2 # Expected local sensitivity. if ind_step1 and ind_step2: # Data-independent bounds. cost_opt, beta_opt, ss_opt, sigma_ss_opt = None, 0., 0., np.inf else: # Data-dependent bounds. cost_opt, beta_opt, ss_opt, sigma_ss_opt = np.inf, None, None, None for beta in betas: ss = pate_ss.compute_discounted_max(beta, ls_cum) # Solution to the minimization problem: # min_sigma {order * exp(2 * beta)/ sigma^2 + 2 * ss * sigma} sigma_ss = ((order * math.exp(2 * beta)) / ss)**(1 / 3) cost_ss = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( beta, sigma_ss, order) # Cost captures exact_eps + cost of releasing SS + two stds of noise. cost = rdp_cum + cost_ss + 2 * ss * sigma_ss if cost < cost_opt: cost_opt, beta_opt, ss_opt, sigma_ss_opt = cost, beta, ss, sigma_ss if ((i + 1) % 100 == 0) or (i == votes.shape[0] - 1): eps_before_ss = rdp_cum + cost_delta eps_with_ss = ( eps_before_ss + pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( beta_opt, sigma_ss_opt, order)) print('{}: E[answered queries] = {:.1f}, RDP at {} goes from {:.3f} to ' '{:.3f} +/- {:.3f} (ss = {:.4}, beta = {:.4f}, sigma_ss = {:.3f})'. format(i + 1, answered_cum, order, eps_before_ss, eps_with_ss, ss_opt * sigma_ss_opt, ss_opt, beta_opt, sigma_ss_opt)) sys.stdout.flush() # Return optimal parameters for the last iteration. return beta_opt, ss_opt, sigma_ss_opt