def SPA(data, l2_sens, eps): coeffs = dft(data) n = len(coeffs) lamb = math.sqrt(2) * l2_sens / eps priv_items = [] s = sum([abs(x)**2 for x in coeffs]) d = 0 for i in range(1, n + 1): d += abs(coeffs[i - 1])**2 Fni = s - d U = math.sqrt(Fni) + float(i) * math.sqrt(n) / eps priv_items.append(PrivItem(-U, i)) item = ExpMechanism.basic(priv_items, util.old_div(1.0, lamb)) k = item.id g = random.gammavariate(util.old_div((k + 1), 2.0), util.old_div(1.0, lamb**2)) for j in range(n): if j < k: (magn, phi) = cmath.polar(coeffs[j]) coeffs[j] = cmath.rect( magn + random.normalvariate(0, math.sqrt(g)), phi) else: coeffs[j] = 0 return [x.real for x in idft(coeffs)]
def hilbert(N): """ Produce coordinates of an NxN Hilbert curve. @param N: the length of side, assumed to be a power of 2 ( >= 2) @returns: x and y, each as an array of integers representing coordinates of points along the Hilbert curve. Calling plot(x, y) will plot the Hilbert curve. From Wikipedia """ assert 2**int(math.ceil(math.log( N, 2))) == N, "N={0} is not a power of 2!".format(N) if N == 2: return numpy.array((0, 0, 1, 1)), numpy.array((0, 1, 1, 0)) else: x, y = hilbert(util.old_div(N, 2)) xl = numpy.r_[y, x, util.old_div(N, 2) + x, N - 1 - y] yl = numpy.r_[x, util.old_div(N, 2) + y, util.old_div(N, 2) + y, util.old_div(N, 2) - 1 - x] return xl, yl
def inference(self): k = self.k # go bottom up to compute z[v] for (node, h) in self.postorder_iter(with_height=True): if node.isleaf(): node.hbar = node.noisy else: alpha = k**(h - 1) a = ((k - 1) * alpha) * node.noisy total = reduce(lambda total, child: total + child.hbar, node.children, 0) node.total_z_children = total b = (alpha - 1) * total node.hbar = util.old_div(float(a + b), (k * alpha - 1)) # go top down to compute hbar[v] leaves = [None] * (k**(self.height - 1)) for node in self.preorder_iter(): if node == self.root: continue parent = node.parent sum_z = parent.total_z_children node.hbar += util.old_div((parent.hbar - sum_z), k) if node.isleaf(): assert leaves[node.start] == None leaves[node.start] = node.hbar # check that all leaves are filled in assert reduce(lambda total, leaf_count: (leaf_count == None) + total, leaves, 0) == 0 return leaves
def Run(self, Q, x, epsilon, seed): assert seed is not None, 'seed must be set' prng = numpy.random.RandomState(seed) assert len( x.shape ) == 2, '%s is defined for 2D data only' % self.__class__.__name__ n, m = x.shape N = sum(sum(x)) # compute number of cells in the first level m1 = int( util.old_div(math.sqrt(util.old_div((N * epsilon), self.c)), 4) - 1) + 1 if m1 < 10: m1 = 10 M = m1**2 grid = int(math.sqrt(n * m * 1.0 / M) - 1) + 1 if grid <= 0: grid = 1 num1 = int(util.old_div((n - 1), grid) + 1) num2 = int(util.old_div((m - 1), grid) + 1) cells, counts = AG_engine.GenerateCells(x, n, m, num1, num2, grid) y = AG_engine.CountPerturb(x, cells, counts, epsilon, self.alpha, self.c2, prng) return y
def Run(self, Q, x, epsilon, seed): assert seed is not None, 'seed must be set' prng = numpy.random.RandomState(seed) Q1=list(Q.query_list)# leave Q as it is for future evaluation # here we assume the total count is known # create uniform estimate based on total count hatx = numpy.empty_like(x) hatx.fill( util.old_div(x.sum(), float(x.size)) ) selepsilon = epsilon * self._ratio queryepsilon = epsilon - selepsilon assert(self._nrounds <= len(Q1)) # the maximum possible number of rounds is the size of Q. # selected queries will be removed from Q, added to list of estimated queries estQ = [] nrounds = self._nrounds for c in range(nrounds): i = self._exponentialMechanism(x, hatx, Q1, util.old_div(selepsilon, nrounds) ,prng) # get index of selected query q = Q1[i] del Q1[i] # no longer a candidate in next round sens=q.sens() est = q.eval(x) + prng.laplace(0.0, sens * nrounds / queryepsilon, 1) estQ.append( (q,est) ) hatx = self._update(hatx, q, est) # update using only current q and estimate return hatx
def dpcube(epsilon, p, pp, rp, X2, left, right, prng): # this function used to compute the noisy counts len = right - left + 1 bias = DPcube1D_engine.Compute(p, pp, left, right) cur = bias + util.old_div(1.0, epsilon) flag = False pos = left for k in range(left, right): bias1 = DPcube1D_engine.Compute(p, pp, left, k) bias2 = DPcube1D_engine.Compute(p, pp, k + 1, right) if bias1 + bias2 + util.old_div(2.0, epsilon) < cur: cur = bias1 + bias2 + util.old_div(2.0, epsilon) flag = True pos = k if flag == True: DPcube1D_engine.dpcube(epsilon, p, pp, rp, X2, left, pos, prng) DPcube1D_engine.dpcube(epsilon, p, pp, rp, X2, pos + 1, right, prng) else: ncnt = rp[right + 1] - rp[left] + prng.laplace( 0.0, util.old_div(1.0, epsilon)) navg = ncnt * 1.0 / len for i in range(left, right + 1): X2[i] = navg
def Run(self,Q,x,epsilon,seed): assert seed is not None, 'seed must be set' prng = numpy.random.RandomState(seed) assert len(x.shape)==2, '%s is defined for 2D data only' % self.__class__.__name__ n,m = x.shape # assume the data scale is non private information. N = x.sum() # compute number of cells M = util.old_div((N*epsilon), self.c) if self.gz == 0: grid = int(math.sqrt(n*m/M)-1)+1 else: grid = int(self.gz) if grid < 1: grid = 1 num1 = int(util.old_div((n-1), grid) + 1) num2 = int(util.old_div((m-1), grid) + 1) cells = UG_engine.GenerateCells(n,m,num1,num2,grid) y = UG_engine.CountPerturb(x,cells,epsilon,prng) return y
def CountPerturb(x, cells, counts, epsilon, alpha, c2, prng): # generate second level grids and compute the noisy counts n, m = x.shape y = numpy.ndarray((n, m), 'float32') y.fill(0) noisycnt = counts + prng.laplace(0, util.old_div( 1.0, (alpha * epsilon)), len(counts)) #second level grids and compute noisy counts with postprocessings for k in range(len(cells)): x1, y1, x2, y2 = cells[k][0][0], cells[k][0][1], cells[k][1][ 0], cells[k][1][1] nn = x2 - x1 + 1 mm = y2 - y1 + 1 #compute second level grid size if noisycnt[k] <= 0: m2 = 1 else: m2 = int( math.sqrt(noisycnt[k] * (1 - alpha) * epsilon / c2) - 1) + 1 M2 = m2**2 newgrid = int(math.sqrt(nn * mm * 1.0 / M2) - 1) + 1 if newgrid <= 0: newgrid = 1 num1 = int(util.old_div((nn - 1), newgrid) + 1) num2 = int(util.old_div((mm - 1), newgrid) + 1) curX = numpy.ndarray((nn, mm), 'float32') for xx in range(x1, x2 + 1): curX[xx - x1] = x[xx][y1:y2 + 1] newcells, newcounts = AG_engine.GenerateCells( curX, nn, mm, num1, num2, newgrid) ncounts = newcounts + prng.laplace( 0, util.old_div(1.0, ((1 - alpha) * epsilon)), len(newcounts)) #postprocessing newncnt = (alpha * m2)**2 / ( (1 - alpha)**2 + (alpha * m2)**2) * noisycnt[k] + (1 - alpha)**2 / ( (1 - alpha)**2 + (alpha * m2)**2) * sum(ncounts) for i in range(len(newcells)): upcnt = ncounts[i] + 1.0 / len(newcells) * (newncnt - sum(ncounts)) xx1, yy1, xx2, yy2 = x1 + newcells[i][0][0], y1 + newcells[i][ 0][1], x1 + newcells[i][1][0], y1 + newcells[i][1][1] upavg = upcnt * 1.0 / ((xx2 - xx1 + 1) * (yy2 - yy1 + 1)) for j in range(xx1, xx2 + 1): y[j][yy1:yy2 + 1] = upavg return y
def test_inference3(self): htree = HTree(2, [0] * 2) [parent, child1, child2] = htree.preorder_iter() parent.noisy = 0 child1.noisy = 1 child2.noisy = 0 leaves = htree.inference() self.assertEqual(2, len(leaves)) self.assertAlmostEqual(util.old_div(-1., 3), leaves[0]) self.assertAlmostEqual(util.old_div(2., 3), leaves[1])
def EFPA(data, l2_sens, eps, prng): data = list(map(float, data)) coeffs = rfft(data) n = len(coeffs) eps_1 = eps_2 = eps * 0.5 s = abs(coeffs[0])**2 for i in range(1, n - 1): s += 2 * abs(coeffs[i])**2 if len(data) % 2 == 0: s += abs(coeffs[n - 1])**2 else: s += 2 * abs(coeffs[n - 1])**2 # DC coeff kept = 1 d = abs(coeffs[0])**2 sum = sqrt(s - d) + sqrt(2) * kept * (util.old_div(l2_sens, eps_2)) priv_items = [PrivItem(-sum, [0, kept])] # Other coeffs except the last one for i in range(1, n - 1): kept += 2 d += 2 * abs(coeffs[i])**2 sum = sqrt(s - d) + sqrt(2) * kept * (util.old_div(l2_sens, eps_2)) #sum = sqrt(sum) * sqrt(kept) priv_items.append(PrivItem(-sum, [i, kept])) # last coeff needs special consideration depending len(data) is even or odd if len(data) % 2 == 0: kept += 1 d += abs(coeffs[n - 1])**2 else: kept += 2 d += 2 * abs(coeffs[n - 1])**2 sum = sqrt(s - d) + sqrt(2) * kept * (util.old_div(l2_sens, eps_2)) priv_items.append(PrivItem(-sum, [n, kept])) # maximal kept coeffs is upper bounded by the length data vector item = ExpMechanism.run(priv_items, eps_1, l2_sens) lamb = sqrt(item.id[1]) * l2_sens / eps_2 k = item.id[0] + 1 for j in range(n): if j < k: (magn, phi) = cmath.polar(coeffs[j]) coeffs[j] = cmath.rect(magn + prng.laplace(0, lamb), phi) else: coeffs[j] = 0 return [x.real for x in irfft(coeffs, len(data))]
def _dewave(t, m): y = numpy.array(t) n = 2 half_n = 1 for c in range(m): y[:n:2], y[1:n:2] = util.old_div( (y[:half_n] + y[half_n:n]), 2.0), util.old_div( (y[:half_n] - y[half_n:n]), 2.0) n = n * 2 half_n = half_n * 2 return y
def L1partition_approx(x, epsilon, ratio=0.5, gethist=False, seed=None): """Compute the noisy L1 histogram using interval buckets of size 2^k Args: x - list of numeric values. The input data vector epsilon - double. Total private budget ratio - double in (0, 1) the use ratio*epsilon for partition computation and (1-ratio)*epsilon for querying the count in each partition gethist - boolean. If set to truth, return the partition directly (the privacy budget used is still ratio*epsilon) Return: if gethist == False, return an estimated data vector. Otherwise, return the partition """ assert seed is not None, "seed must be set" prng = numpy.random.RandomState(seed) n = len(x) # check that the input vector x is of appropriate type assert (x.dtype == numpy.dtype(int) or x.dtype == numpy.dtype("int32") ), "Input vector must be int! %s given" % x.dtype y = x.astype('int32') #numpy type int32 is not not JSON serializable check = (x == y) assert check.sum() == len(check), "Casting error from int to int32" x = y hist = cutil.L1partition_approx(n + 1, x, epsilon, ratio, prng.randint(500000)) hatx = numpy.zeros(n) rb = n if gethist: bucks = [] for lb in hist[1:]: bucks.insert(0, [lb, rb - 1]) rb = lb if lb == 0: break return bucks else: for lb in hist[1:]: hatx[lb:rb] = util.old_div( max( 0, sum(x[lb:rb]) + prng.laplace(0, util.old_div(1.0, (epsilon * (1 - ratio))), 1)), float(rb - lb)) rb = lb if lb == 0: break return hatx
def _dewave(y, m): """Compute the original dataset from a set of wavelet parameters y with size 2^m. """ x = numpy.array(y) n = 2 half_n = 1 for c in range(m): x[:n:2], x[1:n:2] = util.old_div((x[:half_n] + x[half_n:n]),2.0), \ util.old_div((x[:half_n] - x[half_n:n]),2.0) n *= 2 half_n *= 2 return x
def Run(self, QtQ, x, epsilon, seed): """ QtQ - given the workload Q in matrix form, QtQ is the multiplication between the transpose of Q and Q. """ assert seed is not None, 'seed must be set' prng = numpy.random.RandomState(seed) x = numpy.array(x) assert len( x.shape ) == 1, '%s is defined for 1D data only' % self.__class__.__name__ n = len(x) err, inv, dist, query = self._GreedyHierByLv(QtQ, n, 0, withRoot=False) qmat = [] y2 = [] for c in range(len(dist)): if dist[c] > 0: lb, rb = query[c] currow = numpy.zeros(n) currow[lb:rb + 1] = dist[c] qmat.append(currow) y2.append(sum(x[lb:(rb + 1)]) * dist[c]) qmat = numpy.array(qmat) y2 += prng.laplace(0.0, util.old_div(1.0, epsilon), len(y2)) return numpy.dot(inv, numpy.dot(qmat.T, y2))
def split(self,clusters): err = fsum([x.error for x in clusters]) priv_items = [PrivItem(-err,[0, 0])] # navigate to the first non-ready cluster for cluster in clusters: if not cluster.ready: break # calling C implementation from cutils split_errors = clustersplit(cluster.noisy_counts()) for i in range(len(split_errors)): priv_items.append(PrivItem(-err+cluster.error-split_errors[i][0]-2*cluster.b, [cluster, i+1])) #item = max(priv_items,key=lambda x:x.q) item = ExpMechanism.run(priv_items,util.old_div((self.eps_cluster*0.5),self.max_depth), 2*self.sensitivity) old_cluster = cluster cluster = item.id[0] if cluster == 0: old_cluster.ready = True return False else: clusters.remove(cluster) children = cluster.split(item.id[1]) for c in children: if len(c) == 1 or c.level == self.max_depth: c.ready = True clusters.extend(children) return True
def calculate_error(prefix, diff, norm_factor): """ Error calculations are implemented once here Error calculations are performed on a vector of query differences 'diff' should be a vector, not a matrix or norms will be different. For L1 and L2, per-query error is reported """ assert len(diff) == diff.size, 'diff should be a vector' d = {} d[prefix + '.Linf'] = util.old_div(la.norm(diff, np.inf), norm_factor) # d[prefix + '.L1'] = util.old_div( (util.old_div(la.norm(diff, 1), float(diff.size))), norm_factor) d[prefix + '.L2'] = util.old_div( (util.old_div(la.norm(diff), float(diff.size))), norm_factor) return d
def GetsynData(x, gz, epsilon, prng): l = len(x) y = numpy.zeros(l) p = util.old_div(l, gz) for i in range(p): nc = sum(x[i * gz:(i + 1) * gz]) nc = nc + prng.laplace(0.0, util.old_div(1.0, epsilon)) for j in range(gz): y[i * gz + j] = nc * 1.0 / gz if l % gz != 0: nc = sum(x[p * gz:]) nc = nc + prng.laplace(0.0, util.old_div(1.0, epsilon)) for j in range(l - p * gz): y[p * gz + j] = nc * 1.0 / (l - p * gz) return y
def setUp(self): n = 1024 scale = 1E5 self.hist = numpy.array(list(range(n))) self.d = dataset.Dataset(self.hist, None) self.dist = numpy.random.exponential(1, n) self.dist = util.old_div(self.dist, float(self.dist.sum())) self.ds = dataset.DatasetSampled(self.dist, scale, None, 1001)
def build_tree(x, epsilon,prng, b=2): # tree code requires len(x) be a power of b # if it is not, then pad x with 0s and then remove at end n = len(x) target_n = b**int(math.ceil(math.log(n, b))) if n < target_n: x = np.append(x, [0]*(target_n - n)) H = h_tree.HTree(b, x) # add noise epsilon = util.old_div(float(epsilon), H.height) # uniform allocation for node in H.postorder_iter(): node.noisy = node.count + prng.laplace(0, util.old_div(1,epsilon)) est_x = H.inference() return est_x[:n] # truncate any padded zeros
def __init__(self, nickname, sample_to_scale, reduce_to_dom_shape=None, seed=None): self.init_params = util.init_params_from_locals(locals()) self.fname = nickname assert nickname in filenameDict, 'Filename parameter not recognized: %s' % nickname hist = load(filenameDict[self.fname]) dist = util.old_div(hist, float(hist.sum())) super(DatasetSampledFromFile,self).__init__(dist, sample_to_scale, reduce_to_dom_shape, seed)
def _rebuild(partition, counts, n): """Rebuild an estimated data using uniform expansion.""" estx = numpy.zeros(n) n2 = len(counts) for c in range(n2): lb, rb = partition[c] estx[lb:(rb + 1)] = util.old_div(counts[c], float(rb - lb + 1)) return estx
def CountPerturb(x,cells,epsilon,prng): # this function used to perturb counts based on generated grids n,m = x.shape y = numpy.ndarray((n,m),'float32') y.fill(0) for cell in cells: x1,y1,x2,y2 = int(cell[0][0]),int(cell[0][1]),int(cell[1][0]),int(cell[1][1]) cnt = 0 for i in range(x1,x2+1): cnt = cnt + sum(x[i][y1:y2+1]) navg = util.old_div((prng.laplace(cnt,util.old_div(1.0,epsilon))), ((x2-x1+1)*(y2-y1+1))) for i in range(x1,x2+1): y[i][y1:y2+1] = navg return y
def g(*idx): """ This function will receive an index tuple from numpy.fromfunction It's behavior depends on grid_shape: take (i,j) and divide by grid_shape (in each dimension) That becomes an identifier of the block; then assign a unique integer to it using pairing. """ x = numpy.array(idx) y = numpy.array(grid_shape) return general_pairing( util.old_div(x,y) ) # broadcasting integer division
def Run(self, Q, x, epsilon, seed=None): # rewritten to support nd-array input x: assert seed is not None, 'seed must be set' prng = numpy.random.RandomState(seed) m = x.sum() + prng.laplace(0.0, util.old_div(1.0, epsilon), 1) return numpy.ones_like( x, dtype=numpy.float32) * m / x.size # assuming m is known
def _wave(t, m): y = numpy.array(t) n = len(t) for c in range(m): y[:n] = numpy.hstack( [y[:n][0::2] + y[:n][1::2], y[:n][0::2] - y[:n][1::2]]) n = util.old_div(n, 2) return y
def quantile(self, p): count = self.count() s = 0 for i in range(len(self)): s += self.bins[i] if count > 0 and util.old_div(float(s), count) > p: return i - 1 return 0
def setUp(self): n = 1024 self.hist = numpy.array(list(range(n))) self.d = dataset.Dataset(self.hist, None) self.dist = numpy.random.exponential(1, n) self.dist = util.old_div(self.dist, float(self.dist.sum())) self.epsilon = 0.1 self.w1 = workload.Identity.oneD(1024, weight=1.0) self.w2 = workload.Prefix1D(1024) self.eng = identity.identity_engine()
def _update(hatx, q, est): """basic multiplicative weight update. update one single query, one round""" total = hatx.sum() error = est - q.eval(hatx) # difference between query ans on current estimated data and the observed answer q1 = q.asArray(hatx.shape) # transform a query object into a nd array hatx = hatx * numpy.exp( q1 * error / (2.0 * total) ) hatx *= util.old_div(total, hatx.sum()) return hatx
def Run(self, Q, x, epsilon, seed): assert seed is not None, 'seed must be set' prng = numpy.random.RandomState(seed) assert len( x.shape ) == 1, '%s is defined for 1D data only' % self.__class__.__name__ n = len(x) if n <= 16: # don't convert to wavelet parameters for small domains return x + prng.laplace(0.0, util.old_div(1.0, epsilon), len(x)) else: m = int(math.ceil(math.log(n, 2))) x1 = numpy.zeros(2**m) x1[:n] = x y1 = privelet_engine._wave(x1, m) + \ prng.laplace(0.0, util.old_div((m+1.0), epsilon), len(x1)) return privelet_engine._dewave(y1, m)[:n]
def WeightAvg(Node, epsilon, toth): ''' First postprocessing: Weighted averaging (in section 3.3)''' if Node.isleaf() == True: return for ch in Node.children: WeightAvg(ch, epsilon, toth) if Node.count != None: eps1 = 2**((toth - Node.height) * 1.0 / 3) * epsilon * ( 2**(util.old_div(1.0, 3)) - 1) / (2**((toth + 1) * 1.0 / 3) - 1) eps2 = 2**((toth - Node.height + 1) * 1.0 / 3) * epsilon * ( 2**(util.old_div(1.0, 3)) - 1) / (2**((toth + 1) * 1.0 / 3) - 1) alpha = 4 * eps1**2 / (4 * eps1**2 + eps2**2) tot = 0 for x in Node.children: tot = tot + x.count Node.count = alpha * Node.count + (1 - alpha) * tot