def privbayes_inference(domain, measurements, total): synthetic = pd.DataFrame() _, y, _, proj = measurements[0] y = np.maximum(y, 0) y /= y.sum() col = proj[0] synthetic[col] = np.random.choice(domain[col], total, True, y) for _, y, _, proj in measurements[1:]: # find the CPT col, dep = proj[0], proj[1:] print(col) y = np.maximum(y, 0) dom = domain.project(proj) cpt = Factor(dom, y.reshape(dom.shape)) marg = cpt.project(dep) cpt /= marg cpt2 = np.moveaxis(cpt.project(proj).values, 0, -1) # sample current column synthetic[col] = 0 rng = itertools.product(*[range(domain[a]) for a in dep]) for v in rng: idx = (synthetic.loc[:,dep].values == np.array(v)).all(axis=1) p = cpt2[v].flatten() if p.sum() == 0: p = np.ones(p.size) / p.size n = domain[col] N = idx.sum() if N > 0: synthetic.loc[idx,col] = np.random.choice(n, N, True, p) return Dataset(synthetic, domain)
def marginal_loss(marginals, workload, cache): answers = [] for proj, W in workload: for cl in marginals: if set(proj) <= set(cl): mu = marginals[cl].project(proj) x = mu.values.flatten() answers.append(W.dot(x)) break total = x.sum() answers = np.concatenate(answers) / total gradient = grad(log_likelihood, argnum=0) loss = log_likelihood(answers, cache) danswers = gradient(answers, cache) i = 0 gradients = {cl: Factor.zeros(marginals[cl].domain) for cl in marginals} for proj, W in workload: for cl in marginals: if set(proj) <= set(cl): m = W.shape[0] dmu = W.T.dot(danswers[i:i + m]) / total dom = gradients[cl].domain.project(proj) gradients[cl] += Factor(dom, dmu) i += m break print(loss) return loss, graphical_model.CliqueVector(gradients)
def DualQuery(data, workload, eps=1.0, delta=0.001, seed=0): prng = np.random.RandomState(seed) total = data.df.shape[0] domain = data.domain answers = answer_workload(workload, data) / total nu = 2.0 s = 50 #T = int(0.5 * ( np.sqrt(4 * eps * total + s * nu) / np.sqrt(s*nu) + 1 )) T = 2 while 2 * nu * (T - 1) / total * ( np.sqrt(2 * s * (T - 1) * np.log(1.0 / delta) + s * (T - 1) * np.exp(2 * nu * (T - 1) / total) - 1)) < eps: T = T + 1 T = T - 1 Qsize = sum(W.shape[0] for _, W in workload) Xsize = data.domain.size() Q = np.ones(Qsize) / Qsize cache = [] #lookup = [Factor(domain.project(cl), q) for cl, W in workload for q in W] lookup = [(cl, W, i) for cl, W in workload for i in range(W.shape[0])] results = [] for i in range(T): idx = prng.choice(Qsize, s, True, Q) #queries = [lookup[i] for i in idx] queries = [] for i in idx: cl, W, e = lookup[i] dom = domain.project(cl) n = W.shape[0] z = np.zeros(n) z[e] = 1.0 q = W.T.dot(z) queries.append(Factor(dom, -q)) best = max_sum_ve(queries, data.domain) curr = answer_workload(workload, best) Q *= np.exp(-nu * (answers - curr)) Q /= Q.sum() cache.append((idx, curr)) results.append(best.df) synthetic = Dataset(pd.concat(results), data.domain) print('Iterations', T) print('Privacy level', nu * T * (T - 1) * s / total) delta = 1e-3 eps = 2 * nu * (T - 1) / total * ( np.sqrt(2 * s * (T - 1) * np.log(1.0 / delta) + s * (T - 1) * np.exp(2 * nu * (T - 1) / total) - 1)) print('Approx privacy level', eps, delta) return synthetic, cache
def max_sum_ve(factors, domain=None, elim=None): """ run max-product variable elimination on the factors return the most likely assignment as a dictionary where keys are attributes values are elements of the domain """ # step 0: choose an elimination order if domain is None: domain = reduce(Domain.merge, [F.domain for F in factors]) if elim is None: cliques = [F.domain.attrs for F in factors] elim = graphical_model.greedy_order(domain, cliques, domain.attrs) # step 1: variable elimination k = len(factors) phi = dict(zip(range(k), factors)) psi = {} for z in elim: phi2 = [phi.pop(i) for i in list(phi.keys()) if z in phi[i].domain] psi[z] = sum(phi2, Factor.ones(domain.project(z))) phi[k] = psi[z].max([z]) k += 1 value = phi[k - 1] # step 2: traceback-MAP x = {} for z in reversed(elim): x[z] = psi[z].condition(x).values.argmax() # step 3 convert to a Dataset object df = pd.DataFrame(x, index=[0]) return Dataset(df, domain)
def fit(self, data): from mbi import Factor assert data.domain.contains( self.domain), 'model domain not compatible with data domain' marginals = {} for cl in self.cliques: x = data.project(cl).datavector() dom = self.domain.project(cl) marginals[cl] = Factor(dom, x) self.potentials = self.mle(marginals)
def multWeightsFast(self, measurements, total): domain = self.domain groups, projections = _cluster(measurements) factors = [] for group, proj in zip(groups, projections): dom = self.domain.project(proj) fact = Factor.uniform(dom) for i in range(self.iters): update = Factor.zeros(dom) for Q, y, noise_scale, p in group: dom2 = dom.project(p) hatx = fact.project(p).values.flatten()*total error = y - Q.dot(hatx) update += Factor(dom2, Q.T.dot(error).reshape(dom2.shape)) fact *= np.exp(update / (2*total)) fact /= fact.sum() factors.append(fact) self.model = ProductDist(factors, self.domain, total)
def __init__(self, factors, domain, total): """ :param factors: a list of contingency tables, defined over disjoint subsets of attributes :param domain: the domain object :param total: known or estimated total """ self.factors = factors self.domain = domain self.total = total for a in domain: if not any(a in f.domain for f in factors): sub = domain.project([a]) x = np.ones(domain[a]) / domain[a] factors.append(Factor(sub, x))
def postprocess(self): iters = self.iters domain = self.domain engine = FactoredInference(domain, structural_zeros=None, iters=500, log=True, warm_start=True, elim_order=self.elimination_order) self.engine = engine cb = mbi.callbacks.Logger(engine) if self.warmup: engine._setup(self.measurements, None) oneway = {} for i in range(len(self.round1)): p = self.round1[i] y = self.measurements[i][1] y = np.maximum(y, 1) y /= y.sum() oneway[p] = Factor(self.domain.project(p), y) marginals = {} for cl in engine.model.cliques: marginals[cl] = reduce(lambda x, y: x * y, [oneway[p] for p in cl]) theta = engine.model.mle(marginals) engine.potentials = theta engine.marginals = engine.model.belief_prop_fast(theta) checkpt = self.save[:-4] + '-checkpt.csv' for i in range(self.iters // 500): engine.infer(self.measurements, engine='MD', callback=cb) if i % 4 == 3: self.synthetic = engine.model.synthetic_data() self.synthetic = reverse_data(self.synthetic, self.supports) self.transform_domain() self.synthetic.to_csv(checkpt, index=False) if os.path.exists(checkpt): os.remove(checkpt) self.synthetic = engine.model.synthetic_data() self.synthetic = reverse_data(self.synthetic, self.supports)
def krondot(self, matrices): """ Compute the answer to the set of queries Q1 x Q2 X ... x Qd, where Qi is a query matrix on the ith attribute and "x" is the Kronecker product This may be more efficient than computing a supporting marginal then multiplying that by Q. In particular, if each Qi has only a few rows. :param matrices: a list of matrices for each attribute in the domain :return: the vector of query answers """ assert all(M.shape[1] == n for M, n in zip(matrices, self.domain.shape)), \ 'matrices must conform to the shape of the domain' logZ = self.belief_propagation(self.potentials, logZ=True) factors = [self.potentials[cl].exp() for cl in self.cliques] Factor = type(factors[0]) # infer the type of the factors elim = self.domain.attrs for attr, Q in zip(elim, matrices): d = Domain(['%s-answer' % attr, attr], Q.shape) factors.append(Factor(d, Q)) result = variable_elimination(factors, elim) result = result.transpose(['%s-answer' % a for a in elim]) return result.datavector(flatten=False) * self.total / np.exp(logZ)