def __init__(self, workloads): # q or r = - (-q and -r) # W = 1 x 1 - (Q1 x R1) self.A = Kronecker([Ones(*W.shape) for W in workloads]) # totals self.B = -1 * Kronecker([Ones(*W.shape) - W for W in workloads]) # negations Sum.__init__(self, [self.A, self.B])
def __init__(self, domain, key): """ :param domain: a d-tuple containing the domain size of the d attributes :param key: a integer key 0 <= key < 2^d identifying the marginal """ self.domain = domain self.key = key binary = self.binary() subs = [] for i, n in enumerate(domain): if binary[i] == 0: subs.append(Total(n)) else: subs.append(Identity(n)) Kronecker.__init__(self, subs)
def process_workload(wd, eps): blockinfo = {"columnNames": [], 'buildingBlock': [], 'p': []} for bb in wd['data']: blockinfo['columnNames'].append(bb['name']) size = int((float(bb['maximum']) - float(bb['minimum'])) / float(bb['bucketSize']) + 1) pv = math.ceil(size / 16.0) if math.ceil( size / 16.0) != 2 else math.ceil(size / 16.0) - 1 if bb['buildingBlock'] == 'identity': blockinfo['buildingBlock'].append(Identity(size)) pv = 1 elif bb['buildingBlock'] == 'allrange': blockinfo['buildingBlock'].append(AllRange(size)) elif bb['buildingBlock'] == 'prefix': blockinfo['buildingBlock'].append(Prefix(size)) elif bb['buildingBlock'] == 'customized': domainMatrix = parse_customized(bb, size) blockinfo['buildingBlock'].append(EkteloMatrix(domainMatrix)) pv = 1 else: blockinfo['buildingBlock'].append(Total(size)) pv = 1 blockinfo['p'].append(pv) gc.collect() gc.collect() wgt = np.sqrt(float(wd['weight'])) return wgt * Kronecker(blockinfo['buildingBlock']), blockinfo
def Laplace(): eps = 1.0 wk, wds = get_workload() identity = Kronecker([Identity(n) for n in domain(wk)]) metrics = calculate_workload_error_default(wk, identity, eps)[1] metrics['stage'] = 'Identity Baseline Complete' metrics['method'] = 'Identity' return json.dumps(metrics)
def get_measurements(domain, workload): # get measurements using OPT+ parameterization lookup = {} # optimal strategy for Identity is Identity for attr in domain: n = domain.size(attr) lookup[attr] = Identity(n) # optimal strategy for Prefix is precomputed and loaded lookup['age'] = EkteloMatrix(np.load('prefix-85.npy')) lookup['fnlwgt'] = EkteloMatrix(np.load('prefix-100.npy')) lookup['capital-gain'] = EkteloMatrix(np.load('prefix-100.npy')) lookup['capital-loss'] = EkteloMatrix(np.load('prefix-100.npy')) lookup['hours-per-week'] = EkteloMatrix(np.load('prefix-99.npy')) measurements = [] for proj, _ in workload: Q = Kronecker([lookup[a] for a in proj]) measurements.append((proj, Q.sparse_matrix())) return measurements
def per_query_error_sampling(W, A, number=100000, eps=np.sqrt(2), normalize=False): # note: this only works for Kronecker or explicit strategy W, A = convert_implicit(W), convert_implicit(A) if isinstance(W, Weighted): ans = W.weight**2 * per_query_error_sampling(W.base, A, number) #elif isinstance(W, VStack) and type(A) == VStack: # m = W.shape[0] # num = lambda Wi: int(number*Wi.shape[0]/m + 1) # samples = [per_query_error_sampling(Wi,Ai.base,num(Wi)) for Wi,Ai in zip(W.matrices,A.matrices)] # weights = [Ai.weight for Ai in A.matrices] # ans = np.concatenate([err/w**2 for w, err in zip(weights, samples)]) elif isinstance(W, VStack): m = W.shape[0] num = lambda Wi: int(number * Wi.shape[0] / m + 1) samples = [ per_query_error_sampling(Wi, A, num(Wi)) for Wi in W.matrices ] ans = np.concatenate(samples) elif isinstance(W, Kronecker) and isinstance(A, Kronecker): assert isinstance(A, Kronecker) pieces = [ per_query_error_sampling(Wi, Ai, number) for Wi, Ai in zip(W.matrices, A.matrices) ] ans = np.prod(pieces, axis=0) elif isinstance(W, Kronecker) and isinstance(A, workload.Marginals): # optimization: if W is Marginals, all errors are the same if all( type(Wi) in [workload.Identity, workload.Ones] for Wi in W.matrices): err = expected_error(W, A) ans = np.repeat(err, number) else: # will be very slow, uses for loop AtA1 = A.gram().pinv() ans = np.zeros(number) for i in range(number): idx = [np.random.randint(Wi.shape[0]) for Wi in W.matrices] w = Kronecker([Wi[j] for Wi, j in zip(W.matrices, idx)]) ans[i] = expected_error(w, A) else: ans = np.random.choice(per_query_error(W, A), number) delta = A.sensitivity() ans *= 2.0 / eps**2 return np.sqrt(ans) if normalize else ans
def __init__(self, domain, lower, higher, dtype=np.float64): """ :param domain: the domain size, as an int for 1D or tuple for d-dimensional domains where each bound is a tuple with the same size as domain. :param lower: a q x d array of lower boundaries for the q queries :param higher: a q x d array of upper boundareis for the q queries """ assert lower.shape == higher.shape, 'lower and higher must have same shape' #assert np.all(lower <= higher), 'lower index must be <= than higher index' if type(domain) is int: domain = (domain, ) lower = lower[:, None] higher = higher[:, None] self.domain = domain self.shape = (lower.shape[0], np.prod(domain)) self.dtype = dtype self._lower = lower self._higher = higher idx = np.arange(np.prod(domain), dtype=np.int32).reshape(domain) shape = (lower.shape[0], np.prod(domain)) corners = np.array( list(itertools.product(*[(False, True)] * len(domain)))) size = len(corners) * lower.shape[0] row_ind = np.zeros(size, dtype=np.int32) col_ind = np.zeros(size, dtype=np.int32) data = np.zeros(size, dtype=dtype) queries = np.arange(shape[0], dtype=np.int32) start = 0 for corner in corners: tmp = np.where(corner, lower - 1, higher) keep = np.all(tmp >= 0, axis=1) index = idx[tuple(tmp.T)] coef = np.sum(corner) % 2 * 2 - 1 end = start + keep.sum() row_ind[start:end] = queries[keep] col_ind[start:end] = index[keep] data[start:end] = -coef start = end self._transformer = sparse.csr_matrix( (data[:end], (row_ind[:end], col_ind[:end])), shape, dtype) P = Kronecker([Prefix(n, dtype) for n in domain]) T = EkteloMatrix(self._transformer) Product.__init__(self, T, P)
def synthesize(self, file_path, eps, seed): # setup random state prng = np.random.RandomState(seed) # load data vector relation = Relation(self.config) relation.load_csv(file_path) self._numerize(relation._df) # perform measurement attributes = [field_name for field_name in self.config.keys()] measurements = [] w_sum = sum(Ai.weight for Ai in self.strategy.matrices) for Ai in self.strategy.matrices: w = Ai.weight proj = [ attributes[i] for i, B in enumerate(Ai.base.matrices) if type(B).__name__ != 'Ones' ] matrix = [ B for B in Ai.base.matrices if type(B).__name__ != 'Ones' ] matrix = EkteloMatrix(np.ones( (1, 1))) if len(matrix) == 0 else Kronecker(matrix) proj_rel = copy.deepcopy(relation) proj_rel.project(proj) if proj_rel.df.shape[1] == 0: x = np.array([proj_rel.df.shape[0]]) else: x = Vectorize('').transform(proj_rel).flatten() y = Laplace(matrix, w * eps / w_sum).measure(x, prng) measurements.append((matrix.sparse_matrix(), y, 1.0 / w, proj)) # generate synthetic data sizes = [field['bins'] for field in self.config.values()] dom = Domain(attributes, sizes) engine = FactoredInference(dom) model = engine.estimate(measurements) df = model.synthetic_data().df self._denumerize(df) self._sample_numerical(df) return df
class Disjuncts(Sum): """ Just like the Kron workload class can represent a cartesian product of predicate counting queries where the predicates are conjunctions, this workload class can represent a cartesian product of predicate counting queries where the predicates are disjunctions. """ #TODO: check implementation after refactoring def __init__(self, workloads): # q or r = - (-q and -r) # W = 1 x 1 - (Q1 x R1) self.A = Kronecker([Ones(*W.shape) for W in workloads]) # totals self.B = -1 * Kronecker([Ones(*W.shape) - W for W in workloads]) # negations Sum.__init__(self, [self.A, self.B]) def gram(self): return Sum([ self.A.gram(), self.A.T @ self.B, self.B.T @ self.A, self.B.gram() ])
def Prefix2D(n): return Kronecker([Prefix(n), Prefix(n)])
def Range2D(n): return Kronecker([AllRange(n), AllRange(n)])
def _transpose(self): ans = Kronecker._transpose(self) ans._matmat = self._rmatmat return ans