示例#1
0
    def __init__(self, workloads):
        # q or r = - (-q and -r)
        # W = 1 x 1 - (Q1 x R1)

        self.A = Kronecker([Ones(*W.shape) for W in workloads])  # totals
        self.B = -1 * Kronecker([Ones(*W.shape) - W
                                 for W in workloads])  # negations
        Sum.__init__(self, [self.A, self.B])
示例#2
0
 def __init__(self, domain, key):
     """
     :param domain: a d-tuple containing the domain size of the d attributes
     :param key: a integer key 0 <= key < 2^d identifying the marginal
     """
     self.domain = domain
     self.key = key
     binary = self.binary()
     subs = []
     for i, n in enumerate(domain):
         if binary[i] == 0:
             subs.append(Total(n))
         else:
             subs.append(Identity(n))
     Kronecker.__init__(self, subs)
示例#3
0
文件: app.py 项目: dpcomp-org/hdmm
def process_workload(wd, eps):
    blockinfo = {"columnNames": [], 'buildingBlock': [], 'p': []}
    for bb in wd['data']:
        blockinfo['columnNames'].append(bb['name'])
        size = int((float(bb['maximum']) - float(bb['minimum'])) /
                   float(bb['bucketSize']) + 1)
        pv = math.ceil(size / 16.0) if math.ceil(
            size / 16.0) != 2 else math.ceil(size / 16.0) - 1
        if bb['buildingBlock'] == 'identity':
            blockinfo['buildingBlock'].append(Identity(size))
            pv = 1
        elif bb['buildingBlock'] == 'allrange':
            blockinfo['buildingBlock'].append(AllRange(size))
        elif bb['buildingBlock'] == 'prefix':
            blockinfo['buildingBlock'].append(Prefix(size))
        elif bb['buildingBlock'] == 'customized':
            domainMatrix = parse_customized(bb, size)
            blockinfo['buildingBlock'].append(EkteloMatrix(domainMatrix))
            pv = 1
        else:
            blockinfo['buildingBlock'].append(Total(size))
            pv = 1
        blockinfo['p'].append(pv)
        gc.collect()
    gc.collect()
    wgt = np.sqrt(float(wd['weight']))
    return wgt * Kronecker(blockinfo['buildingBlock']), blockinfo
示例#4
0
文件: app.py 项目: dpcomp-org/hdmm
def Laplace():
    eps = 1.0
    wk, wds = get_workload()
    identity = Kronecker([Identity(n) for n in domain(wk)])
    metrics = calculate_workload_error_default(wk, identity, eps)[1]
    metrics['stage'] = 'Identity Baseline Complete'
    metrics['method'] = 'Identity'
    return json.dumps(metrics)
示例#5
0
def get_measurements(domain, workload):
    # get measurements using OPT+ parameterization
    lookup = {}
    # optimal strategy for Identity is Identity
    for attr in domain:
        n = domain.size(attr)
        lookup[attr] = Identity(n)
    # optimal strategy for Prefix is precomputed and loaded
    lookup['age'] = EkteloMatrix(np.load('prefix-85.npy'))
    lookup['fnlwgt'] = EkteloMatrix(np.load('prefix-100.npy'))
    lookup['capital-gain'] = EkteloMatrix(np.load('prefix-100.npy'))
    lookup['capital-loss'] = EkteloMatrix(np.load('prefix-100.npy'))
    lookup['hours-per-week'] = EkteloMatrix(np.load('prefix-99.npy'))

    measurements = []
    for proj, _ in workload:
        Q = Kronecker([lookup[a] for a in proj])
        measurements.append((proj, Q.sparse_matrix()))

    return measurements
示例#6
0
文件: error.py 项目: dpcomp-org/hdmm
def per_query_error_sampling(W,
                             A,
                             number=100000,
                             eps=np.sqrt(2),
                             normalize=False):
    # note: this only works for Kronecker or explicit strategy
    W, A = convert_implicit(W), convert_implicit(A)
    if isinstance(W, Weighted):
        ans = W.weight**2 * per_query_error_sampling(W.base, A, number)
    #elif isinstance(W, VStack) and type(A) == VStack:
    #    m = W.shape[0]
    #    num = lambda Wi: int(number*Wi.shape[0]/m + 1)
    #    samples = [per_query_error_sampling(Wi,Ai.base,num(Wi)) for Wi,Ai in zip(W.matrices,A.matrices)]
    #    weights = [Ai.weight for Ai in A.matrices]
    #    ans = np.concatenate([err/w**2 for w, err in zip(weights, samples)])
    elif isinstance(W, VStack):
        m = W.shape[0]
        num = lambda Wi: int(number * Wi.shape[0] / m + 1)
        samples = [
            per_query_error_sampling(Wi, A, num(Wi)) for Wi in W.matrices
        ]
        ans = np.concatenate(samples)
    elif isinstance(W, Kronecker) and isinstance(A, Kronecker):
        assert isinstance(A, Kronecker)
        pieces = [
            per_query_error_sampling(Wi, Ai, number)
            for Wi, Ai in zip(W.matrices, A.matrices)
        ]
        ans = np.prod(pieces, axis=0)
    elif isinstance(W, Kronecker) and isinstance(A, workload.Marginals):
        # optimization: if W is Marginals, all errors are the same
        if all(
                type(Wi) in [workload.Identity, workload.Ones]
                for Wi in W.matrices):
            err = expected_error(W, A)
            ans = np.repeat(err, number)
        else:
            # will be very slow, uses for loop
            AtA1 = A.gram().pinv()
            ans = np.zeros(number)
            for i in range(number):
                idx = [np.random.randint(Wi.shape[0]) for Wi in W.matrices]
                w = Kronecker([Wi[j] for Wi, j in zip(W.matrices, idx)])
                ans[i] = expected_error(w, A)
    else:
        ans = np.random.choice(per_query_error(W, A), number)
        delta = A.sensitivity()
    ans *= 2.0 / eps**2
    return np.sqrt(ans) if normalize else ans
示例#7
0
    def __init__(self, domain, lower, higher, dtype=np.float64):
        """
        :param domain: the domain size, as an int for 1D or tuple for d-dimensional 
            domains where each bound is a tuple with the same size as domain.
        :param lower: a q x d array of lower boundaries for the q queries
        :param higher: a q x d array of upper boundareis for the q queries
        """
        assert lower.shape == higher.shape, 'lower and higher must have same shape'
        #assert np.all(lower <= higher), 'lower index must be <= than higher index'

        if type(domain) is int:
            domain = (domain, )
            lower = lower[:, None]
            higher = higher[:, None]
        self.domain = domain
        self.shape = (lower.shape[0], np.prod(domain))
        self.dtype = dtype
        self._lower = lower
        self._higher = higher

        idx = np.arange(np.prod(domain), dtype=np.int32).reshape(domain)
        shape = (lower.shape[0], np.prod(domain))
        corners = np.array(
            list(itertools.product(*[(False, True)] * len(domain))))
        size = len(corners) * lower.shape[0]
        row_ind = np.zeros(size, dtype=np.int32)
        col_ind = np.zeros(size, dtype=np.int32)
        data = np.zeros(size, dtype=dtype)
        queries = np.arange(shape[0], dtype=np.int32)
        start = 0

        for corner in corners:
            tmp = np.where(corner, lower - 1, higher)
            keep = np.all(tmp >= 0, axis=1)
            index = idx[tuple(tmp.T)]
            coef = np.sum(corner) % 2 * 2 - 1
            end = start + keep.sum()
            row_ind[start:end] = queries[keep]
            col_ind[start:end] = index[keep]
            data[start:end] = -coef
            start = end

        self._transformer = sparse.csr_matrix(
            (data[:end], (row_ind[:end], col_ind[:end])), shape, dtype)

        P = Kronecker([Prefix(n, dtype) for n in domain])
        T = EkteloMatrix(self._transformer)
        Product.__init__(self, T, P)
示例#8
0
    def synthesize(self, file_path, eps, seed):
        # setup random state
        prng = np.random.RandomState(seed)

        # load data vector
        relation = Relation(self.config)
        relation.load_csv(file_path)
        self._numerize(relation._df)

        # perform measurement
        attributes = [field_name for field_name in self.config.keys()]
        measurements = []
        w_sum = sum(Ai.weight for Ai in self.strategy.matrices)
        for Ai in self.strategy.matrices:
            w = Ai.weight
            proj = [
                attributes[i] for i, B in enumerate(Ai.base.matrices)
                if type(B).__name__ != 'Ones'
            ]
            matrix = [
                B for B in Ai.base.matrices if type(B).__name__ != 'Ones'
            ]
            matrix = EkteloMatrix(np.ones(
                (1, 1))) if len(matrix) == 0 else Kronecker(matrix)
            proj_rel = copy.deepcopy(relation)
            proj_rel.project(proj)
            if proj_rel.df.shape[1] == 0:
                x = np.array([proj_rel.df.shape[0]])
            else:
                x = Vectorize('').transform(proj_rel).flatten()
            y = Laplace(matrix, w * eps / w_sum).measure(x, prng)
            measurements.append((matrix.sparse_matrix(), y, 1.0 / w, proj))

        # generate synthetic data
        sizes = [field['bins'] for field in self.config.values()]
        dom = Domain(attributes, sizes)
        engine = FactoredInference(dom)
        model = engine.estimate(measurements)
        df = model.synthetic_data().df
        self._denumerize(df)
        self._sample_numerical(df)

        return df
示例#9
0
class Disjuncts(Sum):
    """
    Just like the Kron workload class can represent a cartesian product of predicate counting
    queries where the predicates are conjunctions, this workload class can represent a cartesian
    product of predicate counting queries where the predicates are disjunctions.
    """

    #TODO: check implementation after refactoring
    def __init__(self, workloads):
        # q or r = - (-q and -r)
        # W = 1 x 1 - (Q1 x R1)

        self.A = Kronecker([Ones(*W.shape) for W in workloads])  # totals
        self.B = -1 * Kronecker([Ones(*W.shape) - W
                                 for W in workloads])  # negations
        Sum.__init__(self, [self.A, self.B])

    def gram(self):
        return Sum([
            self.A.gram(), self.A.T @ self.B, self.B.T @ self.A,
            self.B.gram()
        ])
示例#10
0
def Prefix2D(n):
    return Kronecker([Prefix(n), Prefix(n)])
示例#11
0
def Range2D(n):
    return Kronecker([AllRange(n), AllRange(n)])
示例#12
0
 def _transpose(self):
     ans = Kronecker._transpose(self)
     ans._matmat = self._rmatmat
     return ans