Пример #1
0
def select(data, epsilon, measurement_log, cliques=[]):
    engine = FactoredInference(data.domain, iters=1000)
    est = engine.estimate(measurement_log)

    weights = {}
    candidates = list(itertools.combinations(data.domain.attrs, 2))
    for a, b in candidates:
        xhat = est.project([a, b]).datavector()
        x = data.project([a, b]).datavector()
        weights[a, b] = np.linalg.norm(x - xhat, 1)

    T = nx.Graph()
    T.add_nodes_from(data.domain.attrs)
    ds = DisjointSet()

    for e in cliques:
        T.add_edge(*e)
        ds.union(*e)

    r = len(list(nx.connected_components(T)))

    for i in range(r - 1):
        candidates = [e for e in candidates if not ds.connected(*e)]
        wgts = np.array([weights[e] for e in candidates])
        idx = permute_and_flip(wgts, epsilon / (r - 1), sensitivity=1.0)
        e = candidates[idx]
        T.add_edge(*e)
        ds.union(*e)

    return list(T.edges)
Пример #2
0
    def postprocess(self):
        #use noisy measurements to fit PGM inference
        #and generate synthetic data
        iters = self.iters
        domain = self.domain
        temp_domain = Domain.fromdict(domain)
        engine = FactoredInference(temp_domain,
                                   structural_zeros=None,
                                   iters=10000,
                                   log=True,
                                   warm_start=False,
                                   elim_order=self.elimination_order)
        self.engine = engine
        engine.estimate(self.measurements)

        self.synthetic = self.engine.model.synthetic_data()
        self.synthetic = reverse_data(self.synthetic, self.supports)
Пример #3
0
def run(dataset, measurements, eps=1.0, delta=0.0, bounded=True, engine='MD', 
        options={}, iters=10000, seed=None, metric='L2', elim_order=None, frequency=1,workload=None):
    """
    Run a mechanism that measures the given measurements and runs inference.
    This is a convenience method for running end-to-end experiments.
    """

    domain = dataset.domain
    total = None

    state = np.random.RandomState(seed)

    if len(measurements) >= 1 and type(measurements[0][0]) is str:
        matrix = lambda proj: sparse.eye(domain.project(proj).size())
        measurements = [(proj, matrix(proj)) for proj in measurements]

    l1 = 0
    l2 = 0
    for _, Q in measurements:
        l1 += np.abs(Q).sum(axis=0).max()
        try: l2 += Q.power(2).sum(axis=0).max()  # for spares matrices
        except: l2 += np.square(Q).sum(axis=0).max() # for dense matrices

    if bounded:
        total = dataset.df.shape[0]
        l1 *= 2
        l2 *= 2

    if delta > 0:
        noise = norm(loc=0, scale=np.sqrt(l2 * 2 * np.log(2/delta))/eps)
    else:
        noise = laplace(loc=0, scale=l1/eps)

    if workload is None:
        workload = measurements
   
    truth = [] 
    for proj, W, in workload:
        x = dataset.project(proj).datavector()
        y = W.dot(x)
        truth.append( (W, y, proj) )

    answers = []
    for proj, Q in measurements:
        x = dataset.project(proj).datavector()
        z = noise.rvs(size=Q.shape[0], random_state=state)
        y = Q.dot(x)
        answers.append( (Q, y+z, 1.0, proj) )

    estimator = FactoredInference(domain, metric=metric, iters=iters, warm_start=False, elim_order=elim_order)
    logger = Logger(estimator, true_answers=truth, frequency=frequency)
    model = estimator.estimate(answers, total, engine=engine, callback=logger, options=options)
        
    return model, logger, answers
Пример #4
0
def MST(data, epsilon, delta):
    # This mechanism is designed for relatively large high-dimensional datasets
    # for lower-dimensional datasets (like adult), simpler mechanisms may be better
    sigma = calibrate_gaussian_noise(epsilon * 2.0 / 3.0, delta)
    cliques = [(col, ) for col in data.domain]
    log1 = measure(data, cliques, sigma)
    data, log1, undo_compress_fn = compress_domain(data, log1)
    cliques = select(data, epsilon / 3.0, log1)
    log2 = measure(data, cliques, sigma)
    engine = FactoredInference(data.domain, iters=5000)
    est = engine.estimate(log1 + log2)
    synth = est.synthetic_data()
    return undo_compress_fn(synth)
Пример #5
0
    def synthesize(self, file_path, eps, seed):
        # setup random state
        prng = np.random.RandomState(seed)

        # load data vector
        relation = Relation(self.config)
        relation.load_csv(file_path)
        self._numerize(relation._df)

        # perform measurement
        attributes = [field_name for field_name in self.config.keys()]
        measurements = []
        w_sum = sum(Ai.weight for Ai in self.strategy.matrices)
        for Ai in self.strategy.matrices:
            w = Ai.weight
            proj = [
                attributes[i] for i, B in enumerate(Ai.base.matrices)
                if type(B).__name__ != 'Ones'
            ]
            matrix = [
                B for B in Ai.base.matrices if type(B).__name__ != 'Ones'
            ]
            matrix = EkteloMatrix(np.ones(
                (1, 1))) if len(matrix) == 0 else Kronecker(matrix)
            proj_rel = copy.deepcopy(relation)
            proj_rel.project(proj)
            if proj_rel.df.shape[1] == 0:
                x = np.array([proj_rel.df.shape[0]])
            else:
                x = Vectorize('').transform(proj_rel).flatten()
            y = Laplace(matrix, w * eps / w_sum).measure(x, prng)
            measurements.append((matrix.sparse_matrix(), y, 1.0 / w, proj))

        # generate synthetic data
        sizes = [field['bins'] for field in self.config.values()]
        dom = Domain(attributes, sizes)
        engine = FactoredInference(dom)
        model = engine.estimate(measurements)
        df = model.synthetic_data().df
        self._denumerize(df)
        self._sample_numerical(df)

        return df
Пример #6
0
    """ Efficiently take measurements from HDMM strategy and convert to a PGM-compatable form """
    A = workload.union_kron_canonical(A)
    measurements = []
    for Ai in A.matrices:
        w = Ai.weight
        proj = [
            attributes[i] for i, B in enumerate(Ai.base.matrices)
            if type(B) != workload.Ones
        ]
        print(proj)
        matrix = workload.Kronecker(
            [B for B in Ai.base.matrices if type(B) != workload.Ones])
        matrix = w * matrix.sparse_matrix()
        x = data.project(
            proj).datavector()  # does Relation have this functionality?
        y = matrix.dot(x) + np.random.laplace(
            loc=0, scale=1, size=matrix.shape[0])
        measurements.append((matrix, y, 1.0, proj))
    return measurements


measurements = take_measurements(A, data)

engine = FactoredInference(dom)
model = engine.estimate(measurements)

df = model.synthetic_data().df
print(df.head())

# Then you can post-process to change category/bin ids with values
Пример #7
0
sigma = np.sqrt(2.0) / epsilon

np.random.seed(0)
yab = ab + np.random.laplace(loc=0, scale=sigma, size=ab.size)
ybc = bc + np.random.laplace(loc=0, scale=sigma, size=bc.size)

# record the measurements in a form needed by inference
Iab = np.eye(ab.size)
Ibc = np.eye(bc.size)

measurements = [(Iab, yab, sigma, ['A', 'B']),
                (Ibc, ybc, sigma, ['B', 'C'])]

# estimate the data distribution
engine = FactoredInference(domain)
model = engine.estimate(measurements, engine='MD')

# recover consistent estimates of measurements
ab2 = model.project(['A','B']).datavector()
bc2 = model.project(['B','C']).datavector()

print(ab2)

print(bc2)

# estimate answer to unmeasured queries
ac2 = model.project(['A','C']).datavector()
print(ac2)

# generate synthetic data
synth = model.synthetic_data(rows=10)
Пример #8
0
measurements = []
for col in data.domain:
    x = data.project(col).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = Identity(x.size)
    measurements.append((I, y, sigma, (col, )))

# spend half of privacy budget to measure some more 2 and 3 way marginals

cliques = [('age', 'education-num'), ('marital-status', 'race'),
           ('sex', 'hours-per-week'), ('hours-per-week', 'income>50K'),
           ('native-country', 'marital-status', 'occupation')]

sigma = 1.0 / len(cliques) / 2.0

for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = Identity(x.size)
    measurements.append((I, y, sigma, cl))

# now perform inference to estimate the data distribution

engine = FactoredInference(domain, backend='torch', log=True, iters=10000)
model = engine.estimate(measurements, total=total, engine='RDA')

# now answer new queries

y1 = model.project(('sex', 'income>50K')).datavector()
y2 = model.project(('race', 'occupation')).datavector()