Пример #1
0
def get_pub_dataset_biased(data, pub_frac, frac_seed, bias_attr, perturb):
    prng = np.random.RandomState(frac_seed)

    df_priv = data.df
    N = data.df.shape[0]

    attr_distr = np.bincount(df_priv[bias_attr])
    attr_distr = attr_distr / attr_distr.sum()
    orig_attr_distr = attr_distr.copy()

    attr_distr[0] += perturb
    attr_distr[1] = 1 - attr_distr[0]

    df_pub = []
    for i in range(attr_distr.shape[0]):
        mask = df_priv[bias_attr] == i
        df_attr = df_priv[mask].reset_index(drop=True)
        size = int(pub_frac * N * attr_distr[i])
        idxs = prng.choice(df_attr.index, size=size, replace=True)
        df_pub.append(df_attr.loc[idxs])
    df_pub = pd.concat(df_pub).reset_index(drop=True)

    cols = list(df_pub.columns)
    df_pub = df_pub.reset_index().groupby(cols).count()
    df_pub.reset_index(inplace=True)
    A_init = df_pub['index'].values
    A_init = A_init / A_init.sum()

    data_pub = Dataset(df_pub, data.domain)

    return data_pub, A_init, orig_attr_distr
Пример #2
0
def privbayes_inference(domain, measurements, total):
    synthetic = pd.DataFrame()

    _, y, _, proj = measurements[0]
    y = np.maximum(y, 0)
    y /= y.sum()
    col = proj[0]
    synthetic[col] = np.random.choice(domain[col], total, True, y)
        
    for _, y, _, proj in measurements[1:]:
        # find the CPT
        col, dep = proj[0], proj[1:]
        print(col)
        y = np.maximum(y, 0)
        dom = domain.project(proj)
        cpt = Factor(dom, y.reshape(dom.shape))
        marg = cpt.project(dep)
        cpt /= marg
        cpt2 = np.moveaxis(cpt.project(proj).values, 0, -1)
        
        # sample current column
        synthetic[col] = 0
        rng = itertools.product(*[range(domain[a]) for a in dep])
        for v in rng:
            idx = (synthetic.loc[:,dep].values == np.array(v)).all(axis=1)
            p = cpt2[v].flatten()
            if p.sum() == 0:
                p = np.ones(p.size) / p.size
            n = domain[col]
            N = idx.sum()
            if N > 0:
                synthetic.loc[idx,col] = np.random.choice(n, N, True, p)

    return Dataset(synthetic, domain)
Пример #3
0
def max_sum_ve(factors, domain=None, elim=None):
    """ run max-product variable elimination on the factors
    return the most likely assignment as a dictionary where
        keys are attributes
        values are elements of the domain
    """
    # step 0: choose an elimination order
    if domain is None:
        domain = reduce(Domain.merge, [F.domain for F in factors])

    if elim is None:
        cliques = [F.domain.attrs for F in factors]
        elim = graphical_model.greedy_order(domain, cliques, domain.attrs)

    # step 1: variable elimination
    k = len(factors)
    phi = dict(zip(range(k), factors))
    psi = {}
    for z in elim:
        phi2 = [phi.pop(i) for i in list(phi.keys()) if z in phi[i].domain]
        psi[z] = sum(phi2, Factor.ones(domain.project(z)))
        phi[k] = psi[z].max([z])
        k += 1

    value = phi[k - 1]

    # step 2: traceback-MAP
    x = {}
    for z in reversed(elim):
        x[z] = psi[z].condition(x).values.argmax()

    # step 3 convert to a Dataset object
    df = pd.DataFrame(x, index=[0])
    return Dataset(df, domain)
Пример #4
0
def DualQuery(data, workload, eps=1.0, delta=0.001, seed=0):
    prng = np.random.RandomState(seed)
    total = data.df.shape[0]
    domain = data.domain
    answers = answer_workload(workload, data) / total

    nu = 2.0
    s = 50
    #T = int(0.5 * ( np.sqrt(4 * eps * total + s * nu) / np.sqrt(s*nu) + 1 ))
    T = 2
    while 2 * nu * (T - 1) / total * (
            np.sqrt(2 * s * (T - 1) * np.log(1.0 / delta) + s *
                    (T - 1) * np.exp(2 * nu * (T - 1) / total) - 1)) < eps:
        T = T + 1
    T = T - 1

    Qsize = sum(W.shape[0] for _, W in workload)
    Xsize = data.domain.size()

    Q = np.ones(Qsize) / Qsize
    cache = []
    #lookup = [Factor(domain.project(cl), q) for cl, W in workload for q in W]
    lookup = [(cl, W, i) for cl, W in workload for i in range(W.shape[0])]
    results = []

    for i in range(T):
        idx = prng.choice(Qsize, s, True, Q)

        #queries = [lookup[i] for i in idx]
        queries = []
        for i in idx:
            cl, W, e = lookup[i]
            dom = domain.project(cl)
            n = W.shape[0]
            z = np.zeros(n)
            z[e] = 1.0
            q = W.T.dot(z)
            queries.append(Factor(dom, -q))

        best = max_sum_ve(queries, data.domain)
        curr = answer_workload(workload, best)

        Q *= np.exp(-nu * (answers - curr))
        Q /= Q.sum()

        cache.append((idx, curr))
        results.append(best.df)

    synthetic = Dataset(pd.concat(results), data.domain)

    print('Iterations', T)
    print('Privacy level', nu * T * (T - 1) * s / total)

    delta = 1e-3
    eps = 2 * nu * (T - 1) / total * (
        np.sqrt(2 * s * (T - 1) * np.log(1.0 / delta) + s *
                (T - 1) * np.exp(2 * nu * (T - 1) / total) - 1))
    print('Approx privacy level', eps, delta)

    return synthetic, cache
Пример #5
0
def randomKway(name,
               number,
               marginal,
               proj=None,
               seed=0,
               filter=None,
               root_path='./',
               args=None):
    check_size = name in ['adult_orig', 'loans']
    path = os.path.join(root_path, "Datasets/{}.csv".format(name))
    df = pd.read_csv(path)

    domain = os.path.join(root_path, "Datasets/{}-domain.json".format(name))
    config = json.load(open(domain))
    domain = Domain(config.keys(), config.values())

    if name == 'adult':
        if args.adult_seed is not None:
            prng = np.random.RandomState(args.adult_seed)
            mask = prng.binomial(1, 0.9, size=len(df))
            df.loc[:, '_split'] = mask
        else:
            df.loc[:, '_split'] = 1

    if filter is not None:
        col, val = filter
        df = df[df[col] == val].reset_index(drop=True)
        del df[col]

    domain_max = max(domain.config.values())
    dtype = get_min_dtype(domain_max)
    df = df.astype(dtype)

    data = Dataset(df, domain)
    if proj is not None:
        data = data.project(proj)
    return data, randomKwayData(data,
                                number,
                                marginal,
                                seed,
                                check_size=check_size)
Пример #6
0
def get_support(data):
    df_support = []
    for val in list(data.domain.config.values()):
        df_support.append(np.arange(val))
    df_support = list(itertools.product(*df_support))
    df_support = np.array(df_support)
    df_support = pd.DataFrame(df_support, columns=data.df.columns)
    data_support = Dataset(df_support, data.domain)
    A_init = np.ones(len(df_support))
    A_init /= len(A_init)

    return data_support, A_init
Пример #7
0
    def load_data(self, path=None):
        """ load the data and discretize the integer/float attributes """
        if path is None:
            path = self.dataset
        df = pd.read_csv(path)
        self.column_order = df.columns

        for col in self.domain_info:
            vals = self.domain_info[col]
            mapping = dict(zip(vals, range(len(vals))))
            df[col] = df[col].map(mapping)

        mapping = { k : k // 100 for k in range(5000) }
        mapping[999998] = 51
        mapping.update({ i : 50 for i in range(5000, 999998) })
        df['INCWAGE_A'] = df['INCWAGE'].map(mapping)

        mod_mapping = { k : 0 for k in range(5000, 999999) }
        for i in range(5001):
            if i % 100 == 0:
                mod_mapping[i] = 0
            elif i % 20 == 0:
                mod_mapping[i] = 1
            elif i % 50 == 0:
                mod_mapping[i] = 2
            elif i % 25 == 0:
                mod_mapping[i] = 3
            elif i % 10 == 0:
                mod_mapping[i] = 4
            elif i % 5 == 0:
                mod_mapping[i] = 5
            elif i % 2 == 0:
                mod_mapping[i] = 6
            else:
                mod_mapping[i] = 7

        df['INCWAGE_B'] = df['INCWAGE'].map(mod_mapping)

        mapping = {}
        for i in range(9999998):
            if i <= 25000:
                mapping[i] = i // 5
            else:
                mapping[i] = 5000

        mapping[9999998] = 5001
        mapping[9999999] = 5002
        df['VALUEH'] = df['VALUEH'].map(mapping) 
    

        return Dataset(df, self.domain)
Пример #8
0
def get_A_init(data, df):
    cols = list(df.columns)
    df = df.groupby(cols).size().reset_index(name='Count')
    A_init = df['Count'].values
    A_init = A_init / A_init.sum()
    del df['Count']
    data_pub = Dataset(df, data.domain)

    # A_init = df.groupby(cols, sort=False).size().values
    # A_init = A_init / A_init.sum()
    # df = df.drop_duplicates()
    # data_pub = Dataset(df, data.domain)

    return data_pub, A_init
Пример #9
0
    def synthetic_data(self, rows=None):
        """ Generate synthetic tabular data from the distribution """
        total = int(self.total) if rows is None else rows
        cols = self.domain.attrs
        data = np.zeros((total, len(cols)), dtype=int)
        df = pd.DataFrame(data, columns=cols)
        cliques = [set(cl) for cl in self.cliques]

        def synthetic_col(counts, total):
            counts *= total / counts.sum()
            frac, integ = np.modf(counts)
            integ = integ.astype(int)
            extra = total - integ.sum()
            #if extra > 0:
            #    o = np.argsort(frac)
            #    integ[o[-extra:]] += 1
            if extra > 0:
                idx = np.random.choice(counts.size, extra, False,
                                       frac / frac.sum())
                integ[idx] += 1
            vals = np.repeat(np.arange(counts.size), integ)
            np.random.shuffle(vals)
            return vals

        order = self.elimination_order[::-1]
        col = order[0]
        marg = self.project([col]).datavector(flatten=False)
        df.loc[:, col] = synthetic_col(marg, total)
        used = {col}

        for col in order[1:]:
            relevant = [cl for cl in cliques if col in cl]
            relevant = used.intersection(set.union(*relevant))
            proj = tuple(relevant)
            used.add(col)
            marg = self.project(proj + (col, )).datavector(flatten=False)

            def foo(group):
                idx = group.name
                vals = synthetic_col(marg[idx], group.shape[0])
                group[col] = vals
                return group

            if len(proj) >= 1:
                df = df.groupby(list(proj)).apply(foo)
            else:
                df[col] = synthetic_col(marg, df.shape[0])

        return Dataset(df, self.domain)
Пример #10
0
def reverse_data(data, supports):
    df = data.df.copy()
    newdom = {}
    for col in data.domain:
        support = supports[col]
        mx = support.sum()
        newdom[col] = int(support.size)
        idx, extra = np.where(support)[0], np.where(~support)[0]
        mask = df[col] == mx
        if extra.size == 0:
            pass
        else:
            df.loc[mask, col] = np.random.choice(extra, mask.sum())
        df.loc[~mask, col] = idx[df.loc[~mask, col]]
    newdom = Domain.fromdict(newdom)
    return Dataset(df, newdom)
Пример #11
0
def get_dummy_data(domain, data_size, query_manager=None):
    dis = {}
    for attr, n in zip(domain.attrs, domain.shape):
        random_dist = np.random.exponential(10, n)
        random_dist = random_dist / np.sum(random_dist)
        dis[attr] = random_dist
    arr = [np.random.choice(n, data_size, p=dis[attr]) for attr, n in zip(domain.attrs, domain.shape)]
    values = np.array(arr).T
    df = pd.DataFrame(values, columns=domain.attrs)
    data =  Dataset(df, domain)
    if query_manager is not None:
        ans = query_manager.get_answer(data)
        print("max answer: ", np.max(ans))
        plt.hist(ans)
        plt.show()

    return data
Пример #12
0
def transform_data(data, supports):
    df = data.df.copy()
    newdom = {}
    for col in data.domain:
        support = supports[col]
        size = support.sum()
        newdom[col] = int(size)
        if size < support.size:
            newdom[col] += 1
        mapping = {}
        idx = 0
        for i in range(support.size):
            mapping[i] = size
            if support[i]:
                mapping[i] = idx
                idx += 1
        assert idx == size
        df[col] = df[col].map(mapping)
    newdom = Domain.fromdict(newdom)
    return Dataset(df, newdom)
Пример #13
0
def get_pub_dataset_corrupt(data,
                            pub_frac,
                            frac_seed,
                            perturb,
                            perturb_seed,
                            asymmetric=False):
    prng_frac = np.random.RandomState(frac_seed)
    prng_perturb = np.random.RandomState(perturb_seed)

    df_pub = data.df.copy()
    pub_data_size = int(pub_frac * df_pub.shape[0])

    idxs = prng_frac.choice(df_pub.index, size=pub_data_size, replace=False)
    df_pub = df_pub.loc[idxs].reset_index(drop=True)

    mask = prng_perturb.binomial(1, p=perturb, size=df_pub.shape).astype(bool)

    domain = data.domain
    for i, attr in enumerate(df_pub.columns):
        mask_attr = mask[:, i]
        if asymmetric:
            perturbation = 1
        else:
            perturbation = prng_perturb.choice(np.arange(1, domain[attr]),
                                               size=mask_attr.sum(),
                                               replace=True)
        df_pub.loc[mask_attr, attr] += perturbation
        df_pub.loc[mask_attr, attr] %= domain[attr]

    cols = list(df_pub.columns)
    df_pub = df_pub.groupby(cols).size().reset_index(name='Count')
    A_init = df_pub['Count'].values
    A_init = A_init / A_init.sum()

    data_pub = Dataset(df_pub, data.domain)

    return data_pub, A_init
Пример #14
0
def adult_benchmark():
    data = Dataset.load('../data/adult.csv', '../data/adult-domain.json')

    projections = [('occupation', 'race', 'capital-loss'),
                   ('occupation', 'sex', 'native-country'),
                   ('marital-status', 'relationship', 'income>50K'),
                   ('age', 'education-num', 'sex'),
                   ('workclass', 'education-num', 'occupation'),
                   ('marital-status', 'occupation', 'income>50K'),
                   ('race', 'native-country', 'income>50K'),
                   ('occupation', 'capital-gain', 'income>50K'),
                   ('marital-status', 'hours-per-week', 'income>50K'),
                   ('workclass', 'race', 'capital-gain'),
                   ('marital-status', 'relationship', 'capital-gain'),
                   ('workclass', 'education-num', 'capital-gain'),
                   ('education-num', 'relationship', 'race'),
                   ('fnlwgt', 'hours-per-week', 'income>50K'),
                   ('workclass', 'sex', 'native-country')]

    lookup = {}
    for attr in data.domain:
        n = data.domain.size(attr)
        lookup[attr] = workload.Identity(n)

    lookup['age'] = workload.Prefix(85)
    lookup['fnlwgt'] = workload.Prefix(100)
    lookup['capital-gain'] = workload.Prefix(100)
    lookup['capital-loss'] = workload.Prefix(100)
    lookup['hours-per-week'] = workload.Prefix(99)

    workloads = []

    for proj in projections:
        W = workload.Kronecker([lookup[a] for a in proj])
        workloads.append((proj, W))

    return data, workloads
Пример #15
0
def get_dummy_data2(domain, data_size, query_manager, display=False):
    num_attr = len(domain.attrs)

    bag = {}
    for i in range(len(query_manager.workloads)):
        if len(bag) >= num_attr//2:break
        for attr in query_manager.workloads[i]:
            id = query_manager.att_id[attr]
            if id not in bag:
                attr_size = domain.shape[id]
                bag[id] = np.random.randint(0, attr_size)

    arr = []
    for _ in range(data_size):
        arr.append(get_dummy_row(domain, bag))
    values = np.array(arr)
    df = pd.DataFrame(values, columns=domain.attrs)
    data = Dataset(df, domain)
    if display:
        ans = query_manager.get_answer(data)
        print("max answer: ", np.max(ans))
        plot_bins(ans, title='Dummy')

    return data
Пример #16
0
def generate(data,
             query_manager,
             epsilon,
             epsilon_0,
             exponential_scale,
             adaptive,
             samples,
             alpha=0,
             timeout=None,
             show_prgress=True):
    domain = data.domain
    D = np.sum(domain.shape)
    N = data.df.shape[0]
    Q_size = query_manager.num_queries
    delta = 1.0 / N**2
    beta = 0.05  ## Fail probability

    prev_queries = []
    neg_queries = []
    rho_comp = 0.0000

    q1 = util2.sample(np.ones(Q_size) / Q_size)
    q2 = util2.sample(np.ones(Q_size) / Q_size)
    prev_queries.append(q1)  ## Sample a query from the uniform distribution
    neg_queries.append(q2)  ## Sample a query from the uniform distribution

    real_answers = query_manager.get_answer(data, debug=False)
    neg_real_answers = 1 - real_answers

    final_syn_data = []
    fem_start_time = time.time()
    temp = []

    # T = util.get_rounds(epsilon, epsilon_0, delta)
    T = util2.get_rounds_zCDP(epsilon, epsilon_0, adaptive, delta)
    if show_prgress:
        progress_bar = tqdm(total=T)
    status = 'OK'
    for t in range(T):
        eps_t = epsilon_0 + adaptive * t

        if show_prgress: progress_bar.update()
        """
        End early after timeout seconds 
        """
        if (timeout is not None) and time.time() - fem_start_time > timeout:
            status = 'Timeout'
            break
        if (timeout is not None
            ) and t >= 1 and (time.time() - fem_start_time) * T / t > timeout:
            status = 'Ending Early ({:.2f}s) '.format(
                (time.time() - fem_start_time) * T / t)
            break
        """
        Sample s times from FTPL
        """
        util2.blockPrint()
        num_processes = 8
        s2 = int(1.0 + samples / num_processes)
        samples_rem = samples
        processes = []
        manager = mp.Manager()
        fake_temp = manager.list()

        query_workload = query_manager.get_query_workload(prev_queries)
        neg_query_workload = query_manager.get_query_workload(neg_queries)

        for __ in range(num_processes):
            temp_s = samples_rem if samples_rem - s2 < 0 else s2
            samples_rem -= temp_s
            noise = np.random.exponential(exponential_scale, (temp_s, D))
            proc = mp.Process(target=gen_fake_data,
                              args=(fake_temp, query_workload,
                                    neg_query_workload, noise, domain, alpha,
                                    temp_s))

            proc.start()
            processes.append(proc)

        assert samples_rem == 0, "samples_rem = {}".format(samples_rem)
        for p in processes:
            p.join()

        util2.enablePrint()
        oh_fake_data = []
        assert len(fake_temp) > 0
        for x in fake_temp:
            oh_fake_data.append(x)
            temp.append(x)
            # if current_eps >= epsilon / 2:  ## this trick haves the final error
            # if t >= T / 2:  ## this trick haves the final error
            final_syn_data.append(x)

        assert len(oh_fake_data
                   ) == samples, "len(D_hat) = {} len(fake_data_ = {}".format(
                       len(oh_fake_data), len(fake_temp))
        for i in range(samples):
            assert len(oh_fake_data[i]) == D, "D_hat dim = {}".format(
                len(oh_fake_data[0]))
        assert not final_syn_data or len(
            final_syn_data[0]) == D, "D_hat dim = {}".format(
                len(oh_fake_data[0]))

        fake_data = Dataset(
            pd.DataFrame(util2.decode_dataset(oh_fake_data, domain),
                         columns=domain.attrs), domain)
        """
        Compute Exponential Mechanism distribution
        """
        fake_answers = query_manager.get_answer(fake_data, debug=False)
        neg_fake_answers = 1 - fake_answers

        score = np.append(real_answers - fake_answers,
                          neg_real_answers - neg_fake_answers)

        EM_dist_0 = np.exp(eps_t * score * N / 2, dtype=np.float128)
        sum = np.sum(EM_dist_0)
        assert sum > 0 and not np.isinf(sum)
        EM_dist = EM_dist_0 / sum
        assert not np.isnan(
            EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format(
                EM_dist_0, EM_dist, sum)
        assert not np.isinf(
            EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format(
                EM_dist_0, EM_dist, sum)
        """
        Sample from EM
        """
        q_t_ind = util2.sample(EM_dist)

        if q_t_ind < Q_size:
            prev_queries.append(q_t_ind)
        else:
            neg_queries.append(q_t_ind - Q_size)

    if len(final_syn_data) == 0:
        status = status + '---syn data.'
        fake_data = Dataset.synthetic(domain, 100)
    else:
        if status == 'OK':
            # Return top halve
            final_syn_data = np.array(final_syn_data)
            final_syn_data = final_syn_data[T // 2:, :]
        fake_data = Dataset(
            pd.DataFrame(util2.decode_dataset(final_syn_data, domain),
                         columns=domain.attrs), domain)
    if show_prgress: progress_bar.close()
    return fake_data, status
Пример #17
0
                        help='bounded or unbounded privacy definition')
    parser.add_argument('--frequency', type=int, help='logging frequency')
    parser.add_argument('--seed', type=int, help='random seed')
    parser.add_argument('--save', type=str, help='path to save results')
    parser.add_argument('--load',
                        type=str,
                        help='path to load results from (skips experiment)')
    parser.add_argument('--plot', type=str, help='path to save plot')

    parser.set_defaults(**default_params())
    args = parser.parse_args()

    if args.load:
        results = pickle.load(open(args.load, 'rb'))
    else:
        data = Dataset.load('../data/adult.csv', '../data/adult-domain.json')
        projections = [['race', 'capital-loss', 'income>50K'],
                       ['marital-status', 'capital-gain', 'income>50K'],
                       ['race', 'native-country', 'income>50K'],
                       ['workclass', 'sex', 'hours-per-week'],
                       ['fnlwgt', 'marital-status', 'relationship'],
                       ['workclass', 'education-num', 'occupation'],
                       ['age', 'relationship', 'sex'],
                       ['occupation', 'sex', 'hours-per-week'],
                       ['occupation', 'relationship', 'income>50K']]

        measurements = []
        for p in projections:
            Q = sparse.eye(data.domain.size(p))
            measurements.append((p, Q))
Пример #18
0
from hdmm.templates import DefaultKron, Marginals, DefaultUnionKron
from hdmm import workload
from mbi import FactoredInference, Domain, Dataset
import numpy as np
from IPython import embed

# set up domain and workload
attributes = [
    'A', 'B', 'C'
]  #  should be the names of the columns, for now just using 0 and 1
sizes = [32, 32, 32]
dom = Domain(attributes, sizes)
#W = workload.Prefix2D(32)
W = workload.DimKMarginals(sizes, 1)
data = Dataset.synthetic(dom, 1000)

# optimize strategy using HDMM
#template = DefaultKron(sizes)
#template = Marginals(sizes)
template = DefaultUnionKron(sizes, 3)
template.optimize(W)
A = template.strategy()


def take_measurements(A, data):
    """ Efficiently take measurements from HDMM strategy and convert to a PGM-compatable form """
    A = workload.union_kron_canonical(A)
    measurements = []
    for Ai in A.matrices:
        w = Ai.weight
        proj = [
Пример #19
0
def randomKway(name, number, marginal, seed=0):
    path = "Datasets/{}.csv".format(name)
    domain = "Datasets/{}-domain.json".format(name)
    data = Dataset.load(path, domain)
    return data, randomKwayData(data, number, marginal, seed)
Пример #20
0
from mbi import Dataset, FactoredInference, Domain
import numpy as np

# discrete domain with attributes A, B, C and corresponding size 4 x 5 x 6 
domain = Domain(['A','B','C'], [2, 3, 4])

# synthetic dataset with 1000 rows
data = Dataset.synthetic(domain, 1000) 

# project data onto subset of cols, and vectorize
ab = data.project(['A','B']).datavector()
bc = data.project(['B','C']).datavector()

# add noise to preserve differential privacy
epsilon = np.sqrt(2)
sigma = np.sqrt(2.0) / epsilon

np.random.seed(0)
yab = ab + np.random.laplace(loc=0, scale=sigma, size=ab.size)
ybc = bc + np.random.laplace(loc=0, scale=sigma, size=bc.size)

# record the measurements in a form needed by inference
Iab = np.eye(ab.size)
Ibc = np.eye(bc.size)

measurements = [(Iab, yab, sigma, ['A', 'B']),
                (Ibc, ybc, sigma, ['B', 'C'])]

# estimate the data distribution
engine = FactoredInference(domain)
model = engine.estimate(measurements, engine='MD')
Пример #21
0
    pb_path+=str(i)
    pb_path+=".csv"
    print(pb_path)
    syn_data_privbayes = Dataset.load(pb_path, domain)

    dq_path=dualquerydata
    dq_path+=str(i)
    dq_path+=".csv"
    print(dq_path)
    syn_data_dualquery= Dataset.load(dq_path, domain)
    '''
    gm_path = gmdata
    gm_path += str(i + 1)
    gm_path += " .csv"
    print(gm_path)
    syn_data_r = Dataset.load(gm_path, domain)

    # err_pb = []
    # err_dq = []
    err_r = []
    print("ss")
    for p, W in workload:
        true = W.dot(data.project(p).datavector())
        #    print(data.project(p).datavector())
        #    pb = W.dot(syn_data_privbayes.project(p).datavector())
        #   print(syn_data_privbayes.project(p).datavector())
        #    dq_data=syn_data_dualquery.project(p).datavector()
        #    dq_data*=total/dq_data.sum()
        #   dq = W.dot(dq_data)
        #  print(syn_data_dualquery.project(p).datavector())
        r = W.dot(syn_data_r.project(p).datavector())
Пример #22
0
def generate(data,
             query_manager,
             epsilon,
             epsilon_0,
             exponential_scale,
             samples,
             alpha=0,
             show_prgress=True):
    domain = data.domain
    D = np.sum(domain.shape)
    N = data.df.shape[0]
    Q_size = query_manager.num_queries
    delta = 1.0 / N**2
    beta = 0.05  ## Fail probability

    prev_queries = []
    neg_queries = []
    rho_comp = 0.0000

    q1 = util.sample(np.ones(Q_size) / Q_size)
    q2 = util.sample(np.ones(Q_size) / Q_size)
    prev_queries.append(q1)  ## Sample a query from the uniform distribution
    neg_queries.append(q2)  ## Sample a query from the uniform distribution

    real_answers = query_manager.get_answer(data, debug=False)
    neg_real_answers = 1 - real_answers

    final_syn_data = []
    t = -1
    start_time = time.time()
    temp = []
    if show_prgress:
        # progress = tqdm(total=0.5 * epsilon ** 2)
        progress = tqdm(total=epsilon)
    last_eps = 0
    while True:
        """
        End early after 10 minutes
        """
        if time.time() - start_time > 600: break

        t += 1
        rho = 0.5 * epsilon_0**2
        rho_comp += rho  ## EM privacy
        current_eps = rho_comp + 2 * np.sqrt(rho_comp * np.log(1 / delta))

        if current_eps > epsilon:
            break
        if show_prgress:
            progress.update(current_eps - last_eps)
            last_eps = current_eps
        """
        Sample s times from FTPL
        """
        util.blockPrint()
        num_processes = 8
        s2 = int(1.0 + samples / num_processes)
        samples_rem = samples
        processes = []
        manager = mp.Manager()
        fake_temp = manager.list()

        query_workload = query_manager.get_query_workload(prev_queries)
        neg_query_workload = query_manager.get_query_workload(neg_queries)

        for i in range(num_processes):
            temp_s = samples_rem if samples_rem - s2 < 0 else s2
            samples_rem -= temp_s
            noise = np.random.exponential(exponential_scale, (temp_s, D))
            proc = mp.Process(target=gen_fake_data,
                              args=(fake_temp, query_workload,
                                    neg_query_workload, noise, domain, alpha,
                                    temp_s))

            proc.start()
            processes.append(proc)

        assert samples_rem == 0, "samples_rem = {}".format(samples_rem)
        for p in processes:
            p.join()

        util.enablePrint()
        oh_fake_data = []
        assert len(fake_temp) > 0
        for x in fake_temp:
            oh_fake_data.append(x)
            temp.append(x)
            if current_eps >= epsilon / 2:  ## this trick haves the final error
                final_syn_data.append(x)

        assert len(oh_fake_data
                   ) == samples, "len(D_hat) = {} len(fake_data_ = {}".format(
                       len(oh_fake_data), len(fake_temp))
        for i in range(samples):
            assert len(oh_fake_data[i]) == D, "D_hat dim = {}".format(
                len(D_hat[0]))
        assert not final_syn_data or len(
            final_syn_data[0]) == D, "D_hat dim = {}".format(
                len(oh_fake_data[0]))

        fake_data = Dataset(
            pd.DataFrame(util.decode_dataset(oh_fake_data, domain),
                         columns=domain.attrs), domain)
        """
        Compute Exponential Mechanism distribution
        """
        fake_answers = query_manager.get_answer(fake_data, debug=False)
        neg_fake_answers = 1 - fake_answers

        score = np.append(real_answers - fake_answers,
                          neg_real_answers - neg_fake_answers)

        EM_dist_0 = np.exp(epsilon_0 * score * N / 2, dtype=np.float128)
        sum = np.sum(EM_dist_0)
        assert sum > 0 and not np.isinf(sum)
        EM_dist = EM_dist_0 / sum
        assert not np.isnan(
            EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format(
                EM_dist_0, EM_dist, sum)
        assert not np.isinf(
            EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format(
                EM_dist_0, EM_dist, sum)
        """
        Sample from EM
        """
        q_t_ind = util.sample(EM_dist)

        if q_t_ind < Q_size:
            prev_queries.append(q_t_ind)
        else:
            neg_queries.append(q_t_ind - Q_size)

    if len(final_syn_data) == 0:
        final_syn_data = temp
    fake_data = Dataset(
        pd.DataFrame(util.decode_dataset(final_syn_data, domain),
                     columns=domain.attrs), domain)

    return fake_data