예제 #1
0
파일: util.py 프로젝트: cgebest/kamino
def _analyze_privacy(paras):
    epsilon2 = analysis.epsilon(N=paras['n_row'],
                                batch_size=paras['minibatch_size'],
                                noise_multiplier=paras['noise_multiplier'],
                                delta=paras['delta'],
                                iterations=paras['iterations'] *
                                (paras['n_col'] - 1))

    return epsilon2
예제 #2
0
def syn_br2000():
    """
    Group binary attributes in the preprocessing step. Restore back from HoloMake process.
    """
    start = time.time()

    path_data = f'./testdata/br2000/br2000.csv'
    path_ic = f'./testdata/br2000/br2000.ic'

    # Group bianry attributes as one attribute. return path includes _concat
    path_data_preproc = preproc_br2000_separate(path_data)

    n_row, n_col = pd.read_csv(path_data_preproc).shape
    n_len = len(str(n_row)) + 1

    paras = {
        'reuse_embedding': True,  # set True to reuse the embedding
        'dp': True,  # set True to enable privacy
        'n_row': n_row,  # number of rows in the true dataset
        'n_col': n_col,  # number of columns in the true data
        'epsilon1': .4,  # epsilon1,
        'l2_norm_clip': 1.0,
        'noise_multiplier': 1.1,
        'minibatch_size': 29,  # batch size to sample for each iteration.
        'microbatch_size': 1,  # micro batch size
        'delta':
        float(f'1e-{n_len}'),  # depends on data size. Do not change for now
        'learning_rate': 1e-4,
        'iterations':
        1500  # =1500 for comparision, =1000 for testing learned weights
    }

    if paras['dp']:
        epsilon2 = _analyze_privacy(paras)

        gaussian_std = []
        sensitivity_hist = 2
        std1 = np.sqrt(sensitivity_hist * 2. *
                       np.log(1.25 / paras['delta'])) / paras['epsilon1']
        for i in range(1):
            gaussian_std.append(std1)

        sensitivity = 100 * 3  # sample size * number of dcs
        std_learnW = np.sqrt(sensitivity * 2. * np.log(1.25 / 1e-3)) / 1.
        gaussian_std.append(std_learnW)

        epsilon = analysis.epsilon(N=n_row,
                                   batch_size=paras['minibatch_size'],
                                   noise_multiplier=paras['noise_multiplier'],
                                   iterations=paras['iterations'] *
                                   (paras['n_col'] - 1),
                                   delta=paras['delta'],
                                   gaussian_std=gaussian_std)

        paras['epsilon2'] = epsilon2
        msg = f"epsilon1= {paras['epsilon1']}\t epsilon2= {'{:.2f}'.format(epsilon2)}\t delta= {paras['delta']}\n" \
              f"epsilon= {epsilon}"
        print(msg)

    syn_data(path_data_preproc, path_ic, paras)
    path_syn = paras['path_syn']

    path_data_postproc = postproc_br2000(path_syn)

    end = time.time()
    logging.info(f'TIME_WALL= {end - start}')

    evaluate_data(path_data, path_data_postproc, path_ic)
    copy_log(paras)
예제 #3
0
def syn_tpch():
    """
    iid first three attributes, and then use tuple embedding
    For the last two numerical attributes, iid
    """
    start = time.time()

    path_data = f'./testdata/tpch/tpch_order.csv'
    path_data_cat = f'./testdata/tpch/tpch_order_cat.csv'
    path_data_num = f'./testdata/tpch/tpch_order_num.csv'
    path_ic = f'./testdata/tpch/tpch_order.ic'

    # c_custkey, c_mktsegment, c_nationkey,
    # n_name, n_regionkey, o_orderstatus, o_orderpriority,
    # c_acctbal, o_total_price

    df = pd.read_csv(path_data_cat)
    n_row, n_col = df.shape
    n_len = len(str(n_row)) + 1

    paras = {
        'reuse_embedding': True,  # set True to reuse the embedding
        'dp': True,  # set True to enable privacy
        'n_row': n_row,  # number of rows in the true dataset
        'n_col': n_col - 3,  # number of columns in the true dataset
        'epsilon1': .2,  #
        'l2_norm_clip': 1.0,
        'noise_multiplier': 1.1,  #
        'minibatch_size': 19,  #
        'microbatch_size': 1,  # micro batch size
        'delta':
        float(f'1e-{n_len}'),  # depends on data size. Do not change for now
        'learning_rate': 1e-4,
        'iterations':
        2000  # number of iteration. Should be large enough: iterations * minibatch_size > n_row
    }

    std1 = None
    if paras['dp']:
        epsilon2 = _analyze_privacy(paras)
        paras['epsilon2'] = epsilon2

        gaussian_std = []
        sensitivity = 2
        std1 = np.sqrt(sensitivity * 2. *
                       np.log(1.25 / paras['delta'])) / paras['epsilon1']
        for i in range(5):
            gaussian_std.append(std1)

        epsilon = analysis.epsilon(N=n_row,
                                   batch_size=paras['minibatch_size'],
                                   noise_multiplier=paras['noise_multiplier'],
                                   iterations=paras['iterations'] *
                                   (paras['n_col'] - 1),
                                   delta=paras['delta'],
                                   gaussian_std=gaussian_std)

        msg = f"epsilon1= {5 * paras['epsilon1']}\t epsilon2= {'{:.2f}'.format(epsilon2)}\n" \
              f"epsilon= {epsilon}\t delta= {paras['delta']}"
        print(msg)

    # c_custkey, c_mktsegment, c_nationkey,
    custkeys = list(df['c_custkey'].unique())
    mkesgements = list(df['c_mktsegment'].unique())
    nationkeys = list(df['c_nationkey'].unique())
    probas_custkeys = []
    probas_mkesgements = []
    probas_nationkeys = []

    temp_df_custkeys = df['c_custkey'].value_counts()
    for custkey in custkeys:
        count = temp_df_custkeys[custkey]
        noise = np.random.normal(0, std1)
        noisy_count = max(0, count + noise)
        probas_custkeys.append(noisy_count)
    probas_custkeys = np.array(probas_custkeys) / np.sum(probas_custkeys)

    temp_df_mkesgements = df['c_mktsegment'].value_counts()
    for mkesgement in mkesgements:
        count = temp_df_mkesgements[mkesgement]
        noise = np.random.normal(0, std1)
        noisy_count = max(0, count + noise)
        probas_mkesgements.append(noisy_count)
    probas_mkesgements = np.array(probas_mkesgements) / np.sum(
        probas_mkesgements)

    temp_df_nationkeys = df['c_nationkey'].value_counts()
    for nationkey in nationkeys:
        count = temp_df_nationkeys[nationkey]
        noise = np.random.normal(0, std1)
        noisy_count = max(0, count + noise)
        probas_nationkeys.append(noisy_count)
    probas_nationkeys = np.array(probas_nationkeys) / np.sum(probas_nationkeys)

    syn_custkeys = []
    syn_mkesgements = []
    syn_nationkeys = []

    while len(syn_custkeys) < df.shape[0]:
        custkey = np.random.choice(custkeys, p=probas_custkeys)

        if custkey in syn_custkeys:
            id = syn_custkeys.index(custkey)
            nationkey = syn_nationkeys[id]
            mkesgement = syn_mkesgements[id]
        else:
            mkesgement = np.random.choice(mkesgements, p=probas_mkesgements)
            nationkey = np.random.choice(nationkeys, p=probas_nationkeys)

        syn_custkeys.append(custkey)
        syn_mkesgements.append(mkesgement)
        syn_nationkeys.append(nationkey)

    dicti = {
        'c_custkey': syn_custkeys,
        'c_mktsegment': syn_mkesgements,
        'c_nationkey': syn_nationkeys
    }
    syn = pd.DataFrame(dicti)

    path_tmp = '/tmp/tpch_3.csv'
    syn.to_csv(path_tmp, index=False)

    synthesize_continue(path_data=path_data_cat,
                        path_syn_old=path_tmp,
                        path_constraint=path_ic,
                        paras=paras,
                        rand_sequence=False,
                        ic_sampling=True)

    path_syn = paras['path_syn']

    ## for numerical attribute
    df_num = _syn_tpch_num(path_data_num, std1)
    df_cat = pd.read_csv(path_syn)

    df_syn = df_cat.copy()
    for num_attr in list(df_num.columns):
        df_syn[num_attr] = df_num[num_attr].copy()

    df_syn.to_csv(path_syn, index=False)

    logging.info(
        f'path_data_cat= {path_data_cat}\npath_data_num= {path_data_num}\n'
        f'path_ic= {path_ic}\npath_syn= {path_syn}')
    logging.info(f'{paras}')

    end = time.time()
    time_wall = end - start
    time_train = paras['TIME_TRAINING']
    time_seq = paras['TIME_SEQ']
    time_sampling = time_wall - time_seq * 2 - time_train
    logging.info(f'TIME_WALL= {time_wall}')
    logging.info(f'TIME_SEQ= {time_seq}')
    logging.info(f'TIME_TRAINING= {time_train}')
    logging.info(f'TIME_SAMPLING= {time_sampling}')

    evaluate_data(path_data, path_syn, path_ic)
    copy_log(paras)
예제 #4
0
def syn_adult100():
    """
    Quick run on the adult data with 100 rows.
    """
    start = time.time()

    path_data = f'./testdata/a100/adult100.csv'
    path_ic = f'./testdata/a100/adult100.ic'

    path_data_preproc = preproc_adult(path_data)

    n_row, n_col = pd.read_csv(path_data_preproc).shape
    n_len = len(str(n_row)) + 1

    paras = {
        'reuse_embedding': True,  # set True to reuse the embedding
        'dp': True,  # set True to enable privacy
        'n_row': n_row,  # number of rows in the true dataset
        'n_col': n_col,  # number of columns in the true dataset
        'epsilon1': .1,  #
        'l2_norm_clip': 1.0,
        'noise_multiplier': 1.1,
        'minibatch_size': 23,
        'microbatch_size': 1,
        'delta': float(f'1e-{n_len}'),
        'learning_rate': 1e-4,
        'iterations': 10
    }

    if paras['dp']:
        epsilon2 = _analyze_privacy(paras)
        paras['epsilon2'] = epsilon2

        gaussian_std = []
        sensitivity = 2
        std1 = np.sqrt(sensitivity * 2. *
                       np.log(1.25 / paras['delta'])) / paras['epsilon1']
        for i in range(1):
            gaussian_std.append(std1)

        epsilon = analysis.epsilon(N=n_row,
                                   batch_size=paras['minibatch_size'],
                                   noise_multiplier=paras['noise_multiplier'],
                                   iterations=paras['iterations'] *
                                   (paras['n_col'] - 1),
                                   delta=paras['delta'],
                                   gaussian_std=gaussian_std)

        msg = f"epsilon1= {paras['epsilon1']}\t epsilon2= {'{:.2f}'.format(epsilon2)}\t delta= {paras['delta']}\n" \
              f"epsilon= {epsilon}"
        print(msg)

    syn_data(path_data_preproc, path_ic, paras)
    path_syn = paras['path_syn']

    path_data_postproc = postproc_adult(path_syn)

    end = time.time()
    logging.info(f'TIME_WALL= {end - start}')

    evaluate_data(path_data, path_data_postproc, path_ic)
    copy_log(paras)
예제 #5
0
파일: syn_tax.py 프로젝트: cgebest/kamino
def syn_tax():
    """
    The full tax with city and zip - private setup
    use iid for zip and city
    Revision note: this is actually the chosen setup
    """
    start = time.time()

    path_data = f'./testdata/tax/tax30k.csv'
    path_ic = f'./testdata/tax/tax30k.ic'

    path_data_part = f'./testdata/tax/tax30k_part.csv'
    path_ic_part = f'./testdata/tax/tax30k_part.ic'

    df = pd.read_csv(path_data)

    n_row, n_col = df.shape
    n_len = len(str(n_row)) + 1

    paras = {
        'reuse_embedding': True,  # set True to reuse the embedding
        'dp': True,  # set True to enable privacy
        'n_row': n_row,  # number of rows in the true dataset
        'n_col': n_col - 2,  # number of columns in the true dataset
        'epsilon1': .4,  # epsilon1, for iid
        'l2_norm_clip': 1.0,
        'noise_multiplier': 1.1,
        'minibatch_size': 18,  # batch size to sample for each iteration.
        'microbatch_size': 1,  # micro batch size
        'delta': float(f'1e-{n_len}'),  # depends on data size. Do not change for now
        'learning_rate': 1e-4,
        'iterations': 2000  # number of iteration. Should be large enough: iterations * minibatch_size > N
    }

    if paras['dp']:
        epsilon2 = _analyze_privacy(paras)
        paras['epsilon2'] = epsilon2

        gaussian_std = []
        sensitivity = 2
        std1 = np.sqrt(sensitivity * 2. * np.log(1.25 / paras['delta'])) / paras['epsilon1']
        for i in range(3):
            gaussian_std.append(std1)

        epsilon = analysis.epsilon(N=n_row,
                                   batch_size=paras['minibatch_size'],
                                   noise_multiplier=paras['noise_multiplier'],
                                   iterations=paras['iterations'] * (paras['n_col']-1),
                                   delta=paras['delta'], gaussian_std=gaussian_std)

        msg = f"epsilon1= {3 * paras['epsilon1']}\tepsilon2= {'{:.2f}'.format(epsilon2)}\t" \
              f"delta= {paras['delta']}\nepsilon={epsilon}"
        print(msg)

    syn_data(path_data_part, path_ic_part, paras)
    path_syn_part = paras['path_syn']

    # continue on the 'Zip', 'City'
    zips = list(df['Zip'].unique())
    cities = list(df['City'].unique())
    probas_zips = []
    probas_cities = []

    temp_df_zips = df['Zip'].value_counts()
    for crnt_zip in zips:
        count = temp_df_zips[crnt_zip]
        # noise = np.random.laplace(0, 1.0 / paras['epsilon1'])
        noise = np.random.normal(0, std1)
        noisy_count = max(0, count + noise)
        probas_zips.append(noisy_count)
    probas_zips = np.array(probas_zips) / np.sum(probas_zips)

    temp_df_cities = df['City'].value_counts()
    for city in cities:
        count = temp_df_cities[city]
        # noise = np.random.laplace(0, 1.0 / paras['epsilon1'])
        noise = np.random.normal(0, std1)
        noisy_count = max(0, count + noise)
        probas_cities.append(noisy_count)
    probas_cities = np.array(probas_cities) / np.sum(probas_cities)

    df_syn = pd.read_csv(path_syn_part)

    syn_zips = []
    syn_cities = []
    syn_states = df_syn['State'].tolist()

    while len(syn_zips) < len(syn_states):
        idx = len(syn_zips)
        crnt_state = syn_states[idx]
        crnt_zip = np.random.choice(zips, p=probas_zips)
        previous_city = None
        # check if crnt_zip violates zip->state
        if crnt_zip in syn_zips:
            previous_idx = syn_zips.index(crnt_zip)
            previous_state = syn_states[previous_idx]
            previous_city = syn_cities[previous_idx]
            if previous_state != crnt_state:
                continue

        # so far, zip->states
        if previous_city is None:
            crnt_city = np.random.choice(cities, p=probas_cities)
        else:
            crnt_city = previous_city

        syn_zips.append(crnt_zip)
        syn_cities.append(crnt_city)

    df_syn['Zip'] = syn_zips
    df_syn['City'] = syn_cities
    tmp_syn_path = '/tmp/tax_iid.csv'
    df_syn.to_csv(tmp_syn_path, index=False)

    synthesize_continue(path_data=path_data, path_syn_old=tmp_syn_path, path_constraint=path_ic,
                        paras=paras, rand_sequence=False, ic_sampling=True)

    path_syn = paras['path_syn']

    end = time.time()
    time_wall = end - start
    paras['TIME_WALL'] = time_wall
    time_sampling = time_wall - paras['TIME_TRAINING']
    logging.info(f"TIME_TRAINING= {paras['TIME_TRAINING']}")
    logging.info(f'TIME_SAMPLING= {time_sampling}')
    logging.info(f'TIME_ALL= {time_wall}')

    evaluate_data(path_data, path_syn, path_ic)

    copy_log(paras)