Exemplo n.º 1
0
def test_bm3d(data1, data2, domain):
    for marginal, noise in [
        [[5, 7, 8], 1000],
        [[2, 3], 800],
        [[6, 7, 9], 1800],
        [[4, 5, 7], 1600],
        [[0, 2], 630],
        [[2, 3, 8], 920],
        [[0, 2, 3], 430],
        [[2, 4, 7], 1200],
        [[3, 5], 1500],
    ]:
        print(marginal)
        hist1 = tools.get_marginal(data1, domain, marginal)
        hist2 = tools.get_marginal(data2, domain, marginal)

        noisy_hist2 = hist2 + np.random.normal(scale=noise, size=hist2.shape)
        print('        query TVD: {:.4f}'.format(
            tools.get_TVD(hist2, noisy_hist2)))

        bm3d_hist2 = tools.bm3d_denoise(marginal, noisy_hist2, noise, 'normal')
        print('        bm3d TVD: {:.4f}'.format(
            tools.get_TVD(hist2, bm3d_hist2)))

        concatenated_hist2 = np.concatenate([hist1, noisy_hist2], axis=1)
        concatenated_bm3d_hist2 = tools.bm3d_denoise(marginal,
                                                     concatenated_hist2, noise,
                                                     'normal')
        concatenated_bm3d_hist2 = concatenated_bm3d_hist2[:, hist2.shape[1]:]

        print('        concatenated bm3d TVD: {:.4f}'.format(
            tools.get_TVD(hist2, concatenated_bm3d_hist2)))
Exemplo n.º 2
0
def test_data_TVD(data1, data2, domain, test_num=10, k=4, attr_num=10):
    data_num1 = len(data1)
    data_num2 = len(data2)
    print(f"data num1: {data_num1}, data num2: {data_num2}")

    i = 0
    random_marginal = list(itertools.combinations(list(range(attr_num)), k))
    random.shuffle(random_marginal)
    random_marginal = random_marginal[:test_num]
    average_TVD = 0
    for marginal in random_marginal:
        hist1 = tools.get_marginal(data1, domain, marginal)
        hist2 = tools.get_marginal(data2, domain, marginal)

        hist2 = hist2 * data_num1 / data_num2
        TVD = np.sum(np.abs(hist1 - hist2)) / 2 / data_num1
        print("    {} TVD: {:.4f}".format(marginal, TVD))

        average_TVD += TVD

        i += 1
        if i >= test_num:
            break

    print("average TVD: {:.4f}".format(average_TVD / test_num))
Exemplo n.º 3
0
def plot_miles(df):
    columns = list(df.columns)
    data = df.to_numpy()
    data = data[:, -1].reshape((-1, 1))

    # data[data==0] = 1
    data[data > 400] = 400
    # data = (np.log10(data) * 10).astype(int)

    temp = int(max(data))

    print('max mile', temp)

    temp_dict = {0: temp + 1}
    domain = Domain(temp_dict, [
        0,
    ])

    hist = tools.get_marginal(data, domain, (0, ))

    hist[hist == 0] = 1
    hist = np.log10(hist)

    ptools.plot_list(hist,
                     './evaluate/trip_miles.pdf',
                     size=(20.0, 2.5),
                     zero_line=True)
Exemplo n.º 4
0
def plot_marginal2(data, domain):
    for attr in [2, 5, 7, 8, 9]:
        hist = tools.get_marginal(data, domain, (attr, ))
        hist[hist == 0] = 1
        print(attr)
        print(hist)
        hist = np.log10(hist)
        print(hist)
        ptools.plot_list(hist, f'./evaluate/{attr}.pdf')
Exemplo n.º 5
0
def check_marginal(data0, data1, data2, domain):
    hist1 = tools.get_marginal(data1, domain, (0, 2))
    hist2 = tools.get_marginal(data2, domain, (0, 2))
    print(np.sum(hist1 < 10))
    print(np.sum(hist2 < 10))

    hist = tools.get_marginal(data2, domain, (3, ))
    hist = hist / np.sum(hist)
    print(np.where(hist > 0.06))
    feature_pos = [30, 36, 37, 42, 50, 51, 52]
    print(hist[feature_pos])

    for marginal in [
            # (0, 1, 2),
        (0, 2, 3),
    ]:
        # 2019
        hist0 = tools.get_marginal(data0, domain, marginal)

        # 2018
        hist1 = tools.get_marginal(data1, domain, marginal)

        # dp
        hist2 = tools.get_marginal(data2, domain, marginal)

        print(hist1.shape, hist2.shape)

        head_attrs = [0, 2]
        cell_tvd = tools.get_cell_TVD(domain, marginal, hist0, hist1, hist2,
                                      hist, head_attrs)
Exemplo n.º 6
0
def plot_low_p_cell_data(data, domain):
    hist = tools.get_marginal(data, domain, (2, ))
    low_p_cells = np.where(hist < 30000)[0]
    print(low_p_cells)

    mask = np.isin(data[:, 2], low_p_cells)

    data1 = data[mask]
    data2 = data[~mask]

    print('low cell: ', data1.shape)
    print('high cell: ', data2.shape)

    # atools.cal_MI(data1, domain, path='./temp/low_p_edge.json', max_length=2)
    # ptools.plot_correlation(data1, domain, './temp/low_p_edge.json', path='./temp/low_p_correlations.png')

    # atools.cal_MI(data2, domain, path='./temp/high_p_edge.json', max_length=2)
    # ptools.plot_correlation(data2, domain, './temp/high_p_edge.json', path='./temp/high_p_correlations.png')

    hist0 = tools.get_marginal(data, domain, (1, ))
    hist1 = tools.get_marginal(data1, domain, (1, ))
    hist2 = tools.get_marginal(data2, domain, (1, ))

    hist0 = hist0 / np.sum(hist0)
    hist1 = hist1 / np.sum(hist1)
    hist2 = hist2 / np.sum(hist2) + 0.02

    plt.rcParams['figure.figsize'] = (11.0, 2.5)
    plt.rcParams['savefig.dpi'] = 200
    plt.locator_params(nbins=10)
    fig = plt.figure()

    plt.plot(hist0, color='black')
    plt.plot(hist1, color='green')
    plt.plot(hist2, color='yellow')

    plt.plot([0] * len(hist0), 'r')

    plt.savefig('./temp/company.pdf', bbox_inches='tight')
Exemplo n.º 7
0
def check_marginal_distribution(path1, path2, attr_list):
    data1, domain, columns = tools.read_data(path1, domain_path)
    temp_data2, domain, columns = tools.read_data(path2, domain_path, dtype=None)

    data2 = np.zeros(shape=temp_data2.shape)
    data2[:, anchor_attr] = temp_data2[:, anchor_attr]

    for attr in attr_list:
        marginal1 = tools.get_marginal(data1, domain, (attr, ))
        marginal2 = tools.get_marginal(data2, domain, (attr, ))

        print(f'attr {attr}')
        print(marginal1[:20])
        print(marginal2[:20])

        dist_array = marginal1 - marginal2
        print(dist_array[:20])

        check_array = (marginal1 - marginal2)/(marginal1 + 1)
        print(check_array[:20])

        abs_array = np.abs(check_array)
        print(f'{len(check_array)}: {np.sum(abs_array>0.01)} {np.sum(abs_array>0.05)} {np.sum(abs_array>0.10)}\n')
Exemplo n.º 8
0
def compare_data_marginal(data1, data2, domain):
    marginal_list = [(0, 2), (0, 1, 2), (0, 2, 3), (2, 3), (0, 2, 3, 5),
                     (0, 2, 3, 6), (0, 2, 3, 7), (0, 2, 3, 8)]

    for marginal in marginal_list:
        print(marginal)
        hist1 = tools.get_marginal(data1, domain, marginal)
        hist2 = tools.get_marginal(data2, domain, marginal)

        tvd = tools.get_TVD(hist1, hist2)
        print("    tvd: {:.4f}".format(tvd))

        cell = False
        if 0 in marginal:
            head_attrs = [0, 2]
            cell = True
        elif 3 in marginal:
            head_attrs = [2, 3]
            cell = True

        if cell:
            cell_tvd = tools.get_cell_TVD(domain, marginal, hist1, hist2,
                                          head_attrs)
            print("    cell tvd: {:.4f}".format(cell_tvd))
Exemplo n.º 9
0
def truncate_and_plot(data_path, domain_path, prefix):
    data, domain, headings = tools.read_data(data_path, domain_path)

    threshold_map = {'fare': 100, 'tips': 20, 'trip_miles': 40, 'trip_total': 100, 'trip_seconds': 10000}
    for attr in threshold_map:
        data, domain = atools.truncate_data(data, domain, headings.index(attr), threshold_map[attr])

    for attr in common_attrs:
        attr_id = headings.index(attr)
        ptools.plot_attr(attr_id, data, domain, \
            path=f'./info/{prefix}_{attr}_{attr_id}.pdf')

    print(tools.get_marginal(data, domain, (headings.index('payment_type'), ))
    
    )
Exemplo n.º 10
0
def truncate_and_plot(data_path, domain_path, prefix):
    data, domain, headings = tools.read_data(data_path, domain_path)

    threshold_map = {'fare': 100, 'tips': 20, 'trip_miles': 40, 'trip_total': 100, 'trip_seconds': 10000}
    for attr in threshold_map:
        data, domain = atools.truncate_data(data, domain, headings.index(attr), threshold_map[attr])

    parameters_json = json.load(open('./data/parameters.json', 'r'))
    attrs = list(parameters_json['schema'].keys())

    for attr in attrs:
        attr_id = headings.index(attr)
        ptools.plot_attr(attr_id, data, domain, \
            path=f'./info/{prefix}_{attr}_{attr_id}.pdf')

    print(tools.get_marginal(data, domain, (headings.index('payment_type'), ))
    
    )
Exemplo n.º 11
0
def train_MRF(root, weights, public_trip_data, public_taxi_id, trip_data,
              trip_domain, taxi_id, epsilon, budget, downsample_data_dict):
    print('training MRF')
    print(trip_domain)
    config = base_config.copy()
    config['epsilon'] = epsilon
    config['beta0'] = budget

    if not submit:
        print(sys.argv[1])

    if epsilon < 0.5:
        model_epsilon = 0.1
    elif epsilon < 4.0:
        model_epsilon = 1.0
    else:
        model_epsilon = 10.0

    print(public_trip_data.shape, trip_data.shape)

    # atools.cal_MI(trip_data, trip_domain, path='./temp/trip_correlations.json', max_length=2)
    # ptools.plot_correlation(trip_data, trip_domain, './temp/trip_correlations.json', path='./temp/trip_correlations.png')

    marginal_list = []
    marginal_list = json.load(
        open(root + 'temp/trip' + str(model_epsilon) +
             '_marginal_list_save.json'))
    # marginal_list = json.load(open(root+'temp/trip'+str(epsilon)+'_marginal_list_save_miles.json'))

    in_attr_set = set()
    for marginal in marginal_list:
        in_attr_set |= set(marginal)
    for attr in set(range(trip_data.shape[1])) - in_attr_set:
        marginal_list.append([
            attr,
        ])

    bins_map = {}
    re_bins_map = {}
    hierarchy_marginals = []
    mask_marginal = {
        # (2, 3): 5,
        # (0, 2, 3): 0,
        # (5, 7): 10,
    }
    marginal_mask = {}
    bm3d_marginal = []
    gpu = True

    noisy_taxi_num = int(2.8e5)

    test_marginal_list = [(2, 3), (0, 1, 2), (0, 2, 3)]

    if config['load_graph']:
        # graph = nx.node_link_graph(json.load(open(root+'temp/'+config['exp_name']+'_'+str(epsilon)+'_miles_graph.json', 'r')))
        graph = nx.node_link_graph(
            json.load(
                open(
                    root + 'temp/' + config['exp_name'] + '_' +
                    str(model_epsilon) + '_graph.json', 'r')))
    else:
        init_model = NestedGraphicalModel(trip_data, trip_domain, config,
                                          config['data_name'])
        graph, entropy = init_model.construct_model()
        json.dump(
            nx.node_link_data(graph),
            open(
                root + 'temp/' + config['exp_name'] + '_' + str(epsilon) +
                '_graph.json', 'w'))
        # json.dump(nx.node_link_data(graph), open(root+'temp/'+config['exp_name']+'_'+str(epsilon)+'_miles_graph.json', 'w'))

    marginal_mask = {}
    for marginal in mask_marginal:
        threshold = mask_marginal[marginal]
        public_hist = tools.get_marginal(public_trip_data, trip_domain,
                                         marginal)
        mask = public_hist <= threshold

        marginal_mask[marginal] = mask

        print("generate mask {}, threshold: {:.2f}, mask ratio: {:.4f}".format(
            marginal, threshold,
            np.sum(mask) / mask.size))

    hist = tools.get_marginal(public_trip_data, trip_domain, (2, ))
    low_p_cells = np.where(hist < 30000)[0]
    print('low_p', len(low_p_cells), low_p_cells)

    high_p_cells = np.array(
        list(set(list(range(78))) - set(list(low_p_cells))))
    print('high_p', len(high_p_cells), high_p_cells)

    low_p_map = {i: low_p_cells[i] for i in range(len(low_p_cells))}
    low_f = np.vectorize(lambda x: low_p_map.get(x, -1))

    high_p_map = {i: high_p_cells[i] for i in range(len(high_p_cells))}
    high_f = np.vectorize(lambda x: high_p_map.get(x, -1))

    config['marginal_coefficent'] = 1
    config['divide_data'] = 'low_p'
    config['estimation_iter_num'] = 5000
    config['final_iter_num'] = 8000
    if epsilon < 5.0:
        config['total_marginal_num'] = 13
        config['ed_step_num'] = 2
    else:
        config['total_marginal_num'] = 14
        config['ed_step_num'] = 2

    config['theta1'] = 2.0
    config['theta2'] = 2.0
    model = MarkovRandomField(public_trip_data, trip_data, trip_domain, graph, marginal_list, \
            config, bins_map, re_bins_map, hierarchy_marginals, \
            mask_marginal, marginal_mask, bm3d_marginal, taxi_id, gpu=gpu, \
            noisy_taxi_num=noisy_taxi_num, weights=weights, p_cells=high_p_cells)
    p_marginal_mask = {}
    for marginal in mask_marginal:
        if 2 in marginal:
            idx = list(marginal).index(2)
            print(idx, marginal, marginal_mask[marginal].shape)
            p_marginal_mask[marginal] = np.take(marginal_mask[marginal],
                                                high_p_cells,
                                                axis=idx)
        else:
            p_marginal_mask[marginal] = marginal_mask[marginal]
    model.marginal_mask = p_marginal_mask
    model.public_marginal_list = [
        # (2, 3, 5),
        # (0, 2, 3, 5),
    ]
    model.downsample_data_dict = downsample_data_dict
    model.init()
    if not config['load_high_p_model']:
        model.entropy_descent([1] * config['ed_step_num'])
        json.dump(
            model.measure_list,
            open(
                root + 'temp/' + config['exp_name'] + str(config['epsilon']) +
                '_marginal_list.json', 'w'))
        model.save_parameters(root + 'temp/model1' + str(config['exp_name']) +
                              '_high.mdl')
    else:
        model.load_parameters(root + 'temp/model1' + str(config['exp_name']) +
                              '_high.mdl')
    model.test_TVD(test_marginal_list)

    # data = model.synthetic_data(root+'temp/'+str(config['exp_name'])+'.csv', total=model.noisy_trip_num)

    config['marginal_coefficent'] = 1
    config['divide_data'] = 'high_p'
    config['estimation_iter_num'] = 2000
    config['final_iter_num'] = 4000
    if epsilon < 5.0:
        config['total_marginal_num'] = 13
        config['ed_step_num'] = 1
    else:
        config['total_marginal_num'] = 14
        config['ed_step_num'] = 1

    config['theta1'] = 1.0
    config['theta2'] = 1.0
    # marginal_list = marginal_list[:10] # orginal
    # marginal_list = marginal_list[:13] # trip miles
    marginal_list = json.load(
        open(root + 'temp/trip' + str(model_epsilon) +
             '_marginal_list_save_low_p.json'))
    in_attr_set = set()
    for marginal in marginal_list:
        in_attr_set |= set(marginal)
    for attr in set(range(trip_data.shape[1])) - in_attr_set:
        marginal_list.append([
            attr,
        ])

    model2 = MarkovRandomField(public_trip_data, trip_data, trip_domain, graph, marginal_list, \
            config, bins_map, re_bins_map, hierarchy_marginals, \
            mask_marginal, marginal_mask, bm3d_marginal, taxi_id, gpu=gpu, \
            noisy_taxi_num=noisy_taxi_num, weights=weights, p_cells=low_p_cells)
    model2.downsample_data_dict = model.downsample_data_dict
    p_marginal_mask = {}
    for marginal in mask_marginal:
        if 2 in marginal:
            idx = list(marginal).index(2)
            p_marginal_mask[marginal] = np.take(marginal_mask[marginal],
                                                low_p_cells,
                                                axis=idx)
        else:
            p_marginal_mask[marginal] = marginal_mask[marginal]
    model2.marginal_mask = p_marginal_mask
    if epsilon > 5.0:
        model2.public_marginal_list = [
            # (2, 3, 5),
            (0, 2, 3, 5),
            (2, 3, 7),
        ]
    else:
        model2.public_marginal_list = [
            # (2, 3, 5),
            (0, 2, 3, 5),
            (2, 3, 7),
            (5, 7, 8)
        ]
    model2.init()
    if not config['load_low_p_model']:
        model2.entropy_descent([1] * config['ed_step_num'])
        json.dump(
            model2.measure_list,
            open(
                root + 'temp/' + config['exp_name'] + str(config['epsilon']) +
                '_marginal_list.json', 'w'))
        model2.save_parameters(root + 'temp/model2' + str(config['exp_name']) +
                               '_low.mdl')
    else:
        model2.load_parameters(root + 'temp/model2' + str(config['exp_name']) +
                               '_low.mdl')
    model2.test_TVD(test_marginal_list)

    high_p_data = model.synthetic_data(total=int(config['data_num_ratio'] *
                                                 model.noisy_trip_num))

    low_p_data = model2.synthetic_data(total=int(config['data_num_ratio'] *
                                                 model2.noisy_trip_num))

    low_p_data[:, 2] = low_f(low_p_data[:, 2])
    high_p_data[:, 2] = high_f(high_p_data[:, 2])

    temp_marginal = tools.get_marginal(high_p_data, trip_domain, (2, ))
    # ptools.plot_list(temp_marginal, f'./evaluate/(2,)_p_syn.pdf')

    if not submit:
        config['temp_save_data_name'] = sys.argv[1]
        print(
            'write csv',
            root + config['temp_save_data_name'] + '_' + str(epsilon) + '.csv')

    data = np.concatenate([low_p_data, high_p_data], axis=0)
    df = pd.DataFrame(data, columns=list(range(10)))
    df.to_csv(root + config['temp_save_data_name'] + '_' + str(epsilon) +
              '.csv',
              index=False)

    dp_trip_data = np.zeros(shape=(data.shape[0], data.shape[1] + 1),
                            dtype=np.int32)
    dp_trip_data[:, 1:] = data

    taxi_id = assign_taxi_id(data, public_trip_data, public_taxi_id)
    dp_trip_data[:, 0] = taxi_id

    return dp_trip_data
Exemplo n.º 12
0
def evaluate_public():
    sys.stdout = open('./evaluate/evaluate_log_2014_gt.txt', 'w')

    df_2018 = pd.read_csv('./preprocess/2014.csv')
    df_public = pd.read_csv('./preprocess/ground_truth.csv')

    df_2018 = df_2018.drop(
        columns=['taxi_id', 'trip_day_of_week', 'trip_hour_of_day'])
    df_public = df_public.drop(
        columns=['taxi_id', 'trip_day_of_week', 'trip_hour_of_day'])

    data_2018 = df_2018.to_numpy()
    data_public = df_public.to_numpy()

    domain_dict = json.load(open('./preprocess/domain.json', 'r'))
    domain_dict = {
        i: domain_dict[df_2018.columns[i]]
        for i in range(len(df_2018.columns))
    }
    domain = Domain(domain_dict, list(range(len(domain_dict))))
    print(str(domain))

    hist = tools.get_marginal(data_public, domain, (2, ))
    low_p_cells = np.where(hist < 30000)[0]
    print('low p:', low_p_cells)

    high_p_cells = np.array(
        list(set(list(range(78))) - set(list(low_p_cells))))
    print('high p:', high_p_cells)

    df_dp = pd.read_csv('./save_2_10.0.csv')
    data = df_dp.to_numpy()

    data_dp = np.zeros(shape=(data.shape[0], data.shape[1] + 1), dtype=int)
    data_dp[:, 1:] = data
    for i in range(len(data_dp)):
        data_dp[i, 0] = int(1e6 + i / 60)
    data_dp = df_dp.to_numpy()

    print('evaluate overall')
    evaluate(data_2018, data_dp, domain)

    for p_cells in [low_p_cells, high_p_cells]:
        if p_cells is low_p_cells:
            print('evaluate low p')
        else:
            print('evaluate high p')

        mask_2018 = np.isin(data_2018[:, 2], p_cells)
        p_2018 = data_2018[mask_2018]

        mask_dp = np.isin(data_dp[:, 2], p_cells)
        p_dp = data_dp[mask_dp]

        p_map = {p_cells[i]: i for i in range(len(p_cells))}
        f = np.vectorize(lambda x: p_map.get(x, -1))

        p_2018[:, 2] = f(p_2018[:, 2])
        p_dp[:, 2] = f(p_dp[:, 2])

        assert (not (p_2018[:, 2] == -1).any())
        assert (not (p_dp[:, 2] == -1).any())
        print(np.max(p_2018[:, 2]))
        print(np.max(p_dp[:, 2]))

        p_domain_dict = domain.dict.copy()
        p_domain_dict[2] = len(p_cells) + 1
        p_domain = Domain(p_domain_dict, domain.attr_list)

        evaluate(p_2018, p_dp, p_domain)
Exemplo n.º 13
0
def test_marginal(data1, data2, domain):
    # for marginal in [(2, 3, 5), (2, 3, 7)]:
    #     hist1 = tools.get_marginal(data1, domain, marginal)
    #     hist2 = tools.get_marginal(data2, domain, marginal)

    #     tvd = tools.get_TVD(hist1, hist2)
    #     tvd_array, cell_tvd = tools.get_cell_TVD(domain, marginal, hist1, hist2, [2,])

    #     print("  {}, TVD: {:.4f}, cell TVD: {:.4f}".format(marginal, tvd, cell_tvd))

    # for marginal in [(0, 2, 3), (0, 2, 3, 5), (0, 2, 3, 7)]:
    #     hist1 = tools.get_marginal(data1, domain, marginal)
    #     hist2 = tools.get_marginal(data2, domain, marginal)

    #     tvd = tools.get_TVD(hist1, hist2)
    #     tvd_array, cell_tvd = tools.get_cell_TVD(domain, marginal, hist1, hist2, [0, 2])

    #     print("  {}, TVD: {:.4f}, cell TVD: {:.4f}".format(marginal, tvd, cell_tvd))

    for marginal in [(2, 3, 5), (2, 3, 7), (2, 3, 8)]:
        print(marginal)
        # for data in [data1, data2]:
        for data in [
                data1,
        ]:

            # std_hist = tools.get_marginal(data, domain, marginal).astype(int)

            temp_marginal = [
                0,
            ]
            temp_marginal.extend(marginal)
            hist = tools.get_marginal(data, domain, temp_marginal)

            # for shifts in [
            #         [0, 3, 6, 9, 12, 15, 18],
            #         [4, 7, 10, 13, 16],
            #         [5, 8, 11, 14, 17]
            #     ]:
            for shifts in [
                [0, 3, 6, 9, 12, 15, 18],  # all nights
                [5, 8, 11, 14, 17],  # weekday moring
                [4, 7, 10, 13, 16],  # weekday afternoon
                [1, 2, 19, 20]
            ]:
                tvd_array = np.full(shape=len(shifts),
                                    fill_value=-1,
                                    dtype=float)
                std_hist = np.sum(hist[shifts], axis=0)
                for i in range(len(shifts)):
                    _, cell_tvd = tools.get_cell_TVD(domain, marginal,
                                                     std_hist, hist[shifts[i]],
                                                     [
                                                         2,
                                                     ])
                    tvd_array[i] = cell_tvd

                # print(cell_tvd)

                print(np.mean(tvd_array), tvd_array)
            break
        break
Exemplo n.º 14
0
def get_mask(data, domain, marginal, threshold=0):
    hist = tools.get_marginal(data, domain, marginal)
    mask = hist.copy()
    mask[hist <= threshold] = 0
    mask[hist > threshold] = 1
    return mask
Exemplo n.º 15
0
def plot_marginal(data1, data2, domain):
    for marginal in [
            # (0, 2),
            # (0, 2, 3),
        (2, 5, 7, 8, 9)
            # (2,),
            # (0,)
    ]:
        hist1 = tools.get_marginal(data1, domain, marginal)
        unique, cnt = np.unique(hist1, return_counts=True)
        print(marginal)
        print(cnt)
        cnt = np.log10(cnt)
        print(cnt)
        # unique = np.log10(unique)
        cdf = 0
        for i in range(len(cnt)):
            cdf += cnt[i]
            cnt[i] = cdf

        plt.rcParams['figure.figsize'] = (11.0, 2.5)
        plt.rcParams['savefig.dpi'] = 200
        plt.locator_params(nbins=10)
        fig = plt.figure()

        plt.plot(unique, cnt, 'o', color='blue')

        # hist2 = tools.get_marginal(data2, domain, marginal)
        # unique, cnt = np.unique(hist2, return_counts=True)

        # unique = np.log10(unique)
        # cdf = 0
        # for i in range(len(cnt)):
        #     cdf += cnt[i]
        #     cnt[i] = cdf

        # plt.plot(unique, cnt, 'o', color='green')

        plt.savefig(f'./{marginal}_counts_dist.pdf', bbox_inches='tight')

        # tvd_array, cell_tvd = tools.get_cell_TVD(domain, marginal, hist1, hist2, [0, 2], path=f'{marginal}_cell_tvd.pdf')

        # tvd_array = tvd_array.reshape((21, 78))
        # tvd_array = np.sum(tvd_array, axis=0)
        # tvd_array /= 21

        # p_hist = tools.get_marginal(data2, domain, (2,))

        # print(np.sum(p_hist > 50000))
        # mask = p_hist > 50000
        # print(np.sum(p_hist[mask]))

        # p_hist /= np.max(p_hist)

        # plt.rcParams['figure.figsize'] = (11.0, 2.5)
        # plt.rcParams['savefig.dpi'] = 200
        # plt.locator_params(nbins=10)
        # fig = plt.figure()

        # plt.plot(tvd_array)
        # plt.plot(p_hist)

        # plt.savefig('./p_score.pdf', bbox_inches='tight')

        # plt.plot(unique, [0]*len(unique), 'r')

        # plt.savefig(f'./{marginal}_counts_dist.pdf', bbox_inches='tight')

    plt.rcParams['figure.figsize'] = (11.0, 2.5)
    plt.rcParams['savefig.dpi'] = 200
    plt.locator_params(nbins=10)
    fig = plt.figure()

    hist1 = tools.get_marginal(data1, domain, (1, )).reshape((-1, ))
    hist2 = tools.get_marginal(data2, domain, (1, )).reshape((-1, ))

    plt.plot(hist1, 'o', color='blue')
    # plt.plot(hist2, 'o', color='green')

    plt.savefig(f'./(1,)_dist_dp.pdf', bbox_inches='tight')
Exemplo n.º 16
0
def test_mask(gt_trip_data, trip_data_list, trip_domain, taxi_id_list, domain):
    data_num = len(gt_trip_data)

    # shift_downsample_num = 15
    # shift_downsample_data_list = []
    # for i in range(len(trip_data_list)):
    #     data = np.concatenate([trip_data_list[i], taxi_id_list[i]], axis=1)
    #     shift_downsample_data = tools.downsample_data(data, [0, -1], shift_downsample_num)
    #     shift_downsample_data_list.append(shift_downsample_data)

    # trip_s_downsample_num = 30
    # trip_s_downsample_data_list = []
    # for i in range(len(trip_data_list)):
    #     data = np.concatenate([trip_data_list[i], taxi_id_list[i]], axis=1)
    #     trip_s_downsample_data = tools.downsample_data(data, [8, -1], trip_s_downsample_num)
    #     trip_s_downsample_data_list.append(trip_s_downsample_data)

    for marginal, threshold, noise in test_mask_list:
        mask = get_mask(gt_trip_data,
                        trip_domain,
                        marginal,
                        threshold=threshold)
        domain_size = trip_domain.project(marginal).size()
        print("{} mask ratio: {:.2f}, domain size: {}, average num: {}".format(
            marginal, 1 - np.sum(mask) / mask.size, domain_size,
            data_num / domain_size))
        if len(marginal) == 2:
            ptools.plot_img(mask, f'./temp/{marginal}_mask.pdf')

        gt_hist = tools.get_marginal(gt_trip_data, trip_domain, marginal)
        noisy_hist = gt_hist + np.random.normal(scale=noise,
                                                size=gt_hist.shape)
        noisy_hist[noisy_hist < 0] = 0
        print('    gt query TVD: {:.4f}'.format(
            tools.get_TVD(gt_hist, noisy_hist)))

        masked_hist = noisy_hist.copy()
        masked_hist[mask == 0] = 0
        print('    gt masked TVD: {:.4f}'.format(
            tools.get_TVD(gt_hist, masked_hist)))

        for i in range(len(trip_data_list)):
            trip_data = trip_data_list[i]

            cell = False
            if 0 in marginal:
                head_attrs = [0, 2]
                cell = True
            elif 3 in marginal:
                head_attrs = [2, 3]
                cell = True

            test_hist = tools.get_marginal(trip_data, trip_domain, marginal)
            noisy_hist = test_hist + np.random.normal(scale=noise,
                                                      size=test_hist.shape)
            noisy_hist[noisy_hist < 0] = 0
            print('        test query TVD: {:.4f}'.format(
                tools.get_TVD(test_hist, noisy_hist)))
            if cell:
                print('        test query cell TVD: {:.4f}'.format(
                    tools.get_cell_TVD(domain, marginal, test_hist, noisy_hist,
                                       head_attrs)))

            # if 0 in marginal:
            #     downsample_noise = noise/200 * shift_downsample_num * (21 ** 0.5)
            #     noisy_hist = tools.get_marginal(shift_downsample_data_list[i], trip_domain, marginal)
            #     noisy_hist = noisy_hist + np.random.normal(scale=downsample_noise, size=noisy_hist.shape)
            #     print('        shift query TVD: {:.4f}'.format(tools.get_TVD(test_hist, noisy_hist)))
            # elif 8 in marginal:
            #     downsample_noise = noise/200 * trip_s_downsample_num * (11 ** 0.5)
            #     noisy_hist = tools.get_marginal(trip_s_downsample_data_list[i], trip_domain, marginal)
            #     noisy_hist = noisy_hist + np.random.normal(scale=downsample_noise, size=noisy_hist.shape)
            #     print('        trip s query TVD: {:.4f}'.format(tools.get_TVD(test_hist, noisy_hist)))

            masked_test_hist = test_hist.copy()
            masked_test_hist[mask == 0] = 0

            masked_hist = noisy_hist.copy()
            masked_hist[mask == 0] = 0

            if cell:
                print('        test inner cell TVD: {:.4f}, head attrs: {}'.
                      format(
                          tools.get_cell_TVD(domain, marginal, test_hist,
                                             masked_test_hist, head_attrs),
                          head_attrs))

            print('        test inner TVD: {:.4f}'.format(
                tools.get_TVD(test_hist, masked_test_hist)))
            print('        test masked TVD: {:.4f}'.format(
                tools.get_TVD(test_hist, masked_hist)))

            if cell:
                print('        test masked cell TVD: {:.4f}'.format(
                    tools.get_cell_TVD(domain, marginal, test_hist,
                                       masked_hist, head_attrs)))

            print('        test avearge num: {:.4f}'.format(
                np.sum(test_hist[mask == 0]) / np.sum(mask == 0)))
            print('        test large cell num: {:.4f}'.format(
                np.sum(test_hist[mask == 0] > 100)))

            mask2 = get_mask(trip_data, trip_domain, marginal, threshold=10)

            print(np.sum((mask2 == 0) & (mask == 0)))
            print(np.sum(mask == 0))

            if len(marginal) == 2:
                ptools.plot_img(mask, f'./info/{marginal}_mask.pdf')
                ptools.plot_img(mask2, f'./info/{marginal}_mask_dp.pdf')

                ptools.plot_img(gt_hist, f'./info/{marginal}_marginal.pdf')
                ptools.plot_img(test_hist,
                                f'./info/{marginal}_marginal_test.pdf')

            uniques, cnts = np.unique(test_hist[mask == 0], return_counts=True)
            print([(uniques[i], cnts[i]) for i in range(len(uniques))])
            ptools.plot_x_y(uniques,
                            cnts,
                            f'./info/{marginal}_masked_counts.pdf',
                            zero_line=True)

            if marginal == (0, 2, 3):
                temp_hist = test_hist.copy()
                temp_hist[mask != 0] = 0
                for j in range(78):
                    pos = np.where(temp_hist[:, j, j] > 0)[0]
                    if len(pos) != 0:
                        print(j, pos, len(pos))
                        for idx in pos:
                            print(temp_hist[idx, j, j])

        print('')
Exemplo n.º 17
0
def evaluate(gt_data, dp_data, domain):

    print(gt_data.shape, dp_data.shape)

    data_num = len(gt_data)
    # for marginal in [(1,), (2,), (3,), (5,), (0, 2, 3), (0, 2), (1, 2), (2, 3), (2, 4), (2, 5), (2, 7), (2, 8), (2, 9)]:
    for marginal in [(0, 1, 2), (0, 2, 3), (0, 2, 5), (2, 3)]:

        gt_marginal = tools.get_marginal(gt_data, domain, marginal)
        dp_marginal = tools.get_marginal(dp_data, domain, marginal)
        temp = np.sum(np.abs(gt_marginal - dp_marginal)) / 2 / data_num
        print("{} TVD: {:.4f}".format(marginal, temp))

        if len(marginal) == 1:
            ptools.plot_list_list([gt_marginal, dp_marginal], path=f'./evaluate/{marginal}.pdf')

        if set([0, 2]) <= set(marginal):

            # print(marginal, gt_marginal.shape, dp_marginal.shape)
            tvd, cell_tvd = tools.get_cell_TVD(domain, marginal, gt_marginal, dp_marginal, [0, 2], f'./{marginal}_cell_tvd.pdf')
            print("    cell TVD: {:.4f}".format(cell_tvd))
        elif marginal == (2, 3):
            tvd, cell_tvd = tools.get_cell_TVD(domain, marginal, gt_marginal, dp_marginal, [2])
            print("    cell TVD: {:.4f}".format(cell_tvd))

    random_attrs = list(range(10))
    random_attrs.remove(0)
    random_attrs.remove(2)

    marginal_list = []
    for attr1, attr2 in itertools.combinations(random_attrs, 2):
        marginal = [0, 2, attr1, attr2]
        marginal_list.append(marginal)
    # marginal_list = []

    # low_score_cells = np.zeros((21, 78))

    result_dict = {attr: [] for attr in range(10)}

    average_score = 0
    for i in range(len(marginal_list)):
        marginal = marginal_list[i]

        temp_domain = Domain(domain.dict, list(marginal))
        gt_marginal, _ = np.histogramdd(gt_data[:, marginal], bins=temp_domain.edge())
        dp_marginal, _ = np.histogramdd(dp_data[:, marginal], bins=temp_domain.edge())
        # print(gt_marginal.shape)

        tvd = tools.get_TVD(gt_marginal, dp_marginal)

        s1, s2, s3, s4 = gt_marginal.shape
        gt_marginal = gt_marginal.reshape((s1*s2, s3*s4))
        dp_marginal = dp_marginal.reshape((s1*s2, s3*s4))

        gt_sum = np.sum(gt_marginal, keepdims=True, axis=1) + 1
        dp_sum = np.sum(dp_marginal, keepdims=True, axis=1) + 1

        gt_marginal = gt_marginal / gt_sum
        dp_marginal = dp_marginal / dp_sum

        scores = np.sum(np.abs(gt_marginal - dp_marginal), axis=1) / 2

        score = 1 - np.sum(scores) / scores.size
        print("  {} score: {:.4f}, tvd: {:.4f}".format(marginal, score, tvd))

        # low_score_cells[scores.reshape((21, 78)) > 0.5] += 1

        average_score += score

        for attr in marginal:
            result_dict[attr].append(score)

    if len(marginal_list) > 0:
        print("average score: {:.4f}".format(average_score/len(marginal_list)))

    for attr in result_dict:
        print("{} score: {:.4f}".format(attr, sum(result_dict[attr])/len(result_dict[attr])))