示例#1
0
def evaluation_occupancy_statistics(n, df_subsampled_from,day_profile):

    anonymity_level = n
    mode = "arrival"
    rep_mode = 'mean'
    subsample_size_max = int(comb(len(df_subsampled_from),2))

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(subsample_size=subsample_size, seed = None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest='statistics', stat_type=mode)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(range_n_clusters=range(2,8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 6: the original database is privatized using the learned metric
    sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level,
                                        rep_mode=rep_mode, deep_model=lm)

    sanitized_profile_deep = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level,
                                        rep_mode=rep_mode, deep_model=dm)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric = pe.get_statistics_loss(data_gt=day_profile,
                                                data_sanitized=sanitized_profile.round(),
                                                mode = mode)

    loss_learned_metric_deep = pe.get_statistics_loss(data_gt=day_profile,
                                                data_sanitized=sanitized_profile_deep.round(),
                                                mode = mode)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" %  loss_learned_metric)
    print("information loss with learned metric deep  %s" % (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size
示例#2
0
    pairdata_active_deep = []
    pairdata_label_active_deep = []
    sp.reset()
    k = k_init
    while k <= run_sample_size:
        pairdata, pairdata_idx = sp.active_sampling(dist_metric=dm,
                                                    k_init=k_init,
                                                    batch_size=1,
                                                    seed=seed_vec[mc_i])
        pairdata_active_deep = pairdata_active_deep + pairdata

        similarity_label = similarity_label_all_series.loc[
            pairdata_idx].tolist()
        pairdata_label_active_deep = pairdata_label_active_deep + similarity_label

        dm = Deep_Metric()
        dm.train(pairdata_active_deep, pairdata_label_active_deep)

        sanitized_profile_deep = util.sanitize_data(
            day_profile,
            distance_metric="deep",
            anonymity_level=anonymity_level,
            rep_mode=rep_mode,
            deep_model=dm)

        loss_learned_metric_deep = pe.get_information_loss(
            data_gt=day_profile,
            data_sanitized=sanitized_profile_deep.round(),
            window=window)
        loss_active_mc_deep.append(loss_learned_metric_deep)
        pairdata_each_mc_deep.append(pairdata_active_deep)
示例#3
0
def evaluation_occupancy_statistics(n, mode="arrival"):
    day_profile1 = pd.read_pickle('./dataset/dataframe_all_binary.pkl')
    day_profile1 = day_profile1.fillna(0)
    day_profile1[day_profile1 > 0] = 1

    res = 15

    day_profile = day_profile1.iloc[:90, 0::
                                    res]  # subsample the database to improve the speed for demonstration purpose
    day_profile2 = day_profile1.iloc[
        90:-1, 0::
        res]  # subsample the database to improve the speed for demonstration purpose

    rep_mode = 'mean'
    anonymity_level = n

    sanitized_profile_best = util.sanitize_data(
        day_profile,
        distance_metric='self-defined',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        mode=mode)

    sanitized_profile_baseline = util.sanitize_data(
        day_profile,
        distance_metric='euclidean',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode)

    loss_best_metric = pe.get_statistics_loss(
        data_gt=day_profile, data_sanitized=sanitized_profile_best, mode=mode)

    loss_generic_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_baseline,
        mode=mode)

    df_subsampled_from = day_profile2.sample(frac=1)

    subsample_size_max = int(comb(len(df_subsampled_from), 2))
    print('total number of pairs is %s' % len(df_subsampled_from))

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(
        subsample_size=subsample_size, seed=None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest='statistics', stat_type=mode)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(
        range_n_clusters=range(2, 8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 6: the original database is privatized using the learned metric
    sanitized_profile = util.sanitize_data(day_profile,
                                           distance_metric="deep",
                                           anonymity_level=anonymity_level,
                                           rep_mode=rep_mode,
                                           deep_model=lm)

    sanitized_profile_deep = util.sanitize_data(
        day_profile,
        distance_metric="deep",
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        deep_model=dm)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile.round(),
        mode=mode)

    loss_learned_metric_deep = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_deep.round(),
        mode=mode)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" % loss_learned_metric)
    print("information loss with learned metric deep  %s" %
          (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline,
            sanitized_profile,
            sanitized_profile_deep), (loss_best_metric, loss_generic_metric,
                                      loss_learned_metric,
                                      loss_learned_metric_deep), subsample_size
示例#4
0
def evaluation_total_usage(n):
    """
    In the demo, we will showcase an example of special purpose publication.
    The data user wants the published energy database to maximally retain the information about peak-time energy usage
    """

    # Initialization of some useful classes
    util = Utilities()
    pe = PerformanceEvaluation()

    # step 1: get the database to be published
    day_profile = pd.read_pickle('dataset/dataframe_all_energy.pkl')
    day_profile = day_profile.fillna(0)
    day_profile = day_profile.iloc[
        0:90, 0::
        4]  # subsample the database to improve the speed for demonstration purpose
    day_profile.index = range(len(day_profile.index))
    rep_mode = 'mean'
    anonymity_level = n  # desired anonymity level

    # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
    # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the
    # starting and ending time of the peak usage time
    interest = 'window-usage'
    window = [17, 21]

    sanitized_profile_best = util.sanitize_data(
        day_profile,
        distance_metric='self-defined',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        mode=interest,
        window=window)

    # step 3: pre-sanitize the database
    sanitized_profile_baseline = util.sanitize_data(
        day_profile,
        distance_metric='euclidean',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode)

    loss_best_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_best,
        mode=interest,
        window=window)

    loss_generic_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_baseline,
        mode=interest,
        window=window)
    # print("information loss with learned metric %s" % loss_generic_metric)

    df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample(
        frac=1)
    subsample_size_max = int(comb(len(df_subsampled_from), 2))

    print('total number of pairs is %s' % subsample_size_max)

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(
        subsample_size=subsample_size, seed=None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest='statistics',
                                     stat_type=interest,
                                     window=window)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(
        range_n_clusters=range(2, 8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 6: the original database is privatized using the learned metric
    sanitized_profile_deep = util.sanitize_data(
        day_profile,
        distance_metric="deep",
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        deep_model=dm,
        window=window)

    sanitized_profile = util.sanitize_data(day_profile,
                                           distance_metric="deep",
                                           anonymity_level=anonymity_level,
                                           rep_mode=rep_mode,
                                           deep_model=lm,
                                           window=window)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric_deep = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_deep.round(),
        mode=interest,
        window=window)

    loss_learned_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile,
        mode=interest,
        window=window)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" % loss_learned_metric)
    print("information loss with learned metric deep  %s" %
          (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline,
            sanitized_profile,
            sanitized_profile_deep), (loss_best_metric, loss_generic_metric,
                                      loss_learned_metric,
                                      loss_learned_metric_deep), subsample_size
示例#5
0
    pairlabel_each_mc = []
    k = k_init
    while k <= run_sample_size:
        if k == k_init:
            pairdata, pairdata_idx = sp.uniform_sampling(subsample_size=k,
                                                         seed=seed_vec[mc_i])
            pairdata_label = similarity_label_all_series.loc[pairdata_idx]
        else:
            pairdata, pairdata_idx = sp.uniform_sampling(subsample_size=k,
                                                         seed=None)
            pairdata_label = similarity_label_all_series.loc[pairdata_idx]

        lm = Linear_Metric()
        lm.train(pairdata, pairdata_label)

        dm = Deep_Metric()
        dm.train(pairdata, pairdata_label)

        sanitized_profile = util.sanitize_data(day_profile,
                                               distance_metric="deep",
                                               anonymity_level=anonymity_level,
                                               rep_mode=rep_mode,
                                               deep_model=lm)

        sanitized_profile_deep = util.sanitize_data(
            day_profile,
            distance_metric="deep",
            anonymity_level=anonymity_level,
            rep_mode=rep_mode,
            deep_model=dm)
示例#6
0
def evaluation_occupancy_window(n):
    # step 1: get the database to be published
    day_profile1 = pd.read_pickle('./dataset/dataframe_all_binary.pkl')
    day_profile1 = day_profile1.fillna(0)
    day_profile = day_profile1.iloc[0:90,0::60]
    day_profile2 = day_profile1.iloc[90:135,0::60] 
    day_profile.dropna()
    day_profile2.dropna()

    rep_mode = 'mean'
    anonymity_level = n # desired anonymity level

    # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
    # information of a segment of entire time series. In this case, he/she would also need to specify the starting and
    # ending time of the time series segment of interest.
    interest = 'segment'
    window = [11,15] # window specifies the starting and ending time of the period that the data user is interested in

    # step 3: pre-sanitize the database
    sanitized_profile_best = util.sanitize_data(day_profile, distance_metric = 'self-defined',
                                                anonymity_level = anonymity_level, rep_mode = rep_mode,
                                                mode = interest, window = window)

    sanitized_profile_baseline = util.sanitize_data(day_profile, distance_metric='euclidean',
                                                    anonymity_level=anonymity_level, rep_mode = rep_mode)
    
    loss_best_metric = pe.get_information_loss(data_gt=day_profile, data_sanitized=sanitized_profile_best,
                                                  window=window)

    loss_generic_metric = pe.get_information_loss(data_gt=day_profile,
                                                data_sanitized=sanitized_profile_baseline.round(),
                                                window=window)



    df_subsampled_from = day_profile2.sample(frac=1)
    subsample_size_max = int(comb(len(df_subsampled_from),2))
    print('total number of pairs is %s' % len(df_subsampled_from))

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(subsample_size=subsample_size, seed = None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest=interest, window=window)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(range_n_clusters=range(2,8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)        
    
    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    # lam_vec is a set of candidate lambda's for weighting the l1-norm penalty in the metric learning optimization problem.
    # The lambda that achieves lowest testing error will be selected for generating the distance metric

    # step 6: the original database is privatized using the learned metric
    sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level,
                                        rep_mode=rep_mode, deep_model=lm)

    sanitized_profile_deep = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level,
                                        rep_mode=rep_mode, deep_model=dm)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric = pe.get_information_loss(data_gt=day_profile,
                                                data_sanitized=sanitized_profile.round(),
                                                window=window)

    loss_learned_metric_deep = pe.get_information_loss(data_gt=day_profile,
                                                data_sanitized=sanitized_profile_deep.round(),
                                                window=window)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" %  loss_learned_metric)
    print("information loss with learned metric deep  %s" % (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size
示例#7
0
# step 4: sample a subset of pre-sanitized database and form the data points into pairs
subsample_size = int(round(subsample_size_max))
sp = Subsampling(data=df_subsampled_from)
data_pair = sp.uniform_sampling(subsample_size=subsample_size, seed=None)

# User receives the data pairs and label the similarity
sim = Similarity(data=data_pair)
sim.extract_interested_attribute(interest=interest, window=window)
similarity_label, data_subsample = sim.label_via_silhouette_analysis(
    range_n_clusters=range(2, 8))

# step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
lm = Linear_Metric()
lm.train(data_pair, similarity_label)

dm = Deep_Metric()
dm.train(data_pair, similarity_label)

# step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
# lam_vec is a set of candidate lambda's for weighting the l1-norm penalty in the metric learning optimization problem.
# The lambda that achieves lowest testing error will be selected for generating the distance metric

# dist_metric = mel.learn_with_simialrity_label_regularization(data=data_pair,
#                                                              label=similarity_label,
#                                                              lam_vec=[0, 0.1, 1, 10],
#                                                              train_portion=0.8)

# step 6: the original database is privatized using the learned metric
# sanitized_profile = util.sanitize_data(day_profile, distance_metric="mahalanobis",
#                                        anonymity_level=anonymity_level, rep_mode=rep_mode, VI=dist_metric)
示例#8
0
def evaluation_occupancy_window(n, df_subsampled_from, day_profile):
    rep_mode = 'mean'
    anonymity_level = n  # desired anonymity level
    interest = 'segment'
    window = [11, 15]
    subsample_size_max = int(comb(len(df_subsampled_from), 2))

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(
        subsample_size=subsample_size, seed=None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest=interest, window=window)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(
        range_n_clusters=range(2, 8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    # lam_vec is a set of candidate lambda's for weighting the l1-norm penalty in the metric learning optimization problem.
    # The lambda that achieves lowest testing error will be selected for generating the distance metric

    # step 6: the original database is privatized using the learned metric
    sanitized_profile = util.sanitize_data(day_profile,
                                           distance_metric="deep",
                                           anonymity_level=anonymity_level,
                                           rep_mode=rep_mode,
                                           deep_model=lm)

    sanitized_profile_deep = util.sanitize_data(
        day_profile,
        distance_metric="deep",
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        deep_model=dm)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric = pe.get_information_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile.round(),
        window=window)

    loss_learned_metric_deep = pe.get_information_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_deep.round(),
        window=window)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" % loss_learned_metric)
    print("information loss with learned metric deep  %s" %
          (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline,
            sanitized_profile,
            sanitized_profile_deep), (loss_best_metric, loss_generic_metric,
                                      loss_learned_metric,
                                      loss_learned_metric_deep), subsample_size