def evaluation_occupancy_statistics(n, df_subsampled_from,day_profile): anonymity_level = n mode = "arrival" rep_mode = 'mean' subsample_size_max = int(comb(len(df_subsampled_from),2)) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair, data_pair_all_index = sp.uniform_sampling(subsample_size=subsample_size, seed = None) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest='statistics', stat_type=mode) similarity_label, data_subsample = sim.label_via_silhouette_analysis(range_n_clusters=range(2,8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs lm = Linear_Metric() lm.train(data_pair, similarity_label) dm = Deep_Metric() dm.train(data_pair, similarity_label) # step 6: the original database is privatized using the learned metric sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=lm) sanitized_profile_deep = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=dm) # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database loss_learned_metric = pe.get_statistics_loss(data_gt=day_profile, data_sanitized=sanitized_profile.round(), mode = mode) loss_learned_metric_deep = pe.get_statistics_loss(data_gt=day_profile, data_sanitized=sanitized_profile_deep.round(), mode = mode) print('anonymity level %s' % anonymity_level) print("sampled size %s" % subsample_size) print("information loss with best metric %s" % loss_best_metric) print("information loss with generic metric %s" % loss_generic_metric) print("information loss with learned metric %s" % loss_learned_metric) print("information loss with learned metric deep %s" % (loss_learned_metric_deep)) return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size
mode=interest, window=window) print("information loss with learned metric %s" % loss_generic_metric) df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample( frac=1) subsample_size_max = int(comb(len(df_subsampled_from), 2)) print('total number of pairs is %s' % subsample_size_max) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max / 2)) sp = Subsampling(data=df_subsampled_from) data_pair = sp.uniform_sampling(subsample_size=subsample_size) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest='statistics', stat_type=interest, window=window) similarity_label, class_label = sim.label_via_silhouette_analysis( range_n_clusters=range(2, 8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs # lam_vec is a set of candidate lambda's for weighting the l1-norm penalty in the metric learning optimization problem. # The lambda that achieves lowest testing error will be selected for generating the distance metric dist_metric = mel.learn_with_simialrity_label_regularization( data=data_pair, label=similarity_label, lam_vec=[10], train_portion=0.8) # step 6: the original database is privatized using the learned metric sanitized_profile = util.sanitize_data(day_profile, distance_metric="mahalanobis",
window=window) loss_generic_metric = pe.get_information_loss( data_gt=day_profile, data_sanitized=sanitized_profile_baseline.round(), window=window) df_subsampled_from = day_profile_metric_learn subsample_size_max = int(comb(len(df_subsampled_from), 2)) print('total number of pairs is %s' % len(df_subsampled_from)) ## obtain ground truth similarity labels sp = Subsampling(data=df_subsampled_from) data_pair_all, data_pair_all_index = sp.uniform_sampling( subsample_size=subsample_size_max, seed=0) sim = Similarity(data=data_pair_all) sim.extract_interested_attribute(interest=interest, window=window) similarity_label_all, class_label_all = sim.label_via_silhouette_analysis( range_n_clusters=range(2, 8)) similarity_label_all_series = pd.Series(similarity_label_all) similarity_label_all_series.index = data_pair_all_index print('similarity balance is %s' % [sum(similarity_label_all), len(similarity_label_all)]) k_init = 50 mc_num = 5 seed_vec = np.arange(mc_num) run_sample_size = 300 ## active sampling
def evaluation_occupancy_statistics(n, mode="arrival"): day_profile1 = pd.read_pickle('./dataset/dataframe_all_binary.pkl') day_profile1 = day_profile1.fillna(0) day_profile1[day_profile1 > 0] = 1 res = 15 day_profile = day_profile1.iloc[:90, 0:: res] # subsample the database to improve the speed for demonstration purpose day_profile2 = day_profile1.iloc[ 90:-1, 0:: res] # subsample the database to improve the speed for demonstration purpose rep_mode = 'mean' anonymity_level = n sanitized_profile_best = util.sanitize_data( day_profile, distance_metric='self-defined', anonymity_level=anonymity_level, rep_mode=rep_mode, mode=mode) sanitized_profile_baseline = util.sanitize_data( day_profile, distance_metric='euclidean', anonymity_level=anonymity_level, rep_mode=rep_mode) loss_best_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_best, mode=mode) loss_generic_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_baseline, mode=mode) df_subsampled_from = day_profile2.sample(frac=1) subsample_size_max = int(comb(len(df_subsampled_from), 2)) print('total number of pairs is %s' % len(df_subsampled_from)) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair, data_pair_all_index = sp.uniform_sampling( subsample_size=subsample_size, seed=None) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest='statistics', stat_type=mode) similarity_label, data_subsample = sim.label_via_silhouette_analysis( range_n_clusters=range(2, 8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs lm = Linear_Metric() lm.train(data_pair, similarity_label) dm = Deep_Metric() dm.train(data_pair, similarity_label) # step 6: the original database is privatized using the learned metric sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=lm) sanitized_profile_deep = util.sanitize_data( day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=dm) # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database loss_learned_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile.round(), mode=mode) loss_learned_metric_deep = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_deep.round(), mode=mode) print('anonymity level %s' % anonymity_level) print("sampled size %s" % subsample_size) print("information loss with best metric %s" % loss_best_metric) print("information loss with generic metric %s" % loss_generic_metric) print("information loss with learned metric %s" % loss_learned_metric) print("information loss with learned metric deep %s" % (loss_learned_metric_deep)) return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size
def evaluation_total_usage(n): """ In the demo, we will showcase an example of special purpose publication. The data user wants the published energy database to maximally retain the information about peak-time energy usage """ # Initialization of some useful classes util = Utilities() pe = PerformanceEvaluation() # step 1: get the database to be published day_profile = pd.read_pickle('dataset/dataframe_all_energy.pkl') day_profile = day_profile.fillna(0) day_profile = day_profile.iloc[ 0:90, 0:: 4] # subsample the database to improve the speed for demonstration purpose day_profile.index = range(len(day_profile.index)) rep_mode = 'mean' anonymity_level = n # desired anonymity level # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the # starting and ending time of the peak usage time interest = 'window-usage' window = [17, 21] sanitized_profile_best = util.sanitize_data( day_profile, distance_metric='self-defined', anonymity_level=anonymity_level, rep_mode=rep_mode, mode=interest, window=window) # step 3: pre-sanitize the database sanitized_profile_baseline = util.sanitize_data( day_profile, distance_metric='euclidean', anonymity_level=anonymity_level, rep_mode=rep_mode) loss_best_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_best, mode=interest, window=window) loss_generic_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_baseline, mode=interest, window=window) # print("information loss with learned metric %s" % loss_generic_metric) df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample( frac=1) subsample_size_max = int(comb(len(df_subsampled_from), 2)) print('total number of pairs is %s' % subsample_size_max) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair, data_pair_all_index = sp.uniform_sampling( subsample_size=subsample_size, seed=None) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest='statistics', stat_type=interest, window=window) similarity_label, data_subsample = sim.label_via_silhouette_analysis( range_n_clusters=range(2, 8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs lm = Linear_Metric() lm.train(data_pair, similarity_label) dm = Deep_Metric() dm.train(data_pair, similarity_label) # step 6: the original database is privatized using the learned metric sanitized_profile_deep = util.sanitize_data( day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=dm, window=window) sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=lm, window=window) # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database loss_learned_metric_deep = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_deep.round(), mode=interest, window=window) loss_learned_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile, mode=interest, window=window) print('anonymity level %s' % anonymity_level) print("sampled size %s" % subsample_size) print("information loss with best metric %s" % loss_best_metric) print("information loss with generic metric %s" % loss_generic_metric) print("information loss with learned metric %s" % loss_learned_metric) print("information loss with learned metric deep %s" % (loss_learned_metric_deep)) return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size
def evaluation_occupancy_window(n): # step 1: get the database to be published day_profile1 = pd.read_pickle('./dataset/dataframe_all_binary.pkl') day_profile1 = day_profile1.fillna(0) day_profile = day_profile1.iloc[0:90,0::60] day_profile2 = day_profile1.iloc[90:135,0::60] day_profile.dropna() day_profile2.dropna() rep_mode = 'mean' anonymity_level = n # desired anonymity level # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the # information of a segment of entire time series. In this case, he/she would also need to specify the starting and # ending time of the time series segment of interest. interest = 'segment' window = [11,15] # window specifies the starting and ending time of the period that the data user is interested in # step 3: pre-sanitize the database sanitized_profile_best = util.sanitize_data(day_profile, distance_metric = 'self-defined', anonymity_level = anonymity_level, rep_mode = rep_mode, mode = interest, window = window) sanitized_profile_baseline = util.sanitize_data(day_profile, distance_metric='euclidean', anonymity_level=anonymity_level, rep_mode = rep_mode) loss_best_metric = pe.get_information_loss(data_gt=day_profile, data_sanitized=sanitized_profile_best, window=window) loss_generic_metric = pe.get_information_loss(data_gt=day_profile, data_sanitized=sanitized_profile_baseline.round(), window=window) df_subsampled_from = day_profile2.sample(frac=1) subsample_size_max = int(comb(len(df_subsampled_from),2)) print('total number of pairs is %s' % len(df_subsampled_from)) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair, data_pair_all_index = sp.uniform_sampling(subsample_size=subsample_size, seed = None) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest=interest, window=window) similarity_label, data_subsample = sim.label_via_silhouette_analysis(range_n_clusters=range(2,8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs lm = Linear_Metric() lm.train(data_pair, similarity_label) dm = Deep_Metric() dm.train(data_pair, similarity_label) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs # lam_vec is a set of candidate lambda's for weighting the l1-norm penalty in the metric learning optimization problem. # The lambda that achieves lowest testing error will be selected for generating the distance metric # step 6: the original database is privatized using the learned metric sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=lm) sanitized_profile_deep = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=dm) # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database loss_learned_metric = pe.get_information_loss(data_gt=day_profile, data_sanitized=sanitized_profile.round(), window=window) loss_learned_metric_deep = pe.get_information_loss(data_gt=day_profile, data_sanitized=sanitized_profile_deep.round(), window=window) print('anonymity level %s' % anonymity_level) print("sampled size %s" % subsample_size) print("information loss with best metric %s" % loss_best_metric) print("information loss with generic metric %s" % loss_generic_metric) print("information loss with learned metric %s" % loss_learned_metric) print("information loss with learned metric deep %s" % (loss_learned_metric_deep)) return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size
def evaluation_occupancy_window(n, df_subsampled_from, day_profile): rep_mode = 'mean' anonymity_level = n # desired anonymity level interest = 'segment' window = [11, 15] subsample_size_max = int(comb(len(df_subsampled_from), 2)) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair, data_pair_all_index = sp.uniform_sampling( subsample_size=subsample_size, seed=None) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest=interest, window=window) similarity_label, data_subsample = sim.label_via_silhouette_analysis( range_n_clusters=range(2, 8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs lm = Linear_Metric() lm.train(data_pair, similarity_label) dm = Deep_Metric() dm.train(data_pair, similarity_label) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs # lam_vec is a set of candidate lambda's for weighting the l1-norm penalty in the metric learning optimization problem. # The lambda that achieves lowest testing error will be selected for generating the distance metric # step 6: the original database is privatized using the learned metric sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=lm) sanitized_profile_deep = util.sanitize_data( day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=dm) # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database loss_learned_metric = pe.get_information_loss( data_gt=day_profile, data_sanitized=sanitized_profile.round(), window=window) loss_learned_metric_deep = pe.get_information_loss( data_gt=day_profile, data_sanitized=sanitized_profile_deep.round(), window=window) print('anonymity level %s' % anonymity_level) print("sampled size %s" % subsample_size) print("information loss with best metric %s" % loss_best_metric) print("information loss with generic metric %s" % loss_generic_metric) print("information loss with learned metric %s" % loss_learned_metric) print("information loss with learned metric deep %s" % (loss_learned_metric_deep)) return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size