Python Subsampling примеры использования

Язык программирования: Python

Пространство имен/Пакет: subsampling

Класс/Тип: Subsampling

Примеров на hotexamples.com: 7

Python Subsampling - 7 примеров найдено. Это лучшие примеры Python кода для subsampling.Subsampling, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

uniform_sampling(7)

Subsampling(6)

active_sampling(1)

reset(1)

Пример #1

Показать файл

def evaluation_occupancy_statistics(n, df_subsampled_from,day_profile):

    anonymity_level = n
    mode = "arrival"
    rep_mode = 'mean'
    subsample_size_max = int(comb(len(df_subsampled_from),2))

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(subsample_size=subsample_size, seed = None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest='statistics', stat_type=mode)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(range_n_clusters=range(2,8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 6: the original database is privatized using the learned metric
    sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level,
                                        rep_mode=rep_mode, deep_model=lm)

    sanitized_profile_deep = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level,
                                        rep_mode=rep_mode, deep_model=dm)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric = pe.get_statistics_loss(data_gt=day_profile,
                                                data_sanitized=sanitized_profile.round(),
                                                mode = mode)

    loss_learned_metric_deep = pe.get_statistics_loss(data_gt=day_profile,
                                                data_sanitized=sanitized_profile_deep.round(),
                                                mode = mode)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" %  loss_learned_metric)
    print("information loss with learned metric deep  %s" % (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size

Пример #2

Показать файл

    data_gt=day_profile,
    data_sanitized=sanitized_profile_best,
    mode=interest,
    window=window)

loss_generic_metric = pe.get_information_loss(
    data_gt=day_profile,
    data_sanitized=sanitized_profile_baseline.round(),
    window=window)

df_subsampled_from = day_profile_metric_learn
subsample_size_max = int(comb(len(df_subsampled_from), 2))
print('total number of pairs is %s' % len(df_subsampled_from))

## obtain ground truth similarity labels
sp = Subsampling(data=df_subsampled_from)
data_pair_all, data_pair_all_index = sp.uniform_sampling(
    subsample_size=subsample_size_max, seed=0)
sim = Similarity(data=data_pair_all)
sim.extract_interested_attribute(interest=interest, window=window)
similarity_label_all, class_label_all = sim.label_via_silhouette_analysis(
    range_n_clusters=range(2, 8))
similarity_label_all_series = pd.Series(similarity_label_all)
similarity_label_all_series.index = data_pair_all_index
print('similarity balance is %s' %
      [sum(similarity_label_all),
       len(similarity_label_all)])

k_init = 50
mc_num = 5
seed_vec = np.arange(mc_num)

Пример #3

Показать файл

def evaluation_occupancy_statistics(n, mode="arrival"):
    day_profile1 = pd.read_pickle('./dataset/dataframe_all_binary.pkl')
    day_profile1 = day_profile1.fillna(0)
    day_profile1[day_profile1 > 0] = 1

    res = 15

    day_profile = day_profile1.iloc[:90, 0::
                                    res]  # subsample the database to improve the speed for demonstration purpose
    day_profile2 = day_profile1.iloc[
        90:-1, 0::
        res]  # subsample the database to improve the speed for demonstration purpose

    rep_mode = 'mean'
    anonymity_level = n

    sanitized_profile_best = util.sanitize_data(
        day_profile,
        distance_metric='self-defined',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        mode=mode)

    sanitized_profile_baseline = util.sanitize_data(
        day_profile,
        distance_metric='euclidean',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode)

    loss_best_metric = pe.get_statistics_loss(
        data_gt=day_profile, data_sanitized=sanitized_profile_best, mode=mode)

    loss_generic_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_baseline,
        mode=mode)

    df_subsampled_from = day_profile2.sample(frac=1)

    subsample_size_max = int(comb(len(df_subsampled_from), 2))
    print('total number of pairs is %s' % len(df_subsampled_from))

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(
        subsample_size=subsample_size, seed=None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest='statistics', stat_type=mode)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(
        range_n_clusters=range(2, 8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 6: the original database is privatized using the learned metric
    sanitized_profile = util.sanitize_data(day_profile,
                                           distance_metric="deep",
                                           anonymity_level=anonymity_level,
                                           rep_mode=rep_mode,
                                           deep_model=lm)

    sanitized_profile_deep = util.sanitize_data(
        day_profile,
        distance_metric="deep",
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        deep_model=dm)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile.round(),
        mode=mode)

    loss_learned_metric_deep = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_deep.round(),
        mode=mode)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" % loss_learned_metric)
    print("information loss with learned metric deep  %s" %
          (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline,
            sanitized_profile,
            sanitized_profile_deep), (loss_best_metric, loss_generic_metric,
                                      loss_learned_metric,
                                      loss_learned_metric_deep), subsample_size

Пример #4

Показать файл

def evaluation_total_usage(n):
    """
    In the demo, we will showcase an example of special purpose publication.
    The data user wants the published energy database to maximally retain the information about peak-time energy usage
    """

    # Initialization of some useful classes
    util = Utilities()
    pe = PerformanceEvaluation()

    # step 1: get the database to be published
    day_profile = pd.read_pickle('dataset/dataframe_all_energy.pkl')
    day_profile = day_profile.fillna(0)
    day_profile = day_profile.iloc[
        0:90, 0::
        4]  # subsample the database to improve the speed for demonstration purpose
    day_profile.index = range(len(day_profile.index))
    rep_mode = 'mean'
    anonymity_level = n  # desired anonymity level

    # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
    # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the
    # starting and ending time of the peak usage time
    interest = 'window-usage'
    window = [17, 21]

    sanitized_profile_best = util.sanitize_data(
        day_profile,
        distance_metric='self-defined',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        mode=interest,
        window=window)

    # step 3: pre-sanitize the database
    sanitized_profile_baseline = util.sanitize_data(
        day_profile,
        distance_metric='euclidean',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode)

    loss_best_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_best,
        mode=interest,
        window=window)

    loss_generic_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_baseline,
        mode=interest,
        window=window)
    # print("information loss with learned metric %s" % loss_generic_metric)

    df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample(
        frac=1)
    subsample_size_max = int(comb(len(df_subsampled_from), 2))

    print('total number of pairs is %s' % subsample_size_max)

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(
        subsample_size=subsample_size, seed=None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest='statistics',
                                     stat_type=interest,
                                     window=window)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(
        range_n_clusters=range(2, 8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 6: the original database is privatized using the learned metric
    sanitized_profile_deep = util.sanitize_data(
        day_profile,
        distance_metric="deep",
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        deep_model=dm,
        window=window)

    sanitized_profile = util.sanitize_data(day_profile,
                                           distance_metric="deep",
                                           anonymity_level=anonymity_level,
                                           rep_mode=rep_mode,
                                           deep_model=lm,
                                           window=window)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric_deep = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_deep.round(),
        mode=interest,
        window=window)

    loss_learned_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile,
        mode=interest,
        window=window)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" % loss_learned_metric)
    print("information loss with learned metric deep  %s" %
          (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline,
            sanitized_profile,
            sanitized_profile_deep), (loss_best_metric, loss_generic_metric,
                                      loss_learned_metric,
                                      loss_learned_metric_deep), subsample_size

Пример #5

Показать файл

    data_gt=day_profile,
    data_sanitized=sanitized_profile_best,
    mode=interest,
    window=window)

loss_generic_metric = pe.get_information_loss(
    data_gt=day_profile,
    data_sanitized=sanitized_profile_baseline.round(),
    window=window)

df_subsampled_from = day_profile_metric_learn
subsample_size_max = int(comb(len(df_subsampled_from), 2))
print('total number of pairs is %s' % subsample_size_max)

## obtain ground truth similarity labels
sp = Subsampling(data=df_subsampled_from)
data_pair_all, data_pair_all_index = sp.uniform_sampling(
    subsample_size=subsample_size_max, seed=0)
sim = Similarity(data=data_pair_all)
sim.extract_interested_attribute(interest=interest, window=window)
similarity_label_all, class_label_all = sim.label_via_silhouette_analysis(
    range_n_clusters=range(2, 8))
similarity_label_all_series = pd.Series(similarity_label_all)
similarity_label_all_series.index = data_pair_all_index
print('similarity balance is %s' %
      [sum(similarity_label_all),
       len(similarity_label_all)])

k_init = 50
batch_size = 10
mc_num = 5

Пример #6

Показать файл

def evaluation_occupancy_window(n):
    # step 1: get the database to be published
    day_profile1 = pd.read_pickle('./dataset/dataframe_all_binary.pkl')
    day_profile1 = day_profile1.fillna(0)
    day_profile = day_profile1.iloc[0:90,0::60]
    day_profile2 = day_profile1.iloc[90:135,0::60] 
    day_profile.dropna()
    day_profile2.dropna()

    rep_mode = 'mean'
    anonymity_level = n # desired anonymity level

    # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
    # information of a segment of entire time series. In this case, he/she would also need to specify the starting and
    # ending time of the time series segment of interest.
    interest = 'segment'
    window = [11,15] # window specifies the starting and ending time of the period that the data user is interested in

    # step 3: pre-sanitize the database
    sanitized_profile_best = util.sanitize_data(day_profile, distance_metric = 'self-defined',
                                                anonymity_level = anonymity_level, rep_mode = rep_mode,
                                                mode = interest, window = window)

    sanitized_profile_baseline = util.sanitize_data(day_profile, distance_metric='euclidean',
                                                    anonymity_level=anonymity_level, rep_mode = rep_mode)
    
    loss_best_metric = pe.get_information_loss(data_gt=day_profile, data_sanitized=sanitized_profile_best,
                                                  window=window)

    loss_generic_metric = pe.get_information_loss(data_gt=day_profile,
                                                data_sanitized=sanitized_profile_baseline.round(),
                                                window=window)



    df_subsampled_from = day_profile2.sample(frac=1)
    subsample_size_max = int(comb(len(df_subsampled_from),2))
    print('total number of pairs is %s' % len(df_subsampled_from))

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(subsample_size=subsample_size, seed = None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest=interest, window=window)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(range_n_clusters=range(2,8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)        
    
    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    # lam_vec is a set of candidate lambda's for weighting the l1-norm penalty in the metric learning optimization problem.
    # The lambda that achieves lowest testing error will be selected for generating the distance metric

    # step 6: the original database is privatized using the learned metric
    sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level,
                                        rep_mode=rep_mode, deep_model=lm)

    sanitized_profile_deep = util.sanitize_data(day_profile, distance_metric="deep",anonymity_level=anonymity_level,
                                        rep_mode=rep_mode, deep_model=dm)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric = pe.get_information_loss(data_gt=day_profile,
                                                data_sanitized=sanitized_profile.round(),
                                                window=window)

    loss_learned_metric_deep = pe.get_information_loss(data_gt=day_profile,
                                                data_sanitized=sanitized_profile_deep.round(),
                                                window=window)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" %  loss_learned_metric)
    print("information loss with learned metric deep  %s" % (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size

Пример #7

Показать файл

def evaluation_occupancy_window(n, df_subsampled_from, day_profile):
    rep_mode = 'mean'
    anonymity_level = n  # desired anonymity level
    interest = 'segment'
    window = [11, 15]
    subsample_size_max = int(comb(len(df_subsampled_from), 2))

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(
        subsample_size=subsample_size, seed=None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest=interest, window=window)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(
        range_n_clusters=range(2, 8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    # lam_vec is a set of candidate lambda's for weighting the l1-norm penalty in the metric learning optimization problem.
    # The lambda that achieves lowest testing error will be selected for generating the distance metric

    # step 6: the original database is privatized using the learned metric
    sanitized_profile = util.sanitize_data(day_profile,
                                           distance_metric="deep",
                                           anonymity_level=anonymity_level,
                                           rep_mode=rep_mode,
                                           deep_model=lm)

    sanitized_profile_deep = util.sanitize_data(
        day_profile,
        distance_metric="deep",
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        deep_model=dm)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric = pe.get_information_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile.round(),
        window=window)

    loss_learned_metric_deep = pe.get_information_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_deep.round(),
        window=window)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" % loss_learned_metric)
    print("information loss with learned metric deep  %s" %
          (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline,
            sanitized_profile,
            sanitized_profile_deep), (loss_best_metric, loss_generic_metric,
                                      loss_learned_metric,
                                      loss_learned_metric_deep), subsample_size