Пример #1
0
def _calculate_privacy_tests(tgt_data: DataFrame, syn_data: DataFrame):
    """
    Compute privacy tests for a given target and synthetic data set
    """

    assert sorted(tgt_data.columns) == sorted(
        syn_data.columns
    ), "Target and Synthetic have different columns"

    tgt_dict = _generate_column_type_dictionary(tgt_data)
    syn_dict = _generate_column_type_dictionary(syn_data)
    assert tgt_dict == syn_dict, "Target and Synthetic have different types"

    flat_table_target = _flatten_table(tgt_data.reset_index(), tgt_dict)
    flat_table_syn = _flatten_table(syn_data.reset_index(), tgt_dict)

    # columns now include 1st and 2nd record
    column_dict = _generate_column_type_dictionary(flat_table_target)

    smoothing_factor = 1e-8
    tgt_data_p, syn_data_p = _prepare_data_for_privacy_metrics(
        flat_table_target, flat_table_syn, column_dict, smoothing_factor
    )

    checks, privacy_tests = _calculate_dcr_nndr(
        tgt_data_p, syn_data_p, column_dict, smoothing_factor
    )
    return checks
Пример #2
0
def _calculate_coherence(
    tgt_data: DataFrame,
    syn_data: DataFrame,
    type: str,  # users_per_category | categories_per_user
    number_of_bins: int = 10,
):

    assert type in ["users_per_category", "categories_per_user"], "type not recognized"
    # check if data is in expected format
    assert sorted(tgt_data.columns) == sorted(
        syn_data.columns
    ), "Target and Synthetic have different columns"
    column_dict = _generate_column_type_dictionary(tgt_data)

    tgt_binned, syn_binned = _bin_looped(
        tgt_data, syn_data, column_dict, number_of_bins
    )
    tgt_binned = tgt_binned.astype(str).reset_index()
    syn_binned = syn_binned.astype(str).reset_index()

    result = list()
    for col in list(column_dict):
        if type == "users_per_category":
            tgt_shares = tgt_binned.groupby(by=col, as_index=False).agg(
                {"id": pd.Series.nunique}
            )
            tgt_shares["id"] = tgt_shares["id"] / tgt_binned["id"].nunique()
            syn_shares = syn_binned.groupby(by=col, as_index=False).agg(
                {"id": pd.Series.nunique}
            )
            syn_shares["id"] = syn_shares["id"] / syn_binned["id"].nunique()
        elif type == "categories_per_user":
            tgt_shares = tgt_binned.groupby(by="id").agg({col: pd.Series.nunique})
            tgt_shares = (
                tgt_shares[col].value_counts(normalize=True).to_frame().reset_index()
            )
            syn_shares = syn_binned.groupby(by="id").agg({col: pd.Series.nunique})
            syn_shares = (
                syn_shares[col].value_counts(normalize=True).to_frame().reset_index()
            )
        tgt_shares.columns = ["value", "tgt"]
        syn_shares.columns = ["value", "syn"]
        shares = pd.merge(tgt_shares, syn_shares, on="value", how="left")
        shares["syn"] = shares["syn"].fillna(0)
        diff = np.abs(shares["tgt"] - shares["syn"])
        out = pd.DataFrame(
            {
                "label": [col],
                "type": type,
                "max": np.max(diff),
                "l1d": np.sum(diff),
                "l2d": np.sqrt(np.sum(diff ** 2)),
            }
        )

        result.append(out)

    return pd.concat(result)
Пример #3
0
def _calculate_coherence(tgt_data:DataFrame,
                         syn_data:DataFrame,
                         type: str, # users_per_category | categories_per_user
                         number_of_bins:int = 10):

    assert type in ['users_per_category', 'categories_per_user'], "type not recognized"
    # check if data is in expected format
    assert sorted(tgt_data.columns) == sorted(syn_data.columns), "Target and Synthetic have different columns"
    column_dict = _generate_column_type_dictionary(tgt_data)

    tgt_binned, syn_binned = _bin_looped(tgt_data, syn_data, column_dict, number_of_bins)
    tgt_binned = tgt_binned.astype(str).reset_index()
    syn_binned = syn_binned.astype(str).reset_index()

    result = list()
    for col in list(column_dict):
        if type == 'users_per_category':
            tgt_shares = tgt_binned.groupby(by=col, as_index=False).agg({'id': pd.Series.nunique})
            tgt_shares['id'] = tgt_shares['id'] / tgt_binned['id'].nunique()
            syn_shares = syn_binned.groupby(by=col, as_index=False).agg({'id': pd.Series.nunique})
            syn_shares['id'] = syn_shares['id'] / syn_binned['id'].nunique()
        elif type == 'categories_per_user':
            tgt_shares = tgt_binned.groupby(by='id').agg({col: pd.Series.nunique})
            tgt_shares = tgt_shares[col].value_counts(normalize=True).to_frame().reset_index()
            syn_shares = syn_binned.groupby(by='id').agg({col: pd.Series.nunique})
            syn_shares = syn_shares[col].value_counts(normalize=True).to_frame().reset_index()
        tgt_shares.columns = ['value', 'tgt']
        syn_shares.columns = ['value', 'syn']
        shares = pd.merge(tgt_shares, syn_shares, on='value', how='left')
        shares['syn'] = shares['syn'].fillna(0)
        diff = np.abs(shares['tgt'] - shares['syn'])
        out = pd.DataFrame({'label': [col],
                            'type': type,
                            'tvd': np.max(diff),
                            'l1d': np.sum(diff),
                            'l2d': np.sqrt(np.sum(diff ** 2))})

        result.append(out)

    return pd.concat(result)
Пример #4
0
    def generate(self, number_of_subjects):
        super().generate(self)

        df_copy = self.target_data_.copy(deep=True)
        for x in df_copy.columns:
            df_copy[[x]] = np.random.permutation(df_copy[[x]])

        unique_ids = df_copy.index.get_level_values(0).unique()
        sampled_ids = np.random.choice(unique_ids,
                                       size=number_of_subjects,
                                       replace=True)
        grid = pd.DataFrame({'id': sampled_ids}).sort_values('id')
        grid['id_new_'] = range(0, number_of_subjects)
        df = pd.merge(df_copy.reset_index(), grid, on='id')
        df['id'] = df['id_new_']
        df = df.drop(['id_new_'], axis=1).sort_values('id')
        df = df.set_index(['id', 'sequence_pos'])
        # merge caueses dataframes to lose their types
        reference = _generate_column_type_dictionary(self.target_data_)
        df = _assign_column_type(df, reference)

        return df
Пример #5
0
def _calculate_statistical_distances(
    tgt_data: DataFrame,
    syn_data: DataFrame,
    type: str,  # 1dim | 2dim | 3dim | 4dim
    number_of_bins: int = 10,
    max_combinations: int = 100,
):

    assert type in ["1dim", "2dim", "3dim", "4dim"], "type not recognized"
    # check if data is in expected format
    assert sorted(tgt_data.columns) == sorted(
        syn_data.columns
    ), "Target and Synthetic have different columns"
    column_dict = _generate_column_type_dictionary(tgt_data)

    def _sample_one_event(data: DataFrame) -> DataFrame:
        """
        Randomly sample one record for each id.
        """
        # set seed to ensure same rows are considered across synthesizers
        random.seed(a=123)
        # determine sequence length for each id
        seq_lens = data.reset_index().groupby("id").size()
        # randomly draw a sequence_pos for each id
        draws = pd.DataFrame(
            {
                "sequence_pos": np.floor(
                    np.random.rand(len(seq_lens)) * seq_lens
                ).astype(int)
            }
        ).reset_index()
        # inner join with provided dataframe to filter to drawn records
        out = (
            pd.merge(draws, data, on=["id", "sequence_pos"])
            .drop(columns="sequence_pos")
            .set_index("id")
        )
        return out

    tgt_sample = _sample_one_event(tgt_data)
    syn_sample = _sample_one_event(syn_data)
    tgt_binned, syn_binned = _bin_looped(
        tgt_sample, syn_sample, column_dict, number_of_bins
    )
    tgt_binned = tgt_binned.astype(str)
    syn_binned = syn_binned.astype(str)

    def calc_shares(df_binned, label):
        shares = df_binned.value_counts().reset_index()
        shares.columns = ["bins", label]
        shares[label] = shares[label] / shares[label].sum()
        return shares

    if type == "1dim":
        cross = pd.DataFrame({"col_1": tgt_data.columns.to_list()})
        cross["col_2"] = "all"
        cross["col_3"] = "all"
        cross["col_4"] = "all"
    elif type == "2dim":
        cols_1 = pd.DataFrame({"col_1": tgt_data.columns.to_list(), "key": "xyz"})
        cols_2 = pd.DataFrame({"col_2": tgt_data.columns.to_list(), "key": "xyz"})
        cross = pd.merge(cols_1, cols_2, on="key").drop("key", axis=1)
        cross["col_3"] = "all"
        cross["col_4"] = "all"
    elif type == "3dim":
        cols_1 = pd.DataFrame({"col_1": tgt_data.columns.to_list(), "key": "xyz"})
        cols_2 = pd.DataFrame({"col_2": tgt_data.columns.to_list(), "key": "xyz"})
        cols_3 = pd.DataFrame({"col_3": tgt_data.columns.to_list(), "key": "xyz"})
        cross = pd.merge(pd.merge(cols_1, cols_2, on="key"), cols_3, on="key").drop(
            "key", axis=1
        )
        cross["col_4"] = "all"
    elif type == "4dim":
        cols_1 = pd.DataFrame({"col_1": tgt_data.columns.to_list(), "key": "xyz"})
        cols_2 = pd.DataFrame({"col_2": tgt_data.columns.to_list(), "key": "xyz"})
        cols_3 = pd.DataFrame({"col_3": tgt_data.columns.to_list(), "key": "xyz"})
        cols_4 = pd.DataFrame({"col_4": tgt_data.columns.to_list(), "key": "xyz"})
        cross = pd.merge(
            pd.merge(pd.merge(cols_1, cols_2, on="key"), cols_3, on="key"),
            cols_4,
            on="key",
        ).drop("key", axis=1)

    # set seed to ensure same variable combinations are considered across synthesizers
    random.seed(a=123)
    cross = cross.sample(min(cross.shape[0], max_combinations))
    tgt_binned["all"] = "all"
    syn_binned["all"] = "all"

    result = list()
    for i in range(cross.shape[0]):
        col_1 = list(cross.col_1)[i]
        col_2 = list(cross.col_2)[i]
        col_3 = list(cross.col_3)[i]
        col_4 = list(cross.col_4)[i]
        col_label = col_1 + "|" + col_2 + "|" + col_3 + "|" + col_4
        tgt_values = (
            tgt_binned[col_1]
            + "|"
            + tgt_binned[col_2]
            + "|"
            + tgt_binned[col_3]
            + "|"
            + tgt_binned[col_4]
        )
        syn_values = (
            syn_binned[col_1]
            + "|"
            + syn_binned[col_2]
            + "|"
            + syn_binned[col_3]
            + "|"
            + syn_binned[col_4]
        )
        shares = pd.merge(
            calc_shares(tgt_values, "tgt"), calc_shares(syn_values, "syn"), how="left"
        )
        shares["syn"] = shares["syn"].fillna(0)
        diff = np.abs(shares["tgt"] - shares["syn"])
        hell = np.sqrt(
            np.sum((np.sqrt(shares["tgt"]) - np.sqrt(shares["syn"])) ** 2)
        ) / np.sqrt(2)
        out = pd.DataFrame(
            {
                "label": [col_label],
                "type": type,
                "max": np.max(diff),
                "l1d": np.sum(diff),
                "l2d": np.sqrt(np.sum(diff ** 2)),
                "hellinger": hell,
            }
        )

        result.append(out)

    # reset seed
    random.seed(a=None)

    return pd.concat(result)
Пример #6
0
def _calculate_statistical_distances(tgt_data:DataFrame,
                                     syn_data:DataFrame,
                                     type: str, # 1dim | 2dim | 3dim | 4dim
                                     number_of_bins:int = 10,
                                     max_combinations:int = 100):

    assert type in ['1dim', '2dim', '3dim', '4dim'], "type not recognized"
    # check if data is in expected format
    assert sorted(tgt_data.columns) == sorted(syn_data.columns), "Target and Synthetic have different columns"
    column_dict = _generate_column_type_dictionary(tgt_data)

    def _sample_one_event(data: DataFrame) -> DataFrame:
        """
        Randomly sample one record for each id.
        """
        # set seed to ensure same rows are considered across synthesizers
        random.seed(a=123)
        # determine sequence length for each id
        seq_lens = data.reset_index().groupby('id').size()
        # randomly draw a sequence_pos for each id
        draws = pd.DataFrame({'sequence_pos': np.floor(np.random.rand(len(seq_lens)) * seq_lens).astype(
            int)}).reset_index()
        # inner join with provided dataframe to filter to drawn records
        out = pd.merge(draws, data, on=['id', 'sequence_pos']).drop(columns='sequence_pos').set_index('id')
        return out

    tgt_sample = _sample_one_event(tgt_data)
    syn_sample = _sample_one_event(syn_data)
    tgt_binned, syn_binned = _bin_looped(tgt_sample,
                                         syn_sample,
                                         column_dict,
                                         number_of_bins)
    tgt_binned = tgt_binned.astype(str)
    syn_binned = syn_binned.astype(str)

    def calc_shares(df_binned, label):
        shares = df_binned.value_counts().reset_index()
        shares.columns = ['bins', label]
        shares[label] = shares[label] / shares[label].sum()
        return shares

    if type == '1dim':
        cross = pd.DataFrame({'col_1': tgt_data.columns.to_list()})
        cross['col_2'] = 'all'
        cross['col_3'] = 'all'
        cross['col_4'] = 'all'
    elif type == '2dim':
        cols_1 = pd.DataFrame({'col_1': tgt_data.columns.to_list(), 'key': 'xyz'})
        cols_2 = pd.DataFrame({'col_2': tgt_data.columns.to_list(), 'key': 'xyz'})
        cross = pd.merge(cols_1, cols_2, on='key').drop('key', axis=1)
        cross['col_3'] = 'all'
        cross['col_4'] = 'all'
    elif type == '3dim':
        cols_1 = pd.DataFrame({'col_1': tgt_data.columns.to_list(), 'key': 'xyz'})
        cols_2 = pd.DataFrame({'col_2': tgt_data.columns.to_list(), 'key': 'xyz'})
        cols_3 = pd.DataFrame({'col_3': tgt_data.columns.to_list(), 'key': 'xyz'})
        cross = pd.merge(pd.merge(cols_1, cols_2, on='key'), cols_3, on='key').drop('key', axis=1)
        cross['col_4'] = 'all'
    elif type == '4dim':
        cols_1 = pd.DataFrame({'col_1': tgt_data.columns.to_list(), 'key': 'xyz'})
        cols_2 = pd.DataFrame({'col_2': tgt_data.columns.to_list(), 'key': 'xyz'})
        cols_3 = pd.DataFrame({'col_3': tgt_data.columns.to_list(), 'key': 'xyz'})
        cols_4 = pd.DataFrame({'col_4': tgt_data.columns.to_list(), 'key': 'xyz'})
        cross = pd.merge(pd.merge(pd.merge(cols_1, cols_2, on='key'), cols_3, on='key'), cols_4, on='key').drop('key', axis=1)

    # set seed to ensure same variable combinations are considered across synthesizers
    random.seed(a=123)
    cross = cross.sample(min(cross.shape[0], max_combinations))
    tgt_binned['all'] = 'all'
    syn_binned['all'] = 'all'

    result = list()
    for i in range(cross.shape[0]):
        col_1 = list(cross.col_1)[i]
        col_2 = list(cross.col_2)[i]
        col_3 = list(cross.col_3)[i]
        col_4 = list(cross.col_4)[i]
        col_label = col_1 + '|' + col_2 + '|' + col_3 + '|' + col_4
        tgt_values = tgt_binned[col_1] + '|' + \
                     tgt_binned[col_2] + '|' + \
                     tgt_binned[col_3] + '|' + \
                     tgt_binned[col_4]
        syn_values = syn_binned[col_1] + '|' + \
                     syn_binned[col_2] + '|' + \
                     syn_binned[col_3] + '|' + \
                     syn_binned[col_4]
        shares = pd.merge(calc_shares(tgt_values, 'tgt'),
                          calc_shares(syn_values, 'syn'),
                          how='left')
        shares['syn'] = shares['syn'].fillna(0)
        diff = np.abs(shares['tgt'] - shares['syn'])
        hell = np.sqrt(np.sum((np.sqrt(shares['tgt']) - np.sqrt(shares['syn'])) ** 2)) / np.sqrt(2)
        out = pd.DataFrame({'label': [col_label],
                            'type': type,
                            'tvd': np.max(diff),
                            'l1d': np.sum(diff),
                            'l2d': np.sqrt(np.sum(diff ** 2)),
                            'hellinger': hell})

        result.append(out)

    # reset seed
    random.seed(a=None)

    return pd.concat(result)
Пример #7
0
    def transform(self, data):
        data_copy = data.copy(deep=True)
        dfs_to_merge = []

        df_category = data_copy.select_dtypes(include='category')

        if not df_category.empty:

            df_category_dummies = pd.get_dummies(df_category)
            df_category_wide = df_category_dummies.unstack().fillna(0)
            df_category_wide.columns = df_category_wide.columns.map(lambda x: '{}_{}'.format(x[1], x[0]))
            df_category_wide = df_category_wide.sort_index(axis=1)

            possible_sequence_lengths = list(set([int(x.split("_")[0]) for x in df_category_wide.columns]))

            category_columns = df_category.columns
            target_dataframe_dict = {}
            category_column_mapping = {}
            target_dataframe_list = {}
            first_pos = 0
            column_ordering = []
            column_idx_original = {}

            for category_column in category_columns:
                target_categories = []
                column_mapping = {}
                subset_category_columns = [x for x in df_category_wide.columns if category_column in x]
                for seq_length in possible_sequence_lengths:
                    columns = [x for x in subset_category_columns if seq_length == int(x.split("_")[0])]
                    column_ordering.extend(columns)
                    second_pos = first_pos + len(columns)
                    df_sliced = df_category_wide[columns]
                    mapping = {
                        col: idx for idx, col in enumerate(columns)
                    }
                    column_mapping[str(seq_length)] = columns
                    df_map = pd.DataFrame(df_category_wide[columns].idxmax(axis="columns").map(mapping),
                                          columns=[f"idx_{first_pos}_{second_pos}"])
                    first_pos = second_pos
                    target_categories.append(df_map)

                category_column_mapping[category_column] = column_mapping
                category_target = pd.concat(target_categories, axis=1)
                target_dataframe_list[category_column] = category_target
                column_idx_original[category_column] = [[pos, name] for pos, name in enumerate(category_target.columns)]

            self._category_column_mapping = category_column_mapping
            # needed for loss

            dfs_to_merge.append(df_category_wide.apply(lambda x: pd.Categorical(x))[column_ordering])

        df_numerics = data_copy.select_dtypes(include='number')

        if not df_numerics.empty:
            df_numerics_wide = df_numerics.unstack().fillna(0)
            df_numerics_wide.columns = df_numerics_wide.columns.map(lambda x: '{}_{}'.format(x[0], x[1]))
            self._transformer_mean = df_numerics_wide.mean()
            self._transformer_std = df_numerics_wide.std()
            df_numerics_wide_standard = (df_numerics_wide - self._transformer_mean) / self._transformer_std
            dfs_to_merge.append(df_numerics_wide_standard)

        if len(dfs_to_merge) > 1:
            transformed_data = functools.reduce(lambda left, right: pd.merge(left, right, on=['id']), dfs_to_merge)
        else:
            transformed_data = dfs_to_merge[0]

        self.original_column_mapping = _generate_column_type_dictionary(data)
        self.column_mapping = _generate_column_type_dictionary(transformed_data)
        self.idx_mapping = {idx: self.column_mapping[x] for idx, x in enumerate(transformed_data.columns)}
        self._last_index = None
        self._category_idx_real_data = None

        # data will need to have targets for loss
        if not df_category.empty:
            self._last_index = transformed_data.shape[1]
            self._category_idx_real_data = {}
            end_idx = transformed_data.shape[1]
            for cat_col_name, cat_target_dataframe in target_dataframe_list.items():
                transformed_data = pd.concat([transformed_data, cat_target_dataframe], axis=1)
                pos_info = column_idx_original[cat_col_name]
                self._category_idx_real_data[cat_col_name] = {name: pos + end_idx for pos, name in pos_info}
                end_idx = end_idx + cat_target_dataframe.shape[1]

        return transformed_data