예제 #1
0
    def __init__(self, datatype, metadata):
        assert datatype in [DataFrame,
                            ndarray], 'Unknown data type {}'.format(datatype)
        self.datatype = datatype
        self.nfeatures = 0

        self.cat_attr_idx = metadata['categorical_columns'] + metadata[
            'ordinal_columns']
        self.cat_attr_names = []
        self.category_codes = {}

        for cidx in self.cat_attr_idx:
            col = metadata['columns'][cidx]
            self.cat_attr_names.append(col['name'])
            self.category_codes[col['name']] = col['i2s']
            self.nfeatures += col['size'] - 1

        self.num_attr_idx = metadata['continuous_columns']
        self.num_attr_names = []
        self.histogram_bins = {}

        for cidx in self.num_attr_idx:
            col = metadata['columns'][cidx]
            self.num_attr_names.append(col['name'])
            self.nfeatures += 1

        LOGGER.debug(f'Feature set will length {self.nfeatures}')
예제 #2
0
    def generate_samples(self, nsamples):
        """
        Samples synthetic data records from the fitted generative distribution

        :param nsamples: int: Number of synthetic records to generate
        :return: synthetic_dataset: DataFrame: A synthetic dataset
        """
        assert self.trained, "Model must be fitted to some real data first"

        LOGGER.debug(f'Generate synthetic dataset of size {nsamples}')

        all_attributes = self.data_describer.metadata['attribute_list']
        synthetic_data = DataFrame(columns=all_attributes)

        # Get samples for attributes modelled in Bayesian net
        encoded_dataset = self._generate_encoded_dataset(nsamples)

        for attr in all_attributes:
            column = self.data_describer.attr_dict[attr]
            if attr in encoded_dataset:
                synthetic_data[attr] = column.sample_values_from_binning_indices(encoded_dataset[attr])
            else:
                # For attributes not in BN use independent attribute mode
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(nsamples)
                synthetic_data[attr] = column.sample_values_from_binning_indices(binning_indices)

        return synthetic_data
예제 #3
0
    def train(self, synA, labels):
        """ Train a membership inference attack on a labelled training set

         :param synA: list of ndarrays: A list of synthetic datasets
         :param labels: list: A list of labels that indicate whether target was in the training data (LABEL_IN=1) or not (LABEL_OUT=0)
         """

        if self.FeatureSet is not None:
            synA = stack([self.FeatureSet.extract(s) for s in synA])
        else:
            if isinstance(synA[0], DataFrame):
                synA = [self._impute_missing_values(s) for s in synA]
                synA = stack([
                    convert_df_to_array(s, self.metadata).flatten()
                    for s in synA
                ])
            else:
                synA = stack([s.flatten() for s in synA])
        if not isinstance(labels, ndarray):
            labels = array(labels)

        self.AttackClassifier.fit(synA, labels)

        LOGGER.debug('Finished training MIA distinguisher')
        self.trained = True

        del synA, labels
예제 #4
0
    def fit(self, rawTrain):
        """
        Fit a generative model of the training data distribution.
        The BayNet model first models the conditional independence structure of data attributes
        as a Bayesian network and then fits a set of conditional marginals to the training data.

        :param rawTrain: DataFrame: Training dataset
        :return: None
        """
        assert isinstance(rawTrain, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(rawTrain)}'
        LOGGER.debug(f'Start training BayesianNet on data of shape {rawTrain.shape}...')
        if self.trained:
            self.trained = False
            self.data_describer.data_description = {}
            self.bayesian_network = None
            self.conditional_probabilities = None

        self.data_describer.describe(rawTrain)

        encoded_df = DataFrame()
        for attr in self.data_describer.metadata['attribute_list_hist']:
            column = self.data_describer.attr_dict[attr]
            encoded_df[attr] = column.encode_values_into_bin_idx()
        if encoded_df.shape[1] < 2:
            raise Exception("BayesianNet requires at least 2 attributes(i.e., columns) in dataset.")

        if self.multiprocess:
            self.bayesian_network = self._greedy_bayes_multiprocess(encoded_df, self.k)
        else:
            self.bayesian_network = self._greedy_bayes_linear(encoded_df, self.k)
        self.conditional_probabilities = self._construct_conditional_probabilities(self.bayesian_network, encoded_df)

        LOGGER.debug(f'Finished training Bayesian net')
        self.trained = True
예제 #5
0
 def fit(self, data):
     """Fit a gaussian mixture model to the input data. Input data is assumed to be of shape (n_samples, n_features)
     See https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture.fit for details"""
     LOGGER.debug(
         f'Start fitting GaussianMixtureModel to data of shape {data.shape}...'
     )
     self.gm.fit(data)
     LOGGER.debug(f'Finished fitting GMM')
     self.trained = True
예제 #6
0
    def generate_samples(self, nsamples):
        """
        Samples synthetic data records from the fitted generative distribution

        :param nsamples: int: Number of synthetic records to generate
        :return: synData: DataFrame: A synthetic dataset
        """
        assert self.trained, "Model must first be fitted to some data."

        LOGGER.debug(f'Generate synthetic dataset of size {nsamples}')
        synData = self.synthesiser.sample(nsamples)

        return synData
예제 #7
0
def generate_mia_shadow_data_allin(GenModel, target, rawA, sizeSyn, numCopies):
    """ Generate training data for the MIA from a *single* shadow model trained on the entire reference dataset at once

    :param GenModel: GenerativeModel: An object that implements a generative model training procedure
    :param target: ndarray or DataFrame: The target record
    :param rawA: ndarray or DataFrame: Attacker's reference dataset of size n_A
    :param sizeSyn: int: Size of the synthetic dataset the adversary will be given access to
    :param numCopies: int: Number of synthetic training datasets sampled from the shadow model

    :returns
        :return synA: list of ndarrays or DataFrames: List of synthetic datasets
        :return labels: list: List of labels indicating whether target was in or out
    """
    assert isinstance(
        rawA, GenModel.datatype
    ), f"GM expectes datatype {GenModel.datatype} but got {type(rawA)}"
    assert isinstance(target, type(
        rawA)), f"Mismatch of datatypes between target record and raw data"

    synA, labels = [], []

    LOGGER.debug(
        f'Start training shadow model of class {GenModel.__class__.__name__} on data of size {len(rawA)}'
    )

    # Fit GM to data without target's data
    GenModel.fit(rawA)

    # Generate synthetic sample for data without target
    synAout = [GenModel.generate_samples(sizeSyn) for _ in range(numCopies)]
    synA.extend(synAout)
    labels.extend([LABEL_OUT for _ in range(numCopies)])

    # Insert targets into training data
    if isinstance(rawA, DataFrame):
        rawAin = rawA.append(target)
    else:
        if len(target.shape) == 1:
            target = target.reshape(1, len(target))
        rawAin = concatenate([rawA, target])

    # Fit generative model to data including target
    GenModel.fit(rawAin)

    # Generate synthetic sample for data including target
    synAin = [GenModel.generate_samples(sizeSyn) for _ in range(numCopies)]
    synA.extend(synAin)
    labels.extend([LABEL_IN for _ in range(numCopies)])

    return synA, labels
예제 #8
0
    def fit(self, rawTrain):
        """
        Fit a generative model of the training data distribution.
        The IndHist model extracts frequency counts from each attribute independently.

        :param rawTrain: DataFrame: Training dataset
        :return: None
        """
        assert isinstance(rawTrain, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(rawTrain)}'
        LOGGER.debug(f'Start fitting IndependentHistogram model to data of shape {rawTrain.shape}...')
        if self.trained:
            # Make sure to delete previous data description
            self.data_describer = DataDescriber(self.category_threshold, self.histogram_bins)
        self.data_describer.describe(rawTrain)
        LOGGER.debug(f'Finished fitting IndependentHistogram')
        self.trained = True
예제 #9
0
    def train(self, synT):
        """
        Train a MLE attack to reconstruct an unknown sensitive value from a vector of known attributes

        :param synT: DataFrame: A synthetic dataset of shape (n, k + 1)
        """

        # Split data into known and sensitive
        if isinstance(synT, DataFrame):
            assert self.sensitiveAttribute in list(
                synT), f'DataFrame only contains columns {list(synT)}'

            synKnown = synT.drop(self.sensitiveAttribute, axis=1)
            synSensitive = synT[self.sensitiveAttribute]

            synKnown = convert_df_to_array(synKnown, self.metadata)
            synSensitive = convert_series_to_array(synSensitive, self.metadata)

        else:
            assert isinstance(synT, ndarray), f"Unknown data type {type(synT)}"

            # If input data is array assume that self.metadata is the schema of the array
            attrList = [c['name'] for c in self.metadata['columns']]
            sensitiveIdx = attrList.index(self.sensitiveAttribute)
            synKnown = synT[:, [
                i for i in range(len(attrList)) if i != sensitiveIdx
            ]]
            synSensitive = synT[:, sensitiveIdx]

        n, k = synKnown.shape

        # Centre independent variables for better regression performance
        self.scaleFactor = mean(synKnown, axis=0)
        synKnownScaled = synKnown - self.scaleFactor
        synKnownScaled = concatenate(
            [ones((len(synKnownScaled), 1)), synKnownScaled],
            axis=1)  # append all  ones for inclu intercept in beta vector

        # Get MLE for linear coefficients
        self.RegressionModel.fit(synKnownScaled, synSensitive)
        self.coefficients = self.RegressionModel.coef_
        self.sigma = sum((synSensitive - synKnownScaled.dot(self.coefficients))
                         **2) / (n - k)

        LOGGER.debug('Finished training regression model')
        self.trained = True
예제 #10
0
    def generate_samples(self, nsamples):
        """
        Samples synthetic data records from the fitted generative distribution

        :param nsamples: int: Number of synthetic records to generate
        :return: synthetic_dataset: DataFrame: A synthetic dataset
        """
        assert self.trained, "Model must be fitted to some real data first"
        LOGGER.debug(f'Generate synthetic dataset of size {nsamples}')
        all_attributes = list(self.data_describer.metadata['attribute_list'])
        synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.data_describer.data_description[attr]
            column = parse_json(attr_info)
            binning_indices = column.sample_binning_indices_in_independent_attribute_mode(nsamples)
            synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)
        LOGGER.debug(f'Generated synthetic dataset of size {nsamples}')
        return synthetic_dataset
예제 #11
0
    def fit(self, rawTrain):
        """
        Fit a generative model of the training data distribution.
        See <https://github.com/sdv-dev/CTGAN> for details.

        :param rawTrain: DataFrame or ndarray: Training set
        """
        assert isinstance(
            rawTrain, self.datatype
        ), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(rawTrain)}'

        LOGGER.debug(
            f'Start fitting {self.__class__.__name__} to data of shape {rawTrain.shape}...'
        )
        self.synthesiser.fit(rawTrain, self.metadata)

        LOGGER.debug(f'Finished fitting')
        self.trained = True
예제 #12
0
def generate_mia_shadow_data_shufflesplit(GenModel, target, rawA, sizeRaw,
                                          sizeSyn, numModels, numCopies):
    """ Procedure to train a set of shadow models on multiple training sets sampled from a reference dataset.

    :param GenModel: GenerativeModel: An object that implements a generative model training procedure
    :param target: ndarray or DataFrame: The target record
    :param rawA: ndarray or DataFrame: Attacker's reference dataset of size n_A
    :param sizeRaw: int: Size of the target training set
    :param sizeSyn: int: Size of the synthetic dataset the adversary will be given access to
    :param numModels: int: Number of shadow models to train
    :param numCopies: int: Number of synthetic training datasets sampled from each shadow model

    :returns
        :return synA: list of ndarrays or DataFrames: List of synthetic datasets
        :return labels: list: List of labels indicating whether target was in or out
    """
    assert isinstance(
        rawA, GenModel.datatype
    ), f"GM expectes datatype {GenModel.datatype} but got {type(rawA)}"
    assert isinstance(target, type(
        rawA)), f"Mismatch of datatypes between target record and raw data"

    kf = ShuffleSplit(n_splits=numModels, train_size=sizeRaw)
    synA, labels = [], []

    LOGGER.debug(
        f'Start training {numModels} shadow models of class {GenModel.__class__.__name__}'
    )

    for train_index, _ in kf.split(rawA):

        # Sample a new training set from the reference dataset
        if isinstance(rawA, DataFrame):
            rawAout = rawA.iloc[train_index]
        else:
            rawAout = rawA[train_index, :]

        # Fit GM to raw data without target
        GenModel.fit(rawAout)

        # Generate synthetic samples from model trained without target and label as out
        SynAout = [
            GenModel.generate_samples(sizeSyn) for _ in range(numCopies)
        ]
        synA.extend(SynAout)
        labels.extend([LABEL_OUT for _ in range(numCopies)])

        # Insert target record into training data
        if isinstance(rawA, DataFrame):
            rawAin = rawAout.append(target)
        else:
            if len(target.shape) == 1:
                target = target.reshape(1, len(target))
            rawAin = concatenate([rawAout, target])

        # Fit generative model to raw data including target
        GenModel.fit(rawAin)

        # Generate synthetic samples from model trained on data including target
        synthetic_in = [
            GenModel.generate_samples(sizeSyn) for _ in range(numCopies)
        ]
        synA.extend(synthetic_in)
        labels.extend([LABEL_IN for _ in range(numCopies)])

    return synA, labels
예제 #13
0
 def generate_samples(self, nsamples):
     """Generate random samples from the fitted Gaussian distribution"""
     assert self.trained, "Model must first be fitted to some data."
     LOGGER.debug(f'Generate synthetic dataset of size {nsamples}')
     synthetic_data, _ = self.gm.sample(nsamples)
     return synthetic_data