def __init__(self, datatype, metadata): assert datatype in [DataFrame, ndarray], 'Unknown data type {}'.format(datatype) self.datatype = datatype self.nfeatures = 0 self.cat_attr_idx = metadata['categorical_columns'] + metadata[ 'ordinal_columns'] self.cat_attr_names = [] self.category_codes = {} for cidx in self.cat_attr_idx: col = metadata['columns'][cidx] self.cat_attr_names.append(col['name']) self.category_codes[col['name']] = col['i2s'] self.nfeatures += col['size'] - 1 self.num_attr_idx = metadata['continuous_columns'] self.num_attr_names = [] self.histogram_bins = {} for cidx in self.num_attr_idx: col = metadata['columns'][cidx] self.num_attr_names.append(col['name']) self.nfeatures += 1 LOGGER.debug(f'Feature set will length {self.nfeatures}')
def generate_samples(self, nsamples): """ Samples synthetic data records from the fitted generative distribution :param nsamples: int: Number of synthetic records to generate :return: synthetic_dataset: DataFrame: A synthetic dataset """ assert self.trained, "Model must be fitted to some real data first" LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') all_attributes = self.data_describer.metadata['attribute_list'] synthetic_data = DataFrame(columns=all_attributes) # Get samples for attributes modelled in Bayesian net encoded_dataset = self._generate_encoded_dataset(nsamples) for attr in all_attributes: column = self.data_describer.attr_dict[attr] if attr in encoded_dataset: synthetic_data[attr] = column.sample_values_from_binning_indices(encoded_dataset[attr]) else: # For attributes not in BN use independent attribute mode binning_indices = column.sample_binning_indices_in_independent_attribute_mode(nsamples) synthetic_data[attr] = column.sample_values_from_binning_indices(binning_indices) return synthetic_data
def train(self, synA, labels): """ Train a membership inference attack on a labelled training set :param synA: list of ndarrays: A list of synthetic datasets :param labels: list: A list of labels that indicate whether target was in the training data (LABEL_IN=1) or not (LABEL_OUT=0) """ if self.FeatureSet is not None: synA = stack([self.FeatureSet.extract(s) for s in synA]) else: if isinstance(synA[0], DataFrame): synA = [self._impute_missing_values(s) for s in synA] synA = stack([ convert_df_to_array(s, self.metadata).flatten() for s in synA ]) else: synA = stack([s.flatten() for s in synA]) if not isinstance(labels, ndarray): labels = array(labels) self.AttackClassifier.fit(synA, labels) LOGGER.debug('Finished training MIA distinguisher') self.trained = True del synA, labels
def fit(self, rawTrain): """ Fit a generative model of the training data distribution. The BayNet model first models the conditional independence structure of data attributes as a Bayesian network and then fits a set of conditional marginals to the training data. :param rawTrain: DataFrame: Training dataset :return: None """ assert isinstance(rawTrain, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(rawTrain)}' LOGGER.debug(f'Start training BayesianNet on data of shape {rawTrain.shape}...') if self.trained: self.trained = False self.data_describer.data_description = {} self.bayesian_network = None self.conditional_probabilities = None self.data_describer.describe(rawTrain) encoded_df = DataFrame() for attr in self.data_describer.metadata['attribute_list_hist']: column = self.data_describer.attr_dict[attr] encoded_df[attr] = column.encode_values_into_bin_idx() if encoded_df.shape[1] < 2: raise Exception("BayesianNet requires at least 2 attributes(i.e., columns) in dataset.") if self.multiprocess: self.bayesian_network = self._greedy_bayes_multiprocess(encoded_df, self.k) else: self.bayesian_network = self._greedy_bayes_linear(encoded_df, self.k) self.conditional_probabilities = self._construct_conditional_probabilities(self.bayesian_network, encoded_df) LOGGER.debug(f'Finished training Bayesian net') self.trained = True
def fit(self, data): """Fit a gaussian mixture model to the input data. Input data is assumed to be of shape (n_samples, n_features) See https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture.fit for details""" LOGGER.debug( f'Start fitting GaussianMixtureModel to data of shape {data.shape}...' ) self.gm.fit(data) LOGGER.debug(f'Finished fitting GMM') self.trained = True
def generate_samples(self, nsamples): """ Samples synthetic data records from the fitted generative distribution :param nsamples: int: Number of synthetic records to generate :return: synData: DataFrame: A synthetic dataset """ assert self.trained, "Model must first be fitted to some data." LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') synData = self.synthesiser.sample(nsamples) return synData
def generate_mia_shadow_data_allin(GenModel, target, rawA, sizeSyn, numCopies): """ Generate training data for the MIA from a *single* shadow model trained on the entire reference dataset at once :param GenModel: GenerativeModel: An object that implements a generative model training procedure :param target: ndarray or DataFrame: The target record :param rawA: ndarray or DataFrame: Attacker's reference dataset of size n_A :param sizeSyn: int: Size of the synthetic dataset the adversary will be given access to :param numCopies: int: Number of synthetic training datasets sampled from the shadow model :returns :return synA: list of ndarrays or DataFrames: List of synthetic datasets :return labels: list: List of labels indicating whether target was in or out """ assert isinstance( rawA, GenModel.datatype ), f"GM expectes datatype {GenModel.datatype} but got {type(rawA)}" assert isinstance(target, type( rawA)), f"Mismatch of datatypes between target record and raw data" synA, labels = [], [] LOGGER.debug( f'Start training shadow model of class {GenModel.__class__.__name__} on data of size {len(rawA)}' ) # Fit GM to data without target's data GenModel.fit(rawA) # Generate synthetic sample for data without target synAout = [GenModel.generate_samples(sizeSyn) for _ in range(numCopies)] synA.extend(synAout) labels.extend([LABEL_OUT for _ in range(numCopies)]) # Insert targets into training data if isinstance(rawA, DataFrame): rawAin = rawA.append(target) else: if len(target.shape) == 1: target = target.reshape(1, len(target)) rawAin = concatenate([rawA, target]) # Fit generative model to data including target GenModel.fit(rawAin) # Generate synthetic sample for data including target synAin = [GenModel.generate_samples(sizeSyn) for _ in range(numCopies)] synA.extend(synAin) labels.extend([LABEL_IN for _ in range(numCopies)]) return synA, labels
def fit(self, rawTrain): """ Fit a generative model of the training data distribution. The IndHist model extracts frequency counts from each attribute independently. :param rawTrain: DataFrame: Training dataset :return: None """ assert isinstance(rawTrain, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(rawTrain)}' LOGGER.debug(f'Start fitting IndependentHistogram model to data of shape {rawTrain.shape}...') if self.trained: # Make sure to delete previous data description self.data_describer = DataDescriber(self.category_threshold, self.histogram_bins) self.data_describer.describe(rawTrain) LOGGER.debug(f'Finished fitting IndependentHistogram') self.trained = True
def train(self, synT): """ Train a MLE attack to reconstruct an unknown sensitive value from a vector of known attributes :param synT: DataFrame: A synthetic dataset of shape (n, k + 1) """ # Split data into known and sensitive if isinstance(synT, DataFrame): assert self.sensitiveAttribute in list( synT), f'DataFrame only contains columns {list(synT)}' synKnown = synT.drop(self.sensitiveAttribute, axis=1) synSensitive = synT[self.sensitiveAttribute] synKnown = convert_df_to_array(synKnown, self.metadata) synSensitive = convert_series_to_array(synSensitive, self.metadata) else: assert isinstance(synT, ndarray), f"Unknown data type {type(synT)}" # If input data is array assume that self.metadata is the schema of the array attrList = [c['name'] for c in self.metadata['columns']] sensitiveIdx = attrList.index(self.sensitiveAttribute) synKnown = synT[:, [ i for i in range(len(attrList)) if i != sensitiveIdx ]] synSensitive = synT[:, sensitiveIdx] n, k = synKnown.shape # Centre independent variables for better regression performance self.scaleFactor = mean(synKnown, axis=0) synKnownScaled = synKnown - self.scaleFactor synKnownScaled = concatenate( [ones((len(synKnownScaled), 1)), synKnownScaled], axis=1) # append all ones for inclu intercept in beta vector # Get MLE for linear coefficients self.RegressionModel.fit(synKnownScaled, synSensitive) self.coefficients = self.RegressionModel.coef_ self.sigma = sum((synSensitive - synKnownScaled.dot(self.coefficients)) **2) / (n - k) LOGGER.debug('Finished training regression model') self.trained = True
def generate_samples(self, nsamples): """ Samples synthetic data records from the fitted generative distribution :param nsamples: int: Number of synthetic records to generate :return: synthetic_dataset: DataFrame: A synthetic dataset """ assert self.trained, "Model must be fitted to some real data first" LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') all_attributes = list(self.data_describer.metadata['attribute_list']) synthetic_dataset = DataFrame(columns=all_attributes) for attr in all_attributes: attr_info = self.data_describer.data_description[attr] column = parse_json(attr_info) binning_indices = column.sample_binning_indices_in_independent_attribute_mode(nsamples) synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices) LOGGER.debug(f'Generated synthetic dataset of size {nsamples}') return synthetic_dataset
def fit(self, rawTrain): """ Fit a generative model of the training data distribution. See <https://github.com/sdv-dev/CTGAN> for details. :param rawTrain: DataFrame or ndarray: Training set """ assert isinstance( rawTrain, self.datatype ), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(rawTrain)}' LOGGER.debug( f'Start fitting {self.__class__.__name__} to data of shape {rawTrain.shape}...' ) self.synthesiser.fit(rawTrain, self.metadata) LOGGER.debug(f'Finished fitting') self.trained = True
def generate_mia_shadow_data_shufflesplit(GenModel, target, rawA, sizeRaw, sizeSyn, numModels, numCopies): """ Procedure to train a set of shadow models on multiple training sets sampled from a reference dataset. :param GenModel: GenerativeModel: An object that implements a generative model training procedure :param target: ndarray or DataFrame: The target record :param rawA: ndarray or DataFrame: Attacker's reference dataset of size n_A :param sizeRaw: int: Size of the target training set :param sizeSyn: int: Size of the synthetic dataset the adversary will be given access to :param numModels: int: Number of shadow models to train :param numCopies: int: Number of synthetic training datasets sampled from each shadow model :returns :return synA: list of ndarrays or DataFrames: List of synthetic datasets :return labels: list: List of labels indicating whether target was in or out """ assert isinstance( rawA, GenModel.datatype ), f"GM expectes datatype {GenModel.datatype} but got {type(rawA)}" assert isinstance(target, type( rawA)), f"Mismatch of datatypes between target record and raw data" kf = ShuffleSplit(n_splits=numModels, train_size=sizeRaw) synA, labels = [], [] LOGGER.debug( f'Start training {numModels} shadow models of class {GenModel.__class__.__name__}' ) for train_index, _ in kf.split(rawA): # Sample a new training set from the reference dataset if isinstance(rawA, DataFrame): rawAout = rawA.iloc[train_index] else: rawAout = rawA[train_index, :] # Fit GM to raw data without target GenModel.fit(rawAout) # Generate synthetic samples from model trained without target and label as out SynAout = [ GenModel.generate_samples(sizeSyn) for _ in range(numCopies) ] synA.extend(SynAout) labels.extend([LABEL_OUT for _ in range(numCopies)]) # Insert target record into training data if isinstance(rawA, DataFrame): rawAin = rawAout.append(target) else: if len(target.shape) == 1: target = target.reshape(1, len(target)) rawAin = concatenate([rawAout, target]) # Fit generative model to raw data including target GenModel.fit(rawAin) # Generate synthetic samples from model trained on data including target synthetic_in = [ GenModel.generate_samples(sizeSyn) for _ in range(numCopies) ] synA.extend(synthetic_in) labels.extend([LABEL_IN for _ in range(numCopies)]) return synA, labels
def generate_samples(self, nsamples): """Generate random samples from the fitted Gaussian distribution""" assert self.trained, "Model must first be fitted to some data." LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') synthetic_data, _ = self.gm.sample(nsamples) return synthetic_data