def infer_domain_of_string_attribute(self, attribute): datatype = self.attribute_to_datatype[attribute] column_values = self.input_dataset[attribute] column_dropna = column_values.dropna() column_value_lengths = column_dropna.astype(str).map(len) is_categorical_attribute = self.is_categorical(attribute) if is_categorical_attribute: distribution = column_dropna.value_counts() distribution.sort_index(inplace=True) distribution_probabilities = utils.normalize_given_distribution(distribution).tolist() distribution_bins = np.array(distribution.index).tolist() else: distribution = np.histogram(column_value_lengths, bins=self.histogram_size) distribution_probabilities = utils.normalize_given_distribution(distribution[0]).tolist() distribution_bins = distribution[1][:-1].tolist() distribution_bins[0] = distribution_bins[0] - 0.001 * (distribution_bins[1] - distribution_bins[0]) attribute_info = {'datatype': datatype, 'is_categorical': is_categorical_attribute, 'min_length': int(column_value_lengths.min()), 'max_length': int(column_value_lengths.max()), 'distribution_bins': distribution_bins, 'distribution_probabilities': distribution_probabilities, 'missing_rate': column_values.isnull().sum() / column_values.index.size} return attribute_info
def infer_domain(self, column): assert isinstance(column, Series) self.data = column self.data_dropna = self.data.dropna() self.missing_rate = (self.data.size - self.data_dropna.size) / self.data.size epoch_datetime = parse('1970-01-01') timestamps = self.data_dropna.map(lambda x: int( (parse(x) - epoch_datetime).total_seconds())) self.min = float(timestamps.min()) self.max = float(timestamps.max()) if self.is_categorical: distribution = self.data_dropna.value_counts() distribution.sort_index(inplace=True) self.distribution_probabilities = normalize_given_distribution( distribution).tolist() self.distribution_bins = np.array(distribution.index).tolist() else: distribution = np.histogram(timestamps, bins=self.histogram_size) self.distribution_probabilities = normalize_given_distribution( distribution[0]).tolist() bins = distribution[1][:-1].tolist() bins[0] = bins[0] - 0.001 * (bins[1] - bins[0]) self.distribution_bins = bins
def infer_domain_of_numeric_attribute(self, attribute): datatype = self.attribute_to_datatype[attribute] column_values = self.input_dataset[attribute] column_dropna = column_values.dropna() # use timestamp to represent datetime if datatype == 'datetime': column_dropna = column_dropna.map(lambda x: parse(x).timestamp()) is_categorical_attr = self.is_categorical(attribute) if is_categorical_attr: distribution = column_dropna.value_counts() distribution.sort_index(inplace=True) distribution_probabilities = utils.normalize_given_distribution(distribution).tolist() distribution_bins = np.array(distribution.index).tolist() else: distribution = np.histogram(column_dropna, bins=self.histogram_size) distribution_probabilities = utils.normalize_given_distribution(distribution[0]).tolist() distribution_bins = distribution[1][:-1].tolist() distribution_bins[0] = distribution_bins[0] - 0.001 * (distribution_bins[1] - distribution_bins[0]) attribute_info = {'datatype': datatype, 'is_categorical': is_categorical_attr, 'min': float(column_dropna.min()), 'max': float(column_dropna.max()), 'distribution_bins': distribution_bins, 'distribution_probabilities': distribution_probabilities, 'missing_rate': column_values.isnull().sum() / column_values.index.size} if datatype == 'integer': attribute_info['min'] = int(column_dropna.min()) attribute_info['max'] = int(column_dropna.max()) return attribute_info
def construct_noisy_conditional_distributions(bayesian_network, encoded_dataset, epsilon=0.1): """See more in Algorithm 1 in PrivBayes. """ k = len(bayesian_network[-1][1]) conditional_distributions = {} # first k+1 attributes root = bayesian_network[0][1][0] kplus1_attributes = [root] for child, _ in bayesian_network[:k]: kplus1_attributes.append(child) noisy_dist_of_kplus1_attributes = get_noisy_distribution_of_attributes( kplus1_attributes, encoded_dataset, epsilon) # generate noisy distribution of root attribute. root_stats = noisy_dist_of_kplus1_attributes.loc[:, [root, 'count']].groupby( root).sum()['count'] conditional_distributions[root] = normalize_given_distribution( root_stats).tolist() for idx, (child, parents) in enumerate(bayesian_network): conditional_distributions[child] = {} if idx < k: stats = noisy_dist_of_kplus1_attributes.copy( ).loc[:, parents + [child, 'count']] else: stats = get_noisy_distribution_of_attributes( parents + [child], encoded_dataset, epsilon) stats = DataFrame(stats.loc[:, parents + [child, 'count']].groupby(parents + [child]).sum()) if len(parents) == 1: for parent_instance in stats.index.levels[0]: dist = normalize_given_distribution( stats.loc[parent_instance]['count']).tolist() conditional_distributions[child][str([parent_instance])] = dist else: for parents_instance in product(*stats.index.levels[:-1]): dist = normalize_given_distribution( stats.loc[parents_instance]['count']).tolist() conditional_distributions[child][str( list(parents_instance))] = dist return conditional_distributions
def infer_distribution(self): if self.is_categorical: distribution = self.data_dropna.value_counts() for value in set(self.distribution_bins) - set(distribution.index): distribution[value] = 0 distribution.sort_index(inplace=True) self.distribution_probabilities = utils.normalize_given_distribution(distribution) self.distribution_bins = np.array(distribution.index) else: distribution = np.histogram(self.data_dropna_len, bins=self.histogram_size) self.distribution_bins = distribution[1][:-1] self.distribution_probabilities = utils.normalize_given_distribution(distribution[0])
def construct_noisy_conditional_distributions(bayesian_network, encoded_dataset, epsilon=0.1): """See more in Algorithm 1 in PrivBayes.""" k = len(bayesian_network[-1][1]) conditional_distributions = {} # first k+1 attributes root = bayesian_network[0][1][0] kplus1_attributes = [root] for child, _ in bayesian_network[:k]: kplus1_attributes.append(child) noisy_dist_of_kplus1_attributes = get_noisy_distribution_of_attributes( kplus1_attributes, encoded_dataset, epsilon) # generate noisy distribution of root attribute. root_stats = noisy_dist_of_kplus1_attributes.loc[:, [root, 'count']].groupby( root).sum()['count'] conditional_distributions[root] = normalize_given_distribution( root_stats).tolist() for idx, (child, parents) in enumerate(bayesian_network): conditional_distributions[child] = {} if idx <= k - 2: stats = noisy_dist_of_kplus1_attributes.copy( ).loc[:, parents + [child, 'count']] stats = stats.groupby(parents + [child], as_index=False).sum() elif idx == k - 1: stats = noisy_dist_of_kplus1_attributes.loc[:, parents + [child, 'count']] else: stats = get_noisy_distribution_of_attributes( parents + [child], encoded_dataset, epsilon) stats = stats.loc[:, parents + [child, 'count']] for parents_instance, stats_sub in stats.groupby(parents): stats_sub = stats_sub.sort_values(by=child) dist = normalize_given_distribution(stats_sub['count']).tolist() if len(parents) == 1: parents_key = str([parents_instance]) else: parents_key = str(list(parents_instance)) conditional_distributions[child][parents_key] = dist return conditional_distributions
def infer_distribution(self): if self.is_categorical: distribution = self.data_dropna.value_counts() for value in set(self.distribution_bins) - set(distribution.index): distribution[value] = 0 distribution.sort_index(inplace=True) self.distribution_probabilities = normalize_given_distribution( distribution) self.distribution_bins = np.array(distribution.index) else: distribution = np.histogram(self.timestamps, bins=self.histogram_size, range=(self.min, self.max)) self.distribution_probabilities = normalize_given_distribution( distribution[0])
def inject_laplace_noise(self, epsilon=0.1, num_valid_attributes=10): noisy_scale = num_valid_attributes / (epsilon * self.data.size) laplace_noises = np.random.laplace( 0, scale=noisy_scale, size=len(self.distribution_probabilities)) noisy_distribution = np.asarray( self.distribution_probabilities) + laplace_noises self.distribution_probabilities = utils.normalize_given_distribution( noisy_distribution).tolist()
def exponential_mechanism(dataset, mutual_info_list, epsilon=0.1): """Applied in Exponential Mechanism to sample outcomes.""" num_tuples, num_attributes = dataset.shape mi_array = np.array(mutual_info_list) mi_array = mi_array / (2 * delta(num_attributes, num_tuples, epsilon)) mi_array = np.exp(mi_array) mi_array = normalize_given_distribution(mi_array) return mi_array
def inject_laplace_noise_into_distribution_per_attribute(self, epsilon=0.1): h = self.input_dataset.columns.size for attr in self.dataset_description['attribute_description']: distribution = self.dataset_description['attribute_description'][attr]['distribution_probabilities'] noisy_scale = h / (epsilon * self.input_dataset.shape[0]) laplace_noises = np.random.laplace(0, scale=noisy_scale, size=len(distribution)) noisy_distribution = np.asarray(distribution) + laplace_noises noisy_distribution = utils.normalize_given_distribution(noisy_distribution).tolist() self.dataset_description['attribute_description'][attr]['distribution_probabilities'] = noisy_distribution
def infer_domain(self, column): """ Infer domain, including min, max, and 1-D distribution.""" assert isinstance(column, pd.Series) self.data = column self.data_dropna = self.data.dropna() self.missing_rate = (self.data.size - self.data_dropna.size) / self.data.size self.min = float(self.data_dropna.min()) self.max = float(self.data_dropna.max()) if self.is_categorical: distribution = self.data_dropna.value_counts() distribution.sort_index(inplace=True) self.distribution_probabilities = utils.normalize_given_distribution(distribution).tolist() self.distribution_bins = np.array(distribution.index).tolist() else: distribution = np.histogram(self.data_dropna, bins=self.histogram_size) self.distribution_probabilities = utils.normalize_given_distribution(distribution[0]).tolist() bins = distribution[1][:-1].tolist() bins[0] = bins[0] - 0.001 * (bins[1] - bins[0]) self.distribution_bins = bins
def inject_laplace_noise(self, epsilon, num_valid_attributes): if epsilon > 0: sensitivity = 2 / self.data.size privacy_budget = epsilon / num_valid_attributes noise_scale = sensitivity / privacy_budget laplace_noises = np.random.laplace( 0, scale=noise_scale, size=len(self.distribution_probabilities)) noisy_distribution = self.distribution_probabilities + laplace_noises self.distribution_probabilities = utils.normalize_given_distribution( noisy_distribution)
def exponential_mechanism(epsilon, mutual_info_list, parents_pair_list, attr_to_is_binary, num_tuples, num_attributes): """Applied in Exponential Mechanism to sample outcomes.""" delta_array = [] for (child, parents) in parents_pair_list: sensitivity = calculate_sensitivity(num_tuples, child, parents, attr_to_is_binary) delta = calculate_delta(num_attributes, sensitivity, epsilon) delta_array.append(delta) mi_array = np.array(mutual_info_list) / (2 * np.array(delta_array)) mi_array = np.exp(mi_array) mi_array = normalize_given_distribution(mi_array) return mi_array
def describe_dataset_in_random_mode(self, dataset_file, attribute_to_datatype={}, attribute_to_is_categorical={}, seed=0): self.describe_dataset_in_independent_attribute_mode(dataset_file, attribute_to_datatype=attribute_to_datatype, attribute_to_is_categorical=attribute_to_is_categorical, seed=seed) # After running independent attribute mode, 1) make all distributions uniform; 2) set missing rate to zero. for attr in self.dataset_description['attribute_description']: distribution = self.dataset_description['attribute_description'][attr]['distribution_probabilities'] uniform_distribution = np.ones_like(distribution) uniform_distribution = utils.normalize_given_distribution(uniform_distribution).tolist() self.dataset_description['attribute_description'][attr]['distribution_probabilities'] = uniform_distribution self.dataset_description['attribute_description'][attr]['missing_rate'] = 0
def compare_histograms(self, attribute): datatype = self.attribute_description[attribute]['data_type'] is_categorical = self.attribute_description[attribute][ 'is_categorical'] # ignore datetime attributes, since they are converted into timestamps if datatype == 'DateTime': return # ignore non-categorical string attributes elif datatype == 'String' and not is_categorical: return elif attribute in self.candidate_keys: return else: fig = plt.figure(figsize=(15, 5), dpi=120) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) if is_categorical: dist_priv = self.private_df[attribute].value_counts() dist_synt = self.synthetic_df[attribute].value_counts() for idx, number in dist_priv.iteritems(): if idx not in dist_synt.index: dist_synt.loc[idx] = 0 for idx, number in dist_synt.iteritems(): if idx not in dist_priv.index: dist_priv.loc[idx] = 0 dist_priv.index = [str(i) for i in dist_priv.index] dist_synt.index = [str(i) for i in dist_synt.index] dist_priv.sort_index(inplace=True) dist_synt.sort_index(inplace=True) pos_priv = list(range(len(dist_priv))) pos_synt = list(range(len(dist_synt))) ax1.bar(pos_priv, normalize_given_distribution(dist_priv.values)) ax2.bar(pos_synt, normalize_given_distribution(dist_synt.values)) ax1.set_xticks(arange(min(pos_priv), max(pos_priv) + 1, 1.0)) ax2.set_xticks(arange(min(pos_synt), max(pos_synt) + 1, 1.0)) ax1.set_xticklabels(dist_priv.index.tolist(), fontsize=15) ax2.set_xticklabels(dist_synt.index.tolist(), fontsize=15) # the rest are non-categorical numeric attributes. else: ax1.hist(self.private_df[attribute].dropna(), bins=15, align='left', density=True) ax2.hist(self.synthetic_df[attribute].dropna(), bins=15, align='left', density=True) ax1_x_min, ax1_x_max = ax1.get_xlim() ax2_x_min, ax2_x_max = ax2.get_xlim() ax1_y_min, ax1_y_max = ax1.get_ylim() ax2_y_min, ax2_y_max = ax2.get_ylim() x_min = min(ax1_x_min, ax2_x_min) x_max = max(ax1_x_max, ax2_x_max) y_min = min(ax1_y_min, ax2_y_min) y_max = max(ax1_y_max, ax2_y_max) ax1.set_xlim([x_min, x_max]) ax1.set_ylim([y_min, y_max]) ax2.set_xlim([x_min, x_max]) ax2.set_ylim([y_min, y_max]) fig.autofmt_xdate()