def main(): data = pd.read_csv("../data/data.csv") male_age,male_height = get_male_data(data) female_age,female_height = get_female_data(data) plt = create_histogram(male_age, female_age,'Age') plt.show() plt = create_histogram(male_height, female_height,'Height') plt.show()
def set_optimal_threshold(self, x, add_syn=True, num_samples=100, n_dim=5): errors = self.errors(x) if add_syn: syn_errors = self.errors( self.synthesize(x, num_samples=num_samples, n_dim=n_dim)) errors = np.concatenate((errors, syn_errors), axis=0) # Calculate the number of bins according to Freedman-Diaconis rule bin_width = 2 * iqr(errors) / np.power(len(errors), (1 / 3)) num_bins = (np.max(errors) - np.min(errors)) / bin_width hist, bins = create_histogram(errors, num_bins=num_bins, step=bin_width) occurences = [float(o) for o in hist.tolist()] breaks = htb(hist) possible_thresholds = [] for b in breaks: t = fetch_threshold(bins, hist, b) possible_thresholds.append(t) self.optimal_threshold = max(possible_thresholds) return self.optimal_threshold
def set_optimal_threshold(self, x): errors = self.errors(x) # Calculate the number of bins according to Freedman-Diaconis rule bin_width = 2 * iqr(errors) / np.power(len(errors), (1/3)) num_bins = (np.max(errors) - np.min(errors)) / bin_width hist, bins = create_histogram(errors, num_bins=num_bins, step=bin_width) occurences = [float(o) for o in hist.tolist()] breaks = htb(hist) possible_thresholds = [] for b in breaks: t = fetch_threshold(bins, hist, b) possible_thresholds.append(t) self.optimal_threshold = max(possible_thresholds) return self.optimal_threshold
def build(self, data): self.original_data = data self.encoded_data = self.autoencoder.encode(data) self.reconstructed_data = self.autoencoder.decode(self.encoded_data) self.df_encoded_data = pd.DataFrame(data=self.encoded_data) self.df_encoded_data_mean = self.df_encoded_data.mean(axis=0) self.df_encoded_data_std = self.df_encoded_data.std(axis=0) self.stochastic_dimensions = random.sample( range(len(self.df_encoded_data.columns)), self.s_dimension_count) self.num_to_synthesize = round(len(data) * self.relative_frequency) if self.num_to_synthesize > 0: self.synthetic_latent_data = ((self.df_encoded_data.sample( self.num_to_synthesize)).reset_index(drop=True)).copy() else: self.synthetic_latent_data = self.df_encoded_data if self.n_samples > 0: for index, row in self.synthetic_latent_data.iterrows(): for d in self.stochastic_dimensions: tail_values = self.sample_tails( self.df_encoded_data_mean.values[d], self.df_encoded_data_std.values[d], self.n_samples) if len(tail_values) == 0: outlier_v = self.synthetic_latent_data.at[index, d] else: outlier_v = random.choice(tail_values) self.synthetic_latent_data.at[index, d] = outlier_v # Reconstruct using frozen weights self.synthetic_data = self.autoencoder.decode( self.synthetic_latent_data.values) self.df_synthetic_data = pd.DataFrame(data=self.synthetic_data) self.synthetic_data_with_labels = np.append( self.synthetic_data, np.ones((len(self.synthetic_data), 1)), axis=1) self.reconstructed_data_with_labels = np.append( self.reconstructed_data, np.zeros((len(self.reconstructed_data), 1)), axis=1) # Reconstructed synthetic data self.reconstructed_synthetic = self.autoencoder.predict( self.synthetic_data) self.X = np.concatenate((self.original_data, self.synthetic_data)) self.Y = np.concatenate( (self.reconstructed_data, self.reconstructed_synthetic)) self.errors = np.power(self.X - self.Y, 2) self.mean_sq_errors = np.mean(self.errors, axis=1) # Calculate the number of bins according to Freedman-Diaconis rule bin_width = 2 * iqr(self.mean_sq_errors) / np.power( len(self.errors), (1 / 3)) num_bins = (np.max(self.mean_sq_errors) - np.min(self.mean_sq_errors)) / bin_width self.hist, self.bins = create_histogram(self.mean_sq_errors, num_bins=num_bins, step=bin_width) self.occurences = [float(x) for x in self.hist.tolist() ] # Convert to float data type breaks = htb(self.hist) self.possible_thresholds = [] for b in breaks: t = fetch_threshold(self.bins, self.hist, b) self.possible_thresholds.append(t) self.optimal_threshold = max(self.possible_thresholds) # Create labels for histogram rendering self.labels = [] for i in range(len(self.bins) - 1): self.labels.append( str(round(self.bins[i], 4)) + "-" + str(round(self.bins[i + 1], 4)))