class DPCTGAN(CTGANSynthesizer): """Differential Private Conditional Table GAN Synthesizer This code adds Differential Privacy to CTGANSynthesizer from https://github.com/sdv-dev/CTGAN """ def __init__( self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500, epochs=300, pack=1, log_frequency=True, disabled_dp=False, target_delta=None, sigma=5, max_per_sample_grad_norm=1.0, epsilon=1, verbose=True, loss="cross_entropy", ): # CTGAN model specific parameters self.embedding_dim = embedding_dim self.gen_dim = gen_dim self.dis_dim = dis_dim self.l2scale = l2scale self.batch_size = batch_size self.epochs = epochs self.pack = pack self.log_frequency = log_frequency self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # opacus parameters self.sigma = sigma self.disabled_dp = disabled_dp self.target_delta = target_delta self.max_per_sample_grad_norm = max_per_sample_grad_norm self.epsilon = epsilon self.epsilon_list = [] self.alpha_list = [] self.loss_d_list = [] self.loss_g_list = [] self.verbose = verbose self.loss = loss if self.loss != "cross_entropy": # Monkeypatches the _create_or_extend_grad_sample function when calling opacus opacus.supported_layers_grad_samplers._create_or_extend_grad_sample = ( _custom_create_or_extend_grad_sample) def train(self, data, categorical_columns=None, ordinal_columns=None, update_epsilon=None): if update_epsilon: self.epsilon = update_epsilon self.transformer = DataTransformer() self.transformer.fit(data, discrete_columns=categorical_columns) train_data = self.transformer.transform(data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, self.log_frequency) self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) discriminator = Discriminator(data_dim + self.cond_generator.n_opt, self.dis_dim, self.loss, self.pack).to(self.device) optimizer_g = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) optimizer_d = optim.Adam(discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) privacy_engine = opacus.PrivacyEngine( discriminator, batch_size=self.batch_size, sample_size=train_data.shape[0], alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=self.sigma, max_grad_norm=self.max_per_sample_grad_norm, clip_per_layer=True, ) if not self.disabled_dp: privacy_engine.attach(optimizer_d) one = torch.tensor(1, dtype=torch.float).to(self.device) mone = one * -1 real_label = 1 fake_label = 0 criterion = nn.BCELoss() assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 steps_per_epoch = len(train_data) // self.batch_size for i in range(self.epochs): for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype("float32")).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake optimizer_d.zero_grad() if self.loss == "cross_entropy": y_fake = discriminator(fake_cat) # print ('y_fake is {}'.format(y_fake)) label_fake = torch.full( (int(self.batch_size / self.pack), ), fake_label, dtype=torch.float, device=self.device, ) # print ('label_fake is {}'.format(label_fake)) error_d_fake = criterion(y_fake, label_fake) error_d_fake.backward() optimizer_d.step() # train with real label_true = torch.full( (int(self.batch_size / self.pack), ), real_label, dtype=torch.float, device=self.device, ) y_real = discriminator(real_cat) error_d_real = criterion(y_real, label_true) error_d_real.backward() optimizer_d.step() loss_d = error_d_real + error_d_fake else: y_fake = discriminator(fake_cat) mean_fake = torch.mean(y_fake) mean_fake.backward(one) y_real = discriminator(real_cat) mean_real = torch.mean(y_real) mean_real.backward(mone) optimizer_d.step() loss_d = -(mean_real - mean_fake) max_grad_norm = [] for p in discriminator.parameters(): param_norm = p.grad.data.norm(2).item() max_grad_norm.append(param_norm) # pen = calc_gradient_penalty(discriminator, real_cat, fake_cat, self.device) # pen.backward(retain_graph=True) # loss_d.backward() # optimizer_d.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = discriminator(fakeact) # if condvec is None: cross_entropy = 0 # else: # cross_entropy = self._cond_loss(fake, c1, m1) if self.loss == "cross_entropy": label_g = torch.full( (int(self.batch_size / self.pack), ), real_label, dtype=torch.float, device=self.device, ) # label_g = torch.full(int(self.batch_size/self.pack,),1,device=self.device) loss_g = criterion(y_fake, label_g) loss_g = loss_g + cross_entropy else: loss_g = -torch.mean(y_fake) + cross_entropy optimizer_g.zero_grad() loss_g.backward() optimizer_g.step() if not self.disabled_dp: # if self.loss == 'cross_entropy': # autograd_grad_sample.clear_backprops(discriminator) # else: for p in discriminator.parameters(): if hasattr(p, "grad_sample"): del p.grad_sample if self.target_delta is None: self.target_delta = 1 / train_data.shape[0] epsilon, best_alpha = optimizer_d.privacy_engine.get_privacy_spent( self.target_delta) self.epsilon_list.append(epsilon) self.alpha_list.append(best_alpha) # if self.verbose: if not self.disabled_dp: if self.epsilon < epsilon: break self.loss_d_list.append(loss_d) self.loss_g_list.append(loss_g) if self.verbose: print( "Epoch %d, Loss G: %.4f, Loss D: %.4f" % (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu()), flush=True, ) print("epsilon is {e}, alpha is {a}".format(e=epsilon, a=best_alpha)) return self.loss_d_list, self.loss_g_list, self.epsilon_list, self.alpha_list def generate(self, n): self.generator.eval() # output_info = self.transformer.output_info steps = n // self.batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) condvec = self.cond_generator.sample_zero(self.batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] return self.transformer.inverse_transform(data, None)
class CTGANSynthesizer(object): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. gen_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Residual Layer will be created for each one of the values provided. Defaults to (256, 256). dis_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). l2scale (float): Weight Decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. discriminator_steps (int): Number of discriminator updates to do for each generator update. From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper default is 5. Default used is 1 to match original CTGAN implementation. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. blackbox_model: Model that implements fit, predict, predict_proba """ def __init__( self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500, discriminator_steps=1, log_frequency=True, blackbox_model=None, preprocessing_pipeline=None, bb_loss="logloss", ): self.embedding_dim = embedding_dim self.gen_dim = gen_dim self.dis_dim = dis_dim self.l2scale = l2scale self.batch_size = batch_size self.log_frequency = log_frequency self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.trained_epoches = 0 self.discriminator_steps = discriminator_steps self.blackbox_model = blackbox_model self.preprocessing_pipeline = preprocessing_pipeline self.confidence_level = -1 # will set in fit self.bb_loss = bb_loss @staticmethod def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1): """Deals with the instability of the gumbel_softmax for older versions of torch. For more details about the issue: https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing Args: logits: […, num_features] unnormalized log probabilities tau: non-negative scalar temperature hard: if True, the returned samples will be discretized as one-hot vectors, but will be differentiated as if it is the soft sample in autograd dim (int): a dimension along which softmax will be computed. Default: -1. Returns: Sampled tensor of same shape as logits from the Gumbel-Softmax distribution. """ if version.parse(torch.__version__) < version.parse("1.2.0"): for i in range(10): transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) if not torch.isnan(transformed).any(): return transformed raise ValueError("gumbel_softmax returning NaN.") return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) def _apply_activate(self, data): data_t = [] st = 0 for item in self.transformer.output_info: if item[1] == "tanh": ed = st + item[0] data_t.append(torch.tanh(data[:, st:ed])) st = ed elif item[1] == "softmax": ed = st + item[0] transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2) data_t.append(transformed) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): loss = [] st = 0 st_c = 0 skip = False for item in self.transformer.output_info: if item[1] == "tanh": st += item[0] skip = True elif item[1] == "softmax": if skip: skip = False st += item[0] continue ed = st + item[0] ed_c = st_c + item[0] tmp = functional.cross_entropy( data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction="none", ) loss.append(tmp) st = ed st_c = ed_c else: assert 0 loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def fit( self, train_data, discrete_columns=tuple(), epochs=300, confidence_level=-1, verbose=True, gen_lr=2e-4, ): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. """ self.confidence_level = confidence_level loss_other_name = "loss_bb" if confidence_level != -1 else "loss_d" history = {"loss_g": [], loss_other_name: []} # Eli: add Mode-specific Normalization if not hasattr(self, "transformer"): self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions if not hasattr(self, "cond_generator"): self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, self.log_frequency) if not hasattr(self, "generator"): self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) if not hasattr(self, "discriminator"): self.discriminator = Discriminator( data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) if not hasattr(self, "optimizerG"): self.optimizerG = optim.Adam( self.generator.parameters(), lr=gen_lr, betas=(0.5, 0.9), weight_decay=self.l2scale, ) if not hasattr(self, "optimizerD"): self.optimizerD = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 # init mean to zero and std to one mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 # steps_per_epoch = max(len(train_data) // self.batch_size, 1) steps_per_epoch = 10 # magic number decided with Gilad. feel free to change it # Eli: start training loop for i in range(epochs): self.trained_epoches += 1 for id_ in range(steps_per_epoch): if self.confidence_level == -1: # discriminator loop for n in range(self.discriminator_steps): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype("float32")).to( self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = self.discriminator(fake_cat) y_real = self.discriminator(real_cat) pen = self.discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) if self.confidence_level == -1: # without bb loss self.optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() self.optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self.discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = self.discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) if self.confidence_level != -1: # generate `batch_size` samples gen_out = self.sample(self.batch_size) loss_bb = self._calc_bb_confidence_loss(gen_out) loss_g = loss_bb + cross_entropy else: # original loss loss_g = -torch.mean(y_fake) + cross_entropy self.optimizerG.zero_grad() loss_g.backward() self.optimizerG.step() loss_g_val = loss_g.detach().cpu() loss_other_val = locals()[loss_other_name].detach().cpu() history["loss_g"].append(loss_g.item()) history[loss_other_name].append(loss_other_val.item()) if verbose: print( f"Epoch {self.trained_epoches}, Loss G: {loss_g_val}, {loss_other_name}: {loss_other_val}", flush=True, ) return history def sample(self, n, condition_column=None, condition_value=None): """Sample data similar to the training data. Choosing a condition_column and condition_value will increase the probability of the discrete condition_value happening in the condition_column. Args: n (int): Number of rows to sample. condition_column (string): Name of a discrete column. condition_value (string): Name of the category in the condition_column which we wish to increase the probability of happening. Returns: numpy.ndarray or pandas.DataFrame """ if condition_column is not None and condition_value is not None: condition_info = self.transformer.covert_column_name_value_to_id( condition_column, condition_value) global_condition_vec = ( self.cond_generator.generate_cond_from_condition_column_info( condition_info, self.batch_size)) else: global_condition_vec = None steps = n // self.batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) if global_condition_vec is not None: condvec = global_condition_vec.copy() else: condvec = self.cond_generator.sample_zero(self.batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] # return data return self.transformer.inverse_transform(data, None) def save(self, path): assert hasattr(self, "generator") assert hasattr(self, "discriminator") assert hasattr(self, "transformer") # always save a cpu model. device_bak = self.device self.device = torch.device("cpu") self.generator.to(self.device) self.discriminator.to(self.device) torch.save(self, path) self.device = device_bak self.generator.to(self.device) self.discriminator.to(self.device) @classmethod def load(cls, path): model = torch.load(path) model.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") model.generator.to(model.device) model.discriminator.to(model.device) return model def _calc_bb_confidence_loss(self, gen_out): y_prob = self.blackbox_model.predict_proba(gen_out) y_conf_gen = y_prob[:, 0] # confidence scores # create vector with the same size of y_confidence filled with `confidence_level` values if isinstance(self.confidence_level, list): conf = np.random.choice(self.confidence_level) else: conf = self.confidence_level y_conf_wanted = np.full(len(y_conf_gen), conf) # to tensor y_conf_gen = torch.tensor(y_conf_gen, requires_grad=True).to(self.device) y_conf_wanted = torch.tensor(y_conf_wanted).to(self.device) # loss bb_loss = self._get_loss_by_name(self.bb_loss) bb_loss_val = bb_loss(y_conf_gen, y_conf_wanted) return bb_loss_val @staticmethod def _get_loss_by_name(loss_name): if loss_name == "log": return torch.nn.BCELoss() elif loss_name == "l1": return torch.nn.L1Loss() elif loss_name == "l2": return torch.nn.L1Loss() elif loss_name == "focal": return WeightedFocalLoss() else: raise ValueError(f"Unknown loss name '{loss_name}'")
class CTGANSynthesizer(object): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. gen_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Resiudal Layer will be created for each one of the values provided. Defaults to (256, 256). dis_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). l2scale (float): Wheight Decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. """ def __init__(self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500, patience=25): self.embedding_dim = embedding_dim self.gen_dim = gen_dim self.dis_dim = dis_dim self.patience = patience self.l2scale = l2scale self.batch_size = batch_size self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def _apply_activate(self, data): data_t = [] st = 0 for item in self.transformer.output_info: if item[1] == 'tanh': ed = st + item[0] data_t.append(torch.tanh(data[:, st:ed])) st = ed elif item[1] == 'softmax': ed = st + item[0] data_t.append(functional.gumbel_softmax(data[:, st:ed], tau=0.2)) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): loss = [] st = 0 st_c = 0 skip = False for item in self.transformer.output_info: if item[1] == 'tanh': st += item[0] skip = True elif item[1] == 'softmax': if skip: skip = False st += item[0] continue ed = st + item[0] ed_c = st_c + item[0] tmp = functional.cross_entropy( data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none' ) loss.append(tmp) st = ed st_c = ed_c else: assert 0 loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def fit(self, train_data, discrete_columns=tuple(), epochs=300, log_frequency=True): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency ) self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim ).to(self.device) discriminator = Discriminator( data_dim + self.cond_generator.n_opt, self.dis_dim ).to(self.device) optimizerG = optim.Adam( self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale ) optimizerD = optim.Adam(discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 train_losses = [] early_stopping = EarlyStopping(patience=self.patience, verbose=False) steps_per_epoch = max(len(train_data) // self.batch_size, 1) for i in range(epochs): for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = discriminator(fake_cat) y_real = discriminator(real_cat) pen = discriminator.calc_gradient_penalty(real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) train_losses.append(loss_d.item()) optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy train_losses.append(loss_g.item()) optimizerG.zero_grad() loss_g.backward() optimizerG.step() early_stopping(np.average(train_losses)) if early_stopping.early_stop: print("GAN: Early stopping after epochs {}".format(i)) break train_losses = [] # print("Epoch %d, Loss G: %.4f, Loss D: %.4f" % # (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu()), # flush=True) def sample(self, n): """Sample data similar to the training data. Args: n (int): Number of rows to sample. Returns: numpy.ndarray or pandas.DataFrame """ steps = n // self.batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) condvec = self.cond_generator.sample_zero(self.batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] return self.transformer.inverse_transform(data, None)
class PATECTGAN(CTGANSynthesizer): def __init__( self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, epochs=300, pack=1, log_frequency=True, disabled_dp=False, target_delta=None, sigma=5, max_per_sample_grad_norm=1.0, verbose=False, loss="cross_entropy", # losses supported: 'cross_entropy', 'wasserstein' regularization=None, # regularizations supported: 'dragan' binary=False, batch_size=500, teacher_iters=5, student_iters=5, sample_per_teacher=1000, epsilon=8.0, delta=1e-5, noise_multiplier=1e-3, moments_order=100, ): # CTGAN model specifi3c parameters self.embedding_dim = embedding_dim self.gen_dim = gen_dim self.dis_dim = dis_dim self.l2scale = l2scale self.batch_size = batch_size self.epochs = epochs self.pack = pack self.log_frequency = log_frequency self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.verbose = verbose self.loss = loss self.regularization = regularization if self.loss != "wasserstein" else "dragan" self.sample_per_teacher = sample_per_teacher self.noise_multiplier = noise_multiplier self.moments_order = moments_order self.binary = binary self.batch_size = batch_size self.teacher_iters = teacher_iters self.student_iters = student_iters self.epsilon = epsilon self.delta = delta self.pd_cols = None self.pd_index = None def train(self, data, categorical_columns=None, ordinal_columns=None, update_epsilon=None): if update_epsilon: self.epsilon = update_epsilon sample_per_teacher = (self.sample_per_teacher if self.sample_per_teacher < len(data) else 1000) self.num_teachers = int(len(data) / sample_per_teacher) + 1 self.transformer = DataTransformer() self.transformer.fit(data, discrete_columns=categorical_columns) data = self.transformer.transform(data) data_partitions = np.array_split(data, self.num_teachers) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( data, self.transformer.output_info, self.log_frequency) # create conditional generator for each teacher model cond_generator = [ ConditionalGenerator(d, self.transformer.output_info, self.log_frequency) for d in data_partitions ] self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) discriminator = Discriminator(data_dim + self.cond_generator.n_opt, self.dis_dim, self.loss, self.pack).to(self.device) student_disc = discriminator student_disc.apply(weights_init) teacher_disc = [discriminator for i in range(self.num_teachers)] for i in range(self.num_teachers): teacher_disc[i].apply(weights_init) optimizer_g = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) optimizer_s = optim.Adam(student_disc.parameters(), lr=2e-4, betas=(0.5, 0.9)) optimizer_t = [ optim.Adam(teacher_disc[i].parameters(), lr=2e-4, betas=(0.5, 0.9)) for i in range(self.num_teachers) ] noise_multiplier = self.noise_multiplier alphas = torch.tensor([0.0 for i in range(self.moments_order)], device=self.device) l_list = 1 + torch.tensor(range(self.moments_order), device=self.device) eps = 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 real_label = 1 fake_label = 0 criterion = nn.BCELoss() if (self.loss == "cross_entropy") else self.w_loss if self.verbose: print("using loss {} and regularization {}".format( self.loss, self.regularization)) while eps < self.epsilon: # train teacher discriminators for t_2 in range(self.teacher_iters): for i in range(self.num_teachers): partition_data = data_partitions[i] data_sampler = Sampler(partition_data, self.transformer.output_info) fakez = torch.normal(mean, std=std).to(self.device) condvec = cond_generator[i].sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype("float32")).to( self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake optimizer_t[i].zero_grad() y_all = torch.cat( [teacher_disc[i](fake_cat), teacher_disc[i](real_cat)]) label_fake = torch.full( (int(self.batch_size / self.pack), 1), fake_label, dtype=torch.float, device=self.device, ) label_true = torch.full( (int(self.batch_size / self.pack), 1), real_label, dtype=torch.float, device=self.device, ) labels = torch.cat([label_fake, label_true]) error_d = criterion(y_all, labels) error_d.backward() if self.regularization == "dragan": pen = teacher_disc[i].dragan_penalty( real_cat, device=self.device) pen.backward(retain_graph=True) optimizer_t[i].step() # train student discriminator for t_3 in range(self.student_iters): data_sampler = Sampler(data, self.transformer.output_info) fakez = torch.normal(mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) else: fake_cat = fake fake_data = fake_cat predictions, votes = pate(fake_data, teacher_disc, noise_multiplier, device=self.device) output = student_disc(fake_data.detach()) # update moments accountant alphas = alphas + moments_acc(self.num_teachers, votes, noise_multiplier, l_list, device=self.device) loss_s = criterion(output, predictions.float().to(self.device)) optimizer_s.zero_grad() loss_s.backward() if self.regularization == "dragan": vals = torch.cat([predictions, fake_data], axis=1) ordered = vals[vals[:, 0].sort()[1]] data_list = torch.split( ordered, predictions.shape[0] - int(predictions.sum().item())) synth_cat = torch.cat(data_list[1:], axis=0)[:, 1:] pen = student_disc.dragan_penalty(synth_cat, device=self.device) pen.backward(retain_graph=True) optimizer_s.step() # print ('iterator {i}, student discriminator loss is {j}'.format(i=t_3, j=loss_s)) # train generator fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = student_disc(torch.cat([fakeact, c1], dim=1)) else: y_fake = student_disc(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) if self.loss == "cross_entropy": label_g = torch.full( (int(self.batch_size / self.pack), 1), real_label, dtype=torch.float, device=self.device, ) loss_g = criterion(y_fake, label_g.float()) loss_g = loss_g + cross_entropy else: loss_g = -torch.mean(y_fake) + cross_entropy optimizer_g.zero_grad() loss_g.backward() optimizer_g.step() eps = min((alphas - math.log(self.delta)) / l_list) if self.verbose: print("eps: {:f} \t G: {:f} \t D: {:f}".format( eps, loss_g.detach().cpu(), loss_s.detach().cpu())) def w_loss(self, output, labels): vals = torch.cat([labels, output], axis=1) ordered = vals[vals[:, 0].sort()[1]] data_list = torch.split(ordered, labels.shape[0] - int(labels.sum().item())) fake_score = data_list[0][:, 1] true_score = torch.cat(data_list[1:], axis=0)[:, 1] w_loss = -(torch.mean(true_score) - torch.mean(fake_score)) return w_loss def generate(self, n): self.generator.eval() steps = n // self.batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) condvec = self.cond_generator.sample_zero(self.batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] generated_data = self.transformer.inverse_transform(data, None) return generated_data