def fit(self, train_data, discrete_columns=tuple(), epochs=300, log_frequency=True): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ self.discrete_columns = discrete_columns # Fit OH encoder self.oe = OneHotEncoder(sparse=False) if discrete_columns: if type(train_data) is np.ndarray: self.oe.fit(train_data[:, discrete_columns]) self.cat_column_idxes = discrete_columns else: self.oe.fit(train_data[discrete_columns]) features = train_data.columns self.cat_column_idxes = [ features.index(c) for c in discrete_columns ] else: self.cat_column_idxes = [] self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency) self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) discriminator = Discriminator(data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) optimizerG = optim.Adam(self.generator.parameters(), lr=self.gen_lr) optimizerD = optim.Adam( discriminator.parameters(), lr=self.dis_lr, ) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 steps_per_epoch = max(len(train_data) // self.batch_size, 1) for i in tqdm(range(epochs)): for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = discriminator(fake_cat) y_real = discriminator(real_cat) pen = discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(nanmean(y_real) - nanmean(y_fake)) optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -nanmean(y_fake) + cross_entropy optimizerG.zero_grad() loss_g.backward() optimizerG.step() if self.verbose: print("Epoch %d, Loss G: %.4f, Loss D: %.4f" % (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu()), flush=True) self.discriminator = discriminator
def fit(self, train_data, eval_interval, eval, discrete_columns=tuple(), epochs=300, log_frequency=True): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ train = train_data self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency) self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) self.discriminator = Discriminator( data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) optimizerG = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) optimizerD = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 # figure for plotting gradients; ax1 for generator and ax2 for discriminator #fig, (ax1, ax2) = plt.subplots(1, 2) steps_per_epoch = max(len(train_data) // self.batch_size, 1) for i in range(epochs): for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = self.discriminator(fake_cat) y_real = self.discriminator(real_cat) pen = self.discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() #plot gradients #self.plot_grad_flow(self.discriminator.named_parameters(), ax2, 'Gradient flow for D') optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self.discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = self.discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy optimizerG.zero_grad() loss_g.backward() # plot gradients #self.plot_grad_flow(self.generator.named_parameters(), ax1, 'Gradient flow for G') optimizerG.step() print("Epoch %d, Loss G: %.4f, Loss D: %.4f" % (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu()), flush=True) #check model results every x epochs #fig2, ax3 = plt.subplots(1) if eval: stats_real = train[self.demand_column].describe() stats_real_week = train.groupby('Weekday')[ self.demand_column].describe() stats_real_month = train.groupby('Month')[ self.demand_column].describe() if ((i + 1) % eval_interval == 0): eval_sample = self.sample(1000) sample = pd.DataFrame(eval_sample, columns=eval_sample.columns) #sample.loc[:, self.demand_column].hist(bins=50, alpha=0.4, label='fake') #ax3.hist(pd.DataFrame(train, columns=train.columns).loc[:, self.demand_column], bins=50, alpha=0.4, label='real') #fig2.legend() #fig2.show() print( (sample[self.demand_column].describe() - stats_real) / stats_real) print(' ') print(((sample.groupby('Weekday')[ self.demand_column].describe() - stats_real_week) / stats_real_week).T) print(' ') print(((sample.groupby('Month')[ self.demand_column].describe() - stats_real_month) / stats_real_month).T) plt.show()
class CTGANSynthesizer(object): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. gen_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Resiudal Layer will be created for each one of the values provided. Defaults to (256, 256). dis_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). l2scale (float): Wheight Decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. """ def __init__(self, demand_col, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500): self.embedding_dim = embedding_dim self.gen_dim = gen_dim self.dis_dim = dis_dim self.l2scale = l2scale self.batch_size = batch_size self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.demand_column = demand_col def _apply_activate(self, data): data_t = [] st = 0 for item in self.transformer.output_info: if item[1] == 'tanh': ed = st + item[0] data_t.append(torch.tanh(data[:, st:ed])) st = ed elif item[1] == 'softmax': ed = st + item[0] data_t.append( functional.gumbel_softmax(data[:, st:ed], tau=0.2)) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): loss = [] st = 0 st_c = 0 skip = False for item in self.transformer.output_info: if item[1] == 'tanh': st += item[0] skip = True elif item[1] == 'softmax': if skip: skip = False st += item[0] continue ed = st + item[0] ed_c = st_c + item[0] tmp = functional.cross_entropy(data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none') loss.append(tmp) st = ed st_c = ed_c else: assert 0 loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def plot_grad_flow(self, named_parameters, axis, plot_title): '''Plots the gradients flowing through different layers in the net during training. Can be used for checking for possible gradient vanishing / exploding problems. Usage: Plug this function in Trainer class after loss.backwards() as "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow''' ave_grads = [] max_grads = [] layers = [] for n, p in named_parameters: if (p.requires_grad) and ("bias" not in n): layers.append(n) ave_grads.append(p.grad.abs().mean()) max_grads.append(p.grad.abs().max()) axis.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c") axis.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b") axis.hlines(0, 0, len(ave_grads) + 1, lw=2, color="k") axis.set_xticklabels(layers, rotation="vertical") axis.set_xlim(left=0, right=len(ave_grads)) axis.set_ylim(bottom=-0.001, top=1.) # zoom in on the lower gradient regions axis.set_xlabel("Layers") axis.set_ylabel("average gradient") axis.set_title(plot_title) plt.grid(True) axis.legend([ Line2D([0], [0], color="c", lw=4), Line2D([0], [0], color="b", lw=4), Line2D([0], [0], color="k", lw=4) ], ['max-gradient', 'mean-gradient', 'zero-gradient'], loc='upper left') def fit(self, train_data, eval_interval, eval, discrete_columns=tuple(), epochs=300, log_frequency=True): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ train = train_data self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency) self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) self.discriminator = Discriminator( data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) optimizerG = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) optimizerD = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 # figure for plotting gradients; ax1 for generator and ax2 for discriminator #fig, (ax1, ax2) = plt.subplots(1, 2) steps_per_epoch = max(len(train_data) // self.batch_size, 1) for i in range(epochs): for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = self.discriminator(fake_cat) y_real = self.discriminator(real_cat) pen = self.discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() #plot gradients #self.plot_grad_flow(self.discriminator.named_parameters(), ax2, 'Gradient flow for D') optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self.discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = self.discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy optimizerG.zero_grad() loss_g.backward() # plot gradients #self.plot_grad_flow(self.generator.named_parameters(), ax1, 'Gradient flow for G') optimizerG.step() print("Epoch %d, Loss G: %.4f, Loss D: %.4f" % (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu()), flush=True) #check model results every x epochs #fig2, ax3 = plt.subplots(1) if eval: stats_real = train[self.demand_column].describe() stats_real_week = train.groupby('Weekday')[ self.demand_column].describe() stats_real_month = train.groupby('Month')[ self.demand_column].describe() if ((i + 1) % eval_interval == 0): eval_sample = self.sample(1000) sample = pd.DataFrame(eval_sample, columns=eval_sample.columns) #sample.loc[:, self.demand_column].hist(bins=50, alpha=0.4, label='fake') #ax3.hist(pd.DataFrame(train, columns=train.columns).loc[:, self.demand_column], bins=50, alpha=0.4, label='real') #fig2.legend() #fig2.show() print( (sample[self.demand_column].describe() - stats_real) / stats_real) print(' ') print(((sample.groupby('Weekday')[ self.demand_column].describe() - stats_real_week) / stats_real_week).T) print(' ') print(((sample.groupby('Month')[ self.demand_column].describe() - stats_real_month) / stats_real_month).T) plt.show() def sample(self, n, condition=None, seed=0): """Sample data similar to the training data. Args: n (int): Number of rows to sample. condvec (Tuple): Number of column and value of condition to sample seed (int): Seed to create result reproducibility Returns: numpy.ndarray or pandas.DataFrame """ self.generator.eval() steps = n // self.batch_size + 1 data = [] np.random.seed(seed) torch.manual_seed(seed) for i in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) if condition is None: #select random condition condvec = self.cond_generator.sample_zero(self.batch_size) else: condvec = self.cond_generator.sample_condition( self.batch_size, condition, self.transformer) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] # sets model back to training mode self.generator.train() return self.transformer.inverse_transform(data, None)
def train(self, train_data, categorical_columns=tuple(), epochs=-1, log_frequency=True, ordinal_columns=None, update_epsilon=None, verbose=False, mlflow=False): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to init param. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ if (epochs == -1): epochs = self.epochs if not hasattr(self, "transformer"): self.transformer = DataTransformer() self.transformer.fit(train_data, categorical_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions if not hasattr(self, "cond_generator"): self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency) if not hasattr(self, "generator"): self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) if not hasattr(self, "discriminator"): self.discriminator = Discriminator( data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) if not hasattr(self, "optimizerG"): self.optimizerG = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) if not hasattr(self, "optimizerD"): self.optimizerD = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 steps_per_epoch = max(len(train_data) // self.batch_size, 1) for i in range(epochs): self.trained_epoches += 1 for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = self.discriminator(fake_cat) y_real = self.discriminator(real_cat) pen = self.discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) self.optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() self.optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self.discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = self.discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy self.optimizerG.zero_grad() loss_g.backward() self.optimizerG.step() if (verbose): if (i % 50 == 0): print("Epoch %d, Loss G: %.4f, Loss D: %.4f" % (self.trained_epoches - 1, loss_g.detach().cpu(), loss_d.detach().cpu()), flush=True) if (mlflow): import mlflow mlflow.log_metric("loss_d", float(loss_d.detach().cpu()), step=self.trained_epoches - 1) mlflow.log_metric("loss_g", float(loss_g.detach().cpu()), step=self.trained_epoches - 1)
class CTGAN(object): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. gen_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Residual Layer will be created for each one of the values provided. Defaults to (256, 256). dis_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). l2scale (float): Wheight Decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. """ def __init__(self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500, epochs=100): self.embedding_dim = embedding_dim self.gen_dim = gen_dim self.dis_dim = dis_dim self.l2scale = l2scale self.batch_size = batch_size self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.trained_epoches = 0 self.epochs = epochs def _apply_activate(self, data): data_t = [] st = 0 for item in self.transformer.output_info: if item[1] == 'tanh': ed = st + item[0] data_t.append(torch.tanh(data[:, st:ed])) st = ed elif item[1] == 'softmax': ed = st + item[0] data_t.append( functional.gumbel_softmax(data[:, st:ed], tau=0.2)) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): loss = [] st = 0 st_c = 0 skip = False for item in self.transformer.output_info: if item[1] == 'tanh': st += item[0] skip = True elif item[1] == 'softmax': if skip: skip = False st += item[0] continue ed = st + item[0] ed_c = st_c + item[0] tmp = functional.cross_entropy(data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none') loss.append(tmp) st = ed st_c = ed_c else: assert 0 loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def train(self, train_data, categorical_columns=tuple(), epochs=-1, log_frequency=True, ordinal_columns=None, update_epsilon=None, verbose=False, mlflow=False): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to init param. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ if (epochs == -1): epochs = self.epochs if not hasattr(self, "transformer"): self.transformer = DataTransformer() self.transformer.fit(train_data, categorical_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions if not hasattr(self, "cond_generator"): self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency) if not hasattr(self, "generator"): self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) if not hasattr(self, "discriminator"): self.discriminator = Discriminator( data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) if not hasattr(self, "optimizerG"): self.optimizerG = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) if not hasattr(self, "optimizerD"): self.optimizerD = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 steps_per_epoch = max(len(train_data) // self.batch_size, 1) for i in range(epochs): self.trained_epoches += 1 for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = self.discriminator(fake_cat) y_real = self.discriminator(real_cat) pen = self.discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) self.optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() self.optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self.discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = self.discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy self.optimizerG.zero_grad() loss_g.backward() self.optimizerG.step() if (verbose): if (i % 50 == 0): print("Epoch %d, Loss G: %.4f, Loss D: %.4f" % (self.trained_epoches - 1, loss_g.detach().cpu(), loss_d.detach().cpu()), flush=True) if (mlflow): import mlflow mlflow.log_metric("loss_d", float(loss_d.detach().cpu()), step=self.trained_epoches - 1) mlflow.log_metric("loss_g", float(loss_g.detach().cpu()), step=self.trained_epoches - 1) def generate(self, n, condition_column=None, condition_value=None): """Sample data similar to the training data. Args: n (int): Number of rows to sample. Returns: numpy.ndarray or pandas.DataFrame """ if condition_column is not None and condition_value is not None: condition_info = self.transformer.covert_column_name_value_to_id( condition_column, condition_value) global_condition_vec = self.cond_generator.generate_cond_from_condition_column_info( condition_info, self.batch_size) else: global_condition_vec = None steps = n // self.batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) if global_condition_vec is not None: condvec = global_condition_vec.copy() else: condvec = self.cond_generator.sample_zero(self.batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] return self.transformer.inverse_transform(data, None) def save(self, path): assert hasattr(self, "generator") assert hasattr(self, "discriminator") assert hasattr(self, "transformer") # always save a cpu model. device_bak = self.device self.device = torch.device("cpu") self.generator.to(self.device) self.discriminator.to(self.device) torch.save(self, path) self.device = device_bak self.generator.to(self.device) self.discriminator.to(self.device) @classmethod def load(cls, path): model = torch.load(path) model.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") model.generator.to(model.device) model.discriminator.to(model.device) return model
def fit(self, train_data, discrete_columns=(), epochs=300, log_frequency=True): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency) self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) discriminator = Discriminator(data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) optimizerG = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) optimizerD = optim.Adam(discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 train_losses = [] early_stopping = EarlyStopping(patience=self.patience, verbose=False) steps_per_epoch = max(len(train_data) // self.batch_size, 1) for i in range(epochs): for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = discriminator(fake_cat) y_real = discriminator(real_cat) pen = discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) train_losses.append(loss_d.item()) optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy train_losses.append(loss_g.item()) optimizerG.zero_grad() loss_g.backward() optimizerG.step() early_stopping(np.average(train_losses)) if early_stopping.early_stop: print("GAN: Early stopping after epochs {}".format(i)) break train_losses = []
def fit( self, train_data, discrete_columns=tuple(), epochs=300, verbose=True, gen_lr=2e-4, ): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. """ """ self.confidence_level = confidence_level loss_other_name = "loss_bb" if confidence_level != -1 else "loss_d" history = {"loss_g": [], loss_other_name: []} """ # Eli: add Mode-specific Normalization if not hasattr(self, "transformer"): self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions if not hasattr(self, "cond_generator"): self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, self.log_frequency) if not hasattr(self, "generator"): self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) #print(data_dim) #print(self.cond_generator.n_opt) if not hasattr(self, "discriminator"): self.discriminator = Discriminator( data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) """ #after sample in fit gen_output is 120 not 80(cause gen allmost twice) if not hasattr(self, "discriminator"): self.discriminator = Discriminator( 24 + self.cond_generator.n_opt, self.dis_dim ).to(self.device) """ if not hasattr(self, "optimizerG"): self.optimizerG = optim.Adam( self.generator.parameters(), lr=gen_lr, betas=(0.5, 0.9), weight_decay=self.l2scale, ) if not hasattr(self, "optimizerD"): self.optimizerD = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 # init mean to zero and std to one #keep one spot to confidence level, which will be add after normal dis will mean = torch.zeros(((self.batch_size * self.embedding_dim) - 1), device=self.device) std = mean + 1 # steps_per_epoch = max(len(train_data) // self.batch_size, 1) steps_per_epoch = 10 # magic number decided with Gilad. feel free to change it loss_other_name = "loss_bb" if self.confidence_levels != [] else "loss_d" allhist = {"confidence_levels_history": []} # Eli: start training loop for current_conf_level in self.confidence_levels: #need to change, if non confidence give, aka []) no loop will run #so if no conf, need to jump over conf loop history = { "confidence_level": current_conf_level, "loss_g": [], loss_other_name: [] } for i in range(epochs): self.trained_epoches += 1 for id_ in range(steps_per_epoch): #we examine confidence lelvel so no need parts which they does not show up """ if self.confidence_levels == []: # discriminator loop for n in range(self.discriminator_steps): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample( self.batch_size, col[perm], opt[perm] ) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype("float32")).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = self.discriminator(fake_cat) y_real = self.discriminator(real_cat) pen = self.discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device ) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) if self.confidence_levels == []: # without bb loss self.optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() self.optimizerD.step() """ # we examine confidence lelvel so no need parts which they does not show up #as above(no confidence and uses discriminator, no need) fakez = torch.normal(mean=mean, std=std) #added here the part of adding conf to sample #not sure why we need this part in the code but debug shows it this usses this lines #ask gilad eli confi = current_conf_level.astype( np.float32) # generator excpect float conf = torch.tensor([confi]).to( self.device ) # change conf to conf input that will sent!! fakez = torch.cat([fakez, conf], dim=0) fakez = torch.reshape( fakez, (self.batch_size, self.embedding_dim)) #end added here the part of adding conf to sample condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self.discriminator( torch.cat([fakeact, c1], dim=1)) else: y_fake = self.discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) if self.confidence_levels != []: # generate `batch_size` samples #samples of fit using yBB in its input #apply generator twice gen_out, gen_fakeacts = self.sample( self.batch_size, current_conf_level) """ gen_out=fakeact.detach().cpu().numpy() gen_out=self.transformer.inverse_transform(gen_out, None) """ loss_bb = self._calc_bb_confidence_loss( gen_out, current_conf_level ) #send specific confidence to loss computation #return conf bit vector input and bb_y_vec input bit #send to discriminate to connect gradient again y_fake = self.discriminator(gen_fakeacts) #find mean like in original ctgan #multiple by small number to get realy small value y_fake_mean = torch.mean(y_fake) * 0.00000000000001 y_fake_mean = y_fake_mean.to(self.device) #change to float because y_fake is float not double loss_bb = loss_bb.float() #add y_fake_mean to lossg like in origin ctgan #plus loss_bb loss_g = -y_fake_mean + loss_bb + cross_entropy #loss_g = -y_fake_mean + cross_entropy else: # original loss loss_g = -torch.mean(y_fake) + cross_entropy self.optimizerG.zero_grad() loss_g.backward() """ print("gradients\n") for p in self.generator.parameters(): print(p.grad) """ """ for name, param in self.generator.named_parameters(): #if param.requires_grad: print(name) print(param.grad) """ self.optimizerG.step() loss_g_val = loss_g.detach().cpu() loss_other_val = locals()[loss_other_name].detach().cpu() history["loss_g"].append(loss_g.item()) history[loss_other_name].append(loss_other_val.item()) if verbose: print( f"Epoch {self.trained_epoches}, Loss G: {loss_g_val}, {loss_other_name}: {loss_other_val}", flush=True, ) allhist["confidence_levels_history"].append(history) return allhist
class CTGANSynthesizer(object): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. gen_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Residual Layer will be created for each one of the values provided. Defaults to (256, 256). dis_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). l2scale (float): Weight Decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. discriminator_steps (int): Number of discriminator updates to do for each generator update. From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper default is 5. Default used is 1 to match original CTGAN implementation. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. blackbox_model: Model that implements fit, predict, predict_proba """ def __init__( self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500, discriminator_steps=1, log_frequency=True, blackbox_model=None, preprocessing_pipeline=None, bb_loss="logloss", confidence_levels=[], #default value if no confidence given ): self.embedding_dim = embedding_dim self.gen_dim = gen_dim self.dis_dim = dis_dim self.l2scale = l2scale self.batch_size = batch_size self.log_frequency = log_frequency self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.trained_epoches = 0 self.discriminator_steps = discriminator_steps self.blackbox_model = blackbox_model self.preprocessing_pipeline = preprocessing_pipeline self.confidence_levels = confidence_levels #set here not in fit self.bb_loss = bb_loss #print("self.confidence_levels") @staticmethod def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1): """Deals with the instability of the gumbel_softmax for older versions of torch. For more details about the issue: https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing Args: logits: […, num_features] unnormalized log probabilities tau: non-negative scalar temperature hard: if True, the returned samples will be discretized as one-hot vectors, but will be differentiated as if it is the soft sample in autograd dim (int): a dimension along which softmax will be computed. Default: -1. Returns: Sampled tensor of same shape as logits from the Gumbel-Softmax distribution. """ if version.parse(torch.__version__) < version.parse("1.2.0"): for i in range(10): transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) if not torch.isnan(transformed).any(): return transformed raise ValueError("gumbel_softmax returning NaN.") return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) def _apply_activate(self, data): data_t = [] st = 0 for item in self.transformer.output_info: if item[1] == "tanh": ed = st + item[0] data_t.append(torch.tanh(data[:, st:ed])) st = ed elif item[1] == "softmax": ed = st + item[0] transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2) data_t.append(transformed) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): loss = [] st = 0 st_c = 0 skip = False for item in self.transformer.output_info: if item[1] == "tanh": st += item[0] skip = True elif item[1] == "softmax": if skip: skip = False st += item[0] continue ed = st + item[0] ed_c = st_c + item[0] tmp = functional.cross_entropy( data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction="none", ) loss.append(tmp) st = ed st_c = ed_c else: assert 0 loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def fit( self, train_data, discrete_columns=tuple(), epochs=300, verbose=True, gen_lr=2e-4, ): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. """ """ self.confidence_level = confidence_level loss_other_name = "loss_bb" if confidence_level != -1 else "loss_d" history = {"loss_g": [], loss_other_name: []} """ # Eli: add Mode-specific Normalization if not hasattr(self, "transformer"): self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions if not hasattr(self, "cond_generator"): self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, self.log_frequency) if not hasattr(self, "generator"): self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) #print(data_dim) #print(self.cond_generator.n_opt) if not hasattr(self, "discriminator"): self.discriminator = Discriminator( data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) """ #after sample in fit gen_output is 120 not 80(cause gen allmost twice) if not hasattr(self, "discriminator"): self.discriminator = Discriminator( 24 + self.cond_generator.n_opt, self.dis_dim ).to(self.device) """ if not hasattr(self, "optimizerG"): self.optimizerG = optim.Adam( self.generator.parameters(), lr=gen_lr, betas=(0.5, 0.9), weight_decay=self.l2scale, ) if not hasattr(self, "optimizerD"): self.optimizerD = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 # init mean to zero and std to one #keep one spot to confidence level, which will be add after normal dis will mean = torch.zeros(((self.batch_size * self.embedding_dim) - 1), device=self.device) std = mean + 1 # steps_per_epoch = max(len(train_data) // self.batch_size, 1) steps_per_epoch = 10 # magic number decided with Gilad. feel free to change it loss_other_name = "loss_bb" if self.confidence_levels != [] else "loss_d" allhist = {"confidence_levels_history": []} # Eli: start training loop for current_conf_level in self.confidence_levels: #need to change, if non confidence give, aka []) no loop will run #so if no conf, need to jump over conf loop history = { "confidence_level": current_conf_level, "loss_g": [], loss_other_name: [] } for i in range(epochs): self.trained_epoches += 1 for id_ in range(steps_per_epoch): #we examine confidence lelvel so no need parts which they does not show up """ if self.confidence_levels == []: # discriminator loop for n in range(self.discriminator_steps): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample( self.batch_size, col[perm], opt[perm] ) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype("float32")).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = self.discriminator(fake_cat) y_real = self.discriminator(real_cat) pen = self.discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device ) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) if self.confidence_levels == []: # without bb loss self.optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() self.optimizerD.step() """ # we examine confidence lelvel so no need parts which they does not show up #as above(no confidence and uses discriminator, no need) fakez = torch.normal(mean=mean, std=std) #added here the part of adding conf to sample #not sure why we need this part in the code but debug shows it this usses this lines #ask gilad eli confi = current_conf_level.astype( np.float32) # generator excpect float conf = torch.tensor([confi]).to( self.device ) # change conf to conf input that will sent!! fakez = torch.cat([fakez, conf], dim=0) fakez = torch.reshape( fakez, (self.batch_size, self.embedding_dim)) #end added here the part of adding conf to sample condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self.discriminator( torch.cat([fakeact, c1], dim=1)) else: y_fake = self.discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) if self.confidence_levels != []: # generate `batch_size` samples #samples of fit using yBB in its input #apply generator twice gen_out, gen_fakeacts = self.sample( self.batch_size, current_conf_level) """ gen_out=fakeact.detach().cpu().numpy() gen_out=self.transformer.inverse_transform(gen_out, None) """ loss_bb = self._calc_bb_confidence_loss( gen_out, current_conf_level ) #send specific confidence to loss computation #return conf bit vector input and bb_y_vec input bit #send to discriminate to connect gradient again y_fake = self.discriminator(gen_fakeacts) #find mean like in original ctgan #multiple by small number to get realy small value y_fake_mean = torch.mean(y_fake) * 0.00000000000001 y_fake_mean = y_fake_mean.to(self.device) #change to float because y_fake is float not double loss_bb = loss_bb.float() #add y_fake_mean to lossg like in origin ctgan #plus loss_bb loss_g = -y_fake_mean + loss_bb + cross_entropy #loss_g = -y_fake_mean + cross_entropy else: # original loss loss_g = -torch.mean(y_fake) + cross_entropy self.optimizerG.zero_grad() loss_g.backward() """ print("gradients\n") for p in self.generator.parameters(): print(p.grad) """ """ for name, param in self.generator.named_parameters(): #if param.requires_grad: print(name) print(param.grad) """ self.optimizerG.step() loss_g_val = loss_g.detach().cpu() loss_other_val = locals()[loss_other_name].detach().cpu() history["loss_g"].append(loss_g.item()) history[loss_other_name].append(loss_other_val.item()) if verbose: print( f"Epoch {self.trained_epoches}, Loss G: {loss_g_val}, {loss_other_name}: {loss_other_val}", flush=True, ) allhist["confidence_levels_history"].append(history) return allhist #special sample for fit/train part #extra bit added to input as y,apply gen twice #in second time with ybb after sent gen_out if first apply generator #first time with y zero bit second time with bb conf bit #not relevant function for now def fit_sample(self, n, confidence_level, condition_column=None, condition_value=None): """Sample data similar to the training data. Choosing a condition_column and condition_value will increase the probability of the discrete condition_value happening in the condition_column. Args: n (int): Number of rows to sample. condition_column (string): Name of a discrete column. condition_value (string): Name of the category in the condition_column which we wish to increase the probability of happening. Returns: numpy.ndarray or pandas.DataFrame """ if condition_column is not None and condition_value is not None: condition_info = self.transformer.covert_column_name_value_to_id( condition_column, condition_value) global_condition_vec = ( self.cond_generator.generate_cond_from_condition_column_info( condition_info, self.batch_size)) else: global_condition_vec = None steps = n // self.batch_size + 1 data = [] gen_input_layer = self.generator.seq[0].fc #get linearinpputlayer #now first sample part need to be #with out not gradients updates #so for all first part we do: with torch.no_grad(): for i in range(steps): #check if it is okay decrease noise by one for adding conf #add conf ass input to sample function, as number vector #give one vector place to adding y to gen noise mean = torch.zeros(self.batch_size, (self.embedding_dim - 2)) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) #fakez = torch.reshape(fakez,(-1,)) confidence_level = confidence_level.astype( np.float32) #generator excpect float #create C column vector of confidence level C conf = torch.zeros(self.batch_size, 1) + confidence_level conf = conf.to( self.device) #change conf to conf input that will sent!! #first sample will geet zero as y column vector yzero = torch.zeros(self.batch_size, 1).to(self.device) #adding y bit to fakez gen noise,generator input fakez = torch.cat([fakez, conf, yzero], dim=1) fakez = torch.reshape(fakez, (self.batch_size, self.embedding_dim)) if global_condition_vec is not None: condvec = global_condition_vec.copy() else: condvec = self.cond_generator.sample_zero(self.batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) #conftens = torch.tensor([0.5]); #fakez = torch.cat([fakez, conftens], dim=1)#check to delelte!! #reset to zero weights of y_zero and his bias gen_input_layer.weight[-1] = 0 #-1 is last line, where I considered yzero weights to be #which contains all weights of last bit of input gen_input_layer.bias[-1] = 0 #-1 is last bias,where I considered yzero weights to be #in biases vector which contain the bias of last bit fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] # instead of return data # we will use it here to give it to BB and get new conf y gen_out = self.transformer.inverse_transform(data, None) y_prob_for_y_zero = self.blackbox_model.predict_proba(gen_out) y_conf_gen_for_y_zero = y_prob_for_y_zero[:, 0].astype(np.float32) #now we use this y conf and put it as y bit instead 0 and sample normally ####### #second part of samples with BB y conf bit ####### if condition_column is not None and condition_value is not None: condition_info = self.transformer.covert_column_name_value_to_id( condition_column, condition_value) global_condition_vec = ( self.cond_generator.generate_cond_from_condition_column_info( condition_info, self.batch_size)) else: global_condition_vec = None steps = n // self.batch_size + 1 data = [] for i in range(steps): #check if it is okay decrease noise by one for adding conf #add conf ass input to sample function, as vector cloumn #decrease by one more for y vec (-2 for y bit a conf level in input) mean = torch.zeros(self.batch_size, (self.embedding_dim - 2)) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) #fakez = torch.reshape(fakez,(-1,)) confidence_level = confidence_level.astype( np.float32) #generator excpect float #conf = torch.tensor([confidence_level],requires_grad=True).to(self.device)#change conf to conf input that will sent!! conf = torch.zeros(self.batch_size, 1) + confidence_level #conf is target in BBCE loss function #target ccant have grdient descent True for some reason #conf.requires_grad = True conf.to(self.device) #fix y bb bit #y_conf_gen_for_y_zero = y_conf_gen_for_y_zero.astype(np.float32)#generator #y_BB_bit=torch.tensor([y_conf_gen_for_y_zero], requires_grad=True).to(self.device) #y_BB_column = torch.zeros(self.batch_size,1) + y_conf_gen_for_y_zero y_BB_column = torch.tensor([y_conf_gen_for_y_zero], requires_grad=True).to(self.device) #y_BB_column.requires_grad = True #take Transpose because shape is 1*50 - need 50*1 y_BB_column = y_BB_column.T y_BB_column.to(self.device) #put y_bb bit in fakez gen noise ,generator input fakez = torch.cat([fakez, conf, y_BB_column], dim=1) fakez = torch.reshape(fakez, (self.batch_size, self.embedding_dim)) #fakez = fakez.astype(np.float32) if global_condition_vec is not None: condvec = global_condition_vec.copy() else: condvec = self.cond_generator.sample_zero(self.batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) #conftens = torch.tensor([0.5]); #fakez = torch.cat([fakez, conftens], dim=1)#check to delelte!! #turn on input gradient #fakez.requires_grad = True #turn on inputs gradient fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] # return data #return self.transformer.inverse_transform(data, None) gen_out = self.transformer.inverse_transform(data, None) #ret conf level vector and bb_y_vector #check if return the below written or above written return conf, y_BB_column #normal sample for creating in the expirements(with no train) def sample(self, n, confidence_level, condition_column=None, condition_value=None): """Sample data similar to the training data. Choosing a condition_column and condition_value will increase the probability of the discrete condition_value happening in the condition_column. Args: n (int): Number of rows to sample. condition_column (string): Name of a discrete column. condition_value (string): Name of the category in the condition_column which we wish to increase the probability of happening. Returns: numpy.ndarray or pandas.DataFrame """ if condition_column is not None and condition_value is not None: condition_info = self.transformer.covert_column_name_value_to_id( condition_column, condition_value) global_condition_vec = ( self.cond_generator.generate_cond_from_condition_column_info( condition_info, self.batch_size)) else: global_condition_vec = None steps = n // self.batch_size + 1 data = [] our_fackeacts = [] for i in range(steps): #check if it is okay decrease noise by one for adding conf #add conf ass input to sample function, as number vector column mean = torch.zeros(self.batch_size, (self.embedding_dim - 1)) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) #fakez = torch.reshape(fakez,(-1,)) confidence_level = confidence_level.astype( np.float32) #generator excpect float #create C column vector of confidence level C conf = torch.zeros(self.batch_size, 1) + confidence_level conf = conf.to( self.device) #change conf to conf input that will sent!! fakez = torch.cat([fakez, conf], dim=1) fakez = torch.reshape(fakez, (self.batch_size, self.embedding_dim)) if global_condition_vec is not None: condvec = global_condition_vec.copy() else: condvec = self.cond_generator.sample_zero(self.batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) our_fackeacts.append(fakeact) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] gen_fakeacts = torch.cat(our_fackeacts, 0) return self.transformer.inverse_transform(data, None), gen_fakeacts def save(self, path): assert hasattr(self, "generator") assert hasattr(self, "discriminator") assert hasattr(self, "transformer") # always save a cpu model. device_bak = self.device self.device = torch.device("cpu") self.generator.to(self.device) self.discriminator.to(self.device) torch.save(self, path) self.device = device_bak self.generator.to(self.device) self.discriminator.to(self.device) @classmethod def load(cls, path): model = torch.load(path) model.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") model.generator.to(model.device) model.discriminator.to(model.device) return model def _calc_bb_confidence_loss(self, gen_out, conf_level): #added specific conf level as input arguement y_prob = self.blackbox_model.predict_proba(gen_out) y_conf_gen = y_prob[:, 0] # confidence scores # create vector with the same size of y_confidence filled with `confidence_level` values if isinstance(conf_level, list): conf = np.random.choice(conf_level) else: conf = conf_level y_conf_wanted = np.full(len(y_conf_gen), conf) # to tensor y_conf_gen = torch.tensor(y_conf_gen, requires_grad=True).to(self.device) y_conf_wanted = torch.tensor(y_conf_wanted).to(self.device) # loss bb_loss = self._get_loss_by_name(self.bb_loss) bb_loss_val = bb_loss(y_conf_gen, y_conf_wanted) return bb_loss_val @staticmethod def _get_loss_by_name(loss_name): if loss_name == "log": return torch.nn.BCELoss() elif loss_name == "l1": return torch.nn.L1Loss() elif loss_name == "l2": return torch.nn.L1Loss() elif loss_name == "focal": return WeightedFocalLoss() else: raise ValueError(f"Unknown loss name '{loss_name}'")
def fit( self, train_data, discrete_columns=tuple(), epochs=300, confidence_level=-1, verbose=True, gen_lr=2e-4, ): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. """ self.confidence_level = confidence_level loss_other_name = "loss_bb" if confidence_level != -1 else "loss_d" history = {"loss_g": [], loss_other_name: []} # Eli: add Mode-specific Normalization if not hasattr(self, "transformer"): self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions if not hasattr(self, "cond_generator"): self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, self.log_frequency) if not hasattr(self, "generator"): self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) if not hasattr(self, "discriminator"): self.discriminator = Discriminator( data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) if not hasattr(self, "optimizerG"): self.optimizerG = optim.Adam( self.generator.parameters(), lr=gen_lr, betas=(0.5, 0.9), weight_decay=self.l2scale, ) if not hasattr(self, "optimizerD"): self.optimizerD = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 # init mean to zero and std to one mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 # steps_per_epoch = max(len(train_data) // self.batch_size, 1) steps_per_epoch = 10 # magic number decided with Gilad. feel free to change it # Eli: start training loop for i in range(epochs): self.trained_epoches += 1 for id_ in range(steps_per_epoch): if self.confidence_level == -1: # discriminator loop for n in range(self.discriminator_steps): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype("float32")).to( self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = self.discriminator(fake_cat) y_real = self.discriminator(real_cat) pen = self.discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) if self.confidence_level == -1: # without bb loss self.optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() self.optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self.discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = self.discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) if self.confidence_level != -1: # generate `batch_size` samples gen_out = self.sample(self.batch_size) loss_bb = self._calc_bb_confidence_loss(gen_out) loss_g = loss_bb + cross_entropy else: # original loss loss_g = -torch.mean(y_fake) + cross_entropy self.optimizerG.zero_grad() loss_g.backward() self.optimizerG.step() loss_g_val = loss_g.detach().cpu() loss_other_val = locals()[loss_other_name].detach().cpu() history["loss_g"].append(loss_g.item()) history[loss_other_name].append(loss_other_val.item()) if verbose: print( f"Epoch {self.trained_epoches}, Loss G: {loss_g_val}, {loss_other_name}: {loss_other_val}", flush=True, ) return history
class CTGANSynthesizer(object): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. gen_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Residual Layer will be created for each one of the values provided. Defaults to (256, 256). dis_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). l2scale (float): Weight Decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. discriminator_steps (int): Number of discriminator updates to do for each generator update. From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper default is 5. Default used is 1 to match original CTGAN implementation. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. blackbox_model: Model that implements fit, predict, predict_proba """ def __init__( self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500, discriminator_steps=1, log_frequency=True, blackbox_model=None, preprocessing_pipeline=None, bb_loss="logloss", ): self.embedding_dim = embedding_dim self.gen_dim = gen_dim self.dis_dim = dis_dim self.l2scale = l2scale self.batch_size = batch_size self.log_frequency = log_frequency self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.trained_epoches = 0 self.discriminator_steps = discriminator_steps self.blackbox_model = blackbox_model self.preprocessing_pipeline = preprocessing_pipeline self.confidence_level = -1 # will set in fit self.bb_loss = bb_loss @staticmethod def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1): """Deals with the instability of the gumbel_softmax for older versions of torch. For more details about the issue: https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing Args: logits: […, num_features] unnormalized log probabilities tau: non-negative scalar temperature hard: if True, the returned samples will be discretized as one-hot vectors, but will be differentiated as if it is the soft sample in autograd dim (int): a dimension along which softmax will be computed. Default: -1. Returns: Sampled tensor of same shape as logits from the Gumbel-Softmax distribution. """ if version.parse(torch.__version__) < version.parse("1.2.0"): for i in range(10): transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) if not torch.isnan(transformed).any(): return transformed raise ValueError("gumbel_softmax returning NaN.") return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) def _apply_activate(self, data): data_t = [] st = 0 for item in self.transformer.output_info: if item[1] == "tanh": ed = st + item[0] data_t.append(torch.tanh(data[:, st:ed])) st = ed elif item[1] == "softmax": ed = st + item[0] transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2) data_t.append(transformed) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): loss = [] st = 0 st_c = 0 skip = False for item in self.transformer.output_info: if item[1] == "tanh": st += item[0] skip = True elif item[1] == "softmax": if skip: skip = False st += item[0] continue ed = st + item[0] ed_c = st_c + item[0] tmp = functional.cross_entropy( data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction="none", ) loss.append(tmp) st = ed st_c = ed_c else: assert 0 loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def fit( self, train_data, discrete_columns=tuple(), epochs=300, confidence_level=-1, verbose=True, gen_lr=2e-4, ): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. """ self.confidence_level = confidence_level loss_other_name = "loss_bb" if confidence_level != -1 else "loss_d" history = {"loss_g": [], loss_other_name: []} # Eli: add Mode-specific Normalization if not hasattr(self, "transformer"): self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions if not hasattr(self, "cond_generator"): self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, self.log_frequency) if not hasattr(self, "generator"): self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) if not hasattr(self, "discriminator"): self.discriminator = Discriminator( data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) if not hasattr(self, "optimizerG"): self.optimizerG = optim.Adam( self.generator.parameters(), lr=gen_lr, betas=(0.5, 0.9), weight_decay=self.l2scale, ) if not hasattr(self, "optimizerD"): self.optimizerD = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 # init mean to zero and std to one mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 # steps_per_epoch = max(len(train_data) // self.batch_size, 1) steps_per_epoch = 10 # magic number decided with Gilad. feel free to change it # Eli: start training loop for i in range(epochs): self.trained_epoches += 1 for id_ in range(steps_per_epoch): if self.confidence_level == -1: # discriminator loop for n in range(self.discriminator_steps): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype("float32")).to( self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = self.discriminator(fake_cat) y_real = self.discriminator(real_cat) pen = self.discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) if self.confidence_level == -1: # without bb loss self.optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() self.optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self.discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = self.discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) if self.confidence_level != -1: # generate `batch_size` samples gen_out = self.sample(self.batch_size) loss_bb = self._calc_bb_confidence_loss(gen_out) loss_g = loss_bb + cross_entropy else: # original loss loss_g = -torch.mean(y_fake) + cross_entropy self.optimizerG.zero_grad() loss_g.backward() self.optimizerG.step() loss_g_val = loss_g.detach().cpu() loss_other_val = locals()[loss_other_name].detach().cpu() history["loss_g"].append(loss_g.item()) history[loss_other_name].append(loss_other_val.item()) if verbose: print( f"Epoch {self.trained_epoches}, Loss G: {loss_g_val}, {loss_other_name}: {loss_other_val}", flush=True, ) return history def sample(self, n, condition_column=None, condition_value=None): """Sample data similar to the training data. Choosing a condition_column and condition_value will increase the probability of the discrete condition_value happening in the condition_column. Args: n (int): Number of rows to sample. condition_column (string): Name of a discrete column. condition_value (string): Name of the category in the condition_column which we wish to increase the probability of happening. Returns: numpy.ndarray or pandas.DataFrame """ if condition_column is not None and condition_value is not None: condition_info = self.transformer.covert_column_name_value_to_id( condition_column, condition_value) global_condition_vec = ( self.cond_generator.generate_cond_from_condition_column_info( condition_info, self.batch_size)) else: global_condition_vec = None steps = n // self.batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) if global_condition_vec is not None: condvec = global_condition_vec.copy() else: condvec = self.cond_generator.sample_zero(self.batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] # return data return self.transformer.inverse_transform(data, None) def save(self, path): assert hasattr(self, "generator") assert hasattr(self, "discriminator") assert hasattr(self, "transformer") # always save a cpu model. device_bak = self.device self.device = torch.device("cpu") self.generator.to(self.device) self.discriminator.to(self.device) torch.save(self, path) self.device = device_bak self.generator.to(self.device) self.discriminator.to(self.device) @classmethod def load(cls, path): model = torch.load(path) model.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") model.generator.to(model.device) model.discriminator.to(model.device) return model def _calc_bb_confidence_loss(self, gen_out): y_prob = self.blackbox_model.predict_proba(gen_out) y_conf_gen = y_prob[:, 0] # confidence scores # create vector with the same size of y_confidence filled with `confidence_level` values if isinstance(self.confidence_level, list): conf = np.random.choice(self.confidence_level) else: conf = self.confidence_level y_conf_wanted = np.full(len(y_conf_gen), conf) # to tensor y_conf_gen = torch.tensor(y_conf_gen, requires_grad=True).to(self.device) y_conf_wanted = torch.tensor(y_conf_wanted).to(self.device) # loss bb_loss = self._get_loss_by_name(self.bb_loss) bb_loss_val = bb_loss(y_conf_gen, y_conf_wanted) return bb_loss_val @staticmethod def _get_loss_by_name(loss_name): if loss_name == "log": return torch.nn.BCELoss() elif loss_name == "l1": return torch.nn.L1Loss() elif loss_name == "l2": return torch.nn.L1Loss() elif loss_name == "focal": return WeightedFocalLoss() else: raise ValueError(f"Unknown loss name '{loss_name}'")