def test_fit(self): """Test 'fit' on a np.ndarray with one continuous and one discrete columns. The 'fit' method should: - Set 'self.dataframe' to 'False' - Set 'self._column_raw_dtypes' to the appropirate dtypes - Use the appropriate '_fit' type for each column' - Update 'self.output_info_list', 'self.output_dimensions' and 'self._column_transform_info_list' appropriately Setup: - Create DataTransformer - Mock _fit_discrete - Mock _fit_continuous Input: - raw_data = a table with one continuous and one discrete columns. - discrete_columns = list with the name of the discrete column Output: - None Side Effects: - _fit_discrete and _fit_continuous should each be called once - Assigns 'self._column_raw_dtypes' the appropriate dtypes - Assigns 'self.output_info_list' the appropriate 'output_info'. - Assigns 'self.output_dimensions' the appropriate 'output_dimensions'. - Assigns 'self._column_transform_info_list' the appropriate 'column_transform_info'. """ data = pd.DataFrame({ "x": np.random.random(size=100), "y": np.random.choice(["yes", "no"], size=100) }) transformer = DataTransformer() transformer._fit_continuous = Mock() transformer._fit_continuous.return_value = ColumnTransformInfo( column_name="x", column_type="continuous", transform=None, transform_aux=None, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3) transformer._fit_discrete = Mock() transformer._fit_discrete.return_value = ColumnTransformInfo( column_name="y", column_type="discrete", transform=None, transform_aux=None, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2) transformer.fit(data, discrete_columns=["y"]) transformer._fit_discrete.assert_called_once() transformer._fit_continuous.assert_called_once() assert transformer.output_dimensions == 6
def test_fit(self): """Test ``fit`` on a np.ndarray with one continuous and one discrete columns. The ``fit`` method should: - Set ``self.dataframe`` to ``False``. - Set ``self._column_raw_dtypes`` to the appropirate dtypes. - Use the appropriate ``_fit`` type for each column. - Update ``self.output_info_list``, ``self.output_dimensions`` and ``self._column_transform_info_list`` appropriately. Setup: - Create ``DataTransformer``. - Mock ``_fit_discrete``. - Mock ``_fit_continuous``. Input: - A table with one continuous and one discrete columns. - A list with the name of the discrete column. Side Effects: - ``_fit_discrete`` and ``_fit_continuous`` should each be called once. - Assigns ``self._column_raw_dtypes`` the appropriate dtypes. - Assigns ``self.output_info_list`` the appropriate ``output_info``. - Assigns ``self.output_dimensions`` the appropriate ``output_dimensions``. - Assigns ``self._column_transform_info_list`` the appropriate ``column_transform_info``. """ # Setup transformer = DataTransformer() transformer._fit_continuous = Mock() transformer._fit_continuous.return_value = ColumnTransformInfo( column_name='x', column_type='continuous', transform=None, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3) transformer._fit_discrete = Mock() transformer._fit_discrete.return_value = ColumnTransformInfo( column_name='y', column_type='discrete', transform=None, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2) data = pd.DataFrame({ 'x': np.random.random(size=100), 'y': np.random.choice(['yes', 'no'], size=100) }) # Run transformer.fit(data, discrete_columns=['y']) # Assert transformer._fit_discrete.assert_called_once() transformer._fit_continuous.assert_called_once() assert transformer.output_dimensions == 6
class TVAESynthesizer(BaseSynthesizer): """TVAESynthesizer.""" def __init__(self, embedding_dim=128, compress_dims=(128, 128), decompress_dims=(128, 128), l2scale=1e-5, batch_size=500, epochs=300): self.embedding_dim = embedding_dim self.compress_dims = compress_dims self.decompress_dims = decompress_dims self.l2scale = l2scale self.batch_size = batch_size self.loss_factor = 2 self.epochs = epochs self._device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") def fit(self, train_data, discrete_columns=tuple()): self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) dataset = TensorDataset( torch.from_numpy(train_data.astype('float32')).to(self._device)) loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=True) data_dim = self.transformer.output_dimensions encoder = Encoder(data_dim, self.compress_dims, self.embedding_dim).to(self._device) self.decoder = Decoder(self.embedding_dim, self.compress_dims, data_dim).to(self._device) optimizerAE = Adam(list(encoder.parameters()) + list(self.decoder.parameters()), weight_decay=self.l2scale) for i in range(self.epochs): for id_, data in enumerate(loader): optimizerAE.zero_grad() real = data[0].to(self._device) mu, std, logvar = encoder(real) eps = torch.randn_like(std) emb = eps * std + mu rec, sigmas = self.decoder(emb) loss_1, loss_2 = loss_function( rec, real, sigmas, mu, logvar, self.transformer.output_info_list, self.loss_factor) loss = loss_1 + loss_2 loss.backward() optimizerAE.step() self.decoder.sigma.data.clamp_(0.01, 1.0) def sample(self, samples): self.decoder.eval() steps = samples // self.batch_size + 1 data = [] for _ in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 noise = torch.normal(mean=mean, std=std).to(self._device) fake, sigmas = self.decoder(noise) fake = torch.tanh(fake) data.append(fake.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:samples] return self.transformer.inverse_transform( data, sigmas.detach().cpu().numpy()) def set_device(self, device): self._device = device self.decoder.to(self._device)
class CTGANSynthesizer(BaseSynthesizer): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. generator_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Residual Layer will be created for each one of the values provided. Defaults to (256, 256). discriminator_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). generator_lr (float): Learning rate for the generator. Defaults to 2e-4. generator_decay (float): Generator weight decay for the Adam Optimizer. Defaults to 1e-6. discriminator_lr (float): Learning rate for the discriminator. Defaults to 2e-4. discriminator_decay (float): Discriminator weight decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. discriminator_steps (int): Number of discriminator updates to do for each generator update. From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper default is 5. Default used is 1 to match original CTGAN implementation. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. verbose (boolean): Whether to have print statements for progress results. Defaults to ``False``. epochs (int): Number of training epochs. Defaults to 300. """ def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256), generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4, discriminator_decay=0, pack=1, batch_size=500, discriminator_steps=1, log_frequency=True, verbose=False, epochs=300, epsilon=10, delta=1e-5, noise_multiplier=2, max_grad_norm=1, dp=True): assert batch_size % 2 == 0 self._embedding_dim = embedding_dim self._generator_dim = generator_dim self._discriminator_dim = discriminator_dim self._generator_lr = generator_lr self._generator_decay = generator_decay self._discriminator_lr = discriminator_lr self._discriminator_decay = discriminator_decay self._pack = pack #add this option to original CTGAN for swagness self._batch_size = batch_size self._discriminator_steps = discriminator_steps self._log_frequency = log_frequency self._verbose = verbose self._epochs = epochs self._epsilon = epsilon self._device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.trained_epochs = 0 self.trained_epsilon = 0 self._delta = delta self._noise_multiplier = noise_multiplier self.max_grad_norm = max_grad_norm self._dp = dp opacus.supported_layers_grad_samplers._create_or_extend_grad_sample = _custom_create_or_extend_grad_sample @staticmethod def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1): """Deals with the instability of the gumbel_softmax for older versions of torch. For more details about the issue: https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing Args: logits: […, num_features] unnormalized log probabilities tau: non-negative scalar temperature hard: if True, the returned samples will be discretized as one-hot vectors, but will be differentiated as if it is the soft sample in autograd dim (int): a dimension along which softmax will be computed. Default: -1. Returns: Sampled tensor of same shape as logits from the Gumbel-Softmax distribution. """ if version.parse(torch.__version__) < version.parse("1.2.0"): for i in range(10): transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) if not torch.isnan(transformed).any(): return transformed raise ValueError("gumbel_softmax returning NaN.") return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) def _apply_activate(self, data): """Apply proper activation function to the output of the generator.""" data_t = [] st = 0 for column_info in self._transformer.output_info_list: for span_info in column_info: if span_info.activation_fn == 'tanh': ed = st + span_info.dim data_t.append(torch.tanh(data[:, st:ed])) st = ed elif span_info.activation_fn == 'softmax': ed = st + span_info.dim transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2) data_t.append(transformed) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): """Compute the cross entropy loss on the fixed discrete column.""" loss = [] st = 0 st_c = 0 for column_info in self._transformer.output_info_list: for span_info in column_info: if len(column_info ) != 1 or span_info.activation_fn != "softmax": # not discrete column st += span_info.dim else: ed = st + span_info.dim ed_c = st_c + span_info.dim tmp = functional.cross_entropy(data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none') loss.append(tmp) st = ed st_c = ed_c loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def _validate_discrete_columns(self, train_data, discrete_columns): """Check whether ``discrete_columns`` exists in ``train_data``. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. """ if isinstance(train_data, pd.DataFrame): invalid_columns = set(discrete_columns) - set(train_data.columns) elif isinstance(train_data, np.ndarray): invalid_columns = [] for column in discrete_columns: if column < 0 or column >= train_data.shape[1]: invalid_columns.append(column) else: raise TypeError( '``train_data`` should be either pd.DataFrame or np.array.') if invalid_columns: raise ValueError( 'Invalid columns found: {}'.format(invalid_columns)) def fit(self, train_data, discrete_columns=tuple(), epochs=None, epsilon=None): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. """ self._validate_discrete_columns(train_data, discrete_columns) if epochs is None: epochs = self._epochs if epsilon is None: epsilon = self._epsilon if not self._dp: self.trained_epsilon = float("inf") self._transformer = DataTransformer() self._transformer.fit(train_data, discrete_columns) train_data = self._transformer.transform(train_data) self._data_sampler = DataSampler(train_data, self._transformer.output_info_list, self._log_frequency) data_dim = self._transformer.output_dimensions self._generator = Generator( self._embedding_dim + self._data_sampler.dim_cond_vec(), self._generator_dim, data_dim).to(self._device) self._discriminator = Discriminator( data_dim + self._data_sampler.dim_cond_vec(), self._discriminator_dim, self._pack).to(self._device) self._optimizerG = optim.Adam(self._generator.parameters(), lr=self._generator_lr, betas=(0.5, 0.9), weight_decay=self._generator_decay) self._optimizerD = optim.Adam(self._discriminator.parameters(), lr=self._discriminator_lr, betas=(0.5, 0.9), weight_decay=self._discriminator_decay) if self._dp: self._privacy_engine = PrivacyEngine( self._discriminator, self._batch_size / self._pack, len(train_data), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=self._noise_multiplier, max_grad_norm=self.max_grad_norm, clip_per_layer=True, loss_reduction="sum", ) self._privacy_engine.attach(self._optimizerD) mean = torch.zeros(self._batch_size, self._embedding_dim, device=self._device) std = mean + 1 one = torch.tensor(1, dtype=torch.float).to(self._device) mone = one * -1 steps_per_epoch = max(len(train_data) // self._batch_size, 1) for i in range(epochs): self.trained_epochs += 1 if self._dp: if self.trained_epsilon >= self._epsilon: print( "Privacy budget of {:.2f} exausthed. Please specify an higher one in fit() to train more or disable differential privacy." .format(self._epsilon)) return for id_ in range(steps_per_epoch): for n in range(self._discriminator_steps): fakez = torch.normal(mean=mean, std=std) condvec = self._data_sampler.sample_condvec( self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = self._data_sampler.sample_data( self._batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self._batch_size) np.random.shuffle(perm) real = self._data_sampler.sample_data( self._batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self._generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to( self._device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake self._optimizerD.zero_grad() y_fake = self._discriminator(fake_cat) y_real = self._discriminator(real_cat) if not self._dp: pen = self._discriminator.calc_gradient_penalty( real_cat, fake_cat, self._device) pen.backward(retain_graph=True) loss_d = -torch.mean(y_real) + torch.mean(y_fake) loss_d.backward() self._optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self._data_sampler.sample_condvec(self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) fake = self._generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self._discriminator( torch.cat([fakeact, c1], dim=1)) else: y_fake = self._discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy self._optimizerG.zero_grad() loss_g.backward() self._optimizerG.step() if self._dp: for p in self._discriminator.parameters(): if hasattr(p, "grad_sample"): del p.grad_sample self.trained_epsilon, best_alpha = self._optimizerD.privacy_engine.get_privacy_spent( self._delta) if self.trained_epsilon >= epsilon: print( "Privacy budget of {:.2f} exausthed, training halted. Best alpha: {:.2f}" .format(epsilon, best_alpha)) return if self._verbose: print( f"Epoch {i+1}, epslion {self.trained_epsilon: .2f}, Loss G: {loss_g.detach().cpu(): .4f}, " f"Loss D: {loss_d.detach().cpu(): .4f}", flush=True) if self._dp: self._privacy_engine.detach() def sample(self, n, condition_column=None, condition_value=None): """Sample data similar to the training data. Choosing a condition_column and condition_value will increase the probability of the discrete condition_value happening in the condition_column. Args: n (int): Number of rows to sample. condition_column (string): Name of a discrete column. condition_value (string): Name of the category in the condition_column which we wish to increase the probability of happening. Returns: numpy.ndarray or pandas.DataFrame """ if condition_column is not None and condition_value is not None: condition_info = self._transformer.convert_column_name_value_to_id( condition_column, condition_value) global_condition_vec = self._data_sampler.generate_cond_from_condition_column_info( condition_info, self._batch_size) else: global_condition_vec = None steps = n // self._batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self._batch_size, self._embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self._device) if global_condition_vec is not None: condvec = global_condition_vec.copy() else: condvec = self._data_sampler.sample_original_condvec( self._batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) fake = self._generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] return self._transformer.inverse_transform(data) def set_device(self, device): self._device = device if hasattr(self, '_generator'): self._generator.to(self._device) if hasattr(self, '_discriminator'): self._discriminator.to(self._device)
class CTGANSynthesizer(BaseSynthesizer): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. generator_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Residual Layer will be created for each one of the values provided. Defaults to (256, 256). discriminator_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). generator_lr (float): Learning rate for the generator. Defaults to 2e-4. generator_decay (float): Generator weight decay for the Adam Optimizer. Defaults to 1e-6. discriminator_lr (float): Learning rate for the discriminator. Defaults to 2e-4. discriminator_decay (float): Discriminator weight decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. discriminator_steps (int): Number of discriminator updates to do for each generator update. From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper default is 5. Default used is 1 to match original CTGAN implementation. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. verbose (boolean): Whether to have print statements for progress results. Defaults to ``False``. epochs (int): Number of training epochs. Defaults to 300. pac (int): Number of samples to group together when applying the discriminator. Defaults to 10. cuda (bool): Whether to attempt to use cuda for GPU computation. If this is False or CUDA is not available, CPU will be used. Defaults to ``True``. """ def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256), generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4, discriminator_decay=1e-6, batch_size=500, discriminator_steps=1, log_frequency=True, verbose=False, epochs=300, pac=10, cuda=True): assert batch_size % 2 == 0 self._embedding_dim = embedding_dim self._generator_dim = generator_dim self._discriminator_dim = discriminator_dim self._generator_lr = generator_lr self._generator_decay = generator_decay self._discriminator_lr = discriminator_lr self._discriminator_decay = discriminator_decay self._batch_size = batch_size self._discriminator_steps = discriminator_steps self._log_frequency = log_frequency self._verbose = verbose self._epochs = epochs self.pac = pac if not cuda or not torch.cuda.is_available(): device = 'cpu' elif isinstance(cuda, str): device = cuda else: device = 'cuda' self._device = torch.device(device) self._transformer = None self._data_sampler = None self._generator = None @staticmethod def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1): """Deals with the instability of the gumbel_softmax for older versions of torch. For more details about the issue: https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing Args: logits […, num_features]: Unnormalized log probabilities tau: Non-negative scalar temperature hard (bool): If True, the returned samples will be discretized as one-hot vectors, but will be differentiated as if it is the soft sample in autograd dim (int): A dimension along which softmax will be computed. Default: -1. Returns: Sampled tensor of same shape as logits from the Gumbel-Softmax distribution. """ if version.parse(torch.__version__) < version.parse('1.2.0'): for i in range(10): transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) if not torch.isnan(transformed).any(): return transformed raise ValueError('gumbel_softmax returning NaN.') return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) def _apply_activate(self, data): """Apply proper activation function to the output of the generator.""" data_t = [] st = 0 for column_info in self._transformer.output_info_list: for span_info in column_info: if span_info.activation_fn == 'tanh': ed = st + span_info.dim data_t.append(torch.tanh(data[:, st:ed])) st = ed elif span_info.activation_fn == 'softmax': ed = st + span_info.dim transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2) data_t.append(transformed) st = ed else: raise ValueError(f'Unexpected activation function {span_info.activation_fn}.') return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): """Compute the cross entropy loss on the fixed discrete column.""" loss = [] st = 0 st_c = 0 for column_info in self._transformer.output_info_list: for span_info in column_info: if len(column_info) != 1 or span_info.activation_fn != 'softmax': # not discrete column st += span_info.dim else: ed = st + span_info.dim ed_c = st_c + span_info.dim tmp = functional.cross_entropy( data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none' ) loss.append(tmp) st = ed st_c = ed_c loss = torch.stack(loss, dim=1) # noqa: PD013 return (loss * m).sum() / data.size()[0] def _validate_discrete_columns(self, train_data, discrete_columns): """Check whether ``discrete_columns`` exists in ``train_data``. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. """ if isinstance(train_data, pd.DataFrame): invalid_columns = set(discrete_columns) - set(train_data.columns) elif isinstance(train_data, np.ndarray): invalid_columns = [] for column in discrete_columns: if column < 0 or column >= train_data.shape[1]: invalid_columns.append(column) else: raise TypeError('``train_data`` should be either pd.DataFrame or np.array.') if invalid_columns: raise ValueError(f'Invalid columns found: {invalid_columns}') @random_state def fit(self, train_data, discrete_columns=(), epochs=None): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. """ self._validate_discrete_columns(train_data, discrete_columns) if epochs is None: epochs = self._epochs else: warnings.warn( ('`epochs` argument in `fit` method has been deprecated and will be removed ' 'in a future version. Please pass `epochs` to the constructor instead'), DeprecationWarning ) self._transformer = DataTransformer() self._transformer.fit(train_data, discrete_columns) train_data = self._transformer.transform(train_data) self._data_sampler = DataSampler( train_data, self._transformer.output_info_list, self._log_frequency) data_dim = self._transformer.output_dimensions self._generator = Generator( self._embedding_dim + self._data_sampler.dim_cond_vec(), self._generator_dim, data_dim ).to(self._device) discriminator = Discriminator( data_dim + self._data_sampler.dim_cond_vec(), self._discriminator_dim, pac=self.pac ).to(self._device) optimizerG = optim.Adam( self._generator.parameters(), lr=self._generator_lr, betas=(0.5, 0.9), weight_decay=self._generator_decay ) optimizerD = optim.Adam( discriminator.parameters(), lr=self._discriminator_lr, betas=(0.5, 0.9), weight_decay=self._discriminator_decay ) mean = torch.zeros(self._batch_size, self._embedding_dim, device=self._device) std = mean + 1 steps_per_epoch = max(len(train_data) // self._batch_size, 1) for i in range(epochs): for id_ in range(steps_per_epoch): for n in range(self._discriminator_steps): fakez = torch.normal(mean=mean, std=std) condvec = self._data_sampler.sample_condvec(self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = self._data_sampler.sample_data(self._batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self._batch_size) np.random.shuffle(perm) real = self._data_sampler.sample_data( self._batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self._generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self._device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fakeact y_fake = discriminator(fake_cat) y_real = discriminator(real_cat) pen = discriminator.calc_gradient_penalty( real_cat, fake_cat, self._device, self.pac) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self._data_sampler.sample_condvec(self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) fake = self._generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy optimizerG.zero_grad() loss_g.backward() optimizerG.step() if self._verbose: print(f'Epoch {i+1}, Loss G: {loss_g.detach().cpu(): .4f},' # noqa: T001 f'Loss D: {loss_d.detach().cpu(): .4f}', flush=True) @random_state def sample(self, n, condition_column=None, condition_value=None): """Sample data similar to the training data. Choosing a condition_column and condition_value will increase the probability of the discrete condition_value happening in the condition_column. Args: n (int): Number of rows to sample. condition_column (string): Name of a discrete column. condition_value (string): Name of the category in the condition_column which we wish to increase the probability of happening. Returns: numpy.ndarray or pandas.DataFrame """ if condition_column is not None and condition_value is not None: condition_info = self._transformer.convert_column_name_value_to_id( condition_column, condition_value) global_condition_vec = self._data_sampler.generate_cond_from_condition_column_info( condition_info, self._batch_size) else: global_condition_vec = None steps = n // self._batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self._batch_size, self._embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self._device) if global_condition_vec is not None: condvec = global_condition_vec.copy() else: condvec = self._data_sampler.sample_original_condvec(self._batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) fake = self._generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] return self._transformer.inverse_transform(data) def set_device(self, device): """Set the `device` to be used ('GPU' or 'CPU).""" self._device = device if self._generator is not None: self._generator.to(self._device)
class CTGANSynthesizer(BaseSynthesizer): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. generator_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Residual Layer will be created for each one of the values provided. Defaults to (256, 256). discriminator_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). generator_lr (float): Learning rate for the generator. Defaults to 2e-4. generator_decay (float): Generator weight decay for the Adam Optimizer. Defaults to 1e-6. discriminator_lr (float): Learning rate for the discriminator. Defaults to 2e-4. discriminator_decay (float): Discriminator weight decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. discriminator_steps (int): Number of discriminator updates to do for each generator update. From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper default is 5. Default used is 1 to match original CTGAN implementation. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. verbose (boolean): Whether to have print statements for progress results. Defaults to ``False``. epochs (int): Number of training epochs. Defaults to 300. """ def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256), generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4, discriminator_decay=0, batch_size=500, discriminator_steps=1, log_frequency=True, verbose=False, epochs=300, external_eval_target=False, adaptive_training=True): assert batch_size % 2 == 0 self._embedding_dim = embedding_dim self._generator_dim = generator_dim self._discriminator_dim = discriminator_dim self._generator_lr = generator_lr self._generator_decay = generator_decay self._discriminator_lr = discriminator_lr self._discriminator_decay = discriminator_decay self._batch_size = batch_size self._discriminator_steps = discriminator_steps self._log_frequency = log_frequency self._verbose = verbose self._epochs = epochs self._device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self._external_eval = False if external_eval_target == False else { "best_score": -np.inf, "correlation_scores": [], "detection_scores": [], "ml_efficacy_scores": { "tree": [], "adaboost": [], "regression": [], "mlp": [], "ml_efficacy": [] }, "target": external_eval_target } self._adaptive_training = False if adaptive_training == False else { "r_d": np.random.random(), "r_g": np.random.random(), "prev_loss_g": np.random.random(), "prev_loss_d": np.random.random(), "lambda": 1 / 3, } @staticmethod def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1): """Deals with the instability of the gumbel_softmax for older versions of torch. For more details about the issue: https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing Args: logits: […, num_features] unnormalized log probabilities tau: non-negative scalar temperature hard: if True, the returned samples will be discretized as one-hot vectors, but will be differentiated as if it is the soft sample in autograd dim (int): a dimension along which softmax will be computed. Default: -1. Returns: Sampled tensor of same shape as logits from the Gumbel-Softmax distribution. """ if version.parse(torch.__version__) < version.parse("1.2.0"): for i in range(10): transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) if not torch.isnan(transformed).any(): return transformed raise ValueError("gumbel_softmax returning NaN.") return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) def _apply_activate(self, data): """Apply proper activation function to the output of the generator.""" data_t = [] st = 0 for column_info in self._transformer.output_info_list: for span_info in column_info: if span_info.activation_fn == 'tanh': ed = st + span_info.dim data_t.append(torch.tanh(data[:, st:ed])) st = ed elif span_info.activation_fn == 'softmax': ed = st + span_info.dim transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2) data_t.append(transformed) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): """Compute the cross entropy loss on the fixed discrete column.""" loss = [] st = 0 st_c = 0 for column_info in self._transformer.output_info_list: for span_info in column_info: if len(column_info ) != 1 or span_info.activation_fn != "softmax": # not discrete column st += span_info.dim else: ed = st + span_info.dim ed_c = st_c + span_info.dim tmp = functional.cross_entropy(data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none') loss.append(tmp) st = ed st_c = ed_c loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def _validate_discrete_columns(self, train_data, discrete_columns): """Check whether ``discrete_columns`` exists in ``train_data``. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. """ if isinstance(train_data, pd.DataFrame): invalid_columns = set(discrete_columns) - set(train_data.columns) elif isinstance(train_data, np.ndarray): invalid_columns = [] for column in discrete_columns: if column < 0 or column >= train_data.shape[1]: invalid_columns.append(column) else: raise TypeError( '``train_data`` should be either pd.DataFrame or np.array.') if invalid_columns: raise ValueError( 'Invalid columns found: {}'.format(invalid_columns)) def fit(self, train_data, discrete_columns=tuple(), epochs=None, metadata_top_layer=None): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. """ #self._validate_discrete_columns(train_data, discrete_columns) if epochs is None: epochs = self._epochs else: warnings.warn(( '`epochs` argument in `fit` method has been deprecated and will be removed ' 'in a future version. Please pass `epochs` to the constructor instead' ), DeprecationWarning) self._transformer = DataTransformer() self._transformer.fit(train_data, discrete_columns) # Data structures for the intermediate eval function original_training_data = train_data.copy() self._external_eval["eval_size"] = min(len(train_data), 10000) train_data = self._transformer.transform(train_data) self._data_sampler = DataSampler(train_data, self._transformer.output_info_list, self._log_frequency) data_dim = self._transformer.output_dimensions self._generator = Generator( self._embedding_dim + self._data_sampler.dim_cond_vec(), self._generator_dim, data_dim).to(self._device) self._discriminator = Discriminator( data_dim + self._data_sampler.dim_cond_vec(), self._discriminator_dim).to(self._device) self._optimizerG = optim.Adam(self._generator.parameters(), lr=self._generator_lr, betas=(0.5, 0.9), weight_decay=self._generator_decay) self._optimizerD = optim.Adam(self._discriminator.parameters(), lr=self._discriminator_lr, betas=(0.5, 0.9), weight_decay=self._discriminator_decay) mean = torch.zeros(self._batch_size, self._embedding_dim, device=self._device) std = mean + 1 steps_per_epoch = max(len(train_data) // self._batch_size, 1) for i in range(epochs): for id_ in range(steps_per_epoch): if self._adaptive_training: loss_g = self.calc_loss_g(mean, std) pen, loss_d = self.calc_loss_d(mean, std) if (self._adaptive_training["r_d"] >= (self._adaptive_training["lambda"] * self._adaptive_training["r_g"])): self._optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() self._optimizerD.step() else: self._optimizerG.zero_grad() loss_g.backward() self._optimizerG.step() loss_d = loss_d.detach().item() loss_g = loss_g.detach().item() self._adaptive_training["r_d"] = np.abs( (loss_d - self._adaptive_training["prev_loss_d"]) / self._adaptive_training["prev_loss_d"]) self._adaptive_training["r_g"] = np.abs( (loss_g - self._adaptive_training["prev_loss_g"]) / self._adaptive_training["prev_loss_g"]) self._adaptive_training["prev_loss_g"] = loss_g self._adaptive_training["prev_loss_d"] = loss_d else: for n in range(self._discriminator_steps): self._optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() self._optimizerD.step() self._optimizerG.zero_grad() loss_g.backward() self._optimizerG.step() if self._verbose: print("Epoch " + str(i + 1)) pass if self._external_eval != False: if i % 10 == 0: # Reverse data back to its original format to compute external eval scores real_data = original_training_data.sample( self._external_eval["eval_size"]).reset_index() real_data = metadata_top_layer.reverse_transform(real_data) synthetic_data = self.sample( self._external_eval["eval_size"]) synthetic_data = metadata_top_layer.reverse_transform( synthetic_data) self.evaluate(synthetic_data, real_data, i + 1) if self._external_eval != False: self._generator = self._external_eval["best_generator"] def calc_loss_d(self, mean, std): fakez = torch.normal(mean=mean, std=std) condvec = self._data_sampler.sample_condvec(self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = self._data_sampler.sample_data(self._batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self._batch_size) np.random.shuffle(perm) real = self._data_sampler.sample_data(self._batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self._generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self._device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = self._discriminator(fake_cat) y_real = self._discriminator(real_cat) pen = self._discriminator.calc_gradient_penalty( real_cat, fake_cat, self._device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) return pen, loss_d def calc_loss_g(self, mean, std): fakez = torch.normal(mean=mean, std=std) condvec = self._data_sampler.sample_condvec(self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) fake = self._generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self._discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = self._discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy return loss_g def sample(self, n, condition_column=None, condition_value=None): """Sample data similar to the training data. Choosing a condition_column and condition_value will increase the probability of the discrete condition_value happening in the condition_column. Args: n (int): Number of rows to sample. condition_column (string): Name of a discrete column. condition_value (string): Name of the category in the condition_column which we wish to increase the probability of happening. Returns: numpy.ndarray or pandas.DataFrame """ if condition_column is not None and condition_value is not None: condition_info = self._transformer.convert_column_name_value_to_id( condition_column, condition_value) global_condition_vec = self._data_sampler.generate_cond_from_condition_column_info( condition_info, self._batch_size) else: global_condition_vec = None steps = n // self._batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self._batch_size, self._embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self._device) if global_condition_vec is not None: condvec = global_condition_vec.copy() else: condvec = self._data_sampler.sample_original_condvec( self._batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) fake = self._generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] return self._transformer.inverse_transform(data) def evaluate(self, synthetic_data, real_data, epoch): categorical_cols = real_data.select_dtypes( "object").columns.values.tolist() correlation_score = self.compute_correlation_score( synthetic_data, real_data, categorical_cols) self._external_eval["correlation_scores"].append(correlation_score) ml_efficacy_score = self.compute_ml_efficacy( real_data, synthetic_data, self._external_eval["target"]) detection_score = self.compute_detection(real_data, synthetic_data) self._external_eval["detection_scores"].append(detection_score) overall_score = (detection_score * 0.5 + ml_efficacy_score * 0.5) if self._external_eval["best_score"] < overall_score: print("New max score!") print(str(overall_score)) self._external_eval["best_model_epoch"] = epoch self._external_eval["best_score"] = overall_score self._external_eval["best_generator"] = copy.deepcopy( self._generator) def get_problem_type(self, data, target): target_column = data[target] if target_column.dtypes == "object": unique_labels = np.unique(target_column) if len(unique_labels) == 2: problem_type = "binary_classification" else: problem_type = "multi_classification" else: raise AttributeError("Regression ml efficacy not yet implemented") return problem_type def compute_detection(self, real_data, synthetic_data, verbose=True): real_data = real_data.dropna() synthetic_data = synthetic_data.dropna() detection_score = LogisticDetection.compute(real_data, synthetic_data) if verbose: print("Detection score: " + str(detection_score)) return detection_score def compute_ml_efficacy(self, real_data, synthetic_data, target, verbose=True): dtypes = real_data.dtypes.tolist() problem_type = self.get_problem_type(real_data, target) scores = [] if problem_type == "binary_classification": (tree_score, _), _ = BinaryDecisionTreeClassifier.compute(real_data, synthetic_data, dtypes=dtypes.copy(), target=target) (adaboost_score, _), _ = BinaryAdaBoostClassifier.compute(real_data, synthetic_data, dtypes=dtypes.copy(), target=target) (regression_score, _), _ = BinaryLogisticRegression.compute(real_data, synthetic_data, dtypes=dtypes.copy(), target=target) (mlp_score, _), _ = BinaryMLPClassifier.compute(real_data, synthetic_data, dtypes=dtypes.copy(), target=target) scores.extend( [tree_score, adaboost_score, regression_score, mlp_score]) ml_efficacy_score = sum(scores) / len(scores) self._external_eval["ml_efficacy_scores"]["tree"].append( tree_score) self._external_eval["ml_efficacy_scores"]["adaboost"].append( adaboost_score) self._external_eval["ml_efficacy_scores"]["regression"].append( regression_score) self._external_eval["ml_efficacy_scores"]["mlp"].append(mlp_score) self._external_eval["ml_efficacy_scores"]["ml_efficacy"].append( ml_efficacy_score) if verbose: print("Tree score: " + str(tree_score)) print("Adaboost score: " + str(adaboost_score)) print("Regression score: " + str(regression_score)) print("Mlp score: " + str(mlp_score)) print("ML efficay score: " + str(ml_efficacy_score)) elif problem_type == "multi_classification": tree_score, _ = MulticlassDecisionTreeClassifier.compute( real_data, synthetic_data, dtypes=dtypes.copy(), target=target) mlp_score, _ = MulticlassMLPClassifier.compute( real_data, synthetic_data, dtypes=dtypes.copy(), target=target) scores.extend([tree_score, mlp_score]) ml_efficacy_score = sum(scores) / len(scores) self._external_eval["ml_efficacy_scores"]["tree"].append( tree_score) self._external_eval["ml_efficacy_scores"]["mlp"].append(mlp_score) self._external_eval["ml_efficacy_scores"]["ml_efficacy"].append( ml_efficacy_score) if verbose: print("Tree score: " + str(tree_score)) print("Mlp score: " + str(mlp_score)) print("ML efficay score: " + str(ml_efficacy_score)) return ml_efficacy_score def compute_correlation_score(self, synthetic_data, real_data, categorical_cols, verbose=True): table_eval = TableEvaluator(real_data, synthetic_data, cat_cols=categorical_cols, verbose=False) correlation_score = table_eval.correlation_distance(how='rmse') if verbose: print("Rmse correlation: " + str(correlation_score)) return correlation_score def get_metadata(self): meta_data = {} data_info = {} categorical_cols = [] dtypes = [] dtypes_mapping = {} for index, column in enumerate( self._transformer._column_transform_info_list): name = column[0] data_type = column[1] if data_type == "discrete": data_info[name] = {"type": "categorical"} categorical_cols.append(name) dtypes.append("object") dtypes_mapping[name] = "object" else: if self._transformer._column_raw_dtypes._selected_obj[ index] == "int64": data_info[name] = { "type": "numerical", "subtype": "integer" } dtypes.append(int) dtypes_mapping[name] = int else: data_info[name] = {"type": "numerical", "subtype": "float"} dtypes.append(np.float64) dtypes_mapping[name] = np.float64 meta_data["tables"] = {None: {"fields": data_info}} return meta_data, categorical_cols, dtypes, dtypes_mapping def set_device(self, device): self._device = device if hasattr(self, '_generator'): self._generator.to(self._device) if hasattr(self, '_discriminator'): self._discriminator.to(self._device)
class TVAESynthesizer(BaseSynthesizer): """TVAESynthesizer.""" def __init__(self, embedding_dim=128, compress_dims=(128, 128), decompress_dims=(128, 128), l2scale=1e-5, batch_size=500, epochs=300, loss_factor=2, cuda=True): self.embedding_dim = embedding_dim self.compress_dims = compress_dims self.decompress_dims = decompress_dims self.l2scale = l2scale self.batch_size = batch_size self.loss_factor = loss_factor self.epochs = epochs if not cuda or not torch.cuda.is_available(): device = 'cpu' elif isinstance(cuda, str): device = cuda else: device = 'cuda' self._device = torch.device(device) @random_state def fit(self, train_data, discrete_columns=()): """Fit the TVAE Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. """ self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) dataset = TensorDataset( torch.from_numpy(train_data.astype('float32')).to(self._device)) loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=False) data_dim = self.transformer.output_dimensions encoder = Encoder(data_dim, self.compress_dims, self.embedding_dim).to(self._device) self.decoder = Decoder(self.embedding_dim, self.decompress_dims, data_dim).to(self._device) optimizerAE = Adam(list(encoder.parameters()) + list(self.decoder.parameters()), weight_decay=self.l2scale) for i in range(self.epochs): for id_, data in enumerate(loader): optimizerAE.zero_grad() real = data[0].to(self._device) mu, std, logvar = encoder(real) eps = torch.randn_like(std) emb = eps * std + mu rec, sigmas = self.decoder(emb) loss_1, loss_2 = _loss_function( rec, real, sigmas, mu, logvar, self.transformer.output_info_list, self.loss_factor) loss = loss_1 + loss_2 loss.backward() optimizerAE.step() self.decoder.sigma.data.clamp_(0.01, 1.0) @random_state def sample(self, samples): """Sample data similar to the training data. Args: samples (int): Number of rows to sample. Returns: numpy.ndarray or pandas.DataFrame """ self.decoder.eval() steps = samples // self.batch_size + 1 data = [] for _ in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 noise = torch.normal(mean=mean, std=std).to(self._device) fake, sigmas = self.decoder(noise) fake = torch.tanh(fake) data.append(fake.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:samples] return self.transformer.inverse_transform( data, sigmas.detach().cpu().numpy()) def set_device(self, device): """Set the `device` to be used ('GPU' or 'CPU).""" self._device = device self.decoder.to(self._device)
class PATECTGAN(CTGANSynthesizer): def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256), generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4, discriminator_decay=1e-6, batch_size=500, discriminator_steps=1, log_frequency=False, verbose=False, epochs=300, pac=1, cuda=True, epsilon=1, binary=False, regularization=None, loss="cross_entropy", teacher_iters=5, student_iters=5, sample_per_teacher=1000, delta=None, noise_multiplier=1e-3, moments_order=100, category_epsilon_pct=0.1): assert batch_size % 2 == 0 self._embedding_dim = embedding_dim self._generator_dim = generator_dim self._discriminator_dim = discriminator_dim self._generator_lr = generator_lr self._generator_decay = generator_decay self._discriminator_lr = discriminator_lr self._discriminator_decay = discriminator_decay self._batch_size = batch_size self._discriminator_steps = discriminator_steps self._log_frequency = log_frequency self._verbose = verbose self._epochs = epochs self.pac = pac self.epsilon = epsilon self._category_epsilon_pct = category_epsilon_pct self.verbose = verbose self.loss = loss # PATE params self.regularization = regularization if self.loss != "wasserstein" else "dragan" self.teacher_iters = teacher_iters self.student_iters = student_iters self.pd_cols = None self.pd_index = None self.binary = binary self.sample_per_teacher = sample_per_teacher self.noise_multiplier = noise_multiplier self.moments_order = moments_order self.delta = delta if not cuda or not torch.cuda.is_available(): device = 'cpu' elif isinstance(cuda, str): device = cuda else: device = 'cuda' self._device = torch.device(device) if self._log_frequency: warnings.warn( "log_frequency is selected. This may result in oversampling frequent " "categories, which could cause privacy leaks.") def train(self, data, categorical_columns=None, ordinal_columns=None, update_epsilon=None): if update_epsilon: self.epsilon = update_epsilon for col in categorical_columns: if str(data[col].dtype).startswith('float'): raise ValueError( "It looks like you are passing in a vector of continuous values" f"to a categorical column at [{col}]." "Please discretize and pass in categorical columns with" "unsigned integer or string category names.") sample_per_teacher = (self.sample_per_teacher if self.sample_per_teacher < len(data) else 1000) self.num_teachers = int(len(data) / sample_per_teacher) + 1 self._transformer = DataTransformer() self._transformer.fit(data, discrete_columns=categorical_columns) for tinfo in self._transformer._column_transform_info_list: if tinfo.column_type == "continuous": raise ValueError( "We don't support continuous values on this synthesizer. Please discretize values." ) train_data = self._transformer.transform(data) data_partitions = np.array_split(train_data, self.num_teachers) data_dim = self._transformer.output_dimensions sampler_eps = 0.0 if categorical_columns and self._category_epsilon_pct: sampler_eps = self.epsilon * self._category_epsilon_pct per_col_sampler_eps = sampler_eps / len(categorical_columns) self.epsilon = self.epsilon - sampler_eps else: per_col_sampler_eps = None self.cond_generator = DataSampler( train_data, self._transformer.output_info_list, self._log_frequency, per_column_epsilon=per_col_sampler_eps) spent = self.cond_generator.total_spent if (spent > sampler_eps and not np.isclose(spent, sampler_eps)): raise AssertionError( f"The data sampler used {spent} epsilon and was budgeted for {sampler_eps}" ) # create conditional generator for each teacher model # Note: Previously, there existed a ConditionalGenerator object in CTGAN # - that functionality has been subsumed by DataSampler, but switch is # essentially 1 for 1 # don't need to count eps for each teacher, because these are disjoint partitions cached_probs = self.cond_generator.discrete_column_category_prob cond_generator = [ DataSampler(d, self._transformer.output_info_list, self._log_frequency, per_column_epsilon=None, discrete_column_category_prob=cached_probs) for d in data_partitions ] self._generator = Generator( self._embedding_dim + self.cond_generator.dim_cond_vec(), self._generator_dim, data_dim).to(self._device) discriminator = Discriminator( data_dim + self.cond_generator.dim_cond_vec(), self._discriminator_dim, self.loss, self.pac).to(self._device) student_disc = discriminator student_disc.apply(weights_init) teacher_disc = [discriminator for i in range(self.num_teachers)] for i in range(self.num_teachers): teacher_disc[i].apply(weights_init) optimizerG = optim.Adam(self._generator.parameters(), lr=self._generator_lr, betas=(0.5, 0.9), weight_decay=self._generator_decay) optimizer_s = optim.Adam(student_disc.parameters(), lr=2e-4, betas=(0.5, 0.9)) optimizer_t = [ optim.Adam(teacher_disc[i].parameters(), lr=self._discriminator_lr, betas=(0.5, 0.9), weight_decay=self._discriminator_decay) for i in range(self.num_teachers) ] noise_multiplier = self.noise_multiplier alphas = torch.tensor([0.0 for i in range(self.moments_order)], device=self._device) l_list = 1 + torch.tensor(range(self.moments_order), device=self._device) eps = torch.zeros(1) mean = torch.zeros(self._batch_size, self._embedding_dim, device=self._device) std = mean + 1 real_label = 1 fake_label = 0 criterion = nn.BCELoss() if (self.loss == "cross_entropy") else self.w_loss if self.verbose: print("using loss {} and regularization {}".format( self.loss, self.regularization)) iteration = 0 if self.delta is None: self.delta = 1 / (train_data.shape[0] * np.sqrt(train_data.shape[0])) while eps.item() < self.epsilon: iteration += 1 eps = min((alphas - math.log(self.delta)) / l_list) if eps.item() > self.epsilon: if iteration == 1: raise ValueError( "Inputted epsilon parameter is too small to" + " create a private dataset. Try increasing epsilon and rerunning." ) break # train teacher discriminators for t_2 in range(self.teacher_iters): for i in range(self.num_teachers): partition_data = data_partitions[i] data_sampler = DataSampler( partition_data, self._transformer.output_info_list, self._log_frequency, per_column_epsilon=None, discrete_column_category_prob=cached_probs) fakez = torch.normal(mean, std=std).to(self._device) condvec = cond_generator[i].sample_condvec( self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample_data( self._batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self._batch_size) np.random.shuffle(perm) real = data_sampler.sample_data( self._batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self._generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype("float32")).to( self._device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake optimizer_t[i].zero_grad() y_all = torch.cat( [teacher_disc[i](fake_cat), teacher_disc[i](real_cat)]) label_fake = torch.full( (int(self._batch_size / self.pac), 1), fake_label, dtype=torch.float, device=self._device, ) label_true = torch.full( (int(self._batch_size / self.pac), 1), real_label, dtype=torch.float, device=self._device, ) labels = torch.cat([label_fake, label_true]) error_d = criterion(y_all.squeeze(), labels.squeeze()) error_d.backward() if self.regularization == "dragan": pen = teacher_disc[i].dragan_penalty( real_cat, device=self._device) pen.backward(retain_graph=True) optimizer_t[i].step() ### # train student discriminator for t_3 in range(self.student_iters): data_sampler = DataSampler( train_data, self._transformer.output_info_list, self._log_frequency, per_column_epsilon=None, discrete_column_category_prob=cached_probs) fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample_condvec(self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample_data(self._batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self._batch_size) np.random.shuffle(perm) real = data_sampler.sample_data(self._batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self._generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) else: fake_cat = fakeact fake_data = fake_cat ### predictions, votes = pate(fake_data, teacher_disc, noise_multiplier, device=self._device) output = student_disc(fake_data.detach()) # update moments accountant alphas = alphas + moments_acc(self.num_teachers, votes, noise_multiplier, l_list, device=self._device) loss_s = criterion( output.squeeze(), predictions.float().to(self._device).squeeze()) optimizer_s.zero_grad() loss_s.backward() if self.regularization == "dragan": vals = torch.cat([predictions, fake_data], axis=1) ordered = vals[vals[:, 0].sort()[1]] data_list = torch.split( ordered, predictions.shape[0] - int(predictions.sum().item())) synth_cat = torch.cat(data_list[1:], axis=0)[:, 1:] pen = student_disc.dragan_penalty(synth_cat, device=self._device) pen.backward(retain_graph=True) optimizer_s.step() # print ('iterator {i}, student discriminator loss is {j}'.format(i=t_3, j=loss_s)) # train generator fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample_condvec(self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) fake = self._generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = student_disc(torch.cat([fakeact, c1], dim=1)) else: y_fake = student_disc(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) if self.loss == "cross_entropy": label_g = torch.full( (int(self._batch_size / self.pac), 1), real_label, dtype=torch.float, device=self._device, ) loss_g = criterion(y_fake.squeeze(), label_g.float().squeeze()) loss_g = loss_g + cross_entropy else: loss_g = -torch.mean(y_fake) + cross_entropy optimizerG.zero_grad() loss_g.backward() optimizerG.step() if self.verbose: print("eps: {:f} \t G: {:f} \t D: {:f}".format( eps, loss_g.detach().cpu(), loss_s.detach().cpu())) def w_loss(self, output, labels): vals = torch.cat([labels[None, :], output[None, :]], axis=1) ordered = vals[vals[:, 0].sort()[1]] data_list = torch.split(ordered, labels.shape[0] - int(labels.sum().item())) fake_score = data_list[0][:, 1] true_score = torch.cat(data_list[1:], axis=0)[:, 1] w_loss = -(torch.mean(true_score) - torch.mean(fake_score)) return w_loss def generate(self, n, condition_column=None, condition_value=None): """ TODO: Add condition_column support from CTGAN """ self._generator.eval() # output_info = self._transformer.output_info steps = n // self._batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self._batch_size, self._embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self._device) condvec = self.cond_generator.sample_original_condvec( self._batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) fake = self._generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] return self._transformer.inverse_transform(data)