Пример #1
0
    def train(self,
              data,
              categorical_columns=None,
              ordinal_columns=None,
              update_epsilon=None):
        if update_epsilon:
            self.epsilon = update_epsilon

        if isinstance(data, pd.DataFrame):
            for col in data.columns:
                data[col] = pd.to_numeric(data[col], errors="ignore")
            self.pd_cols = data.columns
            self.pd_index = data.pd_index
            data = data.to_numpy()
        elif not isinstance(data, np.ndarray):
            raise ValueError("Data must be a numpy array or pandas dataframe")

        dataset = TensorDataset(
            torch.from_numpy(data.astype("float32")).to(self.device))
        dataloader = DataLoader(dataset,
                                batch_size=self.batch_size,
                                shuffle=True,
                                drop_last=True)

        self.generator = Generator(self.latent_dim,
                                   data.shape[1],
                                   binary=self.binary).to(self.device)
        discriminator = Discriminator(data.shape[1]).to(self.device)
        optimizer_d = optim.Adam(discriminator.parameters(), lr=4e-4)

        privacy_engine = PrivacyEngine(
            discriminator,
            batch_size=self.batch_size,
            sample_size=len(data),
            alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)),
            noise_multiplier=3.5,
            max_grad_norm=1.0,
            clip_per_layer=True,
        )

        privacy_engine.attach(optimizer_d)
        optimizer_g = optim.Adam(self.generator.parameters(), lr=1e-4)

        criterion = nn.BCELoss()

        for epoch in range(self.epochs):
            for i, data in enumerate(dataloader):
                discriminator.zero_grad()

                real_data = data[0].to(self.device)

                # train with fake data
                noise = torch.randn(self.batch_size,
                                    self.latent_dim,
                                    1,
                                    1,
                                    device=self.device)
                noise = noise.view(-1, self.latent_dim)
                fake_data = self.generator(noise)
                label_fake = torch.full((self.batch_size, ),
                                        0,
                                        dtype=torch.float,
                                        device=self.device)
                output = discriminator(fake_data.detach())
                loss_d_fake = criterion(output, label_fake)
                loss_d_fake.backward()
                optimizer_d.step()

                # train with real data
                label_true = torch.full((self.batch_size, ),
                                        1,
                                        dtype=torch.float,
                                        device=self.device)
                output = discriminator(real_data.float())
                loss_d_real = criterion(output, label_true)
                loss_d_real.backward()
                optimizer_d.step()

                max_grad_norm = []
                for p in discriminator.parameters():
                    param_norm = p.grad.data.norm(2).item()
                    max_grad_norm.append(param_norm)

                privacy_engine.max_grad_norm = max_grad_norm

                # train generator
                self.generator.zero_grad()
                label_g = torch.full((self.batch_size, ),
                                     1,
                                     dtype=torch.float,
                                     device=self.device)
                output_g = discriminator(fake_data)
                loss_g = criterion(output_g, label_g)
                loss_g.backward()
                optimizer_g.step()

                # manually clear gradients
                for p in discriminator.parameters():
                    if hasattr(p, "grad_sample"):
                        del p.grad_sample
                # autograd_grad_sample.clear_backprops(discriminator)

                if self.delta is None:
                    self.delta = 1 / data.shape[0]

                eps, best_alpha = optimizer_d.privacy_engine.get_privacy_spent(
                    self.delta)

            if self.epsilon < eps:
                break
Пример #2
0
    def train(self,
              data,
              categorical_columns=None,
              ordinal_columns=None,
              update_epsilon=None,
              verbose=False,
              mlflow=False):
        if update_epsilon:
            self.epsilon = update_epsilon

        if isinstance(data, pd.DataFrame):
            for col in data.columns:
                data[col] = pd.to_numeric(data[col], errors='ignore')
            self.pd_cols = data.columns
            self.pd_index = data.pd_index
            data = data.to_numpy()
        elif not isinstance(data, np.ndarray):
            raise ValueError("Data must be a numpy array or pandas dataframe")

        dataset = TensorDataset(
            torch.from_numpy(data.astype('float32')).to(self.device))
        dataloader = DataLoader(dataset,
                                batch_size=self.batch_size,
                                shuffle=True,
                                drop_last=True)

        if not hasattr(self, "generator"):
            self.generator = Generator(self.latent_dim,
                                       data.shape[1],
                                       binary=self.binary).to(self.device)
        if not hasattr(self, "discriminator"):
            self.discriminator = Discriminator(data.shape[1]).to(self.device)

        self.optimizer_d = optim.Adam(self.discriminator.parameters(),
                                      lr=4e-4,
                                      betas=(0.5, 0.9))
        if hasattr(self, "state_dict"):
            self.optimizer_d.load_state_dict(self.state_dict)

        if not hasattr(self, "privacy_engine"):
            privacy_engine = PrivacyEngine(
                self.discriminator,
                batch_size=self.batch_size,
                sample_size=len(data),
                alphas=[1 + x / 10.0
                        for x in range(1, 100)] + list(range(12, 64)),
                noise_multiplier=3.5,
                max_grad_norm=1.0,
                clip_per_layer=True).to(self.device)
        else:
            privacy_engine = self.privacy_engine

        privacy_engine.attach(self.optimizer_d)

        if hasattr(self, "privacy_engine"):
            epsilon, best_alpha = self.optimizer_d.privacy_engine.get_privacy_spent(
                self.delta)
        else:
            epsilon = 0

        if not hasattr(self, "optimizer_g"):
            self.optimizer_g = optim.Adam(self.generator.parameters(), lr=1e-4)

        criterion = nn.BCELoss()

        for epoch in range(self.epochs):

            if self.epsilon < epsilon:
                break

            for i, data in enumerate(dataloader):
                self.discriminator.zero_grad()

                real_data = data[0].to(self.device)

                # train with fake data
                noise = torch.randn(self.batch_size,
                                    self.latent_dim,
                                    1,
                                    1,
                                    device=self.device)
                noise = noise.view(-1, self.latent_dim)
                fake_data = self.generator(noise)
                label_fake = torch.full((self.batch_size, 1),
                                        0,
                                        dtype=torch.float,
                                        device=self.device)
                output = self.discriminator(fake_data.detach())
                loss_d_fake = criterion(output, label_fake)
                loss_d_fake.backward()
                self.optimizer_d.step()

                # train with real data
                label_true = torch.full((self.batch_size, 1),
                                        1,
                                        dtype=torch.float,
                                        device=self.device)
                output = self.discriminator(real_data.float())
                loss_d_real = criterion(output, label_true)
                loss_d_real.backward()
                self.optimizer_d.step()

                loss_d = loss_d_real + loss_d_fake

                max_grad_norm = []
                for p in self.discriminator.parameters():
                    param_norm = p.grad.data.norm(2).item()
                    max_grad_norm.append(param_norm)

                privacy_engine.max_grad_norm = max_grad_norm

                # train generator
                self.generator.zero_grad()
                label_g = torch.full((self.batch_size, 1),
                                     1,
                                     dtype=torch.float,
                                     device=self.device)
                output_g = self.discriminator(fake_data)
                loss_g = criterion(output_g, label_g)
                loss_g.backward()
                self.optimizer_g.step()

                # manually clear gradients
                for p in self.discriminator.parameters():
                    if hasattr(p, "grad_sample"):
                        del p.grad_sample
                # autograd_grad_sample.clear_backprops(discriminator)

                if self.delta is None:
                    self.delta = 1 / data.shape[0]

                eps, best_alpha = self.optimizer_d.privacy_engine.get_privacy_spent(
                    self.delta)
                self.alpha = best_alpha

            if (verbose):
                print('eps: {:f} \t alpha: {:f} \t G: {:f} \t D: {:f}'.format(
                    eps, best_alpha,
                    loss_g.detach().cpu(),
                    loss_d.detach().cpu()))

            if (mlflow):
                import mlflow
                mlflow.log_metric("loss_g",
                                  float(loss_g.detach().cpu()),
                                  step=epoch)
                mlflow.log_metric("loss_d",
                                  float(loss_d.detach().cpu()),
                                  step=epoch)
                mlflow.log_metric("epsilon", float(eps), step=epoch)

            if self.epsilon < eps:
                break

        privacy_engine.detach()
        self.state_dict = self.optimizer_d.state_dict()
        self.privacy_engine = privacy_engine