def test_poisson_sampling(self): B = 1 N = 10 d = 10 dataset = [(i, torch.randn(d), torch.randn(d)) for i in range(N)] model = nn.Linear(d, d) optimizer = optim.SGD(model.parameters(), lr=0.1) engine = PrivacyEngine( model, sample_rate=B / N, target_epsilon=1.0, epochs=10, poisson=True, max_grad_norm=1, sample_size=N, ) engine.attach(optimizer) generator = torch.Generator() generator.manual_seed(7) sampler = UniformWithReplacementSampler( num_samples=N, sample_rate=B / N, generator=generator ) dataloader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler) # Sampler with seed=7 should generate [], [7], [], [], [9], [0], [], [], [1], [4] for (_, x, y) in dataloader: prediction = model(x) loss = torch.mean((prediction - y) ** 2) optimizer.zero_grad() loss.backward() optimizer.step()
def train_model(net,trainloader,trainset,device,dp): criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(net.parameters(),lr=.001, momentum=.9) if dp == True: print('adding privacy engine') # if we are training with differential privacy, create the engine privacy_engine = PrivacyEngine( net, 4, len(trainloader), alphas=[1, 10, 100], noise_multiplier=1.3, max_grad_norm=1.0, ) privacy_engine.attach(optimizer) for epoch in range(5): # currently training for 5 epochs print(f'epoch: {epoch}') running_loss = 0.0 for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data[0].to(device), data[1].to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step()
def test_privacy_engine_virtual_step_example(self): # IMPORTANT: When changing this code you also need to update # the docstring for opacus.privacy_engine.PrivacyEngine.virtual_step() model = nn.Linear(16, 2) dataloader = [] batch_size = 64 sample_size = 256 sample_rate = batch_size / sample_size for _ in range(64): data = torch.randn(4, 16) labels = torch.randint(0, 2, (4, )) dataloader.append((data, labels)) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.05) privacy_engine = PrivacyEngine( model, sample_rate=sample_rate, noise_multiplier=0.8, max_grad_norm=0.5, ) privacy_engine.attach(optimizer) for i, (X, y) in enumerate(dataloader): logits = model(X) loss = criterion(logits, y) loss.backward() if i % 16 == 15: optimizer.step() # this will call privacy engine's step() optimizer.zero_grad() else: optimizer.virtual_step( ) # this will call privacy engine's virtual_step()
def main(): run_results = [] for _ in range(N_RUNS): model = Inception3(num_classes=10).to(DEVICE) optimizer = optim.SGD(model.parameters(), lr=LR, momentum=0) if not DISABLE_DP: privacy_engine = PrivacyEngine( model, batch_size=BATCH_SIZE, sample_size=len(train_loader.dataset), alphas=[ 1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=SIGMA, max_grad_norm=GRAD_NORM, secure_rng=SECURE_RNG, ) privacy_engine.attach(optimizer) for epoch in range(1, EPOCHS + 1): train(model, DEVICE, test_loader, optimizer, epoch) run_results.append(test(model, DEVICE, test_loader)) if len(run_results) > 1: print( "Accuracy averaged over {} runs: {:.2f}% ± {:.2f}%".format( len(run_results), np.mean(run_results) * 100, np.std(run_results) * 100 ) )
def initialize_training(parameters, learning_rate, local_dp): """ Initializes the model, optimizer and scheduler and shares the parameters with all the workers in the group. This should be sent from server to all nodes. Args: learning_rate: The learning rate for training. local_dp: bool whether to apply local_dp or not. Returns: Returns the device, model, optimizer and scheduler. """ # Determine the device to train on use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # Initialize model and send parameters of server to all workers model = Net().to(device) # initializing optimizer and scheduler optimizer = optim.SGD(parameters, lr=learning_rate, momentum=0.5) if local_dp: privacy_engine = PrivacyEngine(model, batch_size=64, sample_size=60000, alphas=range(2, 32), noise_multiplier=1.3, max_grad_norm=1.0, ) privacy_engine.attach(optimizer) # returns device, model, optimizer which will be needed in train and test return device, optimizer, model
def main(args): print(args) assert args.dpsgd torch.backends.cudnn.benchmark = True mdict = model_dict.copy() mdict['lstm'] = LSTMNet train_data, train_labels = get_data(args) model = mdict[args.experiment](vocab_size=args.max_features, batch_size=args.batch_size).cuda() optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0) loss_function = nn.CrossEntropyLoss( ) if args.experiment != 'logreg' else nn.BCELoss() privacy_engine = PrivacyEngine( model, batch_size=args.batch_size, sample_size=len(train_data), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=args.sigma, max_grad_norm=args.max_per_sample_grad_norm, ) privacy_engine.attach(optimizer) timings = [] for epoch in range(1, args.epochs + 1): start = time.perf_counter() dataloader = data.dataloader(train_data, train_labels, args.batch_size) for batch_idx, (x, y) in enumerate(dataloader): x, y = x.cuda(non_blocking=True), y.cuda(non_blocking=True) model.zero_grad() outputs = model(x) loss = loss_function(outputs, y) loss.backward() optimizer.step() torch.cuda.synchronize() duration = time.perf_counter() - start print("Time Taken for Epoch: ", duration) timings.append(duration) if args.dpsgd: epsilon, best_alpha = optimizer.privacy_engine.get_privacy_spent( args.delta) print( f"Train Epoch: {epoch} \t" # f"Loss: {np.mean(losses):.6f} " f"(ε = {epsilon:.2f}, δ = {args.delta}) for α = {best_alpha}") else: print(f"Train Epoch: {epoch} \t Loss: {np.mean(losses):.6f}") if not args.no_save: utils.save_runtimes(__file__.split('.')[0], args, timings) else: print('Not saving!') print('Done!')
def demo_basic(rank, world_size, weight, dp, noise_multiplier=0, max_grad_norm=1e8): # We don't want the 2 GPUs to work on the same examples/labels in parallel torch.manual_seed(rank) batch_size = 32 withdp = "with" + ("out " if not dp else "") print( f"Running basic DDP {withdp} differential privacy example on rank {rank}." ) device = setup_and_get_device(rank, world_size) # create model and move it to GPU with id rank model = ToyModel().to(device) print(f"Initial weight: {model.net1.weight.data}") # Freeze all the parameters except one, to ensure that the noise is the same # (the DDP hook does not browse the layers in the same order as the naive implementation) model.net1.bias.requires_grad = False model.net2.bias.requires_grad = False model.net2.weight.requires_grad = False if dp: ddp_model = DPDDP(model) engine = PrivacyEngine( ddp_model, batch_size=batch_size, sample_size=10 * batch_size, alphas=PRIVACY_ALPHAS, noise_multiplier=noise_multiplier, max_grad_norm=[max_grad_norm], ) engine.random_number_generator = engine._set_seed(0) else: ddp_model = DDP(model, device_ids=[device]) loss_fn = nn.MSELoss() optimizer = optim.SGD(ddp_model.parameters(), lr=1) if dp: engine.attach(optimizer) optimizer.zero_grad() labels = torch.randn(batch_size, 5).to(device) outputs = ddp_model(torch.randn(batch_size, 10).to(device)) loss_fn(outputs, labels).backward() optimizer.step() weight.copy_(model.net1.weight.data.cpu()) cleanup()
def initialize_dp(model, optimizer, sample_rate, dp_sigma): privacy_engine = PrivacyEngine( model, sample_rate = sample_rate * N_ACCUMULATION_STEPS, # epochs = EPOCHS, # target_epsilon = EPSILON, target_delta = DELTA, noise_multiplier = dp_sigma, max_grad_norm = MAX_GRAD_NORM, ) privacy_engine.attach(optimizer)
def test_privacy_engine_class_example(self): # IMPORTANT: When changing this code you also need to update # the docstring for opacus.privacy_engine.PrivacyEngine model = torch.nn.Linear(16, 32) # An example model optimizer = torch.optim.SGD(model.parameters(), lr=0.05) privacy_engine = PrivacyEngine( model, sample_rate=0.01, noise_multiplier=1.3, max_grad_norm=1.0, ) privacy_engine.attach(optimizer) # That's it! Now it's business as usual.
def test_model_validator(self): """ Test that the privacy engine throws on attach if there are unsupported modules """ privacy_engine = PrivacyEngine( models.resnet18(), sample_rate=self.SAMPLE_RATE, alphas=self.ALPHAS, noise_multiplier=1.3, max_grad_norm=1, ) with self.assertRaises(IncompatibleModuleException): privacy_engine.attach(self.private_optimizer)
def client(cur_net, current_iter, current_server_rank_id, best_valid_loss, best_net_glob, server_flag): # local train cur_net.train() optimizer = get_optimizer(args, cur_net) loss_func = nn.CrossEntropyLoss() if args.dp: privacy_engine = PrivacyEngine(cur_net, batch_size=args.bs, sample_size=len(local_train_loader), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=0.3, max_grad_norm=1.2, secure_rng=args.secure_rng) privacy_engine.attach(optimizer) current_state_dict, current_loss = normal_train(args, cur_net, optimizer, loss_func, local_train_loader, valid_loader) if args.dp: privacy_engine.detach() # send the state_dict to current server if args.tphe: client_sockets[rank2idx[current_server_rank_id]].send(pickle.dumps([encrypt_torch_state_dict(pub_key, current_state_dict), current_loss])) else: client_sockets[rank2idx[current_server_rank_id]].send(pickle.dumps([current_state_dict, current_loss])) # recv the aggregated state dict from current server aggregated_state_dict = client_sockets[rank2idx[current_server_rank_id]].recv(int(args.buffer)) aggregated_state_dict = pickle.loads(aggregated_state_dict) # parse aggregated state_dict parse_aggregated_state_dict(aggregated_state_dict, cur_net) # recv metadata metadata_list_pkl = client_sockets[rank2idx[current_server_rank_id]].recv(int(args.buffer)) loss_avg, tmp_loss_valid, next_server_rank_id = pickle.loads(metadata_list_pkl) loss_train.append(loss_avg) loss_valid.append(tmp_loss_valid) print('Round{:3d}, Average loss {:.3f}'.format(current_iter, loss_avg)) print('Round{:3d}, Validation loss {:.3f}'.format(current_iter, tmp_loss_valid)) if tmp_loss_valid < best_valid_loss: best_valid_loss = tmp_loss_valid best_net_glob = copy.deepcopy(cur_net) print('SAVE BEST MODEL AT EPOCH {}'.format(current_iter)) # update the metadata for server current_server_rank_id = next_server_rank_id if next_server_rank_id == args.rank: server_flag = True print("\33[31m\33[1m Current server rank id {} \33[0m".format(current_server_rank_id)) return cur_net, current_server_rank_id, best_valid_loss, best_net_glob, server_flag
def setUpOptimizer( self, model: nn.Module, data_loader: DataLoader, privacy_engine: bool = False ): # sample parameter values optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) optimizer.zero_grad() if privacy_engine: pe = PrivacyEngine( model, sample_rate=data_loader.batch_size / len(data_loader.dataset), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=1.3, max_grad_norm=1, ) pe.attach(optimizer) return optimizer
def demo_ddp_hook(rank, world_size, weight, dp, noise_multiplier, max_grad_norm): torch.manual_seed(rank) batch_size = 32 withdp = "with" + ("out " if not dp else "") print( f"Running DDP hook {withdp} differential privacy example on rank {rank}." ) device = setup_and_get_device(rank, world_size, nonce=1) # create model and move it to GPU with id rank model = ToyModel().to(device) model.net1.bias.requires_grad = False model.net2.bias.requires_grad = False model.net2.weight.requires_grad = False ddp_model = DDP(model, device_ids=[device]) if dp: engine = PrivacyEngine( ddp_model, batch_size=batch_size, sample_size=10 * batch_size, alphas=PRIVACY_ALPHAS, noise_multiplier=noise_multiplier, max_grad_norm=[max_grad_norm], ) engine.random_number_generator = engine._set_seed(0) loss_fn = nn.MSELoss() optimizer = optim.SGD(ddp_model.parameters(), lr=1) if dp: engine.attach(optimizer) optimizer.zero_grad() labels = torch.randn(batch_size, 5).to(device) outputs = ddp_model(torch.randn(batch_size, 10).to(device)) loss_fn(outputs, labels).backward() optimizer.step() weight.copy_(model.net1.weight.data.cpu()) del ddp_model cleanup()
def train_model(net, trainloader, trainset, device, dp): criterion = nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(net.parameters(), lr=LR) # optimizer = torch.optim.SGD(net.parameters(),lr=.003, momentum=.9) if dp == True: print('adding privacy engine') # if we are training with differential privacy, create the engine privacy_engine = PrivacyEngine(net, batch_size=VIRTUAL_BATCH_SIZE, sample_size=len(trainset), alphas=range(2, 32), noise_multiplier=NOISE_MULTIPLIER, max_grad_norm=MAX_GRAD_NORM) privacy_engine.attach(optimizer) for epoch in range(3): # currently training for 5 epochs print(f'epoch: {epoch}') train(net, trainloader, optimizer, epoch, device, dp)
def demo_basic(rank, weight, world_size, dp): torch.manual_seed(world_size) batch_size = 32 withdp = "with" + ("out " if not dp else "") print( f"Running basic DDP {withdp} differential privacy example on rank {rank}." ) setup(rank, world_size) # create model and move it to GPU with id rank model = ToyModel().to(rank) if dp: ddp_model = DPDDP(model) engine = PrivacyEngine( ddp_model, batch_size=batch_size, sample_size=10 * batch_size, alphas=PRIVACY_ALPHAS, noise_multiplier=0, max_grad_norm=1e8, ) else: ddp_model = DDP(model, device_ids=[rank]) loss_fn = nn.MSELoss() optimizer = optim.SGD(ddp_model.parameters(), lr=1) if dp: engine.attach(optimizer) # if rank == 0: # print(model.net1.weight) optimizer.zero_grad() labels = torch.randn(batch_size, 5).to(rank) outputs = ddp_model(torch.randn(batch_size, 10).to(rank)) loss_fn(outputs, labels).backward() optimizer.step() # if rank == 0: # print(model.net1.weight) weight.copy_(model.net1.weight.data.cpu()) cleanup()
def setUp_init_model( self, private=False, state_dict=None, model=None, **privacy_engine_kwargs ): model = model or SampleConvNet() optimizer = torch.optim.SGD(model.parameters(), lr=self.LR, momentum=0) if state_dict: model.load_state_dict(state_dict) if private: if len(privacy_engine_kwargs) == 0: privacy_engine_kwargs = self.privacy_default_params privacy_engine = PrivacyEngine( model, batch_size=self.BATCH_SIZE, sample_size=self.DATA_SIZE, alphas=self.ALPHAS, **privacy_engine_kwargs, ) privacy_engine.attach(optimizer) return model, optimizer
def setUpOptimizer( self, model: nn.Module, data_loader: DataLoader, privacy_engine: bool = False ): # sample parameter values optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) optimizer.zero_grad() if privacy_engine: pe = PrivacyEngine( model, # pyre-fixme[6]: Expected `int` for 2nd param but got `Optional[int]`. batch_size=data_loader.batch_size, # pyre-fixme[6]: Expected `Sized` for 1st param but got # `Dataset[typing.Any]`. sample_size=len(data_loader.dataset), # pyre-fixme[6]: `+` is not supported for operand types # `List[float]` and `List[int]`. alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=1.3, max_grad_norm=1, ) pe.attach(optimizer) return optimizer
def add_remove_ddp_hooks(rank, world_size, remaining_hooks, dp, noise_multiplier=0, max_grad_norm=1e8): device = setup_and_get_device(rank, world_size, nonce=2) model = ToyModel().to(device) ddp_model = nn.parallel.DistributedDataParallel(model, device_ids=[device]) engine = PrivacyEngine( ddp_model, batch_size=1, sample_size=10, alphas=PRIVACY_ALPHAS, noise_multiplier=noise_multiplier, max_grad_norm=[max_grad_norm], ) optimizer = optim.SGD(ddp_model.parameters(), lr=1) engine.attach(optimizer) remaining_hooks["attached"] = { p: p._backward_hooks for p in engine.module.parameters() if p._backward_hooks } engine.detach() remaining_hooks["detached"] = { p: p._backward_hooks for p in engine.module.parameters() if p._backward_hooks } cleanup()
def main(): args = parser.parse_args() device = torch.device(args.device) root = Path(args.data_root) all_filenames = list(root.glob("**/*.txt")) print( f"At root {root.absolute()}, found the following files: {all_filenames}" ) all_letters = string.ascii_letters + " .,;'#" n_letters = len(all_letters) category_lines, all_categories, n_categories = build_category_lines( all_filenames, all_letters) category_lines_train, category_lines_val = split_data_train_eval( category_lines, args.train_eval_split) rnn = CharNNClassifier(n_letters, args.n_hidden, n_categories, n_letters, args.batch_size).to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(rnn.parameters(), lr=args.learning_rate) if not args.disable_dp: privacy_engine = PrivacyEngine( rnn, batch_size=args.batch_size, sample_size=get_dataset_size(category_lines_train), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=args.sigma, max_grad_norm=args.max_per_sample_grad_norm, batch_first=False, ) privacy_engine.attach(optimizer) # Measure time elapsed for profiling training def time_since(since): now = time.time() s = now - since m = math.floor(s / 60) s -= m * 60 return "%dm %ds" % (m, s) # Keep track of losses for tracking current_loss = 0 start_time = time.time() for iteration in tqdm(range(1, args.iterations + 1)): # Get a random training input and target batch _, _, category_tensors, line_tensors = get_random_batch( category_lines_train, args.batch_size, all_categories, all_letters, n_letters, args, device, ) output, loss = train(rnn, criterion, optimizer, category_tensors, line_tensors, device) current_loss += loss # Print iteration number, loss, name and guess if iteration % print_every == 0: acc = get_eval_metrics( rnn, category_lines_val, all_categories, all_letters, n_letters, args.batch_size, args.max_seq_length, device, ) time_elapsed = time_since(start_time) if not args.disable_dp: epsilon, best_alpha = optimizer.privacy_engine.get_privacy_spent( args.delta) print( f"Iteration={iteration} / Time elapsed: {time_elapsed} / Loss={loss:.4f} / " f"Eval Accuracy:{acc*100:.2f} / " f"Ɛ = {epsilon:.2f}, 𝛿 = {args.delta:.2f}) for α = {best_alpha:.2f}" ) else: print( f"Iteration={iteration} / Time elapsed: {time_elapsed} / Loss={loss:.4f} / " f"Eval Accuracy:{acc*100:.2f}")
def main(dataset, augment=False, use_scattering=False, size=None, batch_size=2048, mini_batch_size=256, sample_batches=False, lr=1, optim="SGD", momentum=0.9, nesterov=False, noise_multiplier=1, max_grad_norm=0.1, epochs=100, input_norm=None, num_groups=None, bn_noise_multiplier=None, max_epsilon=None, logdir=None, early_stop=True, seed=0): torch.manual_seed(seed) logger = Logger(logdir) device = get_device() train_data, test_data = get_data(dataset, augment=augment) if use_scattering: scattering, K, _ = get_scatter_transform(dataset) scattering.to(device) else: scattering = None K = 3 if len(train_data.data.shape) == 4 else 1 bs = batch_size assert bs % mini_batch_size == 0 n_acc_steps = bs // mini_batch_size # Batch accumulation and data augmentation with Poisson sampling isn't implemented if sample_batches: assert n_acc_steps == 1 assert not augment train_loader = torch.utils.data.DataLoader(train_data, batch_size=mini_batch_size, shuffle=True, num_workers=1, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_data, batch_size=mini_batch_size, shuffle=False, num_workers=1, pin_memory=True) rdp_norm = 0 if input_norm == "BN": # compute noisy data statistics or load from disk if pre-computed save_dir = f"bn_stats/{dataset}" os.makedirs(save_dir, exist_ok=True) bn_stats, rdp_norm = scatter_normalization( train_loader, scattering, K, device, len(train_data), len(train_data), noise_multiplier=bn_noise_multiplier, orders=ORDERS, save_dir=save_dir) model = CNNS[dataset](K, input_norm="BN", bn_stats=bn_stats, size=size) else: model = CNNS[dataset](K, input_norm=input_norm, num_groups=num_groups, size=size) model.to(device) if use_scattering and augment: model = nn.Sequential(scattering, model) train_loader = torch.utils.data.DataLoader(train_data, batch_size=mini_batch_size, shuffle=True, num_workers=1, pin_memory=True, drop_last=True) else: # pre-compute the scattering transform if necessery train_loader = get_scattered_loader(train_loader, scattering, device, drop_last=True, sample_batches=sample_batches) test_loader = get_scattered_loader(test_loader, scattering, device) print(f"model has {get_num_params(model)} parameters") if optim == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, nesterov=nesterov) else: optimizer = torch.optim.Adam(model.parameters(), lr=lr) privacy_engine = PrivacyEngine( model, batch_size=bs, sample_size=len(train_data), alphas=ORDERS, noise_multiplier=noise_multiplier, max_grad_norm=max_grad_norm, ) privacy_engine.attach(optimizer) best_acc = 0 flat_count = 0 results = dict(train_zeon=[], train_xent=[], test_zeon=[], test_xent=[], epoch=[]) for epoch in range(0, epochs): print(f"\nEpoch: {epoch}") train_loss, train_acc = train(model, train_loader, optimizer, n_acc_steps=n_acc_steps) test_loss, test_acc = test(model, test_loader) results['train_zeon'].append(train_acc) results['train_xent'].append(train_loss) results['test_zeon'].append(test_acc) results['test_xent'].append(test_loss) results['epoch'].append(epoch) if noise_multiplier > 0: rdp_sgd = get_renyi_divergence( privacy_engine.sample_rate, privacy_engine.noise_multiplier) * privacy_engine.steps epsilon, _ = get_privacy_spent(rdp_norm + rdp_sgd) epsilon2, _ = get_privacy_spent(rdp_sgd) print(f"ε = {epsilon:.3f} (sgd only: ε = {epsilon2:.3f})") if max_epsilon is not None and epsilon >= max_epsilon: return else: epsilon = None logger.log_epoch(epoch, train_loss, train_acc, test_loss, test_acc, epsilon) logger.log_scalar("epsilon/train", epsilon, epoch) # stop if we're not making progress if test_acc > best_acc: best_acc = test_acc flat_count = 0 else: flat_count += 1 if flat_count >= 20 and early_stop: print("plateau...") break # Write to file. record = { **results, **{ 'best_acc': best_acc, 'seed': seed, 'dataset': dataset } } record_path = os.path.join('.', 'record', f'{dataset}-{seed}.json') os.makedirs(os.path.dirname(record_path), exist_ok=True) with open(record_path, 'w') as f: json.dump(record, f, indent=4) import logging logging.warning(f'Wrote to file: {record_path}')
def main(): args = parser.parse_args() device = torch.device(args.device) ds = NamesDataset(args.data_root) train_len = int(args.train_split * len(ds)) test_len = len(ds) - train_len print(f"{train_len} samples for training, {test_len} for testing") if args.secure_rng: try: import torchcsprng as prng except ImportError as e: msg = ( "To use secure RNG, you must install the torchcsprng package! " "Check out the instructions here: https://github.com/pytorch/csprng#installation" ) raise ImportError(msg) from e generator = prng.create_random_device_generator("/dev/urandom") else: generator = None train_ds, test_ds = torch.utils.data.random_split(ds, [train_len, test_len], generator=generator) model = CharNNClassifier( args.embedding_size, args.hidden_size, len(ds.labels), args.n_lstm_layers, args.bidirectional_lstm, ) model = model.to(device) train_ds, test_ds = torch.utils.data.random_split(ds, [train_len, test_len], generator=generator) train_loader = DataLoader( train_ds, num_workers=8, pin_memory=True, generator=generator, batch_sampler=UniformWithReplacementSampler( num_samples=len(train_ds), sample_rate=args.sample_rate, generator=generator), collate_fn=padded_collate, ) test_loader = DataLoader( test_ds, batch_size=args.batch_size_test, shuffle=False, num_workers=8, pin_memory=True, collate_fn=padded_collate, ) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate) if not args.disable_dp: privacy_engine = PrivacyEngine( model, sample_rate=args.sample_rate, alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=args.sigma, max_grad_norm=args.max_per_sample_grad_norm, target_delta=args.delta, secure_rng=args.secure_rng, ) privacy_engine.attach(optimizer) else: privacy_engine = None print("Train stats: \n") for epoch in tqdm(range(args.epochs)): train(model, criterion, optimizer, train_loader, epoch, device=device) if args.test_every: if epoch % args.test_every == 0: test(model, test_loader, privacy_engine, device=device) test(model, test_loader, privacy_engine, device=device)
def train(self, data, categorical_columns=None, ordinal_columns=None, update_epsilon=None): if update_epsilon: self.epsilon = update_epsilon if isinstance(data, pd.DataFrame): for col in data.columns: data[col] = pd.to_numeric(data[col], errors="ignore") self.pd_cols = data.columns self.pd_index = data.pd_index data = data.to_numpy() elif not isinstance(data, np.ndarray): raise ValueError("Data must be a numpy array or pandas dataframe") dataset = TensorDataset( torch.from_numpy(data.astype("float32")).to(self.device)) dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=True) self.generator = Generator(self.latent_dim, data.shape[1], binary=self.binary).to(self.device) discriminator = Discriminator(data.shape[1]).to(self.device) optimizer_d = optim.Adam(discriminator.parameters(), lr=4e-4) privacy_engine = PrivacyEngine( discriminator, batch_size=self.batch_size, sample_size=len(data), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=3.5, max_grad_norm=1.0, clip_per_layer=True, ) privacy_engine.attach(optimizer_d) optimizer_g = optim.Adam(self.generator.parameters(), lr=1e-4) criterion = nn.BCELoss() for epoch in range(self.epochs): for i, data in enumerate(dataloader): discriminator.zero_grad() real_data = data[0].to(self.device) # train with fake data noise = torch.randn(self.batch_size, self.latent_dim, 1, 1, device=self.device) noise = noise.view(-1, self.latent_dim) fake_data = self.generator(noise) label_fake = torch.full((self.batch_size, ), 0, dtype=torch.float, device=self.device) output = discriminator(fake_data.detach()) loss_d_fake = criterion(output, label_fake) loss_d_fake.backward() optimizer_d.step() # train with real data label_true = torch.full((self.batch_size, ), 1, dtype=torch.float, device=self.device) output = discriminator(real_data.float()) loss_d_real = criterion(output, label_true) loss_d_real.backward() optimizer_d.step() max_grad_norm = [] for p in discriminator.parameters(): param_norm = p.grad.data.norm(2).item() max_grad_norm.append(param_norm) privacy_engine.max_grad_norm = max_grad_norm # train generator self.generator.zero_grad() label_g = torch.full((self.batch_size, ), 1, dtype=torch.float, device=self.device) output_g = discriminator(fake_data) loss_g = criterion(output_g, label_g) loss_g.backward() optimizer_g.step() # manually clear gradients for p in discriminator.parameters(): if hasattr(p, "grad_sample"): del p.grad_sample # autograd_grad_sample.clear_backprops(discriminator) if self.delta is None: self.delta = 1 / data.shape[0] eps, best_alpha = optimizer_d.privacy_engine.get_privacy_spent( self.delta) if self.epsilon < eps: break
def main(): parser = argparse.ArgumentParser(description="PyTorch CIFAR10 DP Training") parser.add_argument( "-j", "--workers", default=2, type=int, metavar="N", help="number of data loading workers (default: 2)", ) parser.add_argument( "--epochs", default=90, type=int, metavar="N", help="number of total epochs to run", ) parser.add_argument( "--start-epoch", default=1, type=int, metavar="N", help="manual epoch number (useful on restarts)", ) parser.add_argument( "-b", "--batch-size", # This should be 256, but that OOMs using the prototype. default=64, type=int, metavar="N", help="mini-batch size (default: 64), this is the total " "batch size of all GPUs on the current node when " "using Data Parallel or Distributed Data Parallel", ) parser.add_argument( "-na", "--n_accumulation_steps", default=1, type=int, metavar="N", help="number of mini-batches to accumulate into an effective batch", ) parser.add_argument( "--lr", "--learning-rate", default=0.001, type=float, metavar="LR", help="initial learning rate", dest="lr", ) parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="SGD momentum") parser.add_argument( "--wd", "--weight-decay", default=5e-4, type=float, metavar="W", help="SGD weight decay (default: 1e-4)", dest="weight_decay", ) parser.add_argument( "-p", "--print-freq", default=10, type=int, metavar="N", help="print frequency (default: 10)", ) parser.add_argument( "--resume", default="", type=str, metavar="PATH", help="path to latest checkpoint (default: none)", ) parser.add_argument( "-e", "--evaluate", dest="evaluate", action="store_true", help="evaluate model on validation set", ) parser.add_argument("--seed", default=None, type=int, help="seed for initializing training. ") parser.add_argument( "--device", type=str, default="cuda", help="GPU ID for this process (default: 'cuda')", ) parser.add_argument( "--sigma", type=float, default=1.0, metavar="S", help="Noise multiplier (default 1.0)", ) parser.add_argument( "-c", "--max-per-sample-grad_norm", type=float, default=1.0, metavar="C", help="Clip per-sample gradients to this norm (default 1.0)", ) parser.add_argument( "--disable-dp", action="store_true", default=False, help="Disable privacy training and just train with vanilla SGD", ) parser.add_argument( "--secure-rng", action="store_true", default=False, help= "Enable Secure RNG to have trustworthy privacy guarantees. Comes at a performance cost", ) parser.add_argument( "--delta", type=float, default=1e-5, metavar="D", help="Target delta (default: 1e-5)", ) parser.add_argument( "--checkpoint-file", type=str, default="checkpoint", help="path to save check points", ) parser.add_argument( "--data-root", type=str, default="../cifar10", help="Where CIFAR10 is/will be stored", ) parser.add_argument("--log-dir", type=str, default="", help="Where Tensorboard log will be stored") parser.add_argument( "--optim", type=str, default="Adam", help="Optimizer to use (Adam, RMSprop, SGD)", ) args = parser.parse_args() args.disable_dp = True if args.disable_dp and args.n_accumulation_steps > 1: raise ValueError("Virtual steps only works with enabled DP") # The following few lines, enable stats gathering about the run # 1. where the stats should be logged stats.set_global_summary_writer( tensorboard.SummaryWriter(os.path.join("/tmp/stat", args.log_dir))) # 2. enable stats stats.add( # stats about gradient norms aggregated for all layers stats.Stat(stats.StatType.GRAD, "AllLayers", frequency=0.1), # stats about gradient norms per layer stats.Stat(stats.StatType.GRAD, "PerLayer", frequency=0.1), # stats about clipping stats.Stat(stats.StatType.GRAD, "ClippingStats", frequency=0.1), # stats on training accuracy stats.Stat(stats.StatType.TRAIN, "accuracy", frequency=0.01), # stats on validation accuracy stats.Stat(stats.StatType.TEST, "accuracy"), ) # The following lines enable stat gathering for the clipping process # and set a default of per layer clipping for the Privacy Engine clipping = {"clip_per_layer": False, "enable_stat": True} if args.secure_rng: assert False try: import torchcsprng as prng except ImportError as e: msg = ( "To use secure RNG, you must install the torchcsprng package! " "Check out the instructions here: https://github.com/pytorch/csprng#installation" ) raise ImportError(msg) from e generator = prng.create_random_device_generator("/dev/urandom") else: generator = None augmentations = [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), ] normalize = [ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ] train_transform = transforms.Compose( augmentations + normalize if args.disable_dp else normalize) test_transform = transforms.Compose(normalize) train_dataset = CIFAR10(root=args.data_root, train=True, download=True, transform=train_transform) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, drop_last=True, generator=generator, ) test_dataset = CIFAR10(root=args.data_root, train=False, download=True, transform=test_transform) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, ) best_acc1 = 0 device = torch.device(args.device) model = convert_batchnorm_modules(models.resnet18(num_classes=10)) # model = CIFAR10Model() model = model.to(device) if args.optim == "SGD": optimizer = optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optim == "RMSprop": optimizer = optim.RMSprop(model.parameters(), lr=args.lr) elif args.optim == "Adam": optimizer = optim.Adam(model.parameters(), lr=args.lr) else: raise NotImplementedError( "Optimizer not recognized. Please check spelling") if not args.disable_dp: privacy_engine = PrivacyEngine( model, batch_size=args.batch_size * args.n_accumulation_steps, sample_size=len(train_dataset), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=args.sigma, max_grad_norm=args.max_per_sample_grad_norm, secure_rng=args.secure_rng, **clipping, ) privacy_engine.attach(optimizer) for epoch in range(args.start_epoch, args.epochs + 1): train(args, model, train_loader, optimizer, epoch, device) top1_acc = test(args, model, test_loader, device) # remember best acc@1 and save checkpoint is_best = top1_acc > best_acc1 best_acc1 = max(top1_acc, best_acc1) save_checkpoint( { "epoch": epoch + 1, "arch": "ResNet18", "state_dict": model.state_dict(), "best_acc1": best_acc1, "optimizer": optimizer.state_dict(), }, is_best, filename=args.checkpoint_file + ".tar", )
class CTGANSynthesizer(BaseSynthesizer): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. generator_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Residual Layer will be created for each one of the values provided. Defaults to (256, 256). discriminator_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). generator_lr (float): Learning rate for the generator. Defaults to 2e-4. generator_decay (float): Generator weight decay for the Adam Optimizer. Defaults to 1e-6. discriminator_lr (float): Learning rate for the discriminator. Defaults to 2e-4. discriminator_decay (float): Discriminator weight decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. discriminator_steps (int): Number of discriminator updates to do for each generator update. From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper default is 5. Default used is 1 to match original CTGAN implementation. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. verbose (boolean): Whether to have print statements for progress results. Defaults to ``False``. epochs (int): Number of training epochs. Defaults to 300. """ def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256), generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4, discriminator_decay=0, pack=1, batch_size=500, discriminator_steps=1, log_frequency=True, verbose=False, epochs=300, epsilon=10, delta=1e-5, noise_multiplier=2, max_grad_norm=1, dp=True): assert batch_size % 2 == 0 self._embedding_dim = embedding_dim self._generator_dim = generator_dim self._discriminator_dim = discriminator_dim self._generator_lr = generator_lr self._generator_decay = generator_decay self._discriminator_lr = discriminator_lr self._discriminator_decay = discriminator_decay self._pack = pack #add this option to original CTGAN for swagness self._batch_size = batch_size self._discriminator_steps = discriminator_steps self._log_frequency = log_frequency self._verbose = verbose self._epochs = epochs self._epsilon = epsilon self._device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.trained_epochs = 0 self.trained_epsilon = 0 self._delta = delta self._noise_multiplier = noise_multiplier self.max_grad_norm = max_grad_norm self._dp = dp opacus.supported_layers_grad_samplers._create_or_extend_grad_sample = _custom_create_or_extend_grad_sample @staticmethod def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1): """Deals with the instability of the gumbel_softmax for older versions of torch. For more details about the issue: https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing Args: logits: […, num_features] unnormalized log probabilities tau: non-negative scalar temperature hard: if True, the returned samples will be discretized as one-hot vectors, but will be differentiated as if it is the soft sample in autograd dim (int): a dimension along which softmax will be computed. Default: -1. Returns: Sampled tensor of same shape as logits from the Gumbel-Softmax distribution. """ if version.parse(torch.__version__) < version.parse("1.2.0"): for i in range(10): transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) if not torch.isnan(transformed).any(): return transformed raise ValueError("gumbel_softmax returning NaN.") return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim) def _apply_activate(self, data): """Apply proper activation function to the output of the generator.""" data_t = [] st = 0 for column_info in self._transformer.output_info_list: for span_info in column_info: if span_info.activation_fn == 'tanh': ed = st + span_info.dim data_t.append(torch.tanh(data[:, st:ed])) st = ed elif span_info.activation_fn == 'softmax': ed = st + span_info.dim transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2) data_t.append(transformed) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): """Compute the cross entropy loss on the fixed discrete column.""" loss = [] st = 0 st_c = 0 for column_info in self._transformer.output_info_list: for span_info in column_info: if len(column_info ) != 1 or span_info.activation_fn != "softmax": # not discrete column st += span_info.dim else: ed = st + span_info.dim ed_c = st_c + span_info.dim tmp = functional.cross_entropy(data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none') loss.append(tmp) st = ed st_c = ed_c loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def _validate_discrete_columns(self, train_data, discrete_columns): """Check whether ``discrete_columns`` exists in ``train_data``. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. """ if isinstance(train_data, pd.DataFrame): invalid_columns = set(discrete_columns) - set(train_data.columns) elif isinstance(train_data, np.ndarray): invalid_columns = [] for column in discrete_columns: if column < 0 or column >= train_data.shape[1]: invalid_columns.append(column) else: raise TypeError( '``train_data`` should be either pd.DataFrame or np.array.') if invalid_columns: raise ValueError( 'Invalid columns found: {}'.format(invalid_columns)) def fit(self, train_data, discrete_columns=tuple(), epochs=None, epsilon=None): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. """ self._validate_discrete_columns(train_data, discrete_columns) if epochs is None: epochs = self._epochs if epsilon is None: epsilon = self._epsilon if not self._dp: self.trained_epsilon = float("inf") self._transformer = DataTransformer() self._transformer.fit(train_data, discrete_columns) train_data = self._transformer.transform(train_data) self._data_sampler = DataSampler(train_data, self._transformer.output_info_list, self._log_frequency) data_dim = self._transformer.output_dimensions self._generator = Generator( self._embedding_dim + self._data_sampler.dim_cond_vec(), self._generator_dim, data_dim).to(self._device) self._discriminator = Discriminator( data_dim + self._data_sampler.dim_cond_vec(), self._discriminator_dim, self._pack).to(self._device) self._optimizerG = optim.Adam(self._generator.parameters(), lr=self._generator_lr, betas=(0.5, 0.9), weight_decay=self._generator_decay) self._optimizerD = optim.Adam(self._discriminator.parameters(), lr=self._discriminator_lr, betas=(0.5, 0.9), weight_decay=self._discriminator_decay) if self._dp: self._privacy_engine = PrivacyEngine( self._discriminator, self._batch_size / self._pack, len(train_data), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=self._noise_multiplier, max_grad_norm=self.max_grad_norm, clip_per_layer=True, loss_reduction="sum", ) self._privacy_engine.attach(self._optimizerD) mean = torch.zeros(self._batch_size, self._embedding_dim, device=self._device) std = mean + 1 one = torch.tensor(1, dtype=torch.float).to(self._device) mone = one * -1 steps_per_epoch = max(len(train_data) // self._batch_size, 1) for i in range(epochs): self.trained_epochs += 1 if self._dp: if self.trained_epsilon >= self._epsilon: print( "Privacy budget of {:.2f} exausthed. Please specify an higher one in fit() to train more or disable differential privacy." .format(self._epsilon)) return for id_ in range(steps_per_epoch): for n in range(self._discriminator_steps): fakez = torch.normal(mean=mean, std=std) condvec = self._data_sampler.sample_condvec( self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = self._data_sampler.sample_data( self._batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self._batch_size) np.random.shuffle(perm) real = self._data_sampler.sample_data( self._batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self._generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to( self._device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake self._optimizerD.zero_grad() y_fake = self._discriminator(fake_cat) y_real = self._discriminator(real_cat) if not self._dp: pen = self._discriminator.calc_gradient_penalty( real_cat, fake_cat, self._device) pen.backward(retain_graph=True) loss_d = -torch.mean(y_real) + torch.mean(y_fake) loss_d.backward() self._optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self._data_sampler.sample_condvec(self._batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self._device) m1 = torch.from_numpy(m1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) fake = self._generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = self._discriminator( torch.cat([fakeact, c1], dim=1)) else: y_fake = self._discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy self._optimizerG.zero_grad() loss_g.backward() self._optimizerG.step() if self._dp: for p in self._discriminator.parameters(): if hasattr(p, "grad_sample"): del p.grad_sample self.trained_epsilon, best_alpha = self._optimizerD.privacy_engine.get_privacy_spent( self._delta) if self.trained_epsilon >= epsilon: print( "Privacy budget of {:.2f} exausthed, training halted. Best alpha: {:.2f}" .format(epsilon, best_alpha)) return if self._verbose: print( f"Epoch {i+1}, epslion {self.trained_epsilon: .2f}, Loss G: {loss_g.detach().cpu(): .4f}, " f"Loss D: {loss_d.detach().cpu(): .4f}", flush=True) if self._dp: self._privacy_engine.detach() def sample(self, n, condition_column=None, condition_value=None): """Sample data similar to the training data. Choosing a condition_column and condition_value will increase the probability of the discrete condition_value happening in the condition_column. Args: n (int): Number of rows to sample. condition_column (string): Name of a discrete column. condition_value (string): Name of the category in the condition_column which we wish to increase the probability of happening. Returns: numpy.ndarray or pandas.DataFrame """ if condition_column is not None and condition_value is not None: condition_info = self._transformer.convert_column_name_value_to_id( condition_column, condition_value) global_condition_vec = self._data_sampler.generate_cond_from_condition_column_info( condition_info, self._batch_size) else: global_condition_vec = None steps = n // self._batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self._batch_size, self._embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self._device) if global_condition_vec is not None: condvec = global_condition_vec.copy() else: condvec = self._data_sampler.sample_original_condvec( self._batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self._device) fakez = torch.cat([fakez, c1], dim=1) fake = self._generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] return self._transformer.inverse_transform(data) def set_device(self, device): self._device = device if hasattr(self, '_generator'): self._generator.to(self._device) if hasattr(self, '_discriminator'): self._discriminator.to(self._device)
class GradientAccumulation_test(unittest.TestCase): def setUp(self): self.DATA_SIZE = 64 self.BATCH_SIZE = 16 self.SAMPLE_RATE = self.BATCH_SIZE / self.DATA_SIZE self.LR = 0 # we want to call optimizer.step() without modifying the model self.ALPHAS = [1 + x / 10.0 for x in range(1, 100, 10)] self.criterion = nn.CrossEntropyLoss() self.setUp_data() self.setUp_model_and_optimizer() def setUp_data(self): self.ds = FakeData( size=self.DATA_SIZE, image_size=(1, 35, 35), num_classes=10, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ) self.dl = DataLoader(self.ds, batch_size=self.BATCH_SIZE) def setUp_model_and_optimizer(self): self.model = SampleConvNet() self.optimizer = torch.optim.SGD( self.model.parameters(), lr=self.LR, momentum=0 ) self.optimizer.zero_grad() # accumulate .grad over the entire dataset for x, y in self.dl: logits = self.model(x) loss = self.criterion(logits, y) loss.backward() self.effective_batch_grad = torch.cat( [p.grad.reshape(-1) for p in self.model.parameters() if p.requires_grad] ) * (self.BATCH_SIZE / self.DATA_SIZE) self.optimizer.zero_grad() def setUp_privacy_engine(self, batch_size): self.privacy_engine = PrivacyEngine( self.model, sample_rate=batch_size / self.DATA_SIZE, alphas=self.ALPHAS, noise_multiplier=0, max_grad_norm=999, ) self.privacy_engine.attach(self.optimizer) def calc_per_sample_grads(self, data_iter, num_steps=1): for x, y in data_iter: num_steps -= 1 logits = self.model(x) loss = self.criterion(logits, y) loss.backward() if num_steps == 0: break def test_grad_sample_accumulation(self): """ Calling loss.backward() multiple times should sum up the gradients in .grad and accumulate all the individual gradients in .grad-sample """ self.setUp_privacy_engine(self.DATA_SIZE) data_iter = iter(self.dl) # 4 batches of size 4 each self.calc_per_sample_grads(data_iter, num_steps=4) # should accumulate grads in .grad and .grad_sample # the accumulated per-sample gradients per_sample_grads = torch.cat( [ p.grad_sample.reshape(self.DATA_SIZE, -1) for p in self.model.parameters() if p.requires_grad ], dim=-1, ) # average up all the per-sample gradients accumulated_grad = torch.mean(per_sample_grads, dim=0) # the full data gradient accumulated in .grad grad = torch.cat( [p.grad.reshape(-1) for p in self.model.parameters() if p.requires_grad] ) * (self.BATCH_SIZE / self.DATA_SIZE) self.optimizer.step() # the accumulated gradients in .grad without any hooks orig_grad = self.effective_batch_grad self.assertTrue( torch.allclose(accumulated_grad, orig_grad, atol=10e-5, rtol=10e-3) ) self.assertTrue(torch.allclose(grad, orig_grad, atol=10e-5, rtol=10e-3)) def test_clipper_accumulation(self): """ Calling optimizer.virtual_step() should accumulate clipped gradients to form one large batch. """ self.setUp_privacy_engine(self.DATA_SIZE) data = iter(self.dl) # 4 batches of size 4 each for _ in range(3): # take 3 virtual steps self.calc_per_sample_grads(data, num_steps=1) self.optimizer.virtual_step() # accumulate on the last step self.calc_per_sample_grads(data, num_steps=1) self.optimizer.step() # .grad should contain the average gradient over the entire dataset accumulated_grad = torch.cat( [p.grad.reshape(-1) for p in self.model.parameters() if p.requires_grad] ) # the accumulated gradients in .grad without any hooks orig_grad = self.effective_batch_grad self.assertTrue( torch.allclose(accumulated_grad, orig_grad, atol=10e-5, rtol=10e-3), f"Values are {accumulated_grad} vs {orig_grad}." f"MAD is {(orig_grad - accumulated_grad).abs().mean()}", ) def test_mixed_accumulation(self): """ Calling loss.backward() multiple times aggregates all per-sample gradients in .grad-sample. Then, calling optimizer.virtual_step() should clip all gradients and aggregate them into one large batch. """ self.setUp_privacy_engine(self.DATA_SIZE) data = iter(self.dl) # 4 batches of size 4 each # accumulate per-sample grads for two mini batches self.calc_per_sample_grads(data, num_steps=2) # take a virtual step self.optimizer.virtual_step() # accumulate another two mini batches self.calc_per_sample_grads(data, num_steps=2) # take a step self.optimizer.step() # .grad should contain the average gradient over the entire dataset accumulated_grad = torch.cat( [p.grad.reshape(-1) for p in self.model.parameters() if p.requires_grad] ) # the accumulated gradients in .grad without any hooks orig_grad = self.effective_batch_grad self.assertTrue( torch.allclose(accumulated_grad, orig_grad, atol=10e-5, rtol=10e-3) ) def test_grad_sample_erased(self): """ Calling optimizer.step() should erase any accumulated per-sample gradients. """ self.setUp_privacy_engine(2 * self.BATCH_SIZE) data = iter(self.dl) # 4 batches of size 4 each for _ in range(2): # accumulate per-sample gradients for two mini-batches to form an # effective batch of size `2*BATCH_SIZE`. Once an effective batch # has been accumulated, we call `optimizer.step()` to clip and # average the per-sample gradients. This should erase the # `grad_sample` fields for each parameter self.calc_per_sample_grads(data, num_steps=2) self.optimizer.step() for param_name, param in self.model.named_parameters(): if param.requires_grad: self.assertFalse( hasattr(param, "grad_sample"), f"Per-sample gradients haven't been erased " f"for {param_name}", ) def test_summed_grad_erased(self): """ Calling optimizer.step() should erase any accumulated clipped gradients. """ self.setUp_privacy_engine(2 * self.BATCH_SIZE) data = iter(self.dl) # 4 batches of size 4 each for idx in range(4): self.calc_per_sample_grads(data, num_steps=1) if idx % 2 == 0: # perform a virtual step for each mini-batch # this will accumulate clipped gradients in each parameter's # `summed_grads` field. self.optimizer.virtual_step() for param_name, param in self.model.named_parameters(): if param.requires_grad: self.assertTrue( hasattr(param, "summed_grad"), f"Clipped gradients aren't accumulated " f"for {param_name}", ) else: # accumulate gradients for two mini-batches to form an # effective batch of size `2*BATCH_SIZE`. Once an effective batch # has been accumulated, we call `optimizer.step()` to compute the # average gradient for the entire batch. This should erase the # `summed_grads` fields for each parameter. # take a step. The clipper will compute the mean gradient # for the entire effective batch and populate each parameter's # `.grad` field. self.optimizer.step() for param_name, param in self.model.named_parameters(): if param.requires_grad: self.assertFalse( hasattr(param, "summed_grad"), f"Accumulated clipped gradients haven't been erased " f"¨for {param_name}", )
def main(): parser = argparse.ArgumentParser(description="PyTorch IMDB Example") parser.add_argument( "-b", "--batch-size-test", type=int, default=64, metavar="B", help="input batch size for test (default: 64)", ) parser.add_argument( "-sr", "--sample-rate", type=float, default=0.00256, metavar="SR", help="sample rate used for batch construction (default: 0.00256)", ) parser.add_argument( "-n", "--epochs", type=int, default=10, metavar="N", help="number of epochs to train (default: 10)", ) parser.add_argument( "--lr", type=float, default=0.02, metavar="LR", help="learning rate (default: .02)", ) parser.add_argument( "--sigma", type=float, default=0.56, metavar="S", help="Noise multiplier (default 0.56)", ) parser.add_argument( "-c", "--max-per-sample-grad_norm", type=float, default=1.0, metavar="C", help="Clip per-sample gradients to this norm (default 1.0)", ) parser.add_argument( "--delta", type=float, default=1e-5, metavar="D", help="Target delta (default: 1e-5)", ) parser.add_argument( "--max-sequence-length", type=int, default=256, metavar="SL", help="Longer sequences will be cut to this length (default: 256)", ) parser.add_argument( "--device", type=str, default="cuda", help="GPU ID for this process (default: 'cuda')", ) parser.add_argument( "--save-model", action="store_true", default=False, help="Save the trained model (default: false)", ) parser.add_argument( "--disable-dp", action="store_true", default=False, help="Disable privacy training and just train with vanilla optimizer", ) parser.add_argument( "--secure-rng", action="store_true", default=False, help="Enable Secure RNG to have trustworthy privacy guarantees. Comes at a performance cost", ) parser.add_argument( "--data-root", type=str, default="../imdb", help="Where IMDB is/will be stored" ) parser.add_argument( "-j", "--workers", default=2, type=int, metavar="N", help="number of data loading workers (default: 2)", ) args = parser.parse_args() device = torch.device(args.device) raw_dataset = load_dataset("imdb", cache_dir=args.data_root) tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased") dataset = raw_dataset.map( lambda x: tokenizer( x["text"], truncation=True, max_length=args.max_sequence_length ), batched=True, ) dataset.set_format(type="torch", columns=["input_ids", "label"]) train_dataset = dataset["train"] test_dataset = dataset["test"] if args.secure_rng: try: import torchcsprng as prng except ImportError as e: msg = ( "To use secure RNG, you must install the torchcsprng package! " "Check out the instructions here: https://github.com/pytorch/csprng#installation" ) raise ImportError(msg) from e generator = prng.create_random_device_generator("/dev/urandom") else: generator = None train_loader = DataLoader( train_dataset, num_workers=args.workers, generator=generator, batch_sampler=UniformWithReplacementSampler( num_samples=len(train_dataset), sample_rate=args.sample_rate, generator=generator, ), collate_fn=padded_collate, pin_memory=True, ) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.batch_size_test, shuffle=False, num_workers=args.workers, collate_fn=padded_collate, pin_memory=True, ) model = SampleNet(vocab_size=len(tokenizer)).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) if not args.disable_dp: privacy_engine = PrivacyEngine( model, sample_rate=args.sample_rate, alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=args.sigma, max_grad_norm=args.max_per_sample_grad_norm, secure_rng=args.secure_rng, ) privacy_engine.attach(optimizer) mean_accuracy = 0 for epoch in range(1, args.epochs + 1): train(args, model, train_loader, optimizer, epoch) mean_accuracy = evaluate(args, model, test_loader) torch.save(mean_accuracy, "run_results_imdb_classification.pt")
def main(tiny_images=None, model="cnn", augment=False, use_scattering=False, batch_size=2048, mini_batch_size=256, lr=1, lr_start=None, optim="SGD", momentum=0.9, noise_multiplier=1, max_grad_norm=0.1, epochs=100, bn_noise_multiplier=None, max_epsilon=None, data_size=550000, delta=1e-6, logdir=None): logger = Logger(logdir) device = get_device() bs = batch_size assert bs % mini_batch_size == 0 n_acc_steps = bs // mini_batch_size train_data, test_data = get_data("cifar10", augment=augment) train_loader = torch.utils.data.DataLoader(train_data, batch_size=100, shuffle=False, num_workers=4, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_data, batch_size=100, shuffle=False, num_workers=4, pin_memory=True) if isinstance(tiny_images, torch.utils.data.Dataset): train_data_aug = tiny_images else: print("loading tiny images...") train_data_aug, _ = get_data("cifar10_500K", augment=augment, aux_data_filename=tiny_images) scattering, K, (h, w) = None, None, (None, None) pre_scattered = False if use_scattering: scattering, K, (h, w) = get_scatter_transform("cifar10_500K") scattering.to(device) # if the whole data fits in memory, pre-compute the scattering if use_scattering and data_size <= 50000: loader = torch.utils.data.DataLoader(train_data_aug, batch_size=100, shuffle=False, num_workers=4) train_data_aug = get_scattered_dataset(loader, scattering, device, data_size) pre_scattered = True assert data_size <= len(train_data_aug) num_sup = min(data_size, 50000) num_batches = int(np.ceil(50000 / mini_batch_size)) # cifar-10 equivalent train_batch_sampler = SemiSupervisedSampler(data_size, num_batches, mini_batch_size) train_loader_aug = torch.utils.data.DataLoader( train_data_aug, batch_sampler=train_batch_sampler, num_workers=0 if pre_scattered else 4, pin_memory=not pre_scattered) rdp_norm = 0 if model == "cnn": if use_scattering: save_dir = f"bn_stats/cifar10_500K" os.makedirs(save_dir, exist_ok=True) bn_stats, rdp_norm = scatter_normalization( train_loader, scattering, K, device, data_size, num_sup, noise_multiplier=bn_noise_multiplier, orders=ORDERS, save_dir=save_dir) model = CNNS["cifar10"](K, input_norm="BN", bn_stats=bn_stats) model = model.to(device) if not pre_scattered: model = nn.Sequential(scattering, model) else: model = CNNS["cifar10"](in_channels=3, internal_norm=False) elif model == "linear": save_dir = f"bn_stats/cifar10_500K" os.makedirs(save_dir, exist_ok=True) bn_stats, rdp_norm = scatter_normalization( train_loader, scattering, K, device, data_size, num_sup, noise_multiplier=bn_noise_multiplier, orders=ORDERS, save_dir=save_dir) model = ScatterLinear(K, (h, w), input_norm="BN", bn_stats=bn_stats) model = model.to(device) if not pre_scattered: model = nn.Sequential(scattering, model) else: raise ValueError(f"Unknown model {model}") model.to(device) if pre_scattered: test_loader = get_scattered_loader(test_loader, scattering, device) print(f"model has {get_num_params(model)} parameters") if optim == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum) else: optimizer = torch.optim.Adam(model.parameters(), lr=lr) privacy_engine = PrivacyEngine( model, bs, data_size, alphas=ORDERS, noise_multiplier=noise_multiplier, max_grad_norm=max_grad_norm, ) privacy_engine.attach(optimizer) best_acc = 0 flat_count = 0 for epoch in range(0, epochs): print(f"\nEpoch: {epoch} ({privacy_engine.steps} steps)") train_loss, train_acc = train(model, train_loader_aug, optimizer, n_acc_steps=n_acc_steps) test_loss, test_acc = test(model, test_loader) if noise_multiplier > 0: print(f"sample_rate={privacy_engine.sample_rate}, " f"mul={privacy_engine.noise_multiplier}, " f"steps={privacy_engine.steps}") rdp_sgd = get_renyi_divergence( privacy_engine.sample_rate, privacy_engine.noise_multiplier) * privacy_engine.steps epsilon, _ = get_privacy_spent(rdp_norm + rdp_sgd, target_delta=delta) epsilon2, _ = get_privacy_spent(rdp_sgd, target_delta=delta) print(f"ε = {epsilon:.3f} (sgd only: ε = {epsilon2:.3f})") if max_epsilon is not None and epsilon >= max_epsilon: return else: epsilon = None logger.log_epoch(epoch, train_loss, train_acc, test_loss, test_acc, epsilon) logger.log_scalar("epsilon/train", epsilon, epoch) logger.log_scalar("cifar10k_loss/train", train_loss, epoch) logger.log_scalar("cifar10k_acc/train", train_acc, epoch) if test_acc > best_acc: best_acc = test_acc flat_count = 0 else: flat_count += 1 if flat_count >= 20: print("plateau...") return
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument( "-sr", "--sample-rate", type=float, default=0.001, metavar="SR", help="sample rate used for batch construction (default: 0.001)", ) parser.add_argument( "--test-batch-size", type=int, default=1024, metavar="TB", help="input batch size for testing (default: 1024)", ) parser.add_argument( "-n", "--epochs", type=int, default=10, metavar="N", help="number of epochs to train (default: 14)", ) parser.add_argument( "-r", "--n-runs", type=int, default=1, metavar="R", help="number of runs to average on (default: 1)", ) parser.add_argument( "--lr", type=float, default=0.1, metavar="LR", help="learning rate (default: .1)", ) parser.add_argument( "--sigma", type=float, default=1.0, metavar="S", help="Noise multiplier (default 1.0)", ) parser.add_argument( "-c", "--max-per-sample-grad_norm", type=float, default=1.0, metavar="C", help="Clip per-sample gradients to this norm (default 1.0)", ) parser.add_argument( "--delta", type=float, default=1e-5, metavar="D", help="Target delta (default: 1e-5)", ) parser.add_argument( "--device", type=str, default="cuda", help="GPU ID for this process (default: 'cuda')", ) parser.add_argument( "--save-model", action="store_true", default=False, help="Save the trained model (default: false)", ) parser.add_argument( "--disable-dp", action="store_true", default=False, help="Disable privacy training and just train with vanilla SGD", ) parser.add_argument( "--secure-rng", action="store_true", default=False, help= "Enable Secure RNG to have trustworthy privacy guarantees. Comes at a performance cost", ) parser.add_argument( "--data-root", type=str, default="../mnist", help="Where MNIST is/will be stored", ) args = parser.parse_args() device = torch.device(args.device) kwargs = {"num_workers": 1, "pin_memory": True} if args.secure_rng: try: import torchcsprng as prng except ImportError as e: msg = ( "To use secure RNG, you must install the torchcsprng package! " "Check out the instructions here: https://github.com/pytorch/csprng#installation" ) raise ImportError(msg) from e generator = prng.create_random_device_generator("/dev/urandom") else: generator = None train_dataset = datasets.MNIST( args.data_root, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((MNIST_MEAN, ), (MNIST_STD, )), ]), ) train_loader = torch.utils.data.DataLoader( train_dataset, generator=generator, batch_sampler=UniformWithReplacementSampler( num_samples=len(train_dataset), sample_rate=args.sample_rate, generator=generator, ), **kwargs, ) test_loader = torch.utils.data.DataLoader( datasets.MNIST( args.data_root, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((MNIST_MEAN, ), (MNIST_STD, )), ]), ), batch_size=args.test_batch_size, shuffle=True, **kwargs, ) run_results = [] for _ in range(args.n_runs): model = SampleConvNet().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0) if not args.disable_dp: privacy_engine = PrivacyEngine( model, sample_rate=args.sample_rate, alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=args.sigma, max_grad_norm=args.max_per_sample_grad_norm, secure_rng=args.secure_rng, ) privacy_engine.attach(optimizer) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) run_results.append(test(args, model, device, test_loader)) if len(run_results) > 1: print("Accuracy averaged over {} runs: {:.2f}% ± {:.2f}%".format( len(run_results), np.mean(run_results) * 100, np.std(run_results) * 100)) repro_str = ( f"{model.name()}_{args.lr}_{args.sigma}_" f"{args.max_per_sample_grad_norm}_{args.sample_rate}_{args.epochs}") torch.save(run_results, f"run_results_{repro_str}.pt") if args.save_model: torch.save(model.state_dict(), f"mnist_cnn_{repro_str}.pt")
model = Net_embedder(embedders, hidden_dims, num_classes) print(model) model.to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), args.lr) if args.epsilon is not None: max_epsilon, delta, sensitivity = get_priv_params(args.epsilon) privacy_engine = PrivacyEngine(model, batch_size=args.batch_size, sample_size=len(sampler), alphas=list(range(2, 32)), noise_multiplier=args.noise_multiplier, max_grad_norm=args.max_grad_norm, target_delta=delta) privacy_engine.attach(optimizer) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_epochs * len(dataloader_train)) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.5) # Training loop best_loss = np.infty best_model = None for i in range(args.num_epochs): loss = train(model, dataloader_train, optimizer, criterion, device, scheduler=scheduler) # loss = train(model, dataloader_train, optimizer, criterion, device, scheduler=None) # scheduler.step(loss) log = f"Train Epoch: {i}\tLoss: {loss:.6f}" if args.epsilon is not None: epsilon, best_alpha = optimizer.privacy_engine.get_privacy_spent(delta) log += f" (ε = {epsilon:.2f}, δ = {delta}) for α = {best_alpha}"
def main(): parser = ArgParser() args = parser.parse_args() gen = Generator(args.latent_dim).to(args.device) disc = Discriminator().to(args.device) if args.device != 'cpu': gen = nn.DataParallel(gen, args.gpu_ids) disc = nn.DataParallel(disc, args.gpu_ids) # gen = gen.apply(weights_init) # disc = disc.apply(weights_init) gen_opt = torch.optim.RMSprop(gen.parameters(), lr=args.lr) disc_opt = torch.optim.RMSprop(disc.parameters(), lr=args.lr) gen_scheduler = torch.optim.lr_scheduler.LambdaLR(gen_opt, lr_lambda=lr_lambda(args.num_epochs)) disc_scheduler = torch.optim.lr_scheduler.LambdaLR(disc_opt, lr_lambda=lr_lambda(args.num_epochs)) disc_loss_fn = DiscriminatorLoss().to(args.device) gen_loss_fn = GeneratorLoss().to(args.device) # dataset = Dataset() dataset = MNISTDataset() loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers) logger = TrainLogger(args, len(loader), phase=None) logger.log_hparams(args) if args.privacy_noise_multiplier != 0: privacy_engine = PrivacyEngine( disc, batch_size=args.batch_size, sample_size=len(dataset), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=.8, max_grad_norm=0.02, batch_first=True, ) privacy_engine.attach(disc_opt) privacy_engine.to(args.device) for epoch in range(args.num_epochs): logger.start_epoch() for cur_step, img in enumerate(tqdm(loader, dynamic_ncols=True)): logger.start_iter() img = img.to(args.device) fake, disc_loss = None, None for _ in range(args.step_train_discriminator): disc_opt.zero_grad() fake_noise = get_noise(args.batch_size, args.latent_dim, device=args.device) fake = gen(fake_noise) disc_loss = disc_loss_fn(img, fake, disc) disc_loss.backward() disc_opt.step() gen_opt.zero_grad() fake_noise_2 = get_noise(args.batch_size, args.latent_dim, device=args.device) fake_2 = gen(fake_noise_2) gen_loss = gen_loss_fn(img, fake_2, disc) gen_loss.backward() gen_opt.step() if args.privacy_noise_multiplier != 0: epsilon, best_alpha = privacy_engine.get_privacy_spent(args.privacy_delta) logger.log_iter_gan_from_latent_vector(img, fake, gen_loss, disc_loss, epsilon if args.privacy_noise_multiplier != 0 else 0) logger.end_iter() logger.end_epoch() gen_scheduler.step() disc_scheduler.step()