def run(): options = parse_options() print(options) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') os.makedirs(options.data_dir, exist_ok=True) os.makedirs(options.output_dir, exist_ok=True) os.makedirs(options.model_dir, exist_ok=True) with open(os.path.join(options.output_dir, 'options.json'), 'w') as f: json.dump(vars(options), f, indent=4) if options.restore: generator = torch.load(os.path.join(options.model_dir, 'generator.pt')) critic = torch.load(os.path.join(options.model_dir, 'critic.pt')) else: generator = Generator(options.image_size, options.state_size) critic = Critic(options.image_size) generator.apply(init_weights) critic.apply(init_weights) generator = generator.to(device) critic = critic.to(device) transform = transforms.Compose([ transforms.Resize((options.image_size, options.image_size)), transforms.CenterCrop(options.image_size), #redundant? transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) ]) if options.dataset == 'lsun': training_class = options.image_class + '_train' dataset = datasets.LSUN(options.data_dir, classes=[training_class], transform=transform) else: dataset = datasets.ImageFolder(root=options.data_dir, transform=transform) dataloader = torch.utils.data.DataLoader(dataset, batch_size=options.batch_size, num_workers=4, shuffle=True, drop_last=True, pin_memory=True) train(generator, critic, dataloader, device, options)
class TD3: def __init__(self, env, state_dim, action_dim, max_action, gamma=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): self.actor = Actor(state_dim, action_dim) self.actor_target = Actor(state_dim, action_dim) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-3) self.critic = Critic(state_dim, action_dim) self.critic_target = Critic(state_dim, action_dim) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) self.max_action = max_action self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.actor.to(self.device) self.actor_target.to(self.device) self.critic.to(self.device) self.critic_target.to(self.device) self.env = env self.total_it = 0 def select_action(self, state, noise=0.1): action = self.actor(state.to(self.device)).data.cpu().numpy().flatten() if noise != 0: action = (action + np.random.normal( 0, noise, size=self.env.action_space.shape[0])) return action.clip(self.env.action_space.low, self.env.action_space.high) def train(self, replay_buffer, batch_size=128): self.total_it += 1 states, states_, actions, rewards, terminal = replay_buffer.sample_buffer( batch_size) with torch.no_grad(): noise = (torch.randn_like(actions.to(self.device)) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) next_action = (self.actor_target(states_.to(self.device)) + noise).clamp(-self.max_action, self.max_action) # compute the target Q value target_q1, target_q2 = self.critic_target( states_.to(self.device), next_action.to(self.device)) target_q = torch.min(target_q1, target_q2) # target_q = rewards + terminal * self.gamma + target_q.cpu() # target_q = rewards + (terminal.reshape(256, 1) * self.gamma * target_q).detach() target_q = rewards + terminal * self.gamma * target_q[:, 0].cpu() # Get current Q value current_q1, current_q2 = self.critic(states.to(self.device), actions.to(self.device)) # Compute critic loss critic_loss = F.mse_loss(current_q1[:, 0], target_q.to( self.device)) + F.mse_loss(current_q2[:, 0], target_q.to(self.device)) # optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Delayed policy updates if self.total_it % self.policy_freq == 0: # Compote actor loss actor_loss = -self.critic.q1(states.to( self.device), self.actor(states.to(self.device))).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def save(self, filename): torch.save(self.critic.state_dict(), filename + "_critic") torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer") torch.save(self.actor.state_dict(), filename + "_actor") torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer") def load(self, filename): self.critic.load_state_dict(torch.load(filename + "_critic")) self.critic_optimizer.load_state_dict( torch.load(filename + "_critic_optimizer")) self.actor.load_state_dict(torch.load(filename + "_actor")) self.actor_optimizer.load_state_dict( torch.load(filename + "_actor_optimizer"))
class DDPGAgent: """ Encapsulates the functioning of the DDPG agent """ def __init__(self, state_dim, action_dim, max_action, device, memory_capacity=10000, discount=0.99, tau=0.005, sigma=0.2, theta=0.15, actor_lr=1e-4, critic_lr=1e-3, train_mode=True): self.train_mode = train_mode # whether the agent is in training or testing mode self.state_dim = state_dim # dimension of the state space self.action_dim = action_dim # dimension of the action space self.device = device # defines which cuda or cpu device is to be used to run the networks self.discount = discount # denoted a gamma in the equation for computation of the Q-value self.tau = tau # defines the factor used for Polyak averaging (i.e., soft updating of the target networks) self.max_action = max_action # the max value of the range in the action space (assumes a symmetric range in the action space) # create an instance of the replay buffer self.memory = ReplayMemory(memory_capacity) # create an instance of the noise generating process self.ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(self.action_dim), sigma=sigma, theta=theta) # instances of the networks for the actor and the critic self.actor = Actor(state_dim, action_dim, max_action, actor_lr) self.critic = Critic(state_dim, action_dim, critic_lr) # instance of the target networks for the actor and the critic self.target_actor = Actor(state_dim, action_dim, max_action, actor_lr) self.target_critic = Critic(state_dim, action_dim, critic_lr) # initialise the targets to the same weight as their corresponding current networks self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) # since we do not learn/train on the target networks self.target_actor.eval() self.target_critic.eval() # for test mode if not self.train_mode: self.actor.eval() self.critic.eval() self.ounoise = None self.actor.to(self.device) self.critic.to(self.device) self.target_actor.to(self.device) self.target_critic.to(self.device) def select_action(self, state): """ Function to return the appropriate action for the given state. During training, it adds a zero-mean OU noise to the action to encourage exploration. During testing, no noise is added to the action decision. Parameters --- state: vector or tensor The current state of the environment as observed by the agent Returns --- A numpy array representing the noisy action to be performed by the agent in the current state """ if not torch.is_tensor(state): state = torch.tensor([state], dtype=torch.float32).to(self.device) self.actor.eval() act = self.actor(state).cpu().data.numpy().flatten() # performs inference using the actor based on the current state as the input and returns the corresponding np array self.actor.train() noise = 0.0 ## for adding Gaussian noise (to use, update the code pass the exploration noise as input) #if self.train_mode: # noise = np.random.normal(0.0, exploration_noise, size=act.shape) # generate the zero-mean gaussian noise with standard deviation determined by exploration_noise # for adding OU noise if self.train_mode: noise = self.ou_noise.generate_noise() noisy_action = act + noise noisy_action = noisy_action.clip(min=-self.max_action, max=self.max_action) # to ensure that the noisy action being returned is within the limit of "legal" actions afforded to the agent; assumes action range is symmetric return noisy_action def learn(self, batchsize): """ Function to perform the updates on the 4 neural networks that run the DDPG algorithm. Parameters --- batchsize: int Number of experiences to be randomly sampled from the memory for the agent to learn from Returns --- none """ if len(self.memory) < batchsize: return states, actions, next_states, rewards, dones = self.memory.sample(batchsize, self.device) # a batch of experiences randomly sampled form the memory # ensure that the actions and rewards tensors have the appropriate shapes actions = actions.view(-1, self.action_dim) rewards = rewards.view(-1, 1) with torch.no_grad(): # generate target actions target_action = self.target_actor(next_states) # calculate TD-Target target_q = self.target_critic(next_states, target_action) target_q[dones] = 0.0 # being in a terminal state implies there are no more future states that the agent would encounter in the given episode and so set the associated Q-value to 0 y = rewards + self.discount * target_q current_q = self.critic(states, actions) critic_loss = F.mse_loss(current_q, y).mean() self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # actor loss is calculated by a gradient ascent along the crtic, thus need to apply the negative sign to convert to a gradient descent pred_current_actions = self.actor(states) pred_current_q = self.critic(states, pred_current_actions) actor_loss = - pred_current_q.mean() self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # apply slow-update to the target networks self.soft_update_targets() def soft_update_net(self, source_net_params, target_net_params): """ Function to perform Polyak averaging to update the parameters of the provided network Parameters --- source_net_params: list trainable parameters of the source, ie. current version of the network target_net_params: list trainable parameters of the corresponding target network Returns --- none """ for source_param, target_param in zip(source_net_params, target_net_params): target_param.data.copy_(self.tau * source_param.data + (1 - self.tau) * target_param.data) def soft_update_targets(self): """ Function that calls Polyak averaging on all three target networks Parameters --- none Returns --- none """ self.soft_update_net(self.actor.parameters(), self.target_actor.parameters()) self.soft_update_net(self.critic.parameters(), self.target_critic.parameters()) def save(self, path, model_name): """ Function to save the actor and critic networks Parameters --- path: str Location where the model is to be saved model_name: str Name of the model Returns --- none """ self.actor.save_model('{}/{}_actor'.format(path, model_name)) self.critic.save_model('{}/{}_critic'.format(path, model_name)) def load(self, path, model_name): """ Function to load the actor and critic networks Parameters --- path: str Location where the model is saved model_name: str Name of the model Returns --- none """ self.actor.load_model('{}/{}_actor'.format(path, model_name)) self.critic.load_model('{}/{}_critic'.format(path, model_name))
class TD3Agent: """ Encapsulates the functioning of the TD3 agent """ def __init__(self, state_dim, action_dim, max_action, device, memory_capacity=10000, discount=0.99, update_freq=2, tau=0.005, policy_noise_std=0.2, policy_noise_clip=0.5, actor_lr=1e-3, critic_lr=1e-3, train_mode=True): self.train_mode = train_mode # whether the agent is in training or testing mode self.state_dim = state_dim # dimension of the state space self.action_dim = action_dim # dimension of the action space self.device = device # defines which cuda or cpu device is to be used to run the networks self.discount = discount # denoted a gamma in the equation for computation of the Q-value self.update_freq = update_freq # defines how frequently should the actor and target be updated self.tau = tau # defines the factor used for Polyak averaging (i.e., soft updating of the target networks) self.max_action = max_action # the max value of the range in the action space (assumes a symmetric range in the action space) self.policy_noise_clip = policy_noise_clip # max range within which the noise for the target policy smoothing must be contained self.policy_noise_std = policy_noise_std # standard deviation, i.e. sigma, of the Gaussian noise applied for target policy smoothing # create an instance of the replay buffer self.memory = ReplayMemory(memory_capacity) # instances of the networks for the actor and the two critics self.actor = Actor(state_dim, action_dim, max_action, actor_lr) self.critic = Critic( state_dim, action_dim, critic_lr ) # the critic class encapsulates two copies of the neural network for the two critics used in TD3 # instance of the target networks for the actor and the two critics self.target_actor = Actor(state_dim, action_dim, max_action, actor_lr) self.target_critic = Critic(state_dim, action_dim, critic_lr) # initialise the targets to the same weight as their corresponding current networks self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) # since we do not learn/train on the target networks self.target_actor.eval() self.target_critic.eval() # for test mode if not self.train_mode: self.actor.eval() self.critic.eval() self.actor.to(self.device) self.critic.to(self.device) self.target_actor.to(self.device) self.target_critic.to(self.device) def select_action(self, state, exploration_noise=0.1): """ Function to returns the appropriate action for the given state. During training, it returns adds a zero-mean gaussian noise with std=exploration_noise to the action to encourage exploration. No noise is added to the action decision during testing mode. Parameters --- state: vector or tensor The current state of the environment as observed by the agent exploration_noise: float, optional Standard deviation, i.e. sigma, of the Gaussian noise to be added to the agent's action to encourage exploration Returns --- A numpy array representing the noisy action to be performed by the agent in the current state """ if not torch.is_tensor(state): state = torch.tensor([state], dtype=torch.float32).to(self.device) act = self.actor(state).cpu().data.numpy().flatten( ) # performs inference using the actor based on the current state as the input and returns the corresponding np array if not self.train_mode: exploration_noise = 0.0 # since we do not need noise to be added to the action during testing noise = np.random.normal( 0.0, exploration_noise, size=act.shape ) # generate the zero-mean gaussian noise with standard deviation determined by exploration_noise noisy_action = act + noise noisy_action = noisy_action.clip( min=-self.max_action, max=self.max_action ) # to ensure that the noisy action being returned is within the limit of "legal" actions afforded to the agent; assumes action range is symmetric return noisy_action def learn(self, current_iteration, batchsize): """ Function to perform the updates on the 6 neural networks that run the TD3 algorithm. Parameters --- current_iteration: int Total number of steps that have been performed by the agent batchsize: int Number of experiences to be randomly sampled from the memory for the agent to learn from Returns --- none """ if len(self.memory) < batchsize: return states, actions, next_states, rewards, dones = self.memory.sample( batchsize, self.device ) # a batch of experiences randomly sampled form the memory # ensure that the actions and rewards tensors have the appropriate shapes actions = actions.view(-1, self.action_dim) rewards = rewards.view(-1, 1) # generate noisy target actions for target policy smoothing pred_action = self.target_actor(next_states) noise = torch.zeros_like(pred_action).normal_( 0, self.policy_noise_std).to(self.device) noise = torch.clamp(noise, min=-self.policy_noise_clip, max=self.policy_noise_clip) noisy_pred_action = torch.clamp(pred_action + noise, min=-self.max_action, max=self.max_action) # calculate TD-Target using Clipped Double Q-learning target_q1, target_q2 = self.target_critic(next_states, noisy_pred_action) target_q = torch.min(target_q1, target_q2) target_q[ dones] = 0.0 # being in a terminal state implies there are no more future states that the agent would encounter in the given episode and so set the associated Q-value to 0 y = rewards + self.discount * target_q current_q1, current_q2 = self.critic( states, actions ) # the critic class encapsulates two copies of the neural network thereby returning two Q values with each forward pass critic_loss = F.mse_loss(current_q1, y) + F.mse_loss( current_q2, y ) # the losses of the two critics need to be added as there is only one optimiser shared between the two networks critic_loss = critic_loss.mean() self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # delayed policy and target updates if current_iteration % self.update_freq == 0: # actor loss is calculated by a gradient ascent along crtic 1, thus need to apply the negative sign to convert to a gradient descent pred_current_actions = self.actor(states) pred_current_q1, _ = self.critic( states, pred_current_actions ) # since we only need the Q-value from critic 1, we can ignore the second value obtained through the forward pass actor_loss = -pred_current_q1.mean() self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # apply slow-update to all three target networks self.soft_update_targets() def soft_update_net(self, source_net_params, target_net_params): """ Function to perform Polyak averaging to update the parameters of the provided network Parameters --- source_net_params: list trainable parameters of the source, ie. current version of the network target_net_params: list trainable parameters of the corresponding target network Returns --- none """ for source_param, target_param in zip(source_net_params, target_net_params): target_param.data.copy_(self.tau * source_param.data + (1 - self.tau) * target_param.data) def soft_update_targets(self): """ Function that calls Polyak averaging on all three target networks Parameters --- none Returns --- none """ self.soft_update_net(self.actor.parameters(), self.target_actor.parameters()) self.soft_update_net(self.critic.parameters(), self.target_critic.parameters()) def save(self, path, model_name): """ Function to save the actor and critic networks Parameters --- path: str Location where the model is to be saved model_name: str Name of the model Returns --- none """ self.actor.save_model('{}/{}_actor'.format(path, model_name)) self.critic.save_model('{}/{}_critic'.format(path, model_name)) def load(self, model_name): """ Function to load the actor and critic networks Parameters --- path: str Location where the model is saved model_name: str Name of the model Returns --- none """ self.actor.load_model('{}/{}_actor'.format(path, model_name)) self.critic.load_model('{}/{}_critic'.format(path, model_name))
def adversarial_debiasing(model_state_dict, data, config, device): logger.info('Training Adversarial model.') actor = load_model(data.num_features, config.get('hyperparameters', {})) actor.load_state_dict(model_state_dict) actor.to(device) hid = config['hyperparameters'][ 'hid'] if 'hyperparameters' in config else 32 critic = Critic(hid * config['adversarial']['batch_size'], num_deep=config['adversarial']['num_deep'], hid=hid) critic.to(device) critic_optimizer = optim.Adam(critic.parameters()) critic_loss_fn = torch.nn.MSELoss() actor_optimizer = optim.Adam(actor.parameters(), lr=config['adversarial']['lr']) actor_loss_fn = torch.nn.BCELoss() for epoch in range(config['adversarial']['epochs']): for param in critic.parameters(): param.requires_grad = True for param in actor.parameters(): param.requires_grad = False actor.eval() critic.train() for step in range(config['adversarial']['critic_steps']): critic_optimizer.zero_grad() indices = torch.randint(0, data.X_valid.size(0), (config['adversarial']['batch_size'], )) cX_valid = data.X_valid_gpu[indices] cy_valid = data.y_valid[indices] cp_valid = data.p_valid[indices] with torch.no_grad(): scores = actor(cX_valid)[:, 0].reshape(-1).cpu().numpy() bias = compute_bias(scores, cy_valid.numpy(), cp_valid, config['metric']) res = critic(actor.trunc_forward(cX_valid)) loss = critic_loss_fn(torch.tensor([bias], device=device), res[0]) loss.backward() train_loss = loss.item() critic_optimizer.step() if (epoch % 10 == 0) and (step % 100 == 0): logger.info( f'=======> Critic Epoch: {(epoch, step)} loss: {train_loss}' ) for param in critic.parameters(): param.requires_grad = False for param in actor.parameters(): param.requires_grad = True actor.train() critic.eval() for step in range(config['adversarial']['actor_steps']): actor_optimizer.zero_grad() indices = torch.randint(0, data.X_valid.size(0), (config['adversarial']['batch_size'], )) cy_valid = data.y_valid_gpu[indices] cX_valid = data.X_valid_gpu[indices] pred_bias = critic(actor.trunc_forward(cX_valid)) bceloss = actor_loss_fn(actor(cX_valid)[:, 0], cy_valid) # loss = lam*abs(pred_bias) + (1-lam)*loss objloss = max( 1, config['adversarial']['lambda'] * (abs(pred_bias[0][0]) - config['objective']['epsilon'] + config['adversarial']['margin']) + 1) * bceloss objloss.backward() train_loss = objloss.item() actor_optimizer.step() if (epoch % 10 == 0) and (step % 100 == 0): logger.info( f'=======> Actor Epoch: {(epoch, step)} loss: {train_loss}' ) if epoch % 10 == 0: with torch.no_grad(): scores = actor(data.X_valid_gpu)[:, 0].reshape(-1, 1).cpu().numpy() _, best_adv_obj = get_best_thresh( scores, np.linspace(0, 1, 1001), data, config, valid=False, margin=config['adversarial']['margin']) logger.info(f'Objective: {best_adv_obj}') logger.info('Finding optimal threshold for Adversarial model.') with torch.no_grad(): scores = actor(data.X_valid_gpu)[:, 0].reshape(-1, 1).cpu().numpy() best_adv_thresh, _ = get_best_thresh( scores, np.linspace(0, 1, 1001), data, config, valid=False, margin=config['adversarial']['margin']) logger.info('Evaluating Adversarial model on best threshold.') with torch.no_grad(): labels = (actor(data.X_valid_gpu)[:, 0] > best_adv_thresh).reshape( -1, 1).cpu().numpy() results_valid = get_valid_objective(labels, data, config) logger.info(f'Results: {results_valid}') with torch.no_grad(): labels = (actor(data.X_test_gpu)[:, 0] > best_adv_thresh).reshape( -1, 1).cpu().numpy() results_test = get_test_objective(labels, data, config) return results_valid, results_test