class CnnDDQNAgent: def __init__(self, config: Config): self.config = config self.is_training = True if self.config.prioritized_replay: self.buffer = PrioritizedReplayBuffer( self.config.max_buff, alpha=self.config.prioritized_replay_alpha) if self.config.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = self.config.frames self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.config.prioritized_replay_beta0, final_p=1.0) else: self.buffer = ReplayBuffer(self.config.max_buff) self.beta_schedule = None self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) if self.config.use_cuda: state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learning(self, fr): if self.config.prioritized_replay: experience = self.buffer.sample(self.config.batch_size, beta=self.beta_schedule.value(fr)) (s0, a, r, s1, done, weights, batch_idxes) = experience else: s0, a, r, s1, done = self.buffer.sample(self.config.batch_size) weights, batch_idxes = np.ones_like(r), None s0 = torch.tensor(s0, dtype=torch.float) s1 = torch.tensor(s1, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) weights = torch.tensor(weights, dtype=torch.float) if self.config.use_cuda: s0 = s0.cuda() s1 = s1.cuda() a = a.cuda() r = r.cuda() done = done.cuda() weights = weights.cuda() q_values = self.model(s0).cuda() next_q_values = self.model(s1).cuda() next_q_state_values = self.target_model(s1).cuda() q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) td_errors = next_q_value - expected_q_value # Notice that detach the expected_q_value loss = F.smooth_l1_loss(q_value, expected_q_value.detach(), reduction='none') loss = (loss * weights).mean() self.model_optim.zero_grad() loss.backward() self.model_optim.step() if self.config.prioritized_replay: new_priorities = np.abs(td_errors.detach().cpu().numpy() ) + self.config.prioritized_replay_eps self.buffer.update_priorities(batch_idxes, new_priorities) if fr % self.config.update_tar_interval == 0: self.target_model.load_state_dict(self.model.state_dict()) return loss.item() def cuda(self): self.model.cuda() self.target_model.cuda() def load_weights(self, model_path): model = torch.load(model_path) if 'model' in model: self.model.load_state_dict(model['model']) else: self.model.load_state_dict(model) def save_model(self, output, name=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n") def save_checkpoint(self, fr, output): checkpath = output + '/checkpoint_model' os.makedirs(checkpath, exist_ok=True) torch.save({ 'frames': fr, 'model': self.model.state_dict() }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr)) def load_checkpoint(self, model_path): checkpoint = torch.load(model_path) fr = checkpoint['frames'] self.model.load_state_dict(checkpoint['model']) self.target_model.load_state_dict(checkpoint['model']) return fr
class Agent(): """ Class implementation of a so-called "intelligent" agent. This agent interacts with and learns from the environment. """ double_dqn = False """ True for the Double-DQN method. """ dueling_network = False """ True for the Dueling Network (DN) method. """ prioritized_replay = False """ True for the Prioritized Replay memory buffer. """ def __init__(self, state_size, action_size, seed, lr_decay=9999e-4, double_dqn=False, dueling_network=False, prioritized_replay=False): """ Initialize an Agent instance. Params ====== state_size (int): Dimension of each state action_size (int): Dimension of each action seed (int): Random seed lr_decay (float): Multiplicative factor of learning rate decay double_dqn (bool): Toogle for using the Double-DQN method dueling_network (bool): Toogle for using the Dueling Network (DN) method prioritized_replay (bool): Toogle for using the Prioritized Replay method """ # Set the parameters. self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_dqn = double_dqn self.dueling_network = dueling_network self.prioritized_replay = prioritized_replay # Q-Network hidden layers. hidden_layers = [128, 32] # Use the Dueling Network (DN) method. if self.dueling_network: # DN requires a hidden state value. hidden_state_value = [64, 32] self.qnetwork_local = DuelingQNetwork( state_size, action_size, seed, hidden_layers, hidden_state_value).to(device) self.qnetwork_target = DuelingQNetwork( state_size, action_size, seed, hidden_layers, hidden_state_value).to(device) self.qnetwork_target.eval() else: # Use the Deep Q-Network (DQN) method. self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_layers).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_layers).to(device) self.qnetwork_target.eval() # Optimize using Adam. self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LEARNING_RATE) self.lr_scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, lr_decay) # Use the Prioritized Replay memory buffer if enabled. if self.prioritized_replay: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device, alpha=0.6, beta=0.4, beta_scheduler=1.0) else: # Use the Replay memory buffer instead. self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize the time step (until the THRESHOLD is reached). self.t_step = 0 def step(self, state, action, reward, next_state, done): """ Update the network on each step. Params ====== state (array_like): Current state """ # Save experience in replay memory. self.memory.add(state, action, reward, next_state, done) # Learn every time step till THRESHOLD. self.t_step = (self.t_step + 1) % THRESHOLD if self.t_step == 0: # Initial time step. # If enough samples are available in memory, get random subset and learn. if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """ Return the actions for a given state as per current policy. Params ====== state (array_like): Current state eps (float): Epsilon (ε), for epsilon-greedy action selection """ # Epsilon-greedy action selection. if random.random() > eps: state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) # Train the network. self.qnetwork_local.train() # Return the action. return np.argmax(action_values.cpu().data.numpy()) else: # Return a random action. return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done, w) tuples gamma (float): Discount factor """ # Set the parameters. states, actions, rewards, next_states, dones, w = experiences # Compute and minimize the loss. with torch.no_grad(): if self.double_dqn: # Use of Double-DQN method. # Select the greedy actions using the QNetwork Local. # Calculate the pair action/reward for each of the next_states. next_action_rewards_local = self.qnetwork_local(next_states) # Select the action with the maximum reward for each of the next actions. greedy_actions_local = next_action_rewards_local.max( dim=1, keepdim=True)[1] ## Get the rewards for the greedy actions using the QNetwork Target. # Calculate the pair action/reward for each of the next_states. next_action_rewards_target = self.qnetwork_target(next_states) # Get the target reward for each of the greedy actions selected, # following the local network. target_rewards = next_action_rewards_target.gather( 1, greedy_actions_local) else: # Use of the fixed Q-target method. # Calculate the pair action/reward for each of the next_states. next_action_rewards = self.qnetwork_target(next_states) # Select the maximum reward for each of the next actions. target_rewards = next_action_rewards.max(dim=1, keepdim=True)[0] # Calculate the discounted target rewards. target_rewards = rewards + (gamma * target_rewards * (1 - dones)) # Calculate the pair action/rewards for each of the states. # Here, shape: [batch_size, action_size]. expected_action_rewards = self.qnetwork_local(states) # Get the reward for each of the actions. # Here, shape: [batch_size, 1]. expected_rewards = expected_action_rewards.gather(1, actions) # If the Prioritized Replay memory buffer if enabled. if self.prioritized_replay: target_rewards.sub_(expected_rewards) target_rewards.squeeze_() target_rewards.pow_(2) with torch.no_grad(): td_error = target_rewards.detach() td_error.pow_(0.5) self.memory.update_priorities(td_error) target_rewards.mul_(w) loss = target_rewards.mean() else: # Calculate the loss. loss = F.mse_loss(expected_rewards, target_rewards) # Perform the back-propagation. self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_scheduler.step() # Update the target network. self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters: θ_target = τ * θ_local + (1 - τ) * θ_target. Params ====== local_model (PyTorch model): Weights will be copied from target_model (PyTorch model): Weights will be copied to tau (float): Interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)