class DDPGAgent(Agent): """ The DDPGAgent class implements a trainable DDPG agent. Parameters ---------- logger: Logger The variable specifies a logger for model management, plotting and printing. obs_dim: int The variable specifies the dimension of observation space vector. action_space: ndarray The variable specifies the action space of environment. userconfig: The variable specifies the config settings. """ def __init__(self, logger, obs_dim, action_space, userconfig): super().__init__(logger=logger, obs_dim=obs_dim, action_dim=action_space.shape[0], userconfig=userconfig) self._observation_dim = obs_dim self._action_space = action_space self._action_n = action_space.shape[0] self._config = { "eps": 0.05, "discount": 0.95, "buffer_size": int(1e5), "batch_size": 128, "learning_rate_actor": 0.0002, "learning_rate_critic": 0.0002, "hidden_sizes": [256, 256], 'tau': 0.0001 } self._config.update(userconfig) self._eps = self._config['eps'] self._tau = self._config['tau'] self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.eval_mode = False if self._config['lr_milestones'] is None: raise ValueError( 'lr_milestones argument cannot be None!\nExample: --lr_milestones=100 200 300' ) lr_milestones = [ int(x) for x in (self._config['lr_milestones'][0]).split(' ') ] # Critic self.critic = Critic( self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_critic'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) self.critic_target = Critic( self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_critic'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) # Actor self.actor = Actor(self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_actor'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) self.actor_target = Actor( self._observation_dim, self._action_n, hidden_sizes=self._config['hidden_sizes'], learning_rate=self._config['learning_rate_actor'], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) def eval(self): self.eval_mode = True def train_mode(self): self.eval_mode = False def act(self, observation, eps=0, evaluation=False): state = torch.from_numpy(observation).float().to(self.device) if eps is None: eps = self._eps if np.random.random() > eps or evaluation: action = self.actor.forward(state) action = action.detach().cpu().numpy()[0] else: action = self._action_space.sample()[:4] return action def schedulers_step(self): self.critic.lr_scheduler.step() self.critic_target.lr_scheduler.step() self.actor.lr_scheduler.step() self.actor_target.lr_scheduler.step() def store_transition(self, transition): self.buffer.add_transition(transition) @staticmethod def load_model(fpath): with open(Path(fpath), 'rb') as inp: return pickle.load(inp) def train(self, total_step_counter, iter_fit=32): losses = [] for i in range(iter_fit): data = self.buffer.sample(batch_size=self._config['batch_size']) s = torch.FloatTensor(np.stack(data[:, 0])).to(self.device) s_next = torch.FloatTensor(np.stack(data[:, 3])).to(self.device) a = torch.FloatTensor(np.stack( data[:, 1])[:, None]).squeeze(dim=1).to(self.device) rew = torch.FloatTensor(np.stack( data[:, 2])[:, None]).squeeze(dim=1).to(self.device) done = torch.FloatTensor(np.stack( data[:, 4])[:, None]).squeeze(dim=1).to(self.device) # done flag Q_target = self.critic(s, a).squeeze(dim=1).to(self.device) a_next = self.actor_target.forward(s_next) Q_next = self.critic_target.forward( s_next, a_next).squeeze(dim=1).to(self.device) # target targets = rew + self._config['discount'] * Q_next * (1.0 - done) # optimize critic targets = targets.to(self.device) critic_loss = self.critic.loss(Q_target.float(), targets.float()) losses.append(critic_loss) self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() actions = self.actor.forward(s) actor_loss = -self.critic.forward(s, actions).mean() self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # update if (total_step_counter) % self._config['update_target_every'] == 0: # optimize actor soft_update(self.critic_target, self.critic, self._tau) soft_update(self.actor_target, self.actor, self._tau) return losses
class DDPGAgent: def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate, max_action=1): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.noise = OUNoise(env.action_space) self.iter = 0.0 self.noisy = False self.max_action = max_action print(self.action_dim) print(self.obs_dim) # RL hyperparameters self.env = env self.gamma = gamma self.tau = tau # Initialize critic and actorr networks self.critic = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim, self.max_action).to(self.device) self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) # Copy target network paramters for critic for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) # Set Optimization algorithms self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.replay_buffer = ExperienceReplayLog(buffer_maxlen) def get_action(self, obs): #print('obs;',obs) if self.noisy == True: state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() action = self.noise.get_action(action, self.iter) self.iter = self.iter + 1 else: state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() return action def update(self, batch_size): #Batch updates states, actions, rewards, next_states, _ = self.replay_buffer.sample( batch_size) state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample( batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) masks = torch.FloatTensor(masks).to(self.device) # Q info updates curr_Q = self.critic.forward(state_batch, action_batch) next_actions = self.actor_target.forward(next_state_batch) next_Q = self.critic_target.forward(next_state_batch, next_actions.detach()) expected_Q = reward_batch + self.gamma * next_Q # Update Critic network q_loss = F.mse_loss(curr_Q, expected_Q.detach()) self.critic_optimizer.zero_grad() q_loss.backward() self.critic_optimizer.step() # Update Actor network policy_loss = -self.critic.forward( state_batch, self.actor.forward(state_batch)).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # Update Actor and Critic target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))