def step(self, states): stats = Statistics() self._step += 1 if not self.eval: self._sample_noise() states_tensor = torch.from_numpy(states).float().to(self._device) self._policy_net.train(False) with torch.no_grad(): q_values = self._policy_net(states_tensor) policy = self._greedy_policy if self.eval else self._policy actions = policy.get_action(q_values.cpu().numpy()) if not self.eval: # During training # Do logging q = torch.max(q_values).detach() stats.set('q', q) self._policy_net.log_scalars(stats.set) try: stats.set('epsilon', self._policy.get_epsilon()) except AttributeError: pass return actions
def transitions(self, states, actions, rewards, next_states, dones): stats = Statistics() assert not self.eval for idx in range(len(states)): self._buffer.push(state=states[idx], action=actions[idx], reward=rewards[idx], next_state=next_states[idx], done=dones[idx]) stats.set("replay_buffer_size", len(self._buffer)) if len(self._buffer) >= self._min_replay_buffer_size: t0 = time.time() # time spent for optimization stats.set_all(self._optimize()) stats.set("optimization_time", time.time() - t0) return stats
def _run_one_iteration(self): stats = Statistics(self._summary_writer, self._iteration) phase_stats, agent_stats = self._run_one_phase(is_training=True) stats.set("training_episodes", phase_stats.sum("episodes")) stats.set("training_steps", phase_stats.sum("steps")) stats.set_all(phase_stats.get(["agent_time", "step_time", "env_time"])) stats.set_all(agent_stats) if self._evaluation_steps != 0: phase_stats, _ = self._run_one_phase(is_training=False) stats.set("eval_episodes", phase_stats.sum("episodes")) stats.set("episode_reward", phase_stats.get("rewards")) stats.set("episode_steps", phase_stats.get("steps")) return stats
def step(self, actions): stats = Statistics() t0 = time.time() next_states = [] states = [] rewards = [] dones = [] if isinstance(actions, np.ndarray): actions = actions.tolist() step_promises = [] for env_idx, env in enumerate(self._envs): count = env.n_envs * env.n_agents start = env_idx * count end = start + count env_actions = actions[start:end] step_promises.append(env.step(env_actions)) reset_states = [] for env_idx, step_promise in enumerate(step_promises): env_next_states, env_rewards, env_dones = step_promise() env = self._envs[env_idx] env_states = env_next_states env_stat = self._env_stats[env] env_stat.set("steps", 1) env_stat.set("rewards", sum(env_rewards)) if env_dones.any(): stats.set("steps", env_stat.sum("steps")) avg_reward = env_stat.sum("rewards") / ( env.n_envs * env.n_agents) stats.set("rewards", avg_reward) stats.set("episodes", 1) self._env_stats[env] = Statistics() reset_states.append((env_idx, env.reset())) next_states.append(env_next_states) states.append(env_states) rewards.append(env_rewards) dones.append(env_dones) for env_idx, step_promise in reset_states: states[env_idx] = step_promise() rewards = np.concatenate(rewards, axis=0) dones = np.concatenate(dones, axis=0) next_states = np.concatenate(next_states, axis=0) self.states = np.concatenate(states, axis=0) stats.set("env_time", time.time() - t0) return rewards, next_states, dones, stats
def _run_one_phase(self, is_training): stats = Statistics() agent_stats = Statistics() self._agent.eval = not is_training min_steps = (self._training_steps if is_training else self._evaluation_steps) * self._env.n_agents self._env.reset() while stats.sum("steps") < min_steps: step_time0 = time.time() states = np.copy(self._env.states) actions = self._agent.step(states) rewards, next_states, dones, env_stats = \ self._env.step(actions) stats.set_all(env_stats) if self._traj_buffer is not None: self._traj_buffer.push(states, actions, rewards, next_states, dones) if is_training: t0 = time.time() agent_stats.set_all( self._agent.transitions(states, actions, rewards, next_states, dones)) stats.set("agent_time", time.time() - t0) stats.set("step_time", time.time() - step_time0) sys.stdout.write( "Iteration {} ({}). ".format( self._iteration, "train" if is_training else "eval") + "Steps executed: {} ".format(stats.sum("steps")) + "Episode length: {} ".format(int(stats.avg("steps"))) + "Return: {:.4f} \r".format(stats.avg("rewards"))) sys.stdout.flush() print() self._agent.episodes_end() return stats, agent_stats
def transitions(self, states, actions, rewards, next_states, term): stats = Statistics() if self.eval: return stats t0 = time.time() self._net.train(True) states = torch.from_numpy(states).float().to(self._device) actions = torch.tensor(actions).long().to(self._device) actions = torch.unsqueeze(actions, dim=1) rewards = torch.from_numpy(rewards).float().to(self._device) rewards = torch.unsqueeze(rewards, dim=1) term_mask = torch.from_numpy(term.astype(np.uint8)).to(self._device) term_mask = torch.unsqueeze(term_mask, dim=1) next_states = torch.from_numpy(next_states).float().to(self._device) _, v_next = self._net(next_states) v_next = v_next * (1 - term_mask).float() # 0 -> term action_logits, v = self._net(states) # it's used as: # 1. loss for the critic # 2. advantage for the actor delta = rewards + self._gamma * v_next - v critic_loss = delta.abs().mean() log_softmax = torch.nn.LogSoftmax(dim=1) action_log_probs = log_softmax(action_logits) action_log_probs = action_log_probs.gather(dim=1, index=actions) # minus here because optimizer is going to *minimize* the # loss. If we were going to update the weights manually, # (without optimizer) we would remove the -1. actor_loss = -(delta * action_log_probs).mean() loss = actor_loss + critic_loss # Optimize self._optimizer.zero_grad() loss.backward() self._optimizer.step() stats.set('loss', loss.detach()) stats.set('loss_critic', critic_loss.detach()) stats.set('loss_actor', actor_loss.detach()) stats.set('optimization_time', time.time() - t0) # Log entropy metric (opposite to confidence) action_probs = torch.nn.Softmax(dim=1)(action_logits) entropy = -(action_probs * action_log_probs).sum(dim=1).mean() stats.set('entropy', entropy.detach()) # Log gradients for p in self._net.parameters(): if p.grad is not None: stats.set('grad_max', p.grad.abs().max().detach()) stats.set('grad_mean', (p.grad**2).mean().sqrt().detach()) # Log Kullback-Leibler divergence between the new # and the old policy. new_action_logits, _ = self._net(states) new_action_probs = torch.nn.Softmax(dim=1)(new_action_logits) kl = -((new_action_probs / action_probs).log() * action_probs).sum(dim=1).mean() stats.set('kl', kl.detach()) return stats
def _optimize(self): stats = Statistics() t0 = time.time() stats.set('replay_buffer_size', len(self._buffer)) if not self._buffer.ready(): return stats self.net.train(True) # Create tensors: state, action, next_state, term states, actions, target_v, advantage = self._buffer.sample() batch_size = len(states) assert batch_size == self._buffer.capacity() states = torch.from_numpy(states).float().to(self._device) actions = torch.from_numpy(actions).to(self._device) if self._is_continous: actions_shape = (batch_size, ) + self._action_space.shape actions = actions.float() else: actions_shape = (batch_size, ) actions = actions.long() assert actions.shape == actions_shape, actions.shape target_v = torch.from_numpy(target_v).float().to(self._device) target_v = torch.unsqueeze(target_v, dim=1) advantage = torch.from_numpy(advantage).float().to(self._device) advantage = (advantage - advantage.mean()) / advantage.std() advantage = advantage.detach() assert not torch.isnan(advantage).any(), advantage if self._is_continous: advantage = torch.unsqueeze(advantage, dim=1) assert advantage.shape == (batch_size, 1), advantage.shape else: assert advantage.shape == (batch_size, ) stats.set('advantage', advantage.abs().mean()) # Iteratively optimize the network critic_loss_fn = nn.MSELoss() # Action probabilities of the network before optimization old_log_probs = None old_dist = None for _ in range(self._epochs): # Calculate Actor Loss if self._is_continous: actions_mu, actions_var, v = self.net(states) assert actions_var.shape == actions_shape, actions_var.shape assert actions_mu.shape == actions_shape, actions_mu.shape assert len(self._action_space.shape) == 1 log_probs_arr = [] for action_idx in range(self._action_space.shape[0]): action_mu = actions_mu[:, action_idx] action_var = actions_var[:, action_idx] assert action_mu.shape == (batch_size, ), action_mu.shape assert action_var.shape == (batch_size, ), action_var.shape dist = torch.distributions.Normal(action_mu, action_var) sub_actions = actions[:, action_idx] assert sub_actions.shape == (batch_size, ) log_probs = dist.log_prob(sub_actions) log_probs_arr.append(log_probs) log_probs = torch.stack(log_probs_arr, dim=1) else: action_logits, _, v = self.net(states) assert action_logits.shape == (batch_size, self._action_space.n) dist = torch.distributions.categorical.Categorical( logits=action_logits) log_probs = dist.log_prob(actions) assert log_probs.shape == actions_shape, log_probs.shape if old_log_probs is None: old_log_probs = log_probs.detach() old_dist = dist r = (log_probs - old_log_probs).exp() assert not torch.isnan(r).any(), r assert r.shape == actions_shape, r.shape obj = torch.min( r * advantage, torch.clamp(r, 1. - self._epsilon, 1. + self._epsilon) * advantage) assert obj.shape == actions_shape, obj.shape # Minus is here because optimizer is going to *minimize* the # loss. If we were going to update the weights manually, # (without optimizer) we would remove the -1. actor_loss = -obj.mean() # Calculate Critic Loss assert v.shape == (batch_size, 1) assert target_v.shape == (batch_size, 1) critic_loss = critic_loss_fn(v, target_v) # Optimize loss = critic_loss + actor_loss self._optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.net.parameters(), 30.0) self._optimizer.step() stats.set('loss_actor', actor_loss.detach()) stats.set('loss_critic', critic_loss.detach()) # Log gradients for p in self.net.parameters(): if p.grad is not None: stats.set('grad_max', p.grad.abs().max().detach()) stats.set('grad_mean', (p.grad**2).mean().sqrt().detach()) self._buffer.reset() # Log stats stats.set('optimization_time', time.time() - t0) stats.set('ppo_optimization_epochs', self._epochs) stats.set('ppo_optimization_samples', batch_size) # Log entropy metric (opposite to confidence) if self._is_continous: action_mu, action_var, _ = self.net(states) stats.set('action_variance', action_var.mean().detach()) stats.set('action_mu_mean', (action_mu**2).mean().sqrt().detach()) stats.set('action_mu_max', action_mu.abs().max().detach()) stats.set('entropy', dist.entropy().mean().detach()) # Log Kullback-Leibler divergence between the new # and the old policy. kl = torch.distributions.kl.kl_divergence(dist, old_dist) stats.set('kl', kl.mean().detach()) return stats
def _optimize(self): stats = Statistics() if self.eval: return stats if self._step % self._train_freq != 0: return stats self._policy_net.train(True) # Increase ReplayBuffer beta parameter 0.4 → 1.0 # (These numbers are taken from the Rainbow paper) beta0 = 0.4 beta1 = 1.0 bonus = min(1.0, self._optimization_step / self._beta_decay) beta = beta0 + (beta1 - beta0) * bonus try: self._buffer.set_beta(beta) stats.set('replay_beta', beta) except AttributeError: # In case it's not a PriorityReplayBuffer pass states, actions, rewards, next_states, term, ids = self._buffer.sample( self._batch_size) # Make Replay Buffer values consumable by PyTorch states = torch.from_numpy(states).float().to(self._device) actions = torch.from_numpy(actions).long().to(self._device) actions = torch.unsqueeze(actions, dim=1) rewards = torch.from_numpy(rewards).float().to(self._device) rewards = torch.unsqueeze(rewards, dim=1) # For term states the Q value is calculated differently: # Q(term_state) = R term_mask = torch.from_numpy(term).to(self._device) term_mask = torch.unsqueeze(term_mask, dim=1) term_mask = (1 - term_mask).float() next_states = torch.from_numpy(next_states).float().to(self._device) # Calculate TD Target self._sample_noise() if self._double: # Double DQN: use target_net for Q values estimation of the # next_state and policy_net for choosing the action # in the next_state. next_q_pnet = self._policy_net(next_states).detach() next_actions = torch.argmax(next_q_pnet, dim=1).unsqueeze(dim=1) else: next_q_tnet = self._target_net(next_states).detach() next_actions = torch.argmax(next_q_tnet, dim=1).unsqueeze(dim=1) self._sample_noise() next_q = self._target_net(next_states).gather( 1, next_actions).detach() # detach → don't backpropagate next_q = next_q * (1 - term_mask).float() # 0 -> term target_q = rewards + self._gamma * next_q self._sample_noise() q = self._policy_net(states).gather(dim=1, index=actions) loss = self._loss_fn(q, target_q) try: w = self._buffer.importance_sampling_weights(ids) w = torch.from_numpy(w).float().to(self._device) loss = w * loss except AttributeError: # Not a priority replay buffer pass loss = torch.mean(loss) stats.set('loss', loss.detach()) self._optimizer.zero_grad() loss.backward() for param in self._policy_net.parameters(): if param.grad is not None: param.grad.data.clamp_(-1, 1) self._optimizer.step() self._update_target_net() self._optimization_step += 1 # Update priorities in the Replay Buffer with torch.no_grad(): buffer_loss = self._buffer_loss_fn(q, target_q) buffer_loss = torch.squeeze(buffer_loss) buffer_loss = buffer_loss.cpu().numpy() try: self._buffer.update_priorities(ids, buffer_loss) except AttributeError: # That's not a priority replay buffer pass return stats