def learn_step(self, replay): policy = self.policy(replay["next_states"][-1]) last_value = policy["value"] discounted_rewards = discount(self.gamma, replay["rewards"], replay["dones"], last_value) discounted_rewards = discounted_rewards.detach() advantages = discounted_rewards - replay["values"] entropy_loss = replay["entropy"].mean() policy_loss = -(replay["log_probs"] * (advantages.detach())).mean() value_loss = (advantages).pow(2).mean() self.optimizer.zero_grad() (policy_loss - self.ent_coef * entropy_loss).backward() value_loss.backward() torch.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_clip_norm) self.optimizer.step() self.num_updates += 1 return { "scalars": { "loss/policy": policy_loss.item(), "loss/value": value_loss.item(), "loss/entropy": entropy_loss.item(), "env/advantage": advantages.mean().item(), } }
def update(self, sess, analysis, gamma): """ Updates the global network by applying gradients Parameters ---------- sess : tf.Session() TensorFlow session used to run the function analysis : array array of parameters used to run the function gamma : float discount parameter for reinforcement learning Returns ------- value_loss/length, policy_loss/length, entropy/length, loss/length : float parameters used to evaluate the model """ analysis = np.array(analysis) states = analysis[:, 0] matching = analysis[:, 1] actions = analysis[:, 2] rewards = analysis[:, 3] values = analysis[:, 4] rewards_plus = np.asarray(rewards.tolist() + [0.0]) discounted_rewards = discount(rewards_plus, gamma)[:-1] values_plus = np.asarray(values.tolist() + [0.0]) advantages = rewards + gamma * values_plus[1:] - values_plus[:-1] feed_dict = { self.local_net.target_value: discounted_rewards, self.local_net.input_vector: np.vstack(states), self.local_net.matching_vector: np.vstack(matching), self.local_net.actions: actions, self.local_net.advantages: advantages } value_loss, policy_loss, entropy, loss, _ = sess.run( [ self.local_net.value_loss, self.local_net.policy_loss, self.local_net.entropy, self.local_net.loss, self.local_net.apply_grads ], feed_dict=feed_dict) length = len(analysis) return value_loss / length, policy_loss / length, entropy / length, loss / length
def train(self, observations, rewards, actions, values, sess, gamma, bootstrap_value): rewards_plus = np.asarray(rewards + [bootstrap_value]) discounted_rewards = discount(rewards_plus, gamma)[:-1] value_plus = np.asarray(values + [bootstrap_value]) advs = np.array(rewards) + gamma * value_plus[1:] - value_plus[:-1] advs = discount(advs, gamma) feed_dict = {self.local_net.advantages:advs, self.local_net.inputs:observations, self.local_net.actions:actions, self.local_net.rewards: discounted_rewards, self.local_net.state_in[0]:self.batch_rnn_state[0], self.local_net.state_in[1]:self.batch_rnn_state[1]} v_l,p_l,e_l,g_n,v_n, self.batch_rnn_state,_ = sess.run([self.local_net.value_loss, self.local_net.policy_loss, self.local_net.entropy, self.local_net.grad_norms, self.local_net.var_norms, self.local_net.state_out, self.local_net.apply_grads], feed_dict=feed_dict) return v_l / len(observations),p_l / len(observations),e_l / len(observations), g_n,v_n