示例#1
0
    def learn_step(self, replay):
        policy = self.policy(replay["next_states"][-1])
        last_value = policy["value"]

        discounted_rewards = discount(self.gamma, replay["rewards"],
                                      replay["dones"], last_value)
        discounted_rewards = discounted_rewards.detach()

        advantages = discounted_rewards - replay["values"]

        entropy_loss = replay["entropy"].mean()
        policy_loss = -(replay["log_probs"] * (advantages.detach())).mean()
        value_loss = (advantages).pow(2).mean()

        self.optimizer.zero_grad()
        (policy_loss - self.ent_coef * entropy_loss).backward()
        value_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                       self.max_clip_norm)
        self.optimizer.step()

        self.num_updates += 1

        return {
            "scalars": {
                "loss/policy": policy_loss.item(),
                "loss/value": value_loss.item(),
                "loss/entropy": entropy_loss.item(),
                "env/advantage": advantages.mean().item(),
            }
        }
示例#2
0
    def update(self, sess, analysis, gamma):
        """
		Updates the global network by applying gradients

		Parameters
		----------
		sess : tf.Session()
			TensorFlow session used to run the function
		analysis : array
			array of parameters used to run the function
		gamma : float
			discount parameter for reinforcement learning

		Returns
		-------
		value_loss/length, policy_loss/length, entropy/length, loss/length : float
			parameters used to evaluate the model
		"""

        analysis = np.array(analysis)

        states = analysis[:, 0]
        matching = analysis[:, 1]
        actions = analysis[:, 2]
        rewards = analysis[:, 3]
        values = analysis[:, 4]

        rewards_plus = np.asarray(rewards.tolist() + [0.0])
        discounted_rewards = discount(rewards_plus, gamma)[:-1]

        values_plus = np.asarray(values.tolist() + [0.0])
        advantages = rewards + gamma * values_plus[1:] - values_plus[:-1]

        feed_dict = {
            self.local_net.target_value: discounted_rewards,
            self.local_net.input_vector: np.vstack(states),
            self.local_net.matching_vector: np.vstack(matching),
            self.local_net.actions: actions,
            self.local_net.advantages: advantages
        }
        value_loss, policy_loss, entropy, loss, _ = sess.run(
            [
                self.local_net.value_loss, self.local_net.policy_loss,
                self.local_net.entropy, self.local_net.loss,
                self.local_net.apply_grads
            ],
            feed_dict=feed_dict)

        length = len(analysis)
        return value_loss / length, policy_loss / length, entropy / length, loss / length
示例#3
0
    def train(self, observations, rewards, actions, values, sess, gamma, bootstrap_value):


        rewards_plus = np.asarray(rewards + [bootstrap_value])
        discounted_rewards = discount(rewards_plus, gamma)[:-1]
        value_plus = np.asarray(values + [bootstrap_value])
        advs = np.array(rewards) + gamma * value_plus[1:] - value_plus[:-1]
        advs = discount(advs, gamma)

        feed_dict = {self.local_net.advantages:advs,
            self.local_net.inputs:observations,
            self.local_net.actions:actions,
                     self.local_net.rewards: discounted_rewards,
            self.local_net.state_in[0]:self.batch_rnn_state[0],
            self.local_net.state_in[1]:self.batch_rnn_state[1]}
        v_l,p_l,e_l,g_n,v_n, self.batch_rnn_state,_ = sess.run([self.local_net.value_loss,
                                                                self.local_net.policy_loss,
            self.local_net.entropy,
            self.local_net.grad_norms,
            self.local_net.var_norms,
            self.local_net.state_out,
            self.local_net.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(observations),p_l / len(observations),e_l / len(observations), g_n,v_n