Пример #1
0
    def init_actor_loss(self):
        if self.policy_type == "GaussianPolicy":
            mu, std = self.policy.mu_and_std(self.states_ph)
            norm_dist = tfd.Normal(loc=mu, scale=std)
            entropy = norm_dist.entropy()
            log_actions = norm_dist.log_prob(tf_v1.atanh(self.actions_ph))
            log_actions -= tf_v1.log(1.0 - self.actions_ph**2 + 1e-8)
            log_actions = tf_v1.reduce_sum(log_actions, axis=-1, keepdims=True)
            # log_actions = self.policy.log_prob(self.states_ph, self.actions_ph)
        elif self.policy_type == "DiscretePolicy":
            action_probs = self.policy.model(self.states_ph)
            entropy = -tf_v1.reduce_sum(
                tf_v1.multiply(tf_v1.log(action_probs), action_probs), axis=-1)
            hot_encoded = tf_v1.one_hot(self.actions_ph, self.action_size)
            log_actions = tf_v1.log(
                tf_v1.reduce_sum(hot_encoded * action_probs, axis=-1))
        else:
            raise NotImplementedError(
                f"Received {self.policy_type}. This should have never happened!"
            )

        log_loss = -log_actions * self.targets_ph
        entropy_loss = -self.alpha * entropy
        loss = log_loss + entropy_loss
        optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.policy.lr_ph)
        train_op = get_clipped_train_op(loss,
                                        optimizer,
                                        var_list=self.policy.trainable_vars,
                                        clip_norm=self.policy.clip_norm)
        self.policy.setup_loss(loss, train_op)
Пример #2
0
 def init_critics_loss(self):
     q_targets = tf_v1.stop_gradient(self.get_q_targets())
     for critic in self.critics:
         q_predictions = critic([self.states_ph, self.actions_ph])
         q_loss = tf_v1.reduce_mean(tf_v1.losses.huber_loss(labels=q_targets, predictions=q_predictions))
         # q_loss = tf_v1.losses.mean_squared_error(labels=q_targets, predictions=q_predictions, weights=0.5)
         optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.q_lr_ph)
         train_op = get_clipped_train_op(q_loss, optimizer=optimizer, var_list=critic.trainable_vars,
                                         clip_norm=self.clip_norm)
         critic.setup_loss(q_loss, train_op)
Пример #3
0
 def init_actor_loss(self):
     actions = self.policy.model(self.states_ph)
     q_value = self.critic([self.states_ph, actions])
     loss = -tf_v1.reduce_mean(q_value)
     optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.policy.lr_ph)
     train_op = get_clipped_train_op(loss,
                                     optimizer,
                                     var_list=self.policy.trainable_vars,
                                     clip_norm=self.policy.clip_norm)
     self.policy.setup_loss(loss, train_op)
Пример #4
0
 def init_q(self):
     q_targets = self.get_q_target()
     q_predictions = self.q_net(self.states_ph)
     batch_size = tf_v1.shape(self.actions_ph)[0]
     indices = tf_v1.stack([tf_v1.range(batch_size), self.actions_ph],
                           axis=-1)
     # From the q_predictions, only change the Q values of given action index to the target value
     q_targets = tf_v1.tensor_scatter_nd_update(q_predictions, indices,
                                                q_targets)
     q_loss = tf_v1.losses.mean_squared_error(labels=q_targets,
                                              predictions=q_predictions)
     optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.lr_ph)
     train_op = get_clipped_train_op(q_loss,
                                     optimizer=optimizer,
                                     var_list=self.q_net.trainable_vars,
                                     clip_norm=self.clip_norm)
     self.q_net.setup_loss(q_loss, train_op)
Пример #5
0
    def init_actor(self):
        actions = self.policy.model(self.states_ph)
        q_values = tuple(Q([self.states_ph, actions]) for Q in self.critics)
        min_q = tf_v1.reduce_min(q_values, axis=0)
        log_actions = self.policy.log_prob(self.states_ph, actions)
        kl_loss = min_q - self.alpha_tf*log_actions
        policy_loss = -tf_v1.reduce_mean(kl_loss)
        optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.policy.lr_ph, name="policy_optimizer")
        actor_train_op = get_clipped_train_op(policy_loss, optimizer, var_list=self.policy.trainable_vars, 
            clip_norm=self.policy.clip_norm)
        self.policy.setup_loss(policy_loss, actor_train_op)

        if self.auto_ent:
            alpha_loss = -self.alpha_tf*tf_v1.stop_gradient(log_actions + self.target_entropy)
            self.alpha_loss_tf = tf_v1.reduce_mean(alpha_loss)
            optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.alpha_lr_ph, name="alpha_optimizer")
            self.alpha_train_op = optimizer.minimize(self.alpha_loss_tf, var_list=[self.log_alpha_tf])
Пример #6
0
 def init_q(self):
     q_targets = self.get_q_target()
     batch_size = tf_v1.shape(self.actions_ph)[0]
     indices = tf_v1.stack([tf_v1.range(batch_size), self.actions_ph],
                           axis=-1)
     q_predictions = self.q_net(self.states_ph)
     q_targets = tf_v1.tensor_scatter_nd_update(q_predictions, indices,
                                                q_targets)
     # q_predictions = tf_v1.gather_nd(q_predictions, indices)
     # q_loss = tf_v1.reduce_mean(tf_v1.losses.huber_loss(labels=q_targets, predictions=q_predictions))
     q_loss = tf_v1.losses.mean_squared_error(labels=q_targets,
                                              predictions=q_predictions)
     optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.lr_ph)
     train_op = get_clipped_train_op(q_loss,
                                     optimizer=optimizer,
                                     var_list=self.q_net.trainable_vars,
                                     clip_norm=self.clip_norm)
     self.q_net.setup_loss(q_loss, train_op)
Пример #7
0
 def init_critic(self):
     loss = tf_v1.losses.mean_squared_error(labels=self.targets_ph, predictions=self.critic.output)
     optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.lr_ph)
     train_op = get_clipped_train_op(loss, optimizer=optimizer, var_list=self.critic.trainable_vars,
                                     clip_norm=self.clip_norm)
     self.critic.setup_loss(loss, train_op)