Exemplo n.º 1
0
    def train_bc(self, expert_dataset_iter):
        """Performs a single training step of behavior clonning.

    The method optimizes MLE on the expert dataset.

    Args:
      expert_dataset_iter: An tensorflow graph iteratable object.
    """

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(self.actor.variables)
            states, actions, _ = next(expert_dataset_iter)
            log_probs = self.actor.get_log_prob(states, actions)
            actor_loss = tf.reduce_mean(
                -log_probs) + keras_utils.orthogonal_regularization(
                    self.actor.trunk)

        actor_grads = tape.gradient(actor_loss, self.actor.variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grads, self.actor.variables))

        self.avg_actor_loss(actor_loss)

        if tf.equal(self.actor_optimizer.iterations % self.log_interval, 0):
            tf.summary.scalar('train bc/actor_loss',
                              self.avg_actor_loss.result(),
                              step=self.actor_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_actor_loss)
Exemplo n.º 2
0
    def train(self,
              replay_buffer_iter,
              discount=0.99,
              tau=0.005,
              target_entropy=0,
              actor_update_freq=2):
        """Performs a single training step for critic and actor.

    Args:
      replay_buffer_iter: An tensorflow graph iteratable object.
      discount: A discount used to compute returns.
      tau: A soft updates discount.
      target_entropy: A target entropy for alpha.
      actor_update_freq: A frequency of the actor network updates.

    Returns:
      Actor and alpha losses.
    """
        states, actions, next_states, rewards, masks = next(
            replay_buffer_iter)[0]

        rewards = self.rewards_fn(states, actions, rewards)

        critic_loss = self.fit_critic(states, actions, next_states, rewards,
                                      masks, discount)

        self.avg_critic_loss(critic_loss)
        if tf.equal(self.critic_optimizer.iterations % self.log_interval, 0):
            tf.summary.scalar('train sac/critic_loss',
                              self.avg_critic_loss.result(),
                              step=self.critic_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_critic_loss)

        if tf.equal(self.critic_optimizer.iterations % actor_update_freq, 0):
            actor_loss, alpha_loss, entropy = self.fit_actor(
                states, target_entropy)
            soft_update(self.critic, self.critic_target, tau=tau)

            self.avg_actor_loss(actor_loss)
            self.avg_alpha_loss(alpha_loss)
            self.avg_actor_entropy(entropy)
            self.avg_alpha(self.alpha)
            if tf.equal(self.actor_optimizer.iterations % self.log_interval,
                        0):
                tf.summary.scalar('train sac/actor_loss',
                                  self.avg_actor_loss.result(),
                                  step=self.actor_optimizer.iterations)
                keras_utils.my_reset_states(self.avg_actor_loss)

                tf.summary.scalar('train sac/alpha_loss',
                                  self.avg_alpha_loss.result(),
                                  step=self.actor_optimizer.iterations)
                keras_utils.my_reset_states(self.avg_alpha_loss)

                tf.summary.scalar('train sac/actor entropy',
                                  self.avg_actor_entropy.result(),
                                  step=self.actor_optimizer.iterations)
                keras_utils.my_reset_states(self.avg_actor_entropy)

                tf.summary.scalar('train sac/alpha',
                                  self.avg_alpha.result(),
                                  step=self.actor_optimizer.iterations)
                keras_utils.my_reset_states(self.avg_alpha)
Exemplo n.º 3
0
    def update(self,
               expert_dataset_iter,
               policy_dataset_iter,
               discount,
               replay_regularization=0.05,
               nu_reg=10.0):
        """A function that updates nu network.

    When replay regularization is non-zero, it learns
    (d_pi * (1 - replay_regularization) + d_rb * replay_regulazation) /
    (d_expert * (1 - replay_regularization) + d_rb * replay_regulazation)
    instead.

    Args:
      expert_dataset_iter: An tensorflow graph iteratable over expert data.
      policy_dataset_iter: An tensorflow graph iteratable over training policy
        data, used for regularization.
      discount: An MDP discount.
      replay_regularization: A fraction of samples to add from a replay buffer.
      nu_reg: A grad penalty regularization coefficient.
    """

        (expert_states, expert_actions,
         expert_next_states) = expert_dataset_iter.get_next()

        expert_initial_states = expert_states

        rb_states, rb_actions, rb_next_states, _, _ = policy_dataset_iter.get_next(
        )[0]

        with tf.GradientTape(watch_accessed_variables=False,
                             persistent=True) as tape:
            tape.watch(self.actor.variables)
            tape.watch(self.nu_net.variables)

            _, policy_next_actions, _ = self.actor(expert_next_states)
            _, rb_next_actions, rb_log_prob = self.actor(rb_next_states)

            _, policy_initial_actions, _ = self.actor(expert_initial_states)

            # Inputs for the linear part of DualDICE loss.
            expert_init_inputs = tf.concat(
                [expert_initial_states, policy_initial_actions], 1)

            expert_inputs = tf.concat([expert_states, expert_actions], 1)
            expert_next_inputs = tf.concat(
                [expert_next_states, policy_next_actions], 1)

            rb_inputs = tf.concat([rb_states, rb_actions], 1)
            rb_next_inputs = tf.concat([rb_next_states, rb_next_actions], 1)

            expert_nu_0 = self.nu_net(expert_init_inputs)
            expert_nu = self.nu_net(expert_inputs)
            expert_nu_next = self.nu_net(expert_next_inputs)

            rb_nu = self.nu_net(rb_inputs)
            rb_nu_next = self.nu_net(rb_next_inputs)

            expert_diff = expert_nu - discount * expert_nu_next
            rb_diff = rb_nu - discount * rb_nu_next

            linear_loss_expert = tf.reduce_mean(expert_nu_0 * (1 - discount))

            linear_loss_rb = tf.reduce_mean(rb_diff)

            rb_expert_diff = tf.concat([expert_diff, rb_diff], 0)
            rb_expert_weights = tf.concat([
                tf.ones(expert_diff.shape) * (1 - replay_regularization),
                tf.ones(rb_diff.shape) * replay_regularization
            ], 0)

            rb_expert_weights /= tf.reduce_sum(rb_expert_weights)
            non_linear_loss = tf.reduce_sum(
                tf.stop_gradient(
                    weighted_softmax(rb_expert_diff, rb_expert_weights,
                                     axis=0)) * rb_expert_diff)

            linear_loss = (linear_loss_expert * (1 - replay_regularization) +
                           linear_loss_rb * replay_regularization)

            loss = (non_linear_loss - linear_loss)

            alpha = tf.random.uniform(shape=(expert_inputs.shape[0], 1))

            nu_inter = alpha * expert_inputs + (1 - alpha) * rb_inputs
            nu_next_inter = alpha * expert_next_inputs + (
                1 - alpha) * rb_next_inputs

            nu_inter = tf.concat([nu_inter, nu_next_inter], 0)

            with tf.GradientTape(watch_accessed_variables=False) as tape2:
                tape2.watch(nu_inter)
                nu_output = self.nu_net(nu_inter)
            nu_grad = tape2.gradient(nu_output, [nu_inter])[0] + EPS
            nu_grad_penalty = tf.reduce_mean(
                tf.square(tf.norm(nu_grad, axis=-1, keepdims=True) - 1))

            nu_loss = loss + nu_grad_penalty * nu_reg
            pi_loss = -loss + keras_utils.orthogonal_regularization(
                self.actor.trunk)

        nu_grads = tape.gradient(nu_loss, self.nu_net.variables)
        pi_grads = tape.gradient(pi_loss, self.actor.variables)

        self.nu_optimizer.apply_gradients(zip(nu_grads, self.nu_net.variables))
        self.actor_optimizer.apply_gradients(
            zip(pi_grads, self.actor.variables))

        del tape

        self.avg_nu_expert(expert_nu)
        self.avg_nu_rb(rb_nu)

        self.nu_reg_metric(nu_grad_penalty)
        self.avg_loss(loss)

        self.avg_actor_loss(pi_loss)
        self.avg_actor_entropy(-rb_log_prob)

        if tf.equal(self.nu_optimizer.iterations % self.log_interval, 0):
            tf.summary.scalar('train dual dice/loss',
                              self.avg_loss.result(),
                              step=self.nu_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_loss)

            tf.summary.scalar('train dual dice/nu expert',
                              self.avg_nu_expert.result(),
                              step=self.nu_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_nu_expert)

            tf.summary.scalar('train dual dice/nu rb',
                              self.avg_nu_rb.result(),
                              step=self.nu_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_nu_rb)

            tf.summary.scalar('train dual dice/nu reg',
                              self.nu_reg_metric.result(),
                              step=self.nu_optimizer.iterations)
            keras_utils.my_reset_states(self.nu_reg_metric)

        if tf.equal(self.actor_optimizer.iterations % self.log_interval, 0):
            tf.summary.scalar('train sac/actor_loss',
                              self.avg_actor_loss.result(),
                              step=self.actor_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_actor_loss)

            tf.summary.scalar('train sac/actor entropy',
                              self.avg_actor_entropy.result(),
                              step=self.actor_optimizer.iterations)
            keras_utils.my_reset_states(self.avg_actor_entropy)