def _training(self):
        """Perform multiple training iterations of both policy and value baseline.

    Training on the episodes collected in the memory. Reset the memory
    afterwards. Always returns a summary string.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('training'):
            assert_full = tf.assert_equal(self._memory_index,
                                          self._config.update_every)
            with tf.control_dependencies([assert_full]):
                data = self._memory.data()
            (observ, action, old_mean, old_logstd, reward), length = data
            with tf.control_dependencies([tf.assert_greater(length, 0)]):
                length = tf.identity(length)
            observ = self._observ_filter.transform(observ)
            reward = self._reward_filter.transform(reward)
            update_summary = self._perform_update_steps(
                observ, action, old_mean, old_logstd, reward, length)
            with tf.control_dependencies([update_summary]):
                penalty_summary = self._adjust_penalty(observ, old_mean,
                                                       old_logstd, length)
            with tf.control_dependencies([penalty_summary]):
                clear_memory = tf.group(self._memory.clear(),
                                        self._memory_index.assign(0))
            with tf.control_dependencies([clear_memory]):
                weight_summary = utility.variable_summaries(
                    tf.trainable_variables(), self._config.weight_summaries)
                return tf.summary.merge(
                    [update_summary, penalty_summary, weight_summary])
 def _define_experience(self, agent_indices, observ, action, reward):
     """Implement the branch of experience() entered during training."""
     update_filters = tf.summary.merge([
         self._observ_filter.update(observ),
         self._reward_filter.update(reward)
     ])
     with tf.control_dependencies([update_filters]):
         if self._config.train_on_agent_action:
             # NOTE: Doesn't seem to change much.
             action = self._last_action
         batch = (observ, action, tf.gather(self._last_mean, agent_indices),
                  tf.gather(self._last_logstd, agent_indices), reward)
         append = self._episodes.append(batch, agent_indices)
     with tf.control_dependencies([append]):
         norm_observ = self._observ_filter.transform(observ)
         norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
         # pylint: disable=g-long-lambda
         summary = tf.cond(
             self._should_log, lambda: tf.summary.merge([
                 update_filters,
                 self._observ_filter.summary(),
                 self._reward_filter.summary(),
                 tf.summary.scalar('memory_size', self._memory_index),
                 tf.summary.histogram('normalized_observ', norm_observ),
                 tf.summary.histogram('action', self._last_action),
                 tf.summary.scalar('normalized_reward', norm_reward)
             ]), str)
         return summary
Exemplo n.º 3
0
    def update(self, value):
        """Update the mean and variance estimates.

    Args:
      value: Batch or single value tensor.

    Returns:
      Summary tensor.
    """
        with tf.name_scope(self._name + '/update'):
            if value.shape.ndims == self._mean.shape.ndims:
                # Add a batch dimension if necessary.
                value = value[None, ...]
            count = tf.shape(value)[0]
            with tf.control_dependencies([self._count.assign_add(count)]):
                step = tf.cast(self._count, tf.float32)
                mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
                new_mean = self._mean + mean_delta / step
                new_mean = tf.cond(self._count > 1, lambda: new_mean,
                                   lambda: value[0])
                var_delta = (value - self._mean[None, ...]) * (
                    value - new_mean[None, ...])
                new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
            with tf.control_dependencies([new_mean, new_var_sum]):
                update = self._mean.assign(new_mean), self._var_sum.assign(
                    new_var_sum)
            with tf.control_dependencies(update):
                if value.shape.ndims == 1:
                    value = tf.reduce_mean(value)
                return self._summary('value', tf.reduce_mean(value))
Exemplo n.º 4
0
    def replace(self, episodes, length, rows=None):
        """Replace full episodes.

    Args:
      episodes: Tuple of transition quantities with batch and time dimensions.
      length: Batch of sequence lengths.
      rows: Episodes to replace, defaults to all.

    Returns:
      Operation.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        assert rows.shape.ndims == 1
        assert_capacity = tf.assert_less(rows,
                                         self._capacity,
                                         message='capacity exceeded')
        with tf.control_dependencies([assert_capacity]):
            assert_max_length = tf.assert_less_equal(
                length, self._max_length, message='max length exceeded')
        replace_ops = []
        with tf.control_dependencies([assert_max_length]):
            for buffer_, elements in zip(self._buffers, episodes):
                replace_op = tf.scatter_update(buffer_, rows, elements)
                replace_ops.append(replace_op)
        with tf.control_dependencies(replace_ops):
            return tf.scatter_update(self._length, rows, length)
    def _define_step():
        """Request actions from the algorithm and apply them to the environments.

    Increments the lengths of all episodes and increases their scores by the
    current reward. After stepping the environments, provides the full
    transition tuple to the algorithm.

    Returns:
      Summary tensor.
    """
        prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
        agent_indices = tf.range(len(batch_env))
        action, step_summary = algo.perform(agent_indices, prevob)
        action.set_shape(batch_env.action.shape)
        with tf.control_dependencies([batch_env.simulate(action)]):
            add_score = score.assign_add(batch_env.reward)
            inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
        with tf.control_dependencies([add_score, inc_length]):
            agent_indices = tf.range(len(batch_env))
            experience_summary = algo.experience(agent_indices, prevob,
                                                 batch_env.action,
                                                 batch_env.reward,
                                                 batch_env.done,
                                                 batch_env.observ)
        return tf.summary.merge([step_summary, experience_summary])
Exemplo n.º 6
0
    def append(self, transitions, rows=None):
        """Append a batch of transitions to rows of the memory.

    Args:
      transitions: Tuple of transition quantities with batch dimension.
      rows: Episodes to append to, defaults to all.

    Returns:
      Operation.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        assert rows.shape.ndims == 1
        assert_capacity = tf.assert_less(rows,
                                         self._capacity,
                                         message='capacity exceeded')
        with tf.control_dependencies([assert_capacity]):
            assert_max_length = tf.assert_less(tf.gather(self._length, rows),
                                               self._max_length,
                                               message='max length exceeded')
        append_ops = []
        with tf.control_dependencies([assert_max_length]):
            for buffer_, elements in zip(self._buffers, transitions):
                timestep = tf.gather(self._length, rows)
                indices = tf.stack([rows, timestep], 1)
                append_ops.append(
                    tf.scatter_nd_update(buffer_, indices, elements))
        with tf.control_dependencies(append_ops):
            episode_mask = tf.reduce_sum(
                tf.one_hot(rows, self._capacity, dtype=tf.int32), 0)
            return self._length.assign_add(episode_mask)
  def _define_step(self, done, score, summary):
    """Combine operations of a phase.

    Keeps track of the mean score and when to report it.

    Args:
      done: Tensor indicating whether current score can be used.
      score: Tensor holding the current, possibly intermediate, score.
      summary: Tensor holding summary string to write if not an empty string.

    Returns:
      Tuple of summary tensor, mean score, and new global step. The mean score
      is zero for non reporting steps.
    """
    if done.shape.ndims == 0:
      done = done[None]
    if score.shape.ndims == 0:
      score = score[None]
    score_mean = streaming_mean.StreamingMean((), tf.float32)
    with tf.control_dependencies([done, score, summary]):
      done_score = tf.gather(score, tf.where(done)[:, 0])
      submit_score = tf.cond(tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
    with tf.control_dependencies([submit_score]):
      mean_score = tf.cond(self._report, score_mean.clear, float)
      steps_made = tf.shape(score)[0]
      next_step = self._step.assign_add(steps_made)
    with tf.control_dependencies([mean_score, next_step]):
      return tf.identity(summary), mean_score, next_step, steps_made
Exemplo n.º 8
0
 def clear(self):
     """Return the mean estimate and reset the streaming statistics."""
     value = self._sum / tf.cast(self._count, self._dtype)
     with tf.control_dependencies([value]):
         reset_value = self._sum.assign(tf.zeros_like(self._sum))
         reset_count = self._count.assign(0)
     with tf.control_dependencies([reset_value, reset_count]):
         return tf.identity(value)
Exemplo n.º 9
0
    def _update_policy_step(self, observ, action, old_mean, old_logstd,
                            advantage, length):
        """Compute the current policy loss and perform a gradient update step.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      advantage: Sequences of advantages.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        network = self._network(observ, length)
        loss, summary = self._policy_loss(network.mean, network.logstd,
                                          old_mean, old_logstd, action,
                                          advantage, length)
        gradients, variables = (zip(
            *self._policy_optimizer.compute_gradients(loss)))
        optimize = self._policy_optimizer.apply_gradients(
            zip(gradients, variables))
        summary = tf.summary.merge([
            summary,
            tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
            utility.gradient_summaries(zip(gradients, variables),
                                       dict(policy=r'.*'))
        ])
        with tf.control_dependencies([optimize]):
            return [tf.identity(loss), tf.identity(summary)]
Exemplo n.º 10
0
    def perform(self, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      observ: Tensor of a batch of observations for all algorithms.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            network = self._network(observ[:, None], tf.ones(observ.shape[0]),
                                    self._last_state)
            action = tf.cond(self._is_training, network.policy.sample,
                             lambda: network.mean)
            logprob = network.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', network.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            with tf.control_dependencies([
                    utility.assign_nested_vars(self._last_state,
                                               network.state),
                    self._last_action.assign(action[:, 0]),
                    self._last_mean.assign(network.mean[:, 0]),
                    self._last_logstd.assign(network.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
 def _define_end_episode(self, agent_indices):
     """Implement the branch of end_episode() entered during training."""
     episodes, length = self._episodes.data(agent_indices)
     space_left = self._config.update_every - self._memory_index
     use_episodes = tf.range(
         tf.minimum(tf.shape(agent_indices)[0], space_left))
     episodes = [tf.gather(elem, use_episodes) for elem in episodes]
     append = self._memory.replace(episodes,
                                   tf.gather(length, use_episodes),
                                   use_episodes + self._memory_index)
     with tf.control_dependencies([append]):
         inc_index = self._memory_index.assign_add(
             tf.shape(use_episodes)[0])
     with tf.control_dependencies([inc_index]):
         memory_full = self._memory_index >= self._config.update_every
         return tf.cond(memory_full, self._training, str)
    def _adjust_penalty(self, observ, old_mean, old_logstd, length):
        """Adjust the KL policy between the behavioral and current policy.

    Compute how much the policy actually changed during the multiple
    update steps. Adjust the penalty strength for the next training phase if we
    overshot or undershot the target divergence too much.

    Args:
      observ: Sequences of observations.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('adjust_penalty'):
            network = self._network(observ, length)
            assert_change = tf.assert_equal(tf.reduce_all(
                tf.equal(network.mean, old_mean)),
                                            False,
                                            message='policy should change')
            print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
            with tf.control_dependencies([assert_change, print_penalty]):
                kl_change = tf.reduce_mean(
                    self._mask(
                        utility.diag_normal_kl(old_mean, old_logstd,
                                               network.mean, network.logstd),
                        length))
                kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
                maybe_increase = tf.cond(
                    kl_change > 1.3 * self._config.kl_target,
                    # pylint: disable=g-long-lambda
                    lambda: tf.Print(self._penalty.assign(self._penalty * 1.5),
                                     [0], 'increase penalty '),
                    float)
                maybe_decrease = tf.cond(
                    kl_change < 0.7 * self._config.kl_target,
                    # pylint: disable=g-long-lambda
                    lambda: tf.Print(self._penalty.assign(self._penalty / 1.5),
                                     [0], 'decrease penalty '),
                    float)
            with tf.control_dependencies([maybe_increase, maybe_decrease]):
                return tf.summary.merge([
                    tf.summary.scalar('kl_change', kl_change),
                    tf.summary.scalar('penalty', self._penalty)
                ])
Exemplo n.º 13
0
    def reset(self):
        """Reset the environment.

    Returns:
      Tensor of the current observation.
    """
        observ_dtype = self._parse_dtype(self._env.observation_space)
        observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
        observ = tf.check_numerics(observ, 'observ')
        with tf.control_dependencies([
                self._observ.assign(observ),
                self._reward.assign(0),
                self._done.assign(False)
        ]):
            return tf.identity(observ)
Exemplo n.º 14
0
    def begin_episode(self, agent_indices):
        """Reset the recurrent states and stored episode.

    Args:
      agent_indices: 1D tensor of batch indices for algorithms starting an episode.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('begin_episode/'):
            reset_state = utility.reinit_nested_vars(self._last_state,
                                                     agent_indices)
            reset_buffer = self._episodes.clear(agent_indices)
            with tf.control_dependencies([reset_state, reset_buffer]):
                return tf.constant('')
    def _perform_update_steps(self, observ, action, old_mean, old_logstd,
                              reward, length):
        """Perform multiple update steps of value function and policy.

    The advantage is computed once at the beginning and shared across
    iterations. We need to decide for the summary of one iteration, and thus
    choose the one after half of the iterations.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      reward: Sequences of rewards.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        return_ = utility.discounted_return(reward, length,
                                            self._config.discount)
        value = self._network(observ, length).value
        if self._config.gae_lambda:
            advantage = utility.lambda_return(reward, value, length,
                                              self._config.discount,
                                              self._config.gae_lambda)
        else:
            advantage = return_ - value
        mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
        advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
        advantage = tf.Print(advantage,
                             [tf.reduce_mean(return_),
                              tf.reduce_mean(value)], 'return and value: ')
        advantage = tf.Print(advantage, [tf.reduce_mean(advantage)],
                             'normalized advantage: ')
        # pylint: disable=g-long-lambda
        value_loss, policy_loss, summary = tf.scan(
            lambda _1, _2: self._update_step(
                observ, action, old_mean, old_logstd, reward, advantage, length
            ),
            tf.range(self._config.update_epochs), [0., 0., ''],
            parallel_iterations=1)
        print_losses = tf.group(
            tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
            tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
        with tf.control_dependencies([value_loss, policy_loss, print_losses]):
            return summary[self._config.update_epochs // 2]
    def perform(self, agent_indices, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      agent_indices: Tensor containing current batch indices.
      observ: Tensor of a batch of observations for all agents.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            if self._last_state is None:
                state = None
            else:
                state = tf.contrib.framework.nest.map_structure(
                    lambda x: tf.gather(x, agent_indices), self._last_state)
            output = self._network(observ[:, None], tf.ones(observ.shape[0]),
                                   state)
            action = tf.cond(self._is_training, output.policy.sample,
                             lambda: output.mean)
            logprob = output.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', output.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            if self._last_state is None:
                assign_state = tf.no_op()
            else:
                assign_state = utility.assign_nested_vars(
                    self._last_state, output.state, agent_indices)
            with tf.control_dependencies([
                    assign_state,
                    tf.scatter_update(self._last_action, agent_indices,
                                      action[:, 0]),
                    tf.scatter_update(self._last_mean, agent_indices,
                                      output.mean[:, 0]),
                    tf.scatter_update(self._last_logstd, agent_indices,
                                      output.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
Exemplo n.º 17
0
    def _define_end_episode(agent_indices):
        """Notify the algorithm of ending episodes.

    Also updates the mean score and length counters used for summaries.

    Args:
      agent_indices: Tensor holding batch indices that end their episodes.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        submit_score = mean_score.submit(tf.gather(score, agent_indices))
        submit_length = mean_length.submit(
            tf.cast(tf.gather(length, agent_indices), tf.float32))
        with tf.control_dependencies([submit_score, submit_length]):
            return algo.end_episode(agent_indices)
    def begin_episode(self, agent_indices):
        """Reset the recurrent states and stored episode.

    Args:
      agent_indices: Tensor containing current batch indices.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('begin_episode/'):
            if self._last_state is None:
                reset_state = tf.no_op()
            else:
                reset_state = utility.reinit_nested_vars(
                    self._last_state, agent_indices)
            reset_buffer = self._episodes.clear(agent_indices)
            with tf.control_dependencies([reset_state, reset_buffer]):
                return tf.constant('')
Exemplo n.º 19
0
    def _define_begin_episode(agent_indices):
        """Reset environments, intermediate scores and durations for new episodes.

    Args:
      agent_indices: Tensor containing batch indices starting an episode.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        zero_scores = tf.zeros_like(agent_indices, tf.float32)
        zero_durations = tf.zeros_like(agent_indices)
        reset_ops = [
            batch_env.reset(agent_indices),
            tf.scatter_update(score, agent_indices, zero_scores),
            tf.scatter_update(length, agent_indices, zero_durations)
        ]
        with tf.control_dependencies(reset_ops):
            return algo.begin_episode(agent_indices)
    def _update_step(self, observ, action, old_mean, old_logstd, reward,
                     advantage, length):
        """Compute the current combined loss and perform a gradient update step.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      reward: Sequences of reward.
      advantage: Sequences of advantages.
      length: Batch of sequence lengths.

    Returns:
      Tuple of value loss, policy loss, and summary tensor.
    """
        value_loss, value_summary = self._value_loss(observ, reward, length)
        network = self._network(observ, length)
        policy_loss, policy_summary = self._policy_loss(
            network.mean, network.logstd, old_mean, old_logstd, action,
            advantage, length)
        value_gradients, value_variables = (zip(
            *self._optimizer.compute_gradients(value_loss)))
        policy_gradients, policy_variables = (zip(
            *self._optimizer.compute_gradients(policy_loss)))
        all_gradients = value_gradients + policy_gradients
        all_variables = value_variables + policy_variables
        optimize = self._optimizer.apply_gradients(
            zip(all_gradients, all_variables))
        summary = tf.summary.merge([
            value_summary, policy_summary,
            tf.summary.scalar('value_gradient_norm',
                              tf.global_norm(value_gradients)),
            tf.summary.scalar('policy_gradient_norm',
                              tf.global_norm(policy_gradients)),
            utility.gradient_summaries(zip(value_gradients, value_variables),
                                       dict(value=r'.*')),
            utility.gradient_summaries(zip(policy_gradients, policy_variables),
                                       dict(policy=r'.*'))
        ])
        with tf.control_dependencies([optimize]):
            return [tf.identity(x) for x in (value_loss, policy_loss, summary)]
Exemplo n.º 21
0
    def _update_value(self, observ, reward, length):
        """Perform multiple update steps of the value baseline.

    We need to decide for the summary of one iteration, and thus choose the one
    after half of the iterations.

    Args:
      observ: Sequences of observations.
      reward: Sequences of reward.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('update_value'):
            loss, summary = tf.scan(
                lambda _1, _2: self._update_value_step(observ, reward, length),
                tf.range(self._config.update_epochs_value), [0., ''],
                parallel_iterations=1)
            print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'value loss: ')
            with tf.control_dependencies([loss, print_loss]):
                return summary[self._config.update_epochs_value // 2]
Exemplo n.º 22
0
    def reset(self, indices=None):
        """Reset the batch of environments.

    Args:
      indices: The batch indices of the environments to reset; defaults to all.

    Returns:
      Batch tensor of the new observations.
    """
        if indices is None:
            indices = tf.range(len(self._batch_env))
        observ_dtype = self._parse_dtype(self._batch_env.observation_space)
        observ = tf.py_func(self._batch_env.reset, [indices],
                            observ_dtype,
                            name='reset')
        observ = tf.check_numerics(observ, 'observ')
        reward = tf.zeros_like(indices, tf.float32)
        done = tf.zeros_like(indices, tf.bool)
        with tf.control_dependencies([
                tf.scatter_update(self._observ, indices, observ),
                tf.scatter_update(self._reward, indices, reward),
                tf.scatter_update(self._done, indices, done)
        ]):
            return tf.identity(observ)
Exemplo n.º 23
0
    def _update_value_step(self, observ, reward, length):
        """Compute the current value loss and perform a gradient update step.

    Args:
      observ: Sequences of observations.
      reward: Sequences of reward.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        loss, summary = self._value_loss(observ, reward, length)
        gradients, variables = (zip(
            *self._value_optimizer.compute_gradients(loss)))
        optimize = self._value_optimizer.apply_gradients(
            zip(gradients, variables))
        summary = tf.summary.merge([
            summary,
            tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
            utility.gradient_summaries(zip(gradients, variables),
                                       dict(value=r'.*'))
        ])
        with tf.control_dependencies([optimize]):
            return [tf.identity(loss), tf.identity(summary)]
Exemplo n.º 24
0
def simulate(batch_env, algo, log=True, reset=False):
    """Simulation step of a vecrotized algorithm with in-graph environments.

  Integrates the operations implemented by the algorithm and the environments
  into a combined operation.

  Args:
    batch_env: In-graph batch environment.
    algo: Algorithm instance implementing required operations.
    log: Tensor indicating whether to compute and return summaries.
    reset: Tensor causing all environments to reset.

  Returns:
    Tuple of tensors containing done flags for the current episodes, possibly
    intermediate scores for the episodes, and a summary tensor.
  """
    def _define_begin_episode(agent_indices):
        """Reset environments, intermediate scores and durations for new episodes.

    Args:
      agent_indices: Tensor containing batch indices starting an episode.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        zero_scores = tf.zeros_like(agent_indices, tf.float32)
        zero_durations = tf.zeros_like(agent_indices)
        reset_ops = [
            batch_env.reset(agent_indices),
            tf.scatter_update(score, agent_indices, zero_scores),
            tf.scatter_update(length, agent_indices, zero_durations)
        ]
        with tf.control_dependencies(reset_ops):
            return algo.begin_episode(agent_indices)

    def _define_step():
        """Request actions from the algorithm and apply them to the environments.

    Increments the lengths of all episodes and increases their scores by the
    current reward. After stepping the environments, provides the full
    transition tuple to the algorithm.

    Returns:
      Summary tensor.
    """
        prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
        action, step_summary = algo.perform(prevob)
        action.set_shape(batch_env.action.shape)
        with tf.control_dependencies([batch_env.simulate(action)]):
            add_score = score.assign_add(batch_env.reward)
            inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
        with tf.control_dependencies([add_score, inc_length]):
            experience_summary = algo.experience(prevob, batch_env.action,
                                                 batch_env.reward,
                                                 batch_env.done,
                                                 batch_env.observ)
        return tf.summary.merge([step_summary, experience_summary])

    def _define_end_episode(agent_indices):
        """Notify the algorithm of ending episodes.

    Also updates the mean score and length counters used for summaries.

    Args:
      agent_indices: Tensor holding batch indices that end their episodes.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        submit_score = mean_score.submit(tf.gather(score, agent_indices))
        submit_length = mean_length.submit(
            tf.cast(tf.gather(length, agent_indices), tf.float32))
        with tf.control_dependencies([submit_score, submit_length]):
            return algo.end_episode(agent_indices)

    def _define_summaries():
        """Reset the average score and duration, and return them as summary.

    Returns:
      Summary string.
    """
        score_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
            lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
        length_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
            lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
        return tf.summary.merge([score_summary, length_summary])

    with tf.name_scope('simulate'):
        log = tf.convert_to_tensor(log)
        reset = tf.convert_to_tensor(reset)
        with tf.variable_scope('simulate_temporary'):
            score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32),
                                False,
                                name='score')
            length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32),
                                 False,
                                 name='length')
        mean_score = streaming_mean.StreamingMean((), tf.float32)
        mean_length = streaming_mean.StreamingMean((), tf.float32)
        agent_indices = tf.cond(
            reset, lambda: tf.range(len(batch_env)),
            lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
        begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                lambda: _define_begin_episode(agent_indices),
                                str)
        with tf.control_dependencies([begin_episode]):
            step = _define_step()
        with tf.control_dependencies([step]):
            agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
            end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                  lambda: _define_end_episode(agent_indices),
                                  str)
        with tf.control_dependencies([end_episode]):
            summary = tf.summary.merge(
                [_define_summaries(), begin_episode, step, end_episode])
        with tf.control_dependencies([summary]):
            done, score = tf.identity(batch_env.done), tf.identity(score)
        return done, score, summary
    def _policy_loss(self, mean, logstd, old_mean, old_logstd, action,
                     advantage, length):
        """Compute the policy loss composed of multiple components.

    1. The policy gradient loss is importance sampled from the data-collecting
       policy at the beginning of training.
    2. The second term is a KL penalty between the policy at the beginning of
       training and the current policy.
    3. Additionally, if this KL already changed more than twice the target
       amount, we activate a strong penalty discouraging further divergence.

    Args:
      mean: Sequences of action means of the current policy.
      logstd: Sequences of action log stddevs of the current policy.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      action: Sequences of actions.
      advantage: Sequences of advantages.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        with tf.name_scope('policy_loss'):
            entropy = utility.diag_normal_entropy(mean, logstd)
            kl = tf.reduce_mean(
                self._mask(
                    utility.diag_normal_kl(old_mean, old_logstd, mean, logstd),
                    length), 1)
            policy_gradient = tf.exp(
                utility.diag_normal_logpdf(mean, logstd, action) -
                utility.diag_normal_logpdf(old_mean, old_logstd, action))
            surrogate_loss = -tf.reduce_mean(
                self._mask(policy_gradient * tf.stop_gradient(advantage),
                           length), 1)
            kl_penalty = self._penalty * kl
            cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
            cutoff_count = tf.reduce_sum(
                tf.cast(kl > cutoff_threshold, tf.int32))
            with tf.control_dependencies([
                    tf.cond(cutoff_count > 0,
                            lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '),
                            int)
            ]):
                kl_cutoff = (self._config.kl_cutoff_coef *
                             tf.cast(kl > cutoff_threshold, tf.float32) *
                             (kl - cutoff_threshold)**2)
            policy_loss = surrogate_loss + kl_penalty + kl_cutoff
            summary = tf.summary.merge([
                tf.summary.histogram('entropy', entropy),
                tf.summary.histogram('kl', kl),
                tf.summary.histogram('surrogate_loss', surrogate_loss),
                tf.summary.histogram('kl_penalty', kl_penalty),
                tf.summary.histogram('kl_cutoff', kl_cutoff),
                tf.summary.histogram('kl_penalty_combined',
                                     kl_penalty + kl_cutoff),
                tf.summary.histogram('policy_loss', policy_loss),
                tf.summary.scalar('avg_surr_loss',
                                  tf.reduce_mean(surrogate_loss)),
                tf.summary.scalar('avg_kl_penalty',
                                  tf.reduce_mean(kl_penalty)),
                tf.summary.scalar('avg_policy_loss',
                                  tf.reduce_mean(policy_loss))
            ])
            policy_loss = tf.reduce_mean(policy_loss, 0)
            return tf.check_numerics(policy_loss, 'policy_loss'), summary