def _define_step():
        """Request actions from the algorithm and apply them to the environments.

    Increments the lengths of all episodes and increases their scores by the
    current reward. After stepping the environments, provides the full
    transition tuple to the algorithm.

    Returns:
      Summary tensor.
    """
        prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
        agent_indices = tf.range(len(batch_env))
        action, step_summary = algo.perform(agent_indices, prevob)
        action.set_shape(batch_env.action.shape)
        with tf.control_dependencies([batch_env.simulate(action)]):
            add_score = score.assign_add(batch_env.reward)
            inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
        with tf.control_dependencies([add_score, inc_length]):
            agent_indices = tf.range(len(batch_env))
            experience_summary = algo.experience(agent_indices, prevob,
                                                 batch_env.action,
                                                 batch_env.reward,
                                                 batch_env.done,
                                                 batch_env.observ)
        return tf.summary.merge([step_summary, experience_summary])
示例#2
0
    def append(self, transitions, rows=None):
        """Append a batch of transitions to rows of the memory.

    Args:
      transitions: Tuple of transition quantities with batch dimension.
      rows: Episodes to append to, defaults to all.

    Returns:
      Operation.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        assert rows.shape.ndims == 1
        assert_capacity = tf.assert_less(rows,
                                         self._capacity,
                                         message='capacity exceeded')
        with tf.control_dependencies([assert_capacity]):
            assert_max_length = tf.assert_less(tf.gather(self._length, rows),
                                               self._max_length,
                                               message='max length exceeded')
        append_ops = []
        with tf.control_dependencies([assert_max_length]):
            for buffer_, elements in zip(self._buffers, transitions):
                timestep = tf.gather(self._length, rows)
                indices = tf.stack([rows, timestep], 1)
                append_ops.append(
                    tf.scatter_nd_update(buffer_, indices, elements))
        with tf.control_dependencies(append_ops):
            episode_mask = tf.reduce_sum(
                tf.one_hot(rows, self._capacity, dtype=tf.int32), 0)
            return self._length.assign_add(episode_mask)
示例#3
0
 def _define_experience(self, observ, action, reward):
     """Implement the branch of experience() entered during training."""
     update_filters = tf.summary.merge([
         self._observ_filter.update(observ),
         self._reward_filter.update(reward)
     ])
     with tf.control_dependencies([update_filters]):
         if self._config.train_on_agent_action:
             # NOTE: Doesn't seem to change much.
             action = self._last_action
         batch = observ, action, self._last_mean, self._last_logstd, reward
         append = self._episodes.append(batch,
                                        tf.range(len(self._batch_env)))
     with tf.control_dependencies([append]):
         norm_observ = self._observ_filter.transform(observ)
         norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
         # pylint: disable=g-long-lambda
         summary = tf.cond(
             self._should_log, lambda: tf.summary.merge([
                 update_filters,
                 self._observ_filter.summary(),
                 self._reward_filter.summary(),
                 tf.summary.scalar('memory_size', self._memory_index),
                 tf.summary.histogram('normalized_observ', norm_observ),
                 tf.summary.histogram('action', self._last_action),
                 tf.summary.scalar('normalized_reward', norm_reward)
             ]), str)
         return summary
示例#4
0
    def replace(self, episodes, length, rows=None):
        """Replace full episodes.

    Args:
      episodes: Tuple of transition quantities with batch and time dimensions.
      length: Batch of sequence lengths.
      rows: Episodes to replace, defaults to all.

    Returns:
      Operation.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        assert rows.shape.ndims == 1
        assert_capacity = tf.assert_less(rows,
                                         self._capacity,
                                         message='capacity exceeded')
        with tf.control_dependencies([assert_capacity]):
            assert_max_length = tf.assert_less_equal(
                length, self._max_length, message='max length exceeded')
        replace_ops = []
        with tf.control_dependencies([assert_max_length]):
            for buffer_, elements in zip(self._buffers, episodes):
                replace_op = tf.scatter_update(buffer_, rows, elements)
                replace_ops.append(replace_op)
        with tf.control_dependencies(replace_ops):
            return tf.scatter_update(self._length, rows, length)
示例#5
0
def discounted_return(reward, length, discount):
  """Discounted Monte-Carlo returns."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  return_ = tf.reverse(
      tf.transpose(
          tf.scan(lambda agg, cur: cur + discount * agg,
                  tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
                  tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(return_), 'return')
示例#6
0
    def length(self, rows=None):
        """Tensor holding the current length of episodes.

    Args:
      rows: Episodes to select length from, defaults to all.

    Returns:
      Batch tensor of sequence lengths.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        return tf.gather(self._length, rows)
示例#7
0
def fixed_step_return(reward, value, length, discount, window):
  """N-step discounted return."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  return_ = tf.zeros_like(reward)
  for _ in range(window):
    return_ += reward
    reward = discount * tf.concat([reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
  return_ += discount**window * tf.concat(
      [value[:, window:], tf.zeros_like(value[:, -window:]), 1])
  return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
示例#8
0
def lambda_advantage(reward, value, length, discount):
  """Generalized Advantage Estimation."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
  delta = reward + discount * next_value - value
  advantage = tf.reverse(
      tf.transpose(
          tf.scan(lambda agg, cur: cur + discount * agg,
                  tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]), tf.zeros_like(delta[:, -1]),
                  1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
示例#9
0
def lambda_return(reward, value, length, discount, lambda_):
  """TD-lambda returns."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  sequence = mask * reward + discount * value * (1 - lambda_)
  discount = mask * discount * lambda_
  sequence = tf.stack([sequence, discount], 2)
  return_ = tf.reverse(
      tf.transpose(
          tf.scan(lambda agg, cur: cur[0] + cur[1] * agg,
                  tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]), tf.zeros_like(value[:, -1]),
                  1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(return_), 'return')
示例#10
0
    def clear(self, rows=None):
        """Reset episodes in the memory.

    Internally, this only sets their lengths to zero. The memory entries will
    be overridden by future calls to append() or replace().

    Args:
      rows: Episodes to clear, defaults to all.

    Returns:
      Operation.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        assert rows.shape.ndims == 1
        return tf.scatter_update(self._length, rows, tf.zeros_like(rows))
 def _define_end_episode(self, agent_indices):
     """Implement the branch of end_episode() entered during training."""
     episodes, length = self._episodes.data(agent_indices)
     space_left = self._config.update_every - self._memory_index
     use_episodes = tf.range(
         tf.minimum(tf.shape(agent_indices)[0], space_left))
     episodes = [tf.gather(elem, use_episodes) for elem in episodes]
     append = self._memory.replace(episodes,
                                   tf.gather(length, use_episodes),
                                   use_episodes + self._memory_index)
     with tf.control_dependencies([append]):
         inc_index = self._memory_index.assign_add(
             tf.shape(use_episodes)[0])
     with tf.control_dependencies([inc_index]):
         memory_full = self._memory_index >= self._config.update_every
         return tf.cond(memory_full, self._training, str)
    def _perform_update_steps(self, observ, action, old_mean, old_logstd,
                              reward, length):
        """Perform multiple update steps of value function and policy.

    The advantage is computed once at the beginning and shared across
    iterations. We need to decide for the summary of one iteration, and thus
    choose the one after half of the iterations.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      reward: Sequences of rewards.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        return_ = utility.discounted_return(reward, length,
                                            self._config.discount)
        value = self._network(observ, length).value
        if self._config.gae_lambda:
            advantage = utility.lambda_return(reward, value, length,
                                              self._config.discount,
                                              self._config.gae_lambda)
        else:
            advantage = return_ - value
        mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
        advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
        advantage = tf.Print(advantage,
                             [tf.reduce_mean(return_),
                              tf.reduce_mean(value)], 'return and value: ')
        advantage = tf.Print(advantage, [tf.reduce_mean(advantage)],
                             'normalized advantage: ')
        # pylint: disable=g-long-lambda
        value_loss, policy_loss, summary = tf.scan(
            lambda _1, _2: self._update_step(
                observ, action, old_mean, old_logstd, reward, advantage, length
            ),
            tf.range(self._config.update_epochs), [0., 0., ''],
            parallel_iterations=1)
        print_losses = tf.group(
            tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
            tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
        with tf.control_dependencies([value_loss, policy_loss, print_losses]):
            return summary[self._config.update_epochs // 2]
    def _mask(self, tensor, length):
        """Set padding elements of a batch of sequences to zero.

    Useful to then safely sum along the time dimension.

    Args:
      tensor: Tensor of sequences.
      length: Batch of sequence lengths.

    Returns:
      Masked sequences.
    """
        with tf.name_scope('mask'):
            range_ = tf.range(tensor.shape[1].value)
            mask = tf.cast(range_[None, :] < length[:, None], tf.float32)
            masked = tensor * mask
            return tf.check_numerics(masked, 'masked')
示例#14
0
    def data(self, rows=None):
        """Access a batch of episodes from the memory.

    Padding elements after the length of each episode are unspecified and might
    contain old data.

    Args:
      rows: Episodes to select, defaults to all.

    Returns:
      Tuple containing a tuple of transition quantiries with batch and time
      dimensions, and a batch of sequence lengths.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        assert rows.shape.ndims == 1
        episode = [tf.gather(buffer_, rows) for buffer_ in self._buffers]
        length = tf.gather(self._length, rows)
        return episode, length
示例#15
0
    def _update_value(self, observ, reward, length):
        """Perform multiple update steps of the value baseline.

    We need to decide for the summary of one iteration, and thus choose the one
    after half of the iterations.

    Args:
      observ: Sequences of observations.
      reward: Sequences of reward.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('update_value'):
            loss, summary = tf.scan(
                lambda _1, _2: self._update_value_step(observ, reward, length),
                tf.range(self._config.update_epochs_value), [0., ''],
                parallel_iterations=1)
            print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'value loss: ')
            with tf.control_dependencies([loss, print_loss]):
                return summary[self._config.update_epochs_value // 2]
示例#16
0
    def reset(self, indices=None):
        """Reset the batch of environments.

    Args:
      indices: The batch indices of the environments to reset; defaults to all.

    Returns:
      Batch tensor of the new observations.
    """
        if indices is None:
            indices = tf.range(len(self._batch_env))
        observ_dtype = self._parse_dtype(self._batch_env.observation_space)
        observ = tf.py_func(self._batch_env.reset, [indices],
                            observ_dtype,
                            name='reset')
        observ = tf.check_numerics(observ, 'observ')
        reward = tf.zeros_like(indices, tf.float32)
        done = tf.zeros_like(indices, tf.bool)
        with tf.control_dependencies([
                tf.scatter_update(self._observ, indices, observ),
                tf.scatter_update(self._reward, indices, reward),
                tf.scatter_update(self._done, indices, done)
        ]):
            return tf.identity(observ)
示例#17
0
文件: simulate.py 项目: Gs-001/quad
def simulate(batch_env, algo, log=True, reset=False):
    """Simulation step of a vecrotized algorithm with in-graph environments.

  Integrates the operations implemented by the algorithm and the environments
  into a combined operation.

  Args:
    batch_env: In-graph batch environment.
    algo: Algorithm instance implementing required operations.
    log: Tensor indicating whether to compute and return summaries.
    reset: Tensor causing all environments to reset.

  Returns:
    Tuple of tensors containing done flags for the current episodes, possibly
    intermediate scores for the episodes, and a summary tensor.
  """
    def _define_begin_episode(agent_indices):
        """Reset environments, intermediate scores and durations for new episodes.

    Args:
      agent_indices: Tensor containing batch indices starting an episode.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        zero_scores = tf.zeros_like(agent_indices, tf.float32)
        zero_durations = tf.zeros_like(agent_indices)
        reset_ops = [
            batch_env.reset(agent_indices),
            tf.scatter_update(score, agent_indices, zero_scores),
            tf.scatter_update(length, agent_indices, zero_durations)
        ]
        with tf.control_dependencies(reset_ops):
            return algo.begin_episode(agent_indices)

    def _define_step():
        """Request actions from the algorithm and apply them to the environments.

    Increments the lengths of all episodes and increases their scores by the
    current reward. After stepping the environments, provides the full
    transition tuple to the algorithm.

    Returns:
      Summary tensor.
    """
        prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
        action, step_summary = algo.perform(prevob)
        action.set_shape(batch_env.action.shape)
        with tf.control_dependencies([batch_env.simulate(action)]):
            add_score = score.assign_add(batch_env.reward)
            inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
        with tf.control_dependencies([add_score, inc_length]):
            experience_summary = algo.experience(prevob, batch_env.action,
                                                 batch_env.reward,
                                                 batch_env.done,
                                                 batch_env.observ)
        return tf.summary.merge([step_summary, experience_summary])

    def _define_end_episode(agent_indices):
        """Notify the algorithm of ending episodes.

    Also updates the mean score and length counters used for summaries.

    Args:
      agent_indices: Tensor holding batch indices that end their episodes.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        submit_score = mean_score.submit(tf.gather(score, agent_indices))
        submit_length = mean_length.submit(
            tf.cast(tf.gather(length, agent_indices), tf.float32))
        with tf.control_dependencies([submit_score, submit_length]):
            return algo.end_episode(agent_indices)

    def _define_summaries():
        """Reset the average score and duration, and return them as summary.

    Returns:
      Summary string.
    """
        score_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
            lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
        length_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
            lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
        return tf.summary.merge([score_summary, length_summary])

    with tf.name_scope('simulate'):
        log = tf.convert_to_tensor(log)
        reset = tf.convert_to_tensor(reset)
        with tf.variable_scope('simulate_temporary'):
            score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32),
                                False,
                                name='score')
            length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32),
                                 False,
                                 name='length')
        mean_score = streaming_mean.StreamingMean((), tf.float32)
        mean_length = streaming_mean.StreamingMean((), tf.float32)
        agent_indices = tf.cond(
            reset, lambda: tf.range(len(batch_env)),
            lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
        begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                lambda: _define_begin_episode(agent_indices),
                                str)
        with tf.control_dependencies([begin_episode]):
            step = _define_step()
        with tf.control_dependencies([step]):
            agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
            end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                  lambda: _define_end_episode(agent_indices),
                                  str)
        with tf.control_dependencies([end_episode]):
            summary = tf.summary.merge(
                [_define_summaries(), begin_episode, step, end_episode])
        with tf.control_dependencies([summary]):
            done, score = tf.identity(batch_env.done), tf.identity(score)
        return done, score, summary