示例#1
0
文件: normalize.py 项目: Gs-001/quad
    def __init__(self,
                 template,
                 center=True,
                 scale=True,
                 clip=10,
                 name='normalize'):
        """Normalize tensors based on streaming estimates of mean and variance.

    Centering the value, scaling it by the standard deviation, and clipping
    outlier values are optional.

    Args:
      template: Example tensor providing shape and dtype of the vaule to track.
      center: Python boolean indicating whether to subtract mean from values.
      scale: Python boolean indicating whether to scale values by stddev.
      clip: If and when to clip normalized values.
      name: Parent scope of operations provided by this class.
    """
        self._center = center
        self._scale = scale
        self._clip = clip
        self._name = name
        with tf.name_scope(name):
            self._count = tf.Variable(0, False)
            self._mean = tf.Variable(tf.zeros_like(template), False)
            self._var_sum = tf.Variable(tf.zeros_like(template), False)
示例#2
0
def fixed_step_return(reward, value, length, discount, window):
  """N-step discounted return."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  return_ = tf.zeros_like(reward)
  for _ in range(window):
    return_ += reward
    reward = discount * tf.concat([reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
  return_ += discount**window * tf.concat(
      [value[:, window:], tf.zeros_like(value[:, -window:]), 1])
  return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
示例#3
0
文件: normalize.py 项目: Gs-001/quad
    def reset(self):
        """Reset the estimates of mean and variance.

    Resets the full state of this class.

    Returns:
      Operation.
    """
        with tf.name_scope(self._name + '/reset'):
            return tf.group(self._count.assign(0),
                            self._mean.assign(tf.zeros_like(self._mean)),
                            self._var_sum.assign(tf.zeros_like(self._var_sum)))
示例#4
0
def lambda_advantage(reward, value, length, discount):
  """Generalized Advantage Estimation."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
  delta = reward + discount * next_value - value
  advantage = tf.reverse(
      tf.transpose(
          tf.scan(lambda agg, cur: cur + discount * agg,
                  tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]), tf.zeros_like(delta[:, -1]),
                  1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
示例#5
0
 def clear(self):
     """Return the mean estimate and reset the streaming statistics."""
     value = self._sum / tf.cast(self._count, self._dtype)
     with tf.control_dependencies([value]):
         reset_value = self._sum.assign(tf.zeros_like(self._sum))
         reset_count = self._count.assign(0)
     with tf.control_dependencies([reset_value, reset_count]):
         return tf.identity(value)
示例#6
0
文件: simulate.py 项目: Gs-001/quad
    def _define_begin_episode(agent_indices):
        """Reset environments, intermediate scores and durations for new episodes.

    Args:
      agent_indices: Tensor containing batch indices starting an episode.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        zero_scores = tf.zeros_like(agent_indices, tf.float32)
        zero_durations = tf.zeros_like(agent_indices)
        reset_ops = [
            batch_env.reset(agent_indices),
            tf.scatter_update(score, agent_indices, zero_scores),
            tf.scatter_update(length, agent_indices, zero_durations)
        ]
        with tf.control_dependencies(reset_ops):
            return algo.begin_episode(agent_indices)
示例#7
0
def discounted_return(reward, length, discount):
  """Discounted Monte-Carlo returns."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  return_ = tf.reverse(
      tf.transpose(
          tf.scan(lambda agg, cur: cur + discount * agg,
                  tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
                  tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(return_), 'return')
示例#8
0
def lambda_return(reward, value, length, discount, lambda_):
  """TD-lambda returns."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  sequence = mask * reward + discount * value * (1 - lambda_)
  discount = mask * discount * lambda_
  sequence = tf.stack([sequence, discount], 2)
  return_ = tf.reverse(
      tf.transpose(
          tf.scan(lambda agg, cur: cur[0] + cur[1] * agg,
                  tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]), tf.zeros_like(value[:, -1]),
                  1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(return_), 'return')
示例#9
0
    def clear(self, rows=None):
        """Reset episodes in the memory.

    Internally, this only sets their lengths to zero. The memory entries will
    be overridden by future calls to append() or replace().

    Args:
      rows: Episodes to clear, defaults to all.

    Returns:
      Operation.
    """
        rows = tf.range(self._capacity) if rows is None else rows
        assert rows.shape.ndims == 1
        return tf.scatter_update(self._length, rows, tf.zeros_like(rows))
示例#10
0
    def reset(self, indices=None):
        """Reset the batch of environments.

    Args:
      indices: The batch indices of the environments to reset; defaults to all.

    Returns:
      Batch tensor of the new observations.
    """
        if indices is None:
            indices = tf.range(len(self._batch_env))
        observ_dtype = self._parse_dtype(self._batch_env.observation_space)
        observ = tf.py_func(self._batch_env.reset, [indices],
                            observ_dtype,
                            name='reset')
        observ = tf.check_numerics(observ, 'observ')
        reward = tf.zeros_like(indices, tf.float32)
        done = tf.zeros_like(indices, tf.bool)
        with tf.control_dependencies([
                tf.scatter_update(self._observ, indices, observ),
                tf.scatter_update(self._reward, indices, reward),
                tf.scatter_update(self._done, indices, done)
        ]):
            return tf.identity(observ)
示例#11
0
def reinit_nested_vars(variables, indices=None):
  """Reset all variables in a nested tuple to zeros.

  Args:
    variables: Nested tuple or list of variaables.
    indices: Indices along the first dimension to reset, defaults to all.

  Returns:
    Operation.
  """
  if isinstance(variables, (tuple, list)):
    return tf.group(*[reinit_nested_vars(variable, indices) for variable in variables])
  if indices is None:
    return variables.assign(tf.zeros_like(variables))
  else:
    zeros = tf.zeros([tf.shape(indices)[0]] + variables.shape[1:].as_list())
    return tf.scatter_update(variables, indices, zeros)
    def __init__(self, batch_env, step, is_training, should_log, config):
        """Create an instance of the PPO algorithm.

    Args:
      batch_env: In-graph batch environment.
      step: Integer tensor holding the current training step.
      is_training: Boolean tensor for whether the algorithm should train.
      should_log: Boolean tensor for whether summaries should be returned.
      config: Object containing the agent configuration as attributes.
    """
        self._batch_env = batch_env
        self._step = step
        self._is_training = is_training
        self._should_log = should_log
        self._config = config
        self._observ_filter = normalize.StreamingNormalize(
            self._batch_env.observ[0],
            center=True,
            scale=True,
            clip=5,
            name='normalize_observ')
        self._reward_filter = normalize.StreamingNormalize(
            self._batch_env.reward[0],
            center=False,
            scale=True,
            clip=10,
            name='normalize_reward')
        # Memory stores tuple of observ, action, mean, logstd, reward.
        template = (self._batch_env.observ[0], self._batch_env.action[0],
                    self._batch_env.action[0], self._batch_env.action[0],
                    self._batch_env.reward[0])
        self._memory = memory.EpisodeMemory(template, config.update_every,
                                            config.max_length, 'memory')
        self._memory_index = tf.Variable(0, False)
        use_gpu = self._config.use_gpu and utility.available_gpus()
        with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
            # Create network variables for later calls to reuse.
            action_size = self._batch_env.action.shape[1].value
            self._network = tf.make_template(
                'network',
                functools.partial(config.network, config, action_size))
            output = self._network(
                tf.zeros_like(self._batch_env.observ)[:, None],
                tf.ones(len(self._batch_env)))
            with tf.variable_scope('ppo_temporary'):
                self._episodes = memory.EpisodeMemory(template, len(batch_env),
                                                      config.max_length,
                                                      'episodes')
                if output.state is None:
                    self._last_state = None
                else:
                    # Ensure the batch dimension is set.
                    tf.contrib.framework.nest.map_structure(
                        lambda x: x.set_shape([len(batch_env)] + x.shape.
                                              as_list()[1:]), output.state)
                    # pylint: disable=undefined-variable
                    self._last_state = tf.contrib.framework.nest.map_structure(
                        lambda x: tf.Variable(lambda: tf.zeros_like(x), False),
                        output.state)
                self._last_action = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_action')
                self._last_mean = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                              False,
                                              name='last_mean')
                self._last_logstd = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_logstd')
        self._penalty = tf.Variable(self._config.kl_init_penalty,
                                    False,
                                    dtype=tf.float32)
        self._optimizer = self._config.optimizer(self._config.learning_rate)
示例#13
0
    def __init__(self, batch_env, step, is_training, should_log, config):
        """Create an instance of the PPO algorithm.

    Args:
      batch_env: In-graph batch environment.
      step: Integer tensor holding the current training step.
      is_training: Boolean tensor for whether the algorithm should train.
      should_log: Boolean tensor for whether summaries should be returned.
      config: Object containing the agents configuration as attributes.
    """
        self._batch_env = batch_env
        self._step = step
        self._is_training = is_training
        self._should_log = should_log
        self._config = config
        self._observ_filter = normalize.StreamingNormalize(
            self._batch_env.observ[0],
            center=True,
            scale=True,
            clip=5,
            name='normalize_observ')
        self._reward_filter = normalize.StreamingNormalize(
            self._batch_env.reward[0],
            center=False,
            scale=True,
            clip=10,
            name='normalize_reward')
        # Memory stores tuple of observ, action, mean, logstd, reward.
        template = (self._batch_env.observ[0], self._batch_env.action[0],
                    self._batch_env.action[0], self._batch_env.action[0],
                    self._batch_env.reward[0])
        self._memory = memory.EpisodeMemory(template, config.update_every,
                                            config.max_length, 'memory')
        self._memory_index = tf.Variable(0, False)
        use_gpu = self._config.use_gpu and utility.available_gpus()
        with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
            # Create network variables for later calls to reuse.
            self._network(tf.zeros_like(self._batch_env.observ)[:, None],
                          tf.ones(len(self._batch_env)),
                          reuse=None)
            cell = self._config.network(self._batch_env.action.shape[1].value)
            with tf.variable_scope('ppo_temporary'):
                self._episodes = memory.EpisodeMemory(template, len(batch_env),
                                                      config.max_length,
                                                      'episodes')
                self._last_state = utility.create_nested_vars(
                    cell.zero_state(len(batch_env), tf.float32))
                self._last_action = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_action')
                self._last_mean = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                              False,
                                              name='last_mean')
                self._last_logstd = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_logstd')
        self._penalty = tf.Variable(self._config.kl_init_penalty,
                                    False,
                                    dtype=tf.float32)
        self._policy_optimizer = self._config.policy_optimizer(
            self._config.policy_lr, name='policy_optimizer')
        self._value_optimizer = self._config.value_optimizer(
            self._config.value_lr, name='value_optimizer')