def recurrent_gaussian(config, action_size, observations, length, state=None):
  """Independent recurrent policy and feed forward value networks.

  The policy network outputs the mean action and the log standard deviation
  is learned as independent parameter vector. The last policy layer is
  recurrent and uses a GRU cell.

  Args:
    config: Configuration object.
    action_size: Length of the action vector.
    observations: Sequences of observations.
    length: Batch of sequence lengths.
    state: Batch of initial recurrent states.

  Returns:
    NetworkOutput tuple.
  """
  mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer(
      factor=config.init_mean_factor)
  logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10)
  cell = tf.contrib.rnn.GRUBlockCell(config.policy_layers[-1])
  flat_observations = tf.reshape(observations, [
      tf.shape(observations)[0],
      tf.shape(observations)[1],
      functools.reduce(operator.mul,
                       observations.shape.as_list()[2:], 1)
  ])
  with tf.variable_scope('policy'):
    x = flat_observations
    for size in config.policy_layers[:-1]:
      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
    x, state = tf.nn.dynamic_rnn(cell, x, length, state, tf.float32)
    mean = tf.contrib.layers.fully_connected(x,
                                             action_size,
                                             tf.tanh,
                                             weights_initializer=mean_weights_initializer)
    logstd = tf.get_variable('logstd', mean.shape[2:], tf.float32, logstd_initializer)
    logstd = tf.tile(logstd[None, None],
                     [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2))
  with tf.variable_scope('value'):
    x = flat_observations
    for size in config.value_layers:
      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
    value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0]
  mean = tf.check_numerics(mean, 'mean')
  logstd = tf.check_numerics(logstd, 'logstd')
  value = tf.check_numerics(value, 'value')
  policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd))
  # assert state.shape.as_list()[0] is not None
  return NetworkOutput(policy, mean, logstd, value, state)
Пример #2
0
    def update(self, value):
        """Update the mean and variance estimates.

    Args:
      value: Batch or single value tensor.

    Returns:
      Summary tensor.
    """
        with tf.name_scope(self._name + '/update'):
            if value.shape.ndims == self._mean.shape.ndims:
                # Add a batch dimension if necessary.
                value = value[None, ...]
            count = tf.shape(value)[0]
            with tf.control_dependencies([self._count.assign_add(count)]):
                step = tf.cast(self._count, tf.float32)
                mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
                new_mean = self._mean + mean_delta / step
                new_mean = tf.cond(self._count > 1, lambda: new_mean,
                                   lambda: value[0])
                var_delta = (value - self._mean[None, ...]) * (
                    value - new_mean[None, ...])
                new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
            with tf.control_dependencies([new_mean, new_var_sum]):
                update = self._mean.assign(new_mean), self._var_sum.assign(
                    new_var_sum)
            with tf.control_dependencies(update):
                if value.shape.ndims == 1:
                    value = tf.reduce_mean(value)
                return self._summary('value', tf.reduce_mean(value))
 def perform(self, agent_indices, unused_observ):
     shape = (
         tf.shape(agent_indices)[0], ) + self._envs[0].action_space.shape
     low = self._envs[0].action_space.low
     high = self._envs[0].action_space.high
     action = tf.random_uniform(shape) * (high - low) + low
     return action, tf.constant('')
Пример #4
0
 def submit(self, value):
     """Submit a single or batch tensor to refine the streaming mean."""
     # Add a batch dimension if necessary.
     if value.shape.ndims == self._sum.shape.ndims:
         value = value[None, ...]
     return tf.group(self._sum.assign_add(tf.reduce_sum(value, 0)),
                     self._count.assign_add(tf.shape(value)[0]))
 def _define_end_episode(self, agent_indices):
     """Implement the branch of end_episode() entered during training."""
     episodes, length = self._episodes.data(agent_indices)
     space_left = self._config.update_every - self._memory_index
     use_episodes = tf.range(
         tf.minimum(tf.shape(agent_indices)[0], space_left))
     episodes = [tf.gather(elem, use_episodes) for elem in episodes]
     append = self._memory.replace(episodes,
                                   tf.gather(length, use_episodes),
                                   use_episodes + self._memory_index)
     with tf.control_dependencies([append]):
         inc_index = self._memory_index.assign_add(
             tf.shape(use_episodes)[0])
     with tf.control_dependencies([inc_index]):
         memory_full = self._memory_index >= self._config.update_every
         return tf.cond(memory_full, self._training, str)
  def _define_step(self, done, score, summary):
    """Combine operations of a phase.

    Keeps track of the mean score and when to report it.

    Args:
      done: Tensor indicating whether current score can be used.
      score: Tensor holding the current, possibly intermediate, score.
      summary: Tensor holding summary string to write if not an empty string.

    Returns:
      Tuple of summary tensor, mean score, and new global step. The mean score
      is zero for non reporting steps.
    """
    if done.shape.ndims == 0:
      done = done[None]
    if score.shape.ndims == 0:
      score = score[None]
    score_mean = streaming_mean.StreamingMean((), tf.float32)
    with tf.control_dependencies([done, score, summary]):
      done_score = tf.gather(score, tf.where(done)[:, 0])
      submit_score = tf.cond(tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
    with tf.control_dependencies([submit_score]):
      mean_score = tf.cond(self._report, score_mean.clear, float)
      steps_made = tf.shape(score)[0]
      next_step = self._step.assign_add(steps_made)
    with tf.control_dependencies([mean_score, next_step]):
      return tf.identity(summary), mean_score, next_step, steps_made
Пример #7
0
    def _build_nets(self, json_data):
        assert self.ACTOR_NET_KEY in json_data
        assert self.CRITIC_NET_KEY in json_data

        actor_net_name = json_data[self.ACTOR_NET_KEY]
        critic_net_name = json_data[self.CRITIC_NET_KEY]
        actor_init_output_scale = 1 if (
            self.ACTOR_INIT_OUTPUT_SCALE_KEY
            not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]

        s_size = self.get_state_size()
        g_size = self.get_goal_size()
        a_size = self.get_action_size()

        # setup input tensors
        self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s")
        self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a")
        self.tar_val_tf = tf.placeholder(tf.float32,
                                         shape=[None],
                                         name="tar_val")
        self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv")
        self.g_tf = tf.placeholder(
            tf.float32,
            shape=([None, g_size] if self.has_goal() else None),
            name="g")
        self.old_logp_tf = tf.placeholder(tf.float32,
                                          shape=[None],
                                          name="old_logp")
        self.exp_mask_tf = tf.placeholder(tf.float32,
                                          shape=[None],
                                          name="exp_mask")

        with tf.variable_scope('main'):
            with tf.variable_scope('actor'):
                self.a_mean_tf = self._build_net_actor(
                    actor_net_name, actor_init_output_scale)
            with tf.variable_scope('critic'):
                self.critic_tf = self._build_net_critic(critic_net_name)

        if (self.a_mean_tf != None):
            Logger.print2('Built actor net: ' + actor_net_name)

        if (self.critic_tf != None):
            Logger.print2('Built critic net: ' + critic_net_name)

        self.norm_a_std_tf = self.exp_params_curr.noise * tf.ones(a_size)
        norm_a_noise_tf = self.norm_a_std_tf * tf.random_normal(
            shape=tf.shape(self.a_mean_tf))
        norm_a_noise_tf *= tf.expand_dims(self.exp_mask_tf, axis=-1)
        self.sample_a_tf = self.a_mean_tf + norm_a_noise_tf * self.a_norm.std_tf
        self.sample_a_logp_tf = TFUtil.calc_logp_gaussian(
            x_tf=norm_a_noise_tf, mean_tf=None, std_tf=self.norm_a_std_tf)

        return
def calc_logp_gaussian(x_tf, mean_tf, std_tf):
    dim = tf.to_float(tf.shape(x_tf)[-1])

    if mean_tf is None:
        diff_tf = x_tf
    else:
        diff_tf = x_tf - mean_tf

    logp_tf = -0.5 * tf.reduce_sum(tf.square(diff_tf / std_tf), axis=-1)
    logp_tf += -0.5 * dim * np.log(2 * np.pi) - tf.reduce_sum(tf.log(std_tf),
                                                              axis=-1)

    return logp_tf
Пример #9
0
 def __call__(self, observation, state):
   with tf.variable_scope('policy'):
     x = tf.contrib.layers.flatten(observation)
     mean = tf.contrib.layers.fully_connected(x,
                                              self._action_size,
                                              tf.tanh,
                                              weights_initializer=self._mean_weights_initializer)
     logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
     logstd = tf.tile(logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
   with tf.variable_scope('value'):
     x = tf.contrib.layers.flatten(observation)
     for size in self._value_layers:
       x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
     value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
   return (mean, logstd, value), state
Пример #10
0
def reinit_nested_vars(variables, indices=None):
  """Reset all variables in a nested tuple to zeros.

  Args:
    variables: Nested tuple or list of variaables.
    indices: Indices along the first dimension to reset, defaults to all.

  Returns:
    Operation.
  """
  if isinstance(variables, (tuple, list)):
    return tf.group(*[reinit_nested_vars(variable, indices) for variable in variables])
  if indices is None:
    return variables.assign(tf.zeros_like(variables))
  else:
    zeros = tf.zeros([tf.shape(indices)[0]] + variables.shape[1:].as_list())
    return tf.scatter_update(variables, indices, zeros)
Пример #11
0
def simulate(batch_env, algo, log=True, reset=False):
    """Simulation step of a vecrotized algorithm with in-graph environments.

  Integrates the operations implemented by the algorithm and the environments
  into a combined operation.

  Args:
    batch_env: In-graph batch environment.
    algo: Algorithm instance implementing required operations.
    log: Tensor indicating whether to compute and return summaries.
    reset: Tensor causing all environments to reset.

  Returns:
    Tuple of tensors containing done flags for the current episodes, possibly
    intermediate scores for the episodes, and a summary tensor.
  """
    def _define_begin_episode(agent_indices):
        """Reset environments, intermediate scores and durations for new episodes.

    Args:
      agent_indices: Tensor containing batch indices starting an episode.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        zero_scores = tf.zeros_like(agent_indices, tf.float32)
        zero_durations = tf.zeros_like(agent_indices)
        reset_ops = [
            batch_env.reset(agent_indices),
            tf.scatter_update(score, agent_indices, zero_scores),
            tf.scatter_update(length, agent_indices, zero_durations)
        ]
        with tf.control_dependencies(reset_ops):
            return algo.begin_episode(agent_indices)

    def _define_step():
        """Request actions from the algorithm and apply them to the environments.

    Increments the lengths of all episodes and increases their scores by the
    current reward. After stepping the environments, provides the full
    transition tuple to the algorithm.

    Returns:
      Summary tensor.
    """
        prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
        action, step_summary = algo.perform(prevob)
        action.set_shape(batch_env.action.shape)
        with tf.control_dependencies([batch_env.simulate(action)]):
            add_score = score.assign_add(batch_env.reward)
            inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
        with tf.control_dependencies([add_score, inc_length]):
            experience_summary = algo.experience(prevob, batch_env.action,
                                                 batch_env.reward,
                                                 batch_env.done,
                                                 batch_env.observ)
        return tf.summary.merge([step_summary, experience_summary])

    def _define_end_episode(agent_indices):
        """Notify the algorithm of ending episodes.

    Also updates the mean score and length counters used for summaries.

    Args:
      agent_indices: Tensor holding batch indices that end their episodes.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        submit_score = mean_score.submit(tf.gather(score, agent_indices))
        submit_length = mean_length.submit(
            tf.cast(tf.gather(length, agent_indices), tf.float32))
        with tf.control_dependencies([submit_score, submit_length]):
            return algo.end_episode(agent_indices)

    def _define_summaries():
        """Reset the average score and duration, and return them as summary.

    Returns:
      Summary string.
    """
        score_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
            lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
        length_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
            lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
        return tf.summary.merge([score_summary, length_summary])

    with tf.name_scope('simulate'):
        log = tf.convert_to_tensor(log)
        reset = tf.convert_to_tensor(reset)
        with tf.variable_scope('simulate_temporary'):
            score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32),
                                False,
                                name='score')
            length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32),
                                 False,
                                 name='length')
        mean_score = streaming_mean.StreamingMean((), tf.float32)
        mean_length = streaming_mean.StreamingMean((), tf.float32)
        agent_indices = tf.cond(
            reset, lambda: tf.range(len(batch_env)),
            lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
        begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                lambda: _define_begin_episode(agent_indices),
                                str)
        with tf.control_dependencies([begin_episode]):
            step = _define_step()
        with tf.control_dependencies([step]):
            agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
            end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                  lambda: _define_end_episode(agent_indices),
                                  str)
        with tf.control_dependencies([end_episode]):
            summary = tf.summary.merge(
                [_define_summaries(), begin_episode, step, end_episode])
        with tf.control_dependencies([summary]):
            done, score = tf.identity(batch_env.done), tf.identity(score)
        return done, score, summary