Exemplo n.º 1
0
  def sample_action(self, state, stddev=1.0):
    """Returns the action for the state with additive noise.

    Args:
      state: A [num_state_dims] tensor representing a state.
      stddev: stddev for the Ornstein-Uhlenbeck noise.
    Returns:
      A [num_action_dims] action tensor.
    """
    agent_action = self.action(state)
    agent_action += tf.random_normal(tf.shape(agent_action)) * stddev
    return utils.clip_to_spec(agent_action, self._action_spec)
Exemplo n.º 2
0
  def sample_action(self, state, stddev=1.0):
    """Returns the action for the state with additive noise.

    Args:
      state: A [num_state_dims] tensor representing a state.
      stddev: stddev for the Ornstein-Uhlenbeck noise.
    Returns:
      A [num_action_dims] action tensor.
    """
    agent_action = self.action(state)
    agent_action += tf.random_normal(tf.shape(agent_action)) * stddev
    return utils.clip_to_spec(agent_action, self._action_spec)
Exemplo n.º 3
0
 def sample(self, states, next_states, num_samples, orig_goals, sc=0.5):
   goal_dim = orig_goals.shape[-1]
   spec_range = (self._spec.maximum - self._spec.minimum) / 2 * tf.ones([goal_dim])
   loc = tf.cast(next_states - states, tf.float32)[:, :goal_dim]
   scale = sc * tf.tile(tf.reshape(spec_range, [1, goal_dim]),
                        [tf.shape(states)[0], 1])
   dist = tf.distributions.Normal(loc, scale)
   if num_samples == 1:
     return dist.sample()
   samples = tf.concat([dist.sample(num_samples - 2),
                        tf.expand_dims(loc, 0),
                        tf.expand_dims(orig_goals, 0)], 0)
   return uvf_utils.clip_to_spec(samples, self._spec)
Exemplo n.º 4
0
 def sample(self, states, next_states, num_samples, orig_goals, sc=0.5):
   goal_dim = orig_goals.shape[-1]
   spec_range = (self._spec.maximum - self._spec.minimum) / 2 * tf.ones([goal_dim])
   loc = tf.cast(next_states - states, tf.float32)[:, :goal_dim]
   scale = sc * tf.tile(tf.reshape(spec_range, [1, goal_dim]),
                        [tf.shape(states)[0], 1])
   dist = tf.distributions.Normal(loc, scale)
   if num_samples == 1:
     return dist.sample()
   samples = tf.concat([dist.sample(num_samples - 2),
                        tf.expand_dims(loc, 0),
                        tf.expand_dims(orig_goals, 0)], 0)
   return uvf_utils.clip_to_spec(samples, self._spec)
Exemplo n.º 5
0
 def noisy_action_fn(state, context=None):
   """Noisy action fn."""
   action = action_fn(state, context)
   if debug:
     action = uvf_utils.tf_print(
         action, [action],
         message='[add_noise_fn] pre-noise action',
         first_n=100)
   noise_dist = tf.distributions.Normal(tf.zeros_like(action),
                                        tf.ones_like(action) * stddev)
   noise = noise_dist.sample()
   action += noise
   if debug:
     action = uvf_utils.tf_print(
         action, [action],
         message='[add_noise_fn] post-noise action',
         first_n=100)
   if clip:
     action = uvf_utils.clip_to_spec(action, self._action_spec)
   return action
Exemplo n.º 6
0
 def noisy_action_fn(state, context=None):
   """Noisy action fn."""
   action = action_fn(state, context)
   if debug:
     action = uvf_utils.tf_print(
         action, [action],
         message='[add_noise_fn] pre-noise action',
         first_n=100)
   noise_dist = tf.distributions.Normal(tf.zeros_like(action),
                                        tf.ones_like(action) * stddev)
   noise = noise_dist.sample()
   action += noise
   if debug:
     action = uvf_utils.tf_print(
         action, [action],
         message='[add_noise_fn] post-noise action',
         first_n=100)
   if clip:
     action = uvf_utils.clip_to_spec(action, self._action_spec)
   return action
Exemplo n.º 7
0
def get_eval_step(uvf_agent,
                  state_preprocess,
                  tf_env,
                  action_fn,
                  meta_action_fn,
                  environment_steps,
                  num_episodes,
                  mode='eval'):
    """Get one-step policy/env stepping ops.

  Args:
    uvf_agent: A UVF agent.
    tf_env: A TFEnvironment.
    action_fn: A function to produce actions given current state.
    meta_action_fn: A function to produce meta actions given current state.
    environment_steps: A variable to count the number of steps in the tf_env.
    num_episodes: A variable to count the number of episodes.
    mode: a string representing the mode=[train, explore, eval].

  Returns:
    A collect_experience_op that excute an action and store into the
    replay_buffer
  """

    tf_env.start_collect()
    state = tf_env.current_obs()
    action = action_fn(state, context=None)
    state_repr = state_preprocess(state)

    action_spec = tf_env.action_spec()
    action_ph = tf.placeholder(dtype=action_spec.dtype,
                               shape=action_spec.shape)
    with tf.control_dependencies([state]):
        transition_type, reward, discount = tf_env.step(action_ph)

    def increment_step():
        return environment_steps.assign_add(1)

    def increment_episode():
        return num_episodes.assign_add(1)

    def no_op_int():
        return tf.constant(0, dtype=tf.int64)

    step_cond = uvf_agent.step_cond_fn(state, action, transition_type,
                                       environment_steps, num_episodes)
    reset_episode_cond = uvf_agent.reset_episode_cond_fn(
        state, action, transition_type, environment_steps, num_episodes)
    reset_env_cond = uvf_agent.reset_env_cond_fn(state, action,
                                                 transition_type,
                                                 environment_steps,
                                                 num_episodes)

    increment_step_op = tf.cond(step_cond, increment_step, no_op_int)
    with tf.control_dependencies([increment_step_op]):
        increment_episode_op = tf.cond(reset_episode_cond, increment_episode,
                                       no_op_int)

    with tf.control_dependencies([reward, discount]):
        next_state = tf_env.current_obs()
        next_state_repr = state_preprocess(next_state)

    with tf.control_dependencies([increment_episode_op]):
        post_reward, post_meta_reward = uvf_agent.cond_begin_episode_op(
            tf.logical_not(reset_episode_cond), [
                state, action_ph, reward, next_state, state_repr,
                next_state_repr
            ],
            mode=mode,
            meta_action_fn=meta_action_fn)

    # Important: do manual reset after getting the final reward from the
    # unreset environment.
    with tf.control_dependencies([post_reward, post_meta_reward]):
        cond_reset_op = tf.cond(reset_env_cond, tf_env.reset,
                                tf_env.current_time_step)

    # Add a dummy control dependency to force the reset_op to run
    with tf.control_dependencies(cond_reset_op):
        post_reward, post_meta_reward = map(tf.identity,
                                            [post_reward, post_meta_reward])

    eval_step = [
        next_state, action_ph, transition_type, post_reward, post_meta_reward,
        discount, uvf_agent.context_vars, state_repr
    ]

    if callable(action):

        def step_fn(sess):
            action_value = action(sess)
            return sess.run(eval_step, feed_dict={action_ph: action_value})
    else:
        action = uvf_utils.clip_to_spec(action, action_spec)

        def step_fn(sess):
            action_value = sess.run(action)
            return sess.run(eval_step, feed_dict={action_ph: action_value})

    return step_fn
Exemplo n.º 8
0
def get_eval_step(uvf_agent,
                  state_preprocess,
                  tf_env,
                  action_fn,
                  meta_action_fn,
                  environment_steps,
                  num_episodes,
                  mode='eval'):
  """Get one-step policy/env stepping ops.

  Args:
    uvf_agent: A UVF agent.
    tf_env: A TFEnvironment.
    action_fn: A function to produce actions given current state.
    meta_action_fn: A function to produce meta actions given current state.
    environment_steps: A variable to count the number of steps in the tf_env.
    num_episodes: A variable to count the number of episodes.
    mode: a string representing the mode=[train, explore, eval].

  Returns:
    A collect_experience_op that excute an action and store into the
    replay_buffer
  """

  tf_env.start_collect()
  state = tf_env.current_obs()
  action = action_fn(state, context=None)
  state_repr = state_preprocess(state)

  action_spec = tf_env.action_spec()
  action_ph = tf.placeholder(dtype=action_spec.dtype, shape=action_spec.shape)
  with tf.control_dependencies([state]):
    transition_type, reward, discount = tf_env.step(action_ph)

  def increment_step():
    return environment_steps.assign_add(1)

  def increment_episode():
    return num_episodes.assign_add(1)

  def no_op_int():
    return tf.constant(0, dtype=tf.int64)

  step_cond = uvf_agent.step_cond_fn(state, action,
                                     transition_type,
                                     environment_steps, num_episodes)
  reset_episode_cond = uvf_agent.reset_episode_cond_fn(
      state, action,
      transition_type, environment_steps, num_episodes)
  reset_env_cond = uvf_agent.reset_env_cond_fn(state, action,
                                               transition_type,
                                               environment_steps, num_episodes)

  increment_step_op = tf.cond(step_cond, increment_step, no_op_int)
  with tf.control_dependencies([increment_step_op]):
    increment_episode_op = tf.cond(reset_episode_cond, increment_episode,
                                   no_op_int)

  with tf.control_dependencies([reward, discount]):
    next_state = tf_env.current_obs()
    next_state_repr = state_preprocess(next_state)

  with tf.control_dependencies([increment_episode_op]):
    post_reward, post_meta_reward = uvf_agent.cond_begin_episode_op(
        tf.logical_not(reset_episode_cond),
        [state, action_ph, reward, next_state,
         state_repr, next_state_repr],
        mode=mode, meta_action_fn=meta_action_fn)

  # Important: do manual reset after getting the final reward from the
  # unreset environment.
  with tf.control_dependencies([post_reward, post_meta_reward]):
    cond_reset_op = tf.cond(reset_env_cond,
                            tf_env.reset,
                            tf_env.current_time_step)

  # Add a dummy control dependency to force the reset_op to run
  with tf.control_dependencies(cond_reset_op):
    post_reward, post_meta_reward = map(tf.identity, [post_reward, post_meta_reward])

  eval_step = [next_state, action_ph, transition_type, post_reward, post_meta_reward, discount, uvf_agent.context_vars, state_repr]

  if callable(action):
    def step_fn(sess):
      action_value = action(sess)
      return sess.run(eval_step, feed_dict={action_ph: action_value})
  else:
    action = uvf_utils.clip_to_spec(action, action_spec)
    def step_fn(sess):
      action_value = sess.run(action)
      return sess.run(eval_step, feed_dict={action_ph: action_value})

  return step_fn