def sample_action(self, state, stddev=1.0): """Returns the action for the state with additive noise. Args: state: A [num_state_dims] tensor representing a state. stddev: stddev for the Ornstein-Uhlenbeck noise. Returns: A [num_action_dims] action tensor. """ agent_action = self.action(state) agent_action += tf.random_normal(tf.shape(agent_action)) * stddev return utils.clip_to_spec(agent_action, self._action_spec)
def sample(self, states, next_states, num_samples, orig_goals, sc=0.5): goal_dim = orig_goals.shape[-1] spec_range = (self._spec.maximum - self._spec.minimum) / 2 * tf.ones([goal_dim]) loc = tf.cast(next_states - states, tf.float32)[:, :goal_dim] scale = sc * tf.tile(tf.reshape(spec_range, [1, goal_dim]), [tf.shape(states)[0], 1]) dist = tf.distributions.Normal(loc, scale) if num_samples == 1: return dist.sample() samples = tf.concat([dist.sample(num_samples - 2), tf.expand_dims(loc, 0), tf.expand_dims(orig_goals, 0)], 0) return uvf_utils.clip_to_spec(samples, self._spec)
def noisy_action_fn(state, context=None): """Noisy action fn.""" action = action_fn(state, context) if debug: action = uvf_utils.tf_print( action, [action], message='[add_noise_fn] pre-noise action', first_n=100) noise_dist = tf.distributions.Normal(tf.zeros_like(action), tf.ones_like(action) * stddev) noise = noise_dist.sample() action += noise if debug: action = uvf_utils.tf_print( action, [action], message='[add_noise_fn] post-noise action', first_n=100) if clip: action = uvf_utils.clip_to_spec(action, self._action_spec) return action
def get_eval_step(uvf_agent, state_preprocess, tf_env, action_fn, meta_action_fn, environment_steps, num_episodes, mode='eval'): """Get one-step policy/env stepping ops. Args: uvf_agent: A UVF agent. tf_env: A TFEnvironment. action_fn: A function to produce actions given current state. meta_action_fn: A function to produce meta actions given current state. environment_steps: A variable to count the number of steps in the tf_env. num_episodes: A variable to count the number of episodes. mode: a string representing the mode=[train, explore, eval]. Returns: A collect_experience_op that excute an action and store into the replay_buffer """ tf_env.start_collect() state = tf_env.current_obs() action = action_fn(state, context=None) state_repr = state_preprocess(state) action_spec = tf_env.action_spec() action_ph = tf.placeholder(dtype=action_spec.dtype, shape=action_spec.shape) with tf.control_dependencies([state]): transition_type, reward, discount = tf_env.step(action_ph) def increment_step(): return environment_steps.assign_add(1) def increment_episode(): return num_episodes.assign_add(1) def no_op_int(): return tf.constant(0, dtype=tf.int64) step_cond = uvf_agent.step_cond_fn(state, action, transition_type, environment_steps, num_episodes) reset_episode_cond = uvf_agent.reset_episode_cond_fn( state, action, transition_type, environment_steps, num_episodes) reset_env_cond = uvf_agent.reset_env_cond_fn(state, action, transition_type, environment_steps, num_episodes) increment_step_op = tf.cond(step_cond, increment_step, no_op_int) with tf.control_dependencies([increment_step_op]): increment_episode_op = tf.cond(reset_episode_cond, increment_episode, no_op_int) with tf.control_dependencies([reward, discount]): next_state = tf_env.current_obs() next_state_repr = state_preprocess(next_state) with tf.control_dependencies([increment_episode_op]): post_reward, post_meta_reward = uvf_agent.cond_begin_episode_op( tf.logical_not(reset_episode_cond), [ state, action_ph, reward, next_state, state_repr, next_state_repr ], mode=mode, meta_action_fn=meta_action_fn) # Important: do manual reset after getting the final reward from the # unreset environment. with tf.control_dependencies([post_reward, post_meta_reward]): cond_reset_op = tf.cond(reset_env_cond, tf_env.reset, tf_env.current_time_step) # Add a dummy control dependency to force the reset_op to run with tf.control_dependencies(cond_reset_op): post_reward, post_meta_reward = map(tf.identity, [post_reward, post_meta_reward]) eval_step = [ next_state, action_ph, transition_type, post_reward, post_meta_reward, discount, uvf_agent.context_vars, state_repr ] if callable(action): def step_fn(sess): action_value = action(sess) return sess.run(eval_step, feed_dict={action_ph: action_value}) else: action = uvf_utils.clip_to_spec(action, action_spec) def step_fn(sess): action_value = sess.run(action) return sess.run(eval_step, feed_dict={action_ph: action_value}) return step_fn
def get_eval_step(uvf_agent, state_preprocess, tf_env, action_fn, meta_action_fn, environment_steps, num_episodes, mode='eval'): """Get one-step policy/env stepping ops. Args: uvf_agent: A UVF agent. tf_env: A TFEnvironment. action_fn: A function to produce actions given current state. meta_action_fn: A function to produce meta actions given current state. environment_steps: A variable to count the number of steps in the tf_env. num_episodes: A variable to count the number of episodes. mode: a string representing the mode=[train, explore, eval]. Returns: A collect_experience_op that excute an action and store into the replay_buffer """ tf_env.start_collect() state = tf_env.current_obs() action = action_fn(state, context=None) state_repr = state_preprocess(state) action_spec = tf_env.action_spec() action_ph = tf.placeholder(dtype=action_spec.dtype, shape=action_spec.shape) with tf.control_dependencies([state]): transition_type, reward, discount = tf_env.step(action_ph) def increment_step(): return environment_steps.assign_add(1) def increment_episode(): return num_episodes.assign_add(1) def no_op_int(): return tf.constant(0, dtype=tf.int64) step_cond = uvf_agent.step_cond_fn(state, action, transition_type, environment_steps, num_episodes) reset_episode_cond = uvf_agent.reset_episode_cond_fn( state, action, transition_type, environment_steps, num_episodes) reset_env_cond = uvf_agent.reset_env_cond_fn(state, action, transition_type, environment_steps, num_episodes) increment_step_op = tf.cond(step_cond, increment_step, no_op_int) with tf.control_dependencies([increment_step_op]): increment_episode_op = tf.cond(reset_episode_cond, increment_episode, no_op_int) with tf.control_dependencies([reward, discount]): next_state = tf_env.current_obs() next_state_repr = state_preprocess(next_state) with tf.control_dependencies([increment_episode_op]): post_reward, post_meta_reward = uvf_agent.cond_begin_episode_op( tf.logical_not(reset_episode_cond), [state, action_ph, reward, next_state, state_repr, next_state_repr], mode=mode, meta_action_fn=meta_action_fn) # Important: do manual reset after getting the final reward from the # unreset environment. with tf.control_dependencies([post_reward, post_meta_reward]): cond_reset_op = tf.cond(reset_env_cond, tf_env.reset, tf_env.current_time_step) # Add a dummy control dependency to force the reset_op to run with tf.control_dependencies(cond_reset_op): post_reward, post_meta_reward = map(tf.identity, [post_reward, post_meta_reward]) eval_step = [next_state, action_ph, transition_type, post_reward, post_meta_reward, discount, uvf_agent.context_vars, state_repr] if callable(action): def step_fn(sess): action_value = action(sess) return sess.run(eval_step, feed_dict={action_ph: action_value}) else: action = uvf_utils.clip_to_spec(action, action_spec) def step_fn(sess): action_value = sess.run(action) return sess.run(eval_step, feed_dict={action_ph: action_value}) return step_fn