def __init__(self, actions: types.NestedTensor, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, policy_info: types.NestedTensorSpec = (), info_spec: types.NestedTensorSpec = (), name: Optional[Text] = None): """A policy which always returns a fixed action. Args: actions: A Tensor, or a nested dict, list or tuple of Tensors corresponding to `action_spec()`. time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. policy_info: A policy info to be returned in PolicyStep. info_spec: A policy info spec. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. """ super(FixedPolicy, self).__init__(time_step_spec, action_spec, clip=False, info_spec=info_spec, name=name, emit_log_probability=True) nest_utils.assert_same_structure(self._action_spec, actions) def convert(action, spec): return tf.convert_to_tensor(value=action, dtype=spec.dtype) self._action_value = tf.nest.map_structure(convert, actions, self._action_spec) log_probability = tf.nest.map_structure( lambda t: tf.constant(0.0, tf.float32), self._action_spec) self._policy_info = policy_step.set_log_probability(policy_info, log_probability) # pytype: disable=wrong-arg-types
def distribution(self, time_step, policy_state=()): """Generates the distribution over next actions given the time_step. Args: time_step: A `TimeStep` tuple corresponding to `time_step_spec()`. policy_state: A Tensor, or a nested dict, list or tuple of Tensors representing the previous policy_state. Returns: A `PolicyStep` named tuple containing: `action`: A tf.distribution capturing the distribution of next actions. `state`: A policy state tensor for the next call to distribution. `info`: Optional side information such as action log probabilities. """ tf.nest.assert_same_structure(time_step, self._time_step_spec) tf.nest.assert_same_structure(policy_state, self._policy_state_spec) if self._automatic_state_reset: policy_state = self._maybe_reset_state(time_step, policy_state) step = self._distribution(time_step=time_step, policy_state=policy_state) if self.emit_log_probability: # This here is set only for compatibility with info_spec in constructor. info = policy_step.set_log_probability( step.info, tf.nest.map_structure( lambda _: tf.constant(0., dtype=tf.float32), policy_step.get_log_probability(self._info_spec))) step = step._replace(info=info) tf.nest.assert_same_structure(step, self._policy_step_spec) return step
def _action(self, time_step, policy_state, seed): """Implementation of `action`. Args: time_step: A `TimeStep` tuple corresponding to `time_step_spec()`. policy_state: A Tensor, or a nested dict, list or tuple of Tensors representing the previous policy_state. seed: Seed to use if action performs sampling (optional). Returns: A `PolicyStep` named tuple containing: `action`: An action Tensor matching the `action_spec`. `state`: A policy state tensor to be fed into the next call to action. `info`: Optional side information such as action log probabilities. """ seed_stream = tfp.util.SeedStream(seed=seed, salt='tf_agents_tf_policy') distribution_step = self._distribution(time_step, policy_state) actions = tf.nest.map_structure( lambda d: reparameterized_sampling.sample(d, seed=seed_stream()), distribution_step.action) info = distribution_step.info if self.emit_log_probability: try: log_probability = tf.nest.map_structure( lambda a, d: d.log_prob(a), actions, distribution_step.action) info = policy_step.set_log_probability(info, log_probability) except: raise TypeError( '%s does not support emitting log-probabilities.' % type(self).__name__) return distribution_step._replace(action=actions, info=info)
def _action(self, time_step, policy_state, seed): observation_and_action_constraint_splitter = ( self.observation_and_action_constraint_splitter) outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) if observation_and_action_constraint_splitter is not None: observation, mask = observation_and_action_constraint_splitter( time_step.observation) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) action_ = tf.cast( masked_categorical.sample() + self.action_spec.minimum, self.action_spec.dtype) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if self.action_spec.shape.rank == 1: action_ = tf.expand_dims(action_, axis=-1) policy_info = tensor_spec.sample_spec_nest(self._info_spec, outer_dims=outer_dims) else: observation = time_step.observation action_ = tensor_spec.sample_spec_nest(self._action_spec, seed=seed, outer_dims=outer_dims) policy_info = tensor_spec.sample_spec_nest(self._info_spec, outer_dims=outer_dims) if self._accepts_per_arm_features: def _gather_fn(t): return tf.gather(params=t, indices=action_, batch_dims=1) chosen_arm_features = tf.nest.map_structure( _gather_fn, observation['per_arm']) policy_info = policy_info._replace( chosen_arm_features=chosen_arm_features) # TODO(b/78181147): Investigate why this control dependency is required. if time_step is not None: with tf.control_dependencies(tf.nest.flatten(time_step)): action_ = tf.nest.map_structure(tf.identity, action_) if self.emit_log_probability: if observation_and_action_constraint_splitter is not None: log_probability = masked_categorical.log_prob( action_ - self.action_spec.minimum) else: action_probability = tf.nest.map_structure( _uniform_probability, self._action_spec) log_probability = tf.nest.map_structure( tf.math.log, action_probability) policy_info = policy_step.set_log_probability( policy_info, log_probability) step = policy_step.PolicyStep(action_, policy_state, policy_info) return step
def __init__(self, time_step_spec, action_spec, policy_state_spec=(), info_spec=(), clip=True, emit_log_probability=False, automatic_state_reset=True, name=None): """Initialization of Base class. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. Usually provided by the user to the subclass. action_spec: A nest of BoundedTensorSpec representing the actions. Usually provided by the user to the subclass. policy_state_spec: A nest of TensorSpec representing the policy_state. Provided by the subclass, not directly by the user. info_spec: A nest of TensorSpec representing the policy info. Provided by the subclass, not directly by the user. clip: Whether to clip actions to spec before returning them. Default True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped continuous actions for training. emit_log_probability: Emit log-probabilities of actions, if supported. If True, policy_step.info will have CommonFields.LOG_PROBABILITY set. Please consult utility methods provided in policy_step for setting and retrieving these. When working with custom policies, either provide a dictionary info_spec or a namedtuple with the field 'log_probability'. automatic_state_reset: If `True`, then `get_initial_policy_state` is used to clear state in `action()` and `distribution()` for for time steps where `time_step.is_first()`. name: A name for this module. Defaults to the class name. """ super(Base, self).__init__(name=name) common.assert_members_are_not_overridden(base_cls=Base, instance=self) self._time_step_spec = time_step_spec self._action_spec = action_spec self._policy_state_spec = policy_state_spec self._emit_log_probability = emit_log_probability if emit_log_probability: log_probability_spec = tensor_spec.BoundedTensorSpec( shape=(), dtype=tf.float32, maximum=0, minimum=-float('inf'), name='log_probability') log_probability_spec = tf.nest.map_structure( lambda _: log_probability_spec, action_spec) info_spec = policy_step.set_log_probability( info_spec, log_probability_spec) self._info_spec = info_spec self._setup_specs() self._clip = clip self._action_fn = common.function_in_tf1()(self._action) self._automatic_state_reset = automatic_state_reset
def _get_policy_info_and_action(self, time_step): outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) log_probability = tf.nest.map_structure( lambda _: tf.zeros(outer_shape, tf.float32), self._action_spec) policy_info = policy_step.set_log_probability( self._policy_info, log_probability=log_probability) action = tf.nest.map_structure(lambda t: common.replicate(t, outer_shape), self._action_value) return policy_info, action
def distribution( self, time_step: ts.TimeStep, policy_state: types.NestedTensor = () ) -> policy_step.PolicyStep: """Generates the distribution over next actions given the time_step. Args: time_step: A `TimeStep` tuple corresponding to `time_step_spec()`. policy_state: A Tensor, or a nested dict, list or tuple of Tensors representing the previous policy_state. Returns: A `PolicyStep` named tuple containing: `action`: A tf.distribution capturing the distribution of next actions. `state`: A policy state tensor for the next call to distribution. `info`: Optional side information such as action log probabilities. Raises: ValueError or TypeError: If `validate_args is True` and inputs or outputs do not match `time_step_spec`, `policy_state_spec`, or `policy_step_spec`. """ if self._validate_args: time_step = nest_utils.prune_extra_keys(self._time_step_spec, time_step) policy_state = nest_utils.prune_extra_keys(self._policy_state_spec, policy_state) nest_utils.assert_same_structure( time_step, self._time_step_spec, message='time_step and time_step_spec structures do not match') nest_utils.assert_same_structure( policy_state, self._policy_state_spec, message= 'policy_state and policy_state_spec structures do not match') if self._automatic_state_reset: policy_state = self._maybe_reset_state(time_step, policy_state) step = self._distribution(time_step=time_step, policy_state=policy_state) if self.emit_log_probability: # This here is set only for compatibility with info_spec in constructor. info = policy_step.set_log_probability( step.info, tf.nest.map_structure( lambda _: tf.constant(0., dtype=tf.float32), policy_step.get_log_probability(self._info_spec))) step = step._replace(info=info) if self._validate_args: nest_utils.assert_same_structure( step, self._policy_step_spec, message=('distribution output and policy_step_spec structures ' 'do not match')) return step
def _distribution2action(self, distribution_step, seed_stream): """ Convert distribution_step to action_step :param distribution_step: :param seed_stream: :return: action_step """ actions = tf.nest.map_structure(lambda d: d.sample(seed=seed_stream()), distribution_step.action) info = distribution_step.info if self.emit_log_probability: try: log_probability = tf.nest.map_structure( lambda a, d: d.log_prob(a), actions, distribution_step.action) info = policy_step.set_log_probability(info, log_probability) except: raise TypeError( '%s does not support emitting log-probabilities.' % type(self).__name__) step = tf.nest.map_structure(lambda x: x, distribution_step) step = step._replace(action=actions, info=info) def clip_action(action, action_spec): if isinstance(action_spec, tensor_spec.BoundedTensorSpec): return common.clip_to_spec(action, action_spec) return action if self._clip: clipped_actions = tf.nest.map_structure(clip_action, step.action, self._action_spec) step = step._replace(action=clipped_actions) tf.nest.assert_same_structure(step, self._policy_step_spec) def compare_to_spec(value, spec): return value.dtype.is_compatible_with(spec.dtype) compatibility = tf.nest.flatten( tf.nest.map_structure(compare_to_spec, step.action, self.action_spec)) if not all(compatibility): get_dtype = lambda x: x.dtype action_dtypes = tf.nest.map_structure(get_dtype, step.action) spec_dtypes = tf.nest.map_structure(get_dtype, self.action_spec) raise TypeError( 'Policy produced an action with a dtype that doesn\'t ' 'match its action_spec. Got action: %s with ' 'action_spec: %s' % (action_dtypes, spec_dtypes)) return step
def __init__(self, actions: types.NestedTensor, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, emit_log_probability: bool = True, policy_info: types.NestedTensorSpec = (), info_spec: types.NestedTensorSpec = (), name: Optional[Text] = None): """A policy which always returns a fixed action. Args: actions: A Tensor, or a nested dict, list or tuple of Tensors corresponding to `action_spec()`. time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. emit_log_probability: Emit log-probabilities of actions, if supported. If True, policy_step.info will have CommonFields.LOG_PROBABILITY set. Please consult utility methods provided in policy_step for setting and retrieving these. When working with custom policies, either provide a dictionary info_spec or a namedtuple with the field 'log_probability'. policy_info: A policy info to be returned in PolicyStep. info_spec: A policy info spec. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. """ super(FixedPolicy, self).__init__(time_step_spec, action_spec, clip=False, info_spec=info_spec, name=name, emit_log_probability=emit_log_probability) nest_utils.assert_same_structure(self._action_spec, actions) def convert(action, spec): return tf.convert_to_tensor(value=action, dtype=spec.dtype) self._action_value = tf.nest.map_structure(convert, actions, self._action_spec) if self._emit_log_probability: log_probability = tf.nest.map_structure( lambda t: tf.constant(0.0, tf.float32), self._action_spec) self._policy_info = policy_step.set_log_probability( policy_info, log_probability) # pytype: disable=wrong-arg-types else: self._policy_info = policy_info
def _action(self, time_step, policy_state, seed): if time_step.observation['mask'] is not None: mask = time_step.observation['mask'] zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) action_ = tf.cast(masked_categorical.sample() + self.action_spec.minimum, self.action_spec.dtype) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if self.action_spec.shape.rank == 1: action_ = tf.expand_dims(action_, axis=-1) else: outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) action_ = tensor_spec.sample_spec_nest( self._action_spec, seed=seed, outer_dims=outer_dims) if time_step is not None: with tf.control_dependencies(tf.nest.flatten(time_step)): action_ = tf.nest.map_structure(tf.identity, action_) policy_info = tensor_spec.sample_spec_nest(self._info_spec) if self.emit_log_probability: if time_step.observation['mask'] is not None: log_probability = masked_categorical.log_prob( action_ - self.action_spec.minimum) else: _uniform_probability = np.random.uniform(low=0.0, high=1.0) action_probability = tf.nest.map_structure(_uniform_probability, self._action_spec) log_probability = tf.nest.map_structure(tf.math.log, action_probability) policy_info = policy_step.set_log_probability(policy_info, log_probability) step = policy_step.PolicyStep(action_, policy_state, policy_info) return step
def __init__(self, time_step_spec, action_spec, policy_state_spec=(), info_spec=(), clip=True, emit_log_probability=False, automatic_state_reset=True, observation_and_action_constraint_splitter=None, validate_args=True, name=None): """Initialization of TFPolicy class. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. Usually provided by the user to the subclass. action_spec: A nest of BoundedTensorSpec representing the actions. Usually provided by the user to the subclass. policy_state_spec: A nest of TensorSpec representing the policy_state. Provided by the subclass, not directly by the user. info_spec: A nest of TensorSpec representing the policy info. Provided by the subclass, not directly by the user. clip: Whether to clip actions to spec before returning them. Default True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped continuous actions for training. emit_log_probability: Emit log-probabilities of actions, if supported. If True, policy_step.info will have CommonFields.LOG_PROBABILITY set. Please consult utility methods provided in policy_step for setting and retrieving these. When working with custom policies, either provide a dictionary info_spec or a namedtuple with the field 'log_probability'. automatic_state_reset: If `True`, then `get_initial_policy_state` is used to clear state in `action()` and `distribution()` for for time steps where `time_step.is_first()`. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `q_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. validate_args: Python bool. Whether to verify inputs to, and outputs of, functions like `action` and `distribution` against spec structures, dtypes, and shapes. Research code may prefer to set this value to `False` to allow iterating on input and output structures without being hamstrung by overly rigid checking (at the cost of harder-to-debug errors). See also `TFAgent.validate_args`. name: A name for this module. Defaults to the class name. """ super(TFPolicy, self).__init__(name=name) common.check_tf1_allowed() common.tf_agents_gauge.get_cell('TFAPolicy').set(True) common.assert_members_are_not_overridden(base_cls=TFPolicy, instance=self) if not isinstance(time_step_spec, ts.TimeStep): raise ValueError( 'The `time_step_spec` must be an instance of `TimeStep`, but is `{}`.' .format(type(time_step_spec))) self._time_step_spec = time_step_spec self._action_spec = action_spec self._policy_state_spec = policy_state_spec self._emit_log_probability = emit_log_probability self._validate_args = validate_args if emit_log_probability: log_probability_spec = tensor_spec.BoundedTensorSpec( shape=(), dtype=tf.float32, maximum=0, minimum=-float('inf'), name='log_probability') log_probability_spec = tf.nest.map_structure( lambda _: log_probability_spec, action_spec) info_spec = policy_step.set_log_probability( info_spec, log_probability_spec) self._info_spec = info_spec self._setup_specs() self._clip = clip self._action_fn = common.function_in_tf1()(self._action) self._automatic_state_reset = automatic_state_reset self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter)
def _action(self, time_step, policy_state, seed): observation_and_action_constraint_splitter = ( self.observation_and_action_constraint_splitter) outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) if observation_and_action_constraint_splitter is not None: observation, mask = observation_and_action_constraint_splitter( time_step.observation) action_spec = tensor_spec.from_spec(self.action_spec) action_spec = cast(tensor_spec.BoundedTensorSpec, action_spec) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) action_ = tf.cast( masked_categorical.sample() + action_spec.minimum, action_spec.dtype) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if action_spec.shape.rank == 1: action_ = tf.expand_dims(action_, axis=-1) policy_info = tensor_spec.sample_spec_nest(self._info_spec, outer_dims=outer_dims) else: observation = time_step.observation action_spec = cast(tensor_spec.BoundedTensorSpec, self.action_spec) if self._accepts_per_arm_features: max_num_arms = action_spec.maximum - action_spec.minimum + 1 batch_size = tf.shape(time_step.step_type)[0] num_actions = observation.get( bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY, tf.ones(shape=(batch_size, ), dtype=tf.int32) * max_num_arms) mask = tf.sequence_mask(num_actions, max_num_arms) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical( zero_logits, mask) action_ = tf.nest.map_structure( lambda t: tf.cast(masked_categorical.sample() + t.minimum, t.dtype), action_spec) else: action_ = tensor_spec.sample_spec_nest(self._action_spec, seed=seed, outer_dims=outer_dims) policy_info = tensor_spec.sample_spec_nest(self._info_spec, outer_dims=outer_dims) # Update policy info with chosen arm features. if self._accepts_per_arm_features: def _gather_fn(t): return tf.gather(params=t, indices=action_, batch_dims=1) chosen_arm_features = tf.nest.map_structure( _gather_fn, observation[bandit_spec_utils.PER_ARM_FEATURE_KEY]) if policy_utilities.has_chosen_arm_features(self._info_spec): policy_info = policy_info._replace( chosen_arm_features=chosen_arm_features) # TODO(b/78181147): Investigate why this control dependency is required. if time_step is not None: with tf.control_dependencies(tf.nest.flatten(time_step)): action_ = tf.nest.map_structure(tf.identity, action_) if self.emit_log_probability: if (self._accepts_per_arm_features or observation_and_action_constraint_splitter is not None): action_spec = cast(tensor_spec.BoundedTensorSpec, self.action_spec) log_probability = masked_categorical.log_prob( action_ - action_spec.minimum) else: log_probability = tf.nest.map_structure( lambda s: _calculate_log_probability(outer_dims, s), self._action_spec) policy_info = policy_step.set_log_probability( policy_info, log_probability) step = policy_step.PolicyStep(action_, policy_state, policy_info) return step
def _get_step(self) -> EnvStep: if self._start_on_next_step: self._start_new_episode() if StepType.is_last(self._step_type): # This is the last (terminating) observation of the environment. self._start_on_next_step = True self._num_total_steps += 1 self._num_episodes += 1 # The policy is not run on the terminal step, so we just carry over the # reward, action, and policy_info from the previous step. return EnvStep(self._step_type, tf.cast(self._cur_step_num, dtype=tf.int64), self._time_step.observation, self._action, self._time_step.reward, self._time_step.discount, self._policy_info, {}, {}) self._action, self._policy_state, self._policy_info = self._policy.action( self._time_step, self._policy_state) # Update type of log-probs to tf.float32... a bit of a bug in TF-Agents. if hasattr(self._policy_info, 'log_probability'): self._policy_info = policy_step.set_log_probability( self._policy_info, tf.cast(self._policy_info.log_probability, tf.float32)) # Sample action from policy. env_action = self._action if self._env.batch_size is not None: env_action = nest_utils.batch_nested_tensors(env_action) # Sample next step from environment. self._next_time_step = self._env.step(env_action) if self._env.batch_size is not None: self._next_time_step = nest_utils.unbatch_nested_tensors( self._next_time_step) self._next_step_type = self._next_time_step.step_type self._cur_step_num += 1 if (self._episode_step_limit and self._cur_step_num >= self._episode_step_limit): self._next_step_type = tf.convert_to_tensor( # Overwrite step type. value=StepType.LAST, dtype=self._first_step_type.dtype) self._next_step_type = tf.reshape(self._next_step_type, tf.shape(self._first_step_type)) step = EnvStep( self._step_type, tf.cast(self._cur_step_num - 1, tf.int64), self._time_step.observation, self._action, # Immediate reward given by next time step. self._next_time_step.reward, self._time_step.discount, self._policy_info, {}, {}) self._num_steps += 1 self._num_total_steps += 1 if StepType.is_first(self._step_type): self._num_total_episodes += 1 self._time_step = self._next_time_step self._step_type = self._next_step_type return step
def main(_): # setting up start_time = time.time() tf.compat.v1.enable_resource_variables() tf.compat.v1.disable_eager_execution() logging.set_verbosity(logging.INFO) global observation_omit_size, goal_coord, sample_count, iter_count, episode_size_buffer, episode_return_buffer root_dir = os.path.abspath(os.path.expanduser(FLAGS.logdir)) if not tf.io.gfile.exists(root_dir): tf.io.gfile.makedirs(root_dir) log_dir = os.path.join(root_dir, FLAGS.environment) if not tf.io.gfile.exists(log_dir): tf.io.gfile.makedirs(log_dir) save_dir = os.path.join(log_dir, "models") if not tf.io.gfile.exists(save_dir): tf.io.gfile.makedirs(save_dir) print("directory for recording experiment data:", log_dir) # in case training is paused and resumed, so can be restored try: sample_count = np.load(os.path.join(log_dir, "sample_count.npy")).tolist() iter_count = np.load(os.path.join(log_dir, "iter_count.npy")).tolist() episode_size_buffer = np.load( os.path.join(log_dir, "episode_size_buffer.npy")).tolist() episode_return_buffer = np.load( os.path.join(log_dir, "episode_return_buffer.npy")).tolist() except: sample_count = 0 iter_count = 0 episode_size_buffer = [] episode_return_buffer = [] train_summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(log_dir, "train", "in_graph_data"), flush_millis=10 * 1000) train_summary_writer.set_as_default() global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if(True): # environment related stuff env = do.get_environment(env_name=FLAGS.environment) py_env = wrap_env( skill_wrapper.SkillWrapper( env, num_latent_skills=FLAGS.num_skills, skill_type=FLAGS.skill_type, preset_skill=None, min_steps_before_resample=FLAGS.min_steps_before_resample, resample_prob=FLAGS.resample_prob, ), max_episode_steps=FLAGS.max_env_steps, ) # all specifications required for all networks and agents py_action_spec = py_env.action_spec() tf_action_spec = tensor_spec.from_spec( py_action_spec) # policy, critic action spec env_obs_spec = py_env.observation_spec() py_env_time_step_spec = ts.time_step_spec( env_obs_spec) # replay buffer time_step spec if observation_omit_size > 0: agent_obs_spec = array_spec.BoundedArraySpec( (env_obs_spec.shape[0] - observation_omit_size, ), env_obs_spec.dtype, minimum=env_obs_spec.minimum, maximum=env_obs_spec.maximum, name=env_obs_spec.name, ) # policy, critic observation spec else: agent_obs_spec = env_obs_spec py_agent_time_step_spec = ts.time_step_spec( agent_obs_spec) # policy, critic time_step spec tf_agent_time_step_spec = tensor_spec.from_spec( py_agent_time_step_spec) if not FLAGS.reduced_observation: skill_dynamics_observation_size = ( py_env_time_step_spec.observation.shape[0] - FLAGS.num_skills) else: skill_dynamics_observation_size = FLAGS.reduced_observation # TODO(architsh): Shift co-ordinate hiding to actor_net and critic_net (good for futher image based processing as well) actor_net = actor_distribution_network.ActorDistributionNetwork( tf_agent_time_step_spec.observation, tf_action_spec, fc_layer_params=(FLAGS.hidden_layer_size, ) * 2, continuous_projection_net=do._normal_projection_net, ) critic_net = critic_network.CriticNetwork( (tf_agent_time_step_spec.observation, tf_action_spec), observation_fc_layer_params=None, action_fc_layer_params=None, joint_fc_layer_params=(FLAGS.hidden_layer_size, ) * 2, ) if (FLAGS.skill_dynamics_relabel_type is not None and "importance_sampling" in FLAGS.skill_dynamics_relabel_type and FLAGS.is_clip_eps > 1.0): reweigh_batches_flag = True else: reweigh_batches_flag = False agent = dads_agent.DADSAgent( # DADS parameters save_dir, skill_dynamics_observation_size, observation_modify_fn=do.process_observation, restrict_input_size=observation_omit_size, latent_size=FLAGS.num_skills, latent_prior=FLAGS.skill_type, prior_samples=FLAGS.random_skills, fc_layer_params=(FLAGS.hidden_layer_size, ) * 2, normalize_observations=FLAGS.normalize_data, network_type=FLAGS.graph_type, num_mixture_components=FLAGS.num_components, fix_variance=FLAGS.fix_variance, reweigh_batches=reweigh_batches_flag, skill_dynamics_learning_rate=FLAGS.skill_dynamics_lr, # SAC parameters time_step_spec=tf_agent_time_step_spec, action_spec=tf_action_spec, actor_network=actor_net, critic_network=critic_net, target_update_tau=0.005, target_update_period=1, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=FLAGS.agent_lr), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=FLAGS.agent_lr), alpha_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=FLAGS.agent_lr), td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error, gamma=FLAGS.agent_gamma, reward_scale_factor=1.0 / (FLAGS.agent_entropy + 1e-12), gradient_clipping=None, debug_summaries=FLAGS.debug, train_step_counter=global_step, ) # evaluation policy eval_policy = py_tf_policy.PyTFPolicy(agent.policy) # collection policy if FLAGS.collect_policy == "default": collect_policy = py_tf_policy.PyTFPolicy(agent.collect_policy) elif FLAGS.collect_policy == "ou_noise": collect_policy = py_tf_policy.PyTFPolicy( ou_noise_policy.OUNoisePolicy(agent.collect_policy, ou_stddev=0.2, ou_damping=0.15)) # relabelling policy deals with batches of data, unlike collect and eval relabel_policy = py_tf_policy.PyTFPolicy(agent.collect_policy) # constructing a replay buffer, need a python spec policy_step_spec = policy_step.PolicyStep(action=py_action_spec, state=(), info=()) if (FLAGS.skill_dynamics_relabel_type is not None and "importance_sampling" in FLAGS.skill_dynamics_relabel_type and FLAGS.is_clip_eps > 1.0): policy_step_spec = policy_step_spec._replace( info=policy_step.set_log_probability( policy_step_spec.info, array_spec.ArraySpec( shape=(), dtype=np.float32, name="action_log_prob"), )) trajectory_spec = from_transition(py_env_time_step_spec, policy_step_spec, py_env_time_step_spec) capacity = FLAGS.replay_buffer_capacity # for all the data collected rbuffer = py_uniform_replay_buffer.PyUniformReplayBuffer( capacity=capacity, data_spec=trajectory_spec) if FLAGS.train_skill_dynamics_on_policy: # for on-policy data (if something special is required) on_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer( capacity=FLAGS.initial_collect_steps + FLAGS.collect_steps + 10, data_spec=trajectory_spec, ) # insert experience manually with relabelled rewards and skills agent.build_agent_graph() agent.build_skill_dynamics_graph() agent.create_savers() # saving this way requires the saver to be out the object train_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(save_dir, "agent"), agent=agent, global_step=global_step, ) policy_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(save_dir, "policy"), policy=agent.policy, global_step=global_step, ) rb_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(save_dir, "replay_buffer"), max_to_keep=1, replay_buffer=rbuffer, ) setup_time = time.time() - start_time print("Setup time:", setup_time) with tf.compat.v1.Session().as_default() as sess: eval_policy.session = sess eval_policy.initialize(None) eval_policy.restore(os.path.join(FLAGS.logdir, "models", "policy")) plotdir = os.path.join(FLAGS.logdir, "plots") if not os.path.exists(plotdir): os.mkdir(plotdir) do.FLAGS = FLAGS do.eval_loop(eval_dir=plotdir, eval_policy=eval_policy, plot_name="plot")
def _get_action_step(action, log_prob): step = policy_step.PolicyStep(action=tf.convert_to_tensor(action)) return step._replace(info=policy_step.set_log_probability( step.info, tf.convert_to_tensor(log_prob)))
def _action(self, time_step, policy_state, seed): seed_stream = tfp.util.SeedStream(seed=seed, salt='epsilon_greedy') greedy_action = self._greedy_policy.action(time_step, policy_state) random_action = self._random_policy.action(time_step, (), seed_stream()) outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) rng = tf.random.uniform( outer_shape, maxval=1.0, seed=seed_stream(), name='epsilon_rng') cond = tf.greater_equal(rng, self._get_epsilon()) # Selects the action/info from the random policy with probability epsilon. # TODO(b/133175894): tf.compat.v1.where only supports a condition which is # either a scalar or a vector. Use tf.compat.v2 so that it can support any # condition whose leading dimensions are the same as the other operands of # tf.where. outer_ndims = int(outer_shape.shape[0]) if outer_ndims >= 2: raise ValueError( 'Only supports batched time steps with a single batch dimension') action = tf.nest.map_structure(lambda g, r: tf.compat.v1.where(cond, g, r), greedy_action.action, random_action.action) if greedy_action.info: if not random_action.info: raise ValueError('Incompatible info field') # Note that the objects in PolicyInfo may have different shapes, so we # need to call nest_utils.where() on each type of object. info = tf.nest.map_structure(lambda x, y: nest_utils.where(cond, x, y), greedy_action.info, random_action.info) if self._emit_log_probability: # At this point, info.log_probability contains the log prob of the # action chosen, conditioned on the policy that was chosen. We want to # emit the full log probability of the action, so we'll add in the log # probability of choosing the policy. random_log_prob = tf.nest.map_structure( lambda t: tf.math.log(tf.zeros_like(t) + self._get_epsilon()), info.log_probability) greedy_log_prob = tf.nest.map_structure( lambda t: tf.math.log(tf.ones_like(t) - self._get_epsilon()), random_log_prob) log_prob_of_chosen_policy = nest_utils.where(cond, greedy_log_prob, random_log_prob) log_prob = tf.nest.map_structure(lambda a, b: a + b, log_prob_of_chosen_policy, info.log_probability) info = policy_step.set_log_probability(info, log_prob) # Overwrite bandit policy info type. if policy_utilities.has_bandit_policy_type(info, check_for_tensor=True): # Generate mask of the same shape as bandit_policy_type (batch_size, 1). # This is the opposite of `cond`, which is 1-D bool tensor (batch_size,) # that is true when greedy policy was used, otherwise `cond` is false. random_policy_mask = tf.reshape(tf.logical_not(cond), tf.shape(info.bandit_policy_type)) # pytype: disable=attribute-error bandit_policy_type = policy_utilities.bandit_policy_uniform_mask( info.bandit_policy_type, mask=random_policy_mask) # pytype: disable=attribute-error info = policy_utilities.set_bandit_policy_type( info, bandit_policy_type) else: if random_action.info: raise ValueError('Incompatible info field') info = () # The state of the epsilon greedy policy is the state of the underlying # greedy policy (the random policy carries no state). # It is commonly assumed that the new policy state only depends only # on the previous state and "time_step", the action (be it the greedy one # or the random one) does not influence the new policy state. state = greedy_action.state return policy_step.PolicyStep(action, state, info)
def load(self): # setting up tf.compat.v1.enable_resource_variables() tf.compat.v1.disable_eager_execution() root_dir = os.path.abspath(os.path.expanduser(self.flags.logdir)) if not tf.io.gfile.exists(root_dir): tf.io.gfile.makedirs(root_dir) log_dir = os.path.join(root_dir, self.flags.environment) if not tf.io.gfile.exists(log_dir): tf.io.gfile.makedirs(log_dir) save_dir = os.path.join(log_dir, "models") if not tf.io.gfile.exists(save_dir): tf.io.gfile.makedirs(save_dir) train_summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(log_dir, "train", "in_graph_data"), flush_millis=10 * 1000) train_summary_writer.set_as_default() global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if(True): # environment related stuff env = do.get_environment(env_name=self.flags.environment) py_env = wrap_env( skill_wrapper.SkillWrapper( env, num_latent_skills=self.flags.num_skills, skill_type=self.flags.skill_type, preset_skill=None, min_steps_before_resample=self.flags. min_steps_before_resample, resample_prob=self.flags.resample_prob, ), max_episode_steps=self.flags.max_env_steps, ) # all specifications required for all networks and agents py_action_spec = py_env.action_spec() tf_action_spec = tensor_spec.from_spec( py_action_spec) # policy, critic action spec env_obs_spec = py_env.observation_spec() py_env_time_step_spec = ts.time_step_spec( env_obs_spec) # replay buffer time_step spec if self.flags.observation_omission_size > 0: agent_obs_spec = array_spec.BoundedArraySpec( (env_obs_spec.shape[0] - self.flags.observation_omission_size), env_obs_spec.dtype, minimum=env_obs_spec.minimum, maximum=env_obs_spec.maximum, name=env_obs_spec.name, ) # policy, critic observation spec else: agent_obs_spec = env_obs_spec py_agent_time_step_spec = ts.time_step_spec( agent_obs_spec) # policy, critic time_step spec tf_agent_time_step_spec = tensor_spec.from_spec( py_agent_time_step_spec) if not self.flags.reduced_observation: skill_dynamics_observation_size = ( py_env_time_step_spec.observation.shape[0] - self.flags.num_skills) else: skill_dynamics_observation_size = self.flags.reduced_observation # TODO(architsh): Shift co-ordinate hiding to actor_net and critic_net (good for futher image based processing as well) actor_net = actor_distribution_network.ActorDistributionNetwork( tf_agent_time_step_spec.observation, tf_action_spec, fc_layer_params=(self.flags.hidden_layer_size, ) * 2, continuous_projection_net=do._normal_projection_net, ) critic_net = critic_network.CriticNetwork( (tf_agent_time_step_spec.observation, tf_action_spec), observation_fc_layer_params=None, action_fc_layer_params=None, joint_fc_layer_params=(self.flags.hidden_layer_size, ) * 2, ) if (self.flags.skill_dynamics_relabel_type is not None and "importance_sampling" in self.flags.skill_dynamics_relabel_type and self.flags.is_clip_eps > 1.0): reweigh_batches_flag = True else: reweigh_batches_flag = False agent = dads_agent.DADSAgent( # DADS parameters save_dir, skill_dynamics_observation_size, observation_modify_fn=self.process_observation, restrict_input_size=self.flags.observation_omission_size, latent_size=self.flags.num_skills, latent_prior=self.flags.skill_type, prior_samples=self.flags.random_skills, fc_layer_params=(self.flags.hidden_layer_size, ) * 2, normalize_observations=self.flags.normalize_data, network_type=self.flags.graph_type, num_mixture_components=self.flags.num_components, fix_variance=self.flags.fix_variance, reweigh_batches=reweigh_batches_flag, skill_dynamics_learning_rate=self.flags.skill_dynamics_lr, # SAC parameters time_step_spec=tf_agent_time_step_spec, action_spec=tf_action_spec, actor_network=actor_net, critic_network=critic_net, target_update_tau=0.005, target_update_period=1, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self.flags.agent_lr), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self.flags.agent_lr), alpha_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self.flags.agent_lr), td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error, gamma=self.flags.agent_gamma, reward_scale_factor=1.0 / (self.flags.agent_entropy + 1e-12), gradient_clipping=None, debug_summaries=self.flags.debug, train_step_counter=global_step, ) # evaluation policy eval_policy = py_tf_policy.PyTFPolicy(agent.policy) # constructing a replay buffer, need a python spec policy_step_spec = policy_step.PolicyStep(action=py_action_spec, state=(), info=()) if (self.flags.skill_dynamics_relabel_type is not None and "importance_sampling" in self.flags.skill_dynamics_relabel_type and self.flags.is_clip_eps > 1.0): policy_step_spec = policy_step_spec._replace( info=policy_step.set_log_probability( policy_step_spec.info, array_spec.ArraySpec( shape=( ), dtype=np.float32, name="action_log_prob"), )) # insert experience manually with relabelled rewards and skills agent.build_agent_graph() agent.build_skill_dynamics_graph() with tf.compat.v1.Session().as_default() as sess: eval_policy.session = sess eval_policy.initialize(None) eval_policy.restore( os.path.join(self.flags.logdir, "models", "policy")) self.policy = eval_policy