def __init__(self,
               actions: types.NestedTensor,
               time_step_spec: ts.TimeStep,
               action_spec: types.NestedTensorSpec,
               policy_info: types.NestedTensorSpec = (),
               info_spec: types.NestedTensorSpec = (),
               name: Optional[Text] = None):
    """A policy which always returns a fixed action.

    Args:
      actions: A Tensor, or a nested dict, list or tuple of Tensors
        corresponding to `action_spec()`.
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      policy_info: A policy info to be returned in PolicyStep.
      info_spec: A policy info spec.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.
    """
    super(FixedPolicy, self).__init__(time_step_spec, action_spec, clip=False,
                                      info_spec=info_spec,
                                      name=name, emit_log_probability=True)
    nest_utils.assert_same_structure(self._action_spec, actions)

    def convert(action, spec):
      return tf.convert_to_tensor(value=action, dtype=spec.dtype)

    self._action_value = tf.nest.map_structure(convert, actions,
                                               self._action_spec)
    log_probability = tf.nest.map_structure(
        lambda t: tf.constant(0.0, tf.float32), self._action_spec)
    self._policy_info = policy_step.set_log_probability(policy_info,
                                                        log_probability)  # pytype: disable=wrong-arg-types
示例#2
0
  def distribution(self, time_step, policy_state=()):
    """Generates the distribution over next actions given the time_step.

    Args:
      time_step: A `TimeStep` tuple corresponding to `time_step_spec()`.
      policy_state: A Tensor, or a nested dict, list or tuple of Tensors
        representing the previous policy_state.

    Returns:
      A `PolicyStep` named tuple containing:

        `action`: A tf.distribution capturing the distribution of next actions.
        `state`: A policy state tensor for the next call to distribution.
        `info`: Optional side information such as action log probabilities.
    """
    tf.nest.assert_same_structure(time_step, self._time_step_spec)
    tf.nest.assert_same_structure(policy_state, self._policy_state_spec)
    if self._automatic_state_reset:
      policy_state = self._maybe_reset_state(time_step, policy_state)
    step = self._distribution(time_step=time_step, policy_state=policy_state)
    if self.emit_log_probability:
      # This here is set only for compatibility with info_spec in constructor.
      info = policy_step.set_log_probability(
          step.info,
          tf.nest.map_structure(
              lambda _: tf.constant(0., dtype=tf.float32),
              policy_step.get_log_probability(self._info_spec)))
      step = step._replace(info=info)
    tf.nest.assert_same_structure(step, self._policy_step_spec)
    return step
示例#3
0
    def _action(self, time_step, policy_state, seed):
        """Implementation of `action`.

    Args:
      time_step: A `TimeStep` tuple corresponding to `time_step_spec()`.
      policy_state: A Tensor, or a nested dict, list or tuple of Tensors
        representing the previous policy_state.
      seed: Seed to use if action performs sampling (optional).

    Returns:
      A `PolicyStep` named tuple containing:
        `action`: An action Tensor matching the `action_spec`.
        `state`: A policy state tensor to be fed into the next call to action.
        `info`: Optional side information such as action log probabilities.
    """
        seed_stream = tfp.util.SeedStream(seed=seed,
                                          salt='tf_agents_tf_policy')
        distribution_step = self._distribution(time_step, policy_state)
        actions = tf.nest.map_structure(
            lambda d: reparameterized_sampling.sample(d, seed=seed_stream()),
            distribution_step.action)
        info = distribution_step.info
        if self.emit_log_probability:
            try:
                log_probability = tf.nest.map_structure(
                    lambda a, d: d.log_prob(a), actions,
                    distribution_step.action)
                info = policy_step.set_log_probability(info, log_probability)
            except:
                raise TypeError(
                    '%s does not support emitting log-probabilities.' %
                    type(self).__name__)

        return distribution_step._replace(action=actions, info=info)
示例#4
0
    def _action(self, time_step, policy_state, seed):
        observation_and_action_constraint_splitter = (
            self.observation_and_action_constraint_splitter)

        outer_dims = nest_utils.get_outer_shape(time_step,
                                                self._time_step_spec)
        if observation_and_action_constraint_splitter is not None:
            observation, mask = observation_and_action_constraint_splitter(
                time_step.observation)

            zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
            masked_categorical = masked.MaskedCategorical(zero_logits, mask)
            action_ = tf.cast(
                masked_categorical.sample() + self.action_spec.minimum,
                self.action_spec.dtype)

            # If the action spec says each action should be shaped (1,), add another
            # dimension so the final shape is (B, 1) rather than (B,).
            if self.action_spec.shape.rank == 1:
                action_ = tf.expand_dims(action_, axis=-1)
            policy_info = tensor_spec.sample_spec_nest(self._info_spec,
                                                       outer_dims=outer_dims)
        else:
            observation = time_step.observation

            action_ = tensor_spec.sample_spec_nest(self._action_spec,
                                                   seed=seed,
                                                   outer_dims=outer_dims)
            policy_info = tensor_spec.sample_spec_nest(self._info_spec,
                                                       outer_dims=outer_dims)
        if self._accepts_per_arm_features:

            def _gather_fn(t):
                return tf.gather(params=t, indices=action_, batch_dims=1)

            chosen_arm_features = tf.nest.map_structure(
                _gather_fn, observation['per_arm'])
            policy_info = policy_info._replace(
                chosen_arm_features=chosen_arm_features)

        # TODO(b/78181147): Investigate why this control dependency is required.
        if time_step is not None:
            with tf.control_dependencies(tf.nest.flatten(time_step)):
                action_ = tf.nest.map_structure(tf.identity, action_)

        if self.emit_log_probability:
            if observation_and_action_constraint_splitter is not None:
                log_probability = masked_categorical.log_prob(
                    action_ - self.action_spec.minimum)
            else:
                action_probability = tf.nest.map_structure(
                    _uniform_probability, self._action_spec)
                log_probability = tf.nest.map_structure(
                    tf.math.log, action_probability)
            policy_info = policy_step.set_log_probability(
                policy_info, log_probability)

        step = policy_step.PolicyStep(action_, policy_state, policy_info)
        return step
示例#5
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 policy_state_spec=(),
                 info_spec=(),
                 clip=True,
                 emit_log_probability=False,
                 automatic_state_reset=True,
                 name=None):
        """Initialization of Base class.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps. Usually
        provided by the user to the subclass.
      action_spec: A nest of BoundedTensorSpec representing the actions. Usually
        provided by the user to the subclass.
      policy_state_spec: A nest of TensorSpec representing the policy_state.
        Provided by the subclass, not directly by the user.
      info_spec: A nest of TensorSpec representing the policy info. Provided by
        the subclass, not directly by the user.
      clip: Whether to clip actions to spec before returning them.  Default
        True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped
        continuous actions for training.
      emit_log_probability: Emit log-probabilities of actions, if supported. If
        True, policy_step.info will have CommonFields.LOG_PROBABILITY set.
        Please consult utility methods provided in policy_step for setting and
        retrieving these. When working with custom policies, either provide a
        dictionary info_spec or a namedtuple with the field 'log_probability'.
      automatic_state_reset:  If `True`, then `get_initial_policy_state` is used
        to clear state in `action()` and `distribution()` for for time steps
        where `time_step.is_first()`.
      name: A name for this module. Defaults to the class name.
    """
        super(Base, self).__init__(name=name)
        common.assert_members_are_not_overridden(base_cls=Base, instance=self)

        self._time_step_spec = time_step_spec
        self._action_spec = action_spec
        self._policy_state_spec = policy_state_spec
        self._emit_log_probability = emit_log_probability
        if emit_log_probability:
            log_probability_spec = tensor_spec.BoundedTensorSpec(
                shape=(),
                dtype=tf.float32,
                maximum=0,
                minimum=-float('inf'),
                name='log_probability')
            log_probability_spec = tf.nest.map_structure(
                lambda _: log_probability_spec, action_spec)
            info_spec = policy_step.set_log_probability(
                info_spec, log_probability_spec)

        self._info_spec = info_spec
        self._setup_specs()
        self._clip = clip
        self._action_fn = common.function_in_tf1()(self._action)
        self._automatic_state_reset = automatic_state_reset
  def _get_policy_info_and_action(self, time_step):
    outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec)

    log_probability = tf.nest.map_structure(
        lambda _: tf.zeros(outer_shape, tf.float32), self._action_spec)
    policy_info = policy_step.set_log_probability(
        self._policy_info, log_probability=log_probability)
    action = tf.nest.map_structure(lambda t: common.replicate(t, outer_shape),
                                   self._action_value)
    return policy_info, action
示例#7
0
    def distribution(
        self, time_step: ts.TimeStep, policy_state: types.NestedTensor = ()
    ) -> policy_step.PolicyStep:
        """Generates the distribution over next actions given the time_step.

    Args:
      time_step: A `TimeStep` tuple corresponding to `time_step_spec()`.
      policy_state: A Tensor, or a nested dict, list or tuple of Tensors
        representing the previous policy_state.

    Returns:
      A `PolicyStep` named tuple containing:

        `action`: A tf.distribution capturing the distribution of next actions.
        `state`: A policy state tensor for the next call to distribution.
        `info`: Optional side information such as action log probabilities.

    Raises:
      ValueError or TypeError: If `validate_args is True` and inputs or
        outputs do not match `time_step_spec`, `policy_state_spec`,
        or `policy_step_spec`.
    """
        if self._validate_args:
            time_step = nest_utils.prune_extra_keys(self._time_step_spec,
                                                    time_step)
            policy_state = nest_utils.prune_extra_keys(self._policy_state_spec,
                                                       policy_state)
            nest_utils.assert_same_structure(
                time_step,
                self._time_step_spec,
                message='time_step and time_step_spec structures do not match')
            nest_utils.assert_same_structure(
                policy_state,
                self._policy_state_spec,
                message=
                'policy_state and policy_state_spec structures do not match')
        if self._automatic_state_reset:
            policy_state = self._maybe_reset_state(time_step, policy_state)
        step = self._distribution(time_step=time_step,
                                  policy_state=policy_state)
        if self.emit_log_probability:
            # This here is set only for compatibility with info_spec in constructor.
            info = policy_step.set_log_probability(
                step.info,
                tf.nest.map_structure(
                    lambda _: tf.constant(0., dtype=tf.float32),
                    policy_step.get_log_probability(self._info_spec)))
            step = step._replace(info=info)
        if self._validate_args:
            nest_utils.assert_same_structure(
                step,
                self._policy_step_spec,
                message=('distribution output and policy_step_spec structures '
                         'do not match'))
        return step
示例#8
0
    def _distribution2action(self, distribution_step, seed_stream):
        """
        Convert distribution_step to action_step
        :param distribution_step:
        :param seed_stream:
        :return: action_step
        """

        actions = tf.nest.map_structure(lambda d: d.sample(seed=seed_stream()),
                                        distribution_step.action)
        info = distribution_step.info
        if self.emit_log_probability:
            try:
                log_probability = tf.nest.map_structure(
                    lambda a, d: d.log_prob(a), actions,
                    distribution_step.action)
                info = policy_step.set_log_probability(info, log_probability)
            except:
                raise TypeError(
                    '%s does not support emitting log-probabilities.' %
                    type(self).__name__)

        step = tf.nest.map_structure(lambda x: x, distribution_step)
        step = step._replace(action=actions, info=info)

        def clip_action(action, action_spec):
            if isinstance(action_spec, tensor_spec.BoundedTensorSpec):
                return common.clip_to_spec(action, action_spec)
            return action

        if self._clip:
            clipped_actions = tf.nest.map_structure(clip_action, step.action,
                                                    self._action_spec)
            step = step._replace(action=clipped_actions)

        tf.nest.assert_same_structure(step, self._policy_step_spec)

        def compare_to_spec(value, spec):
            return value.dtype.is_compatible_with(spec.dtype)

        compatibility = tf.nest.flatten(
            tf.nest.map_structure(compare_to_spec, step.action,
                                  self.action_spec))

        if not all(compatibility):
            get_dtype = lambda x: x.dtype
            action_dtypes = tf.nest.map_structure(get_dtype, step.action)
            spec_dtypes = tf.nest.map_structure(get_dtype, self.action_spec)

            raise TypeError(
                'Policy produced an action with a dtype that doesn\'t '
                'match its action_spec. Got action: %s with '
                'action_spec: %s' % (action_dtypes, spec_dtypes))

        return step
示例#9
0
    def __init__(self,
                 actions: types.NestedTensor,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 emit_log_probability: bool = True,
                 policy_info: types.NestedTensorSpec = (),
                 info_spec: types.NestedTensorSpec = (),
                 name: Optional[Text] = None):
        """A policy which always returns a fixed action.

    Args:
      actions: A Tensor, or a nested dict, list or tuple of Tensors
        corresponding to `action_spec()`.
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      emit_log_probability: Emit log-probabilities of actions, if supported. If
        True, policy_step.info will have CommonFields.LOG_PROBABILITY set.
        Please consult utility methods provided in policy_step for setting and
        retrieving these. When working with custom policies, either provide a
        dictionary info_spec or a namedtuple with the field 'log_probability'.
      policy_info: A policy info to be returned in PolicyStep.
      info_spec: A policy info spec.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        super(FixedPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             clip=False,
                             info_spec=info_spec,
                             name=name,
                             emit_log_probability=emit_log_probability)
        nest_utils.assert_same_structure(self._action_spec, actions)

        def convert(action, spec):
            return tf.convert_to_tensor(value=action, dtype=spec.dtype)

        self._action_value = tf.nest.map_structure(convert, actions,
                                                   self._action_spec)
        if self._emit_log_probability:
            log_probability = tf.nest.map_structure(
                lambda t: tf.constant(0.0, tf.float32), self._action_spec)
            self._policy_info = policy_step.set_log_probability(
                policy_info, log_probability)  # pytype: disable=wrong-arg-types
        else:
            self._policy_info = policy_info
示例#10
0
    def _action(self, time_step, policy_state, seed):
        if time_step.observation['mask'] is not None:

            mask = time_step.observation['mask']

            zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
            masked_categorical = masked.MaskedCategorical(zero_logits, mask)
            action_ = tf.cast(masked_categorical.sample() + self.action_spec.minimum,
                                self.action_spec.dtype)

            # If the action spec says each action should be shaped (1,), add another
            # dimension so the final shape is (B, 1) rather than (B,).
            if self.action_spec.shape.rank == 1:
                action_ = tf.expand_dims(action_, axis=-1)
        else:
            outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec)

            action_ = tensor_spec.sample_spec_nest(
                self._action_spec, seed=seed, outer_dims=outer_dims)

        if time_step is not None:
            with tf.control_dependencies(tf.nest.flatten(time_step)):
                action_ = tf.nest.map_structure(tf.identity, action_)

        policy_info = tensor_spec.sample_spec_nest(self._info_spec)

        if self.emit_log_probability:
            if time_step.observation['mask'] is not None:
                log_probability = masked_categorical.log_prob(
                    action_ - self.action_spec.minimum)
            else:
                _uniform_probability = np.random.uniform(low=0.0, high=1.0)
                action_probability = tf.nest.map_structure(_uniform_probability, self._action_spec)
                log_probability = tf.nest.map_structure(tf.math.log, action_probability)
            policy_info = policy_step.set_log_probability(policy_info, log_probability)

        step = policy_step.PolicyStep(action_, policy_state, policy_info)
        return step
示例#11
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 policy_state_spec=(),
                 info_spec=(),
                 clip=True,
                 emit_log_probability=False,
                 automatic_state_reset=True,
                 observation_and_action_constraint_splitter=None,
                 validate_args=True,
                 name=None):
        """Initialization of TFPolicy class.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps. Usually
        provided by the user to the subclass.
      action_spec: A nest of BoundedTensorSpec representing the actions. Usually
        provided by the user to the subclass.
      policy_state_spec: A nest of TensorSpec representing the policy_state.
        Provided by the subclass, not directly by the user.
      info_spec: A nest of TensorSpec representing the policy info. Provided by
        the subclass, not directly by the user.
      clip: Whether to clip actions to spec before returning them.  Default
        True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped
        continuous actions for training.
      emit_log_probability: Emit log-probabilities of actions, if supported. If
        True, policy_step.info will have CommonFields.LOG_PROBABILITY set.
        Please consult utility methods provided in policy_step for setting and
        retrieving these. When working with custom policies, either provide a
        dictionary info_spec or a namedtuple with the field 'log_probability'.
      automatic_state_reset:  If `True`, then `get_initial_policy_state` is used
        to clear state in `action()` and `distribution()` for for time steps
        where `time_step.is_first()`.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment. The function takes in a full observation and returns a
        tuple consisting of 1) the part of the observation intended as input to
        the network and 2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as: ```
        def observation_and_action_constraint_splitter(observation): return
          observation['network_input'], observation['constraint'] ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
          sure the provided `q_network` is compatible with the network-specific
          half of the output of the
          `observation_and_action_constraint_splitter`. In particular,
          `observation_and_action_constraint_splitter` will be called on the
          observation before passing to the network. If
          `observation_and_action_constraint_splitter` is None, action
          constraints are not applied.
      validate_args: Python bool.  Whether to verify inputs to, and outputs of,
        functions like `action` and `distribution` against spec structures,
        dtypes, and shapes.

        Research code may prefer to set this value to `False` to allow iterating
        on input and output structures without being hamstrung by overly
        rigid checking (at the cost of harder-to-debug errors).

        See also `TFAgent.validate_args`.
      name: A name for this module. Defaults to the class name.
    """
        super(TFPolicy, self).__init__(name=name)
        common.check_tf1_allowed()
        common.tf_agents_gauge.get_cell('TFAPolicy').set(True)
        common.assert_members_are_not_overridden(base_cls=TFPolicy,
                                                 instance=self)
        if not isinstance(time_step_spec, ts.TimeStep):
            raise ValueError(
                'The `time_step_spec` must be an instance of `TimeStep`, but is `{}`.'
                .format(type(time_step_spec)))

        self._time_step_spec = time_step_spec
        self._action_spec = action_spec
        self._policy_state_spec = policy_state_spec
        self._emit_log_probability = emit_log_probability
        self._validate_args = validate_args

        if emit_log_probability:
            log_probability_spec = tensor_spec.BoundedTensorSpec(
                shape=(),
                dtype=tf.float32,
                maximum=0,
                minimum=-float('inf'),
                name='log_probability')
            log_probability_spec = tf.nest.map_structure(
                lambda _: log_probability_spec, action_spec)
            info_spec = policy_step.set_log_probability(
                info_spec, log_probability_spec)

        self._info_spec = info_spec
        self._setup_specs()
        self._clip = clip
        self._action_fn = common.function_in_tf1()(self._action)
        self._automatic_state_reset = automatic_state_reset
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
示例#12
0
    def _action(self, time_step, policy_state, seed):
        observation_and_action_constraint_splitter = (
            self.observation_and_action_constraint_splitter)

        outer_dims = nest_utils.get_outer_shape(time_step,
                                                self._time_step_spec)
        if observation_and_action_constraint_splitter is not None:
            observation, mask = observation_and_action_constraint_splitter(
                time_step.observation)

            action_spec = tensor_spec.from_spec(self.action_spec)
            action_spec = cast(tensor_spec.BoundedTensorSpec, action_spec)
            zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
            masked_categorical = masked.MaskedCategorical(zero_logits, mask)
            action_ = tf.cast(
                masked_categorical.sample() + action_spec.minimum,
                action_spec.dtype)

            # If the action spec says each action should be shaped (1,), add another
            # dimension so the final shape is (B, 1) rather than (B,).
            if action_spec.shape.rank == 1:
                action_ = tf.expand_dims(action_, axis=-1)
            policy_info = tensor_spec.sample_spec_nest(self._info_spec,
                                                       outer_dims=outer_dims)
        else:
            observation = time_step.observation
            action_spec = cast(tensor_spec.BoundedTensorSpec, self.action_spec)

            if self._accepts_per_arm_features:
                max_num_arms = action_spec.maximum - action_spec.minimum + 1
                batch_size = tf.shape(time_step.step_type)[0]
                num_actions = observation.get(
                    bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY,
                    tf.ones(shape=(batch_size, ), dtype=tf.int32) *
                    max_num_arms)
                mask = tf.sequence_mask(num_actions, max_num_arms)
                zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
                masked_categorical = masked.MaskedCategorical(
                    zero_logits, mask)
                action_ = tf.nest.map_structure(
                    lambda t: tf.cast(masked_categorical.sample() + t.minimum,
                                      t.dtype), action_spec)
            else:
                action_ = tensor_spec.sample_spec_nest(self._action_spec,
                                                       seed=seed,
                                                       outer_dims=outer_dims)

            policy_info = tensor_spec.sample_spec_nest(self._info_spec,
                                                       outer_dims=outer_dims)

        # Update policy info with chosen arm features.
        if self._accepts_per_arm_features:

            def _gather_fn(t):
                return tf.gather(params=t, indices=action_, batch_dims=1)

            chosen_arm_features = tf.nest.map_structure(
                _gather_fn, observation[bandit_spec_utils.PER_ARM_FEATURE_KEY])

            if policy_utilities.has_chosen_arm_features(self._info_spec):
                policy_info = policy_info._replace(
                    chosen_arm_features=chosen_arm_features)

        # TODO(b/78181147): Investigate why this control dependency is required.
        if time_step is not None:
            with tf.control_dependencies(tf.nest.flatten(time_step)):
                action_ = tf.nest.map_structure(tf.identity, action_)

        if self.emit_log_probability:
            if (self._accepts_per_arm_features
                    or observation_and_action_constraint_splitter is not None):
                action_spec = cast(tensor_spec.BoundedTensorSpec,
                                   self.action_spec)
                log_probability = masked_categorical.log_prob(
                    action_ - action_spec.minimum)
            else:
                log_probability = tf.nest.map_structure(
                    lambda s: _calculate_log_probability(outer_dims, s),
                    self._action_spec)
            policy_info = policy_step.set_log_probability(
                policy_info, log_probability)

        step = policy_step.PolicyStep(action_, policy_state, policy_info)
        return step
    def _get_step(self) -> EnvStep:
        if self._start_on_next_step:
            self._start_new_episode()

        if StepType.is_last(self._step_type):
            # This is the last (terminating) observation of the environment.
            self._start_on_next_step = True
            self._num_total_steps += 1
            self._num_episodes += 1
            # The policy is not run on the terminal step, so we just carry over the
            # reward, action, and policy_info from the previous step.
            return EnvStep(self._step_type,
                           tf.cast(self._cur_step_num, dtype=tf.int64),
                           self._time_step.observation, self._action,
                           self._time_step.reward, self._time_step.discount,
                           self._policy_info, {}, {})

        self._action, self._policy_state, self._policy_info = self._policy.action(
            self._time_step, self._policy_state)

        # Update type of log-probs to tf.float32... a bit of a bug in TF-Agents.
        if hasattr(self._policy_info, 'log_probability'):
            self._policy_info = policy_step.set_log_probability(
                self._policy_info,
                tf.cast(self._policy_info.log_probability, tf.float32))

        # Sample action from policy.
        env_action = self._action
        if self._env.batch_size is not None:
            env_action = nest_utils.batch_nested_tensors(env_action)

        # Sample next step from environment.
        self._next_time_step = self._env.step(env_action)
        if self._env.batch_size is not None:
            self._next_time_step = nest_utils.unbatch_nested_tensors(
                self._next_time_step)
        self._next_step_type = self._next_time_step.step_type
        self._cur_step_num += 1
        if (self._episode_step_limit
                and self._cur_step_num >= self._episode_step_limit):
            self._next_step_type = tf.convert_to_tensor(  # Overwrite step type.
                value=StepType.LAST,
                dtype=self._first_step_type.dtype)
            self._next_step_type = tf.reshape(self._next_step_type,
                                              tf.shape(self._first_step_type))

        step = EnvStep(
            self._step_type,
            tf.cast(self._cur_step_num - 1, tf.int64),
            self._time_step.observation,
            self._action,
            # Immediate reward given by next time step.
            self._next_time_step.reward,
            self._time_step.discount,
            self._policy_info,
            {},
            {})

        self._num_steps += 1
        self._num_total_steps += 1
        if StepType.is_first(self._step_type):
            self._num_total_episodes += 1

        self._time_step = self._next_time_step
        self._step_type = self._next_step_type

        return step
示例#14
0
def main(_):
    # setting up
    start_time = time.time()
    tf.compat.v1.enable_resource_variables()
    tf.compat.v1.disable_eager_execution()
    logging.set_verbosity(logging.INFO)
    global observation_omit_size, goal_coord, sample_count, iter_count, episode_size_buffer, episode_return_buffer

    root_dir = os.path.abspath(os.path.expanduser(FLAGS.logdir))
    if not tf.io.gfile.exists(root_dir):
        tf.io.gfile.makedirs(root_dir)
    log_dir = os.path.join(root_dir, FLAGS.environment)

    if not tf.io.gfile.exists(log_dir):
        tf.io.gfile.makedirs(log_dir)
    save_dir = os.path.join(log_dir, "models")
    if not tf.io.gfile.exists(save_dir):
        tf.io.gfile.makedirs(save_dir)

    print("directory for recording experiment data:", log_dir)

    # in case training is paused and resumed, so can be restored
    try:
        sample_count = np.load(os.path.join(log_dir,
                                            "sample_count.npy")).tolist()
        iter_count = np.load(os.path.join(log_dir, "iter_count.npy")).tolist()
        episode_size_buffer = np.load(
            os.path.join(log_dir, "episode_size_buffer.npy")).tolist()
        episode_return_buffer = np.load(
            os.path.join(log_dir, "episode_return_buffer.npy")).tolist()
    except:
        sample_count = 0
        iter_count = 0
        episode_size_buffer = []
        episode_return_buffer = []

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        os.path.join(log_dir, "train", "in_graph_data"),
        flush_millis=10 * 1000)
    train_summary_writer.set_as_default()

    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(True):
        # environment related stuff
        env = do.get_environment(env_name=FLAGS.environment)
        py_env = wrap_env(
            skill_wrapper.SkillWrapper(
                env,
                num_latent_skills=FLAGS.num_skills,
                skill_type=FLAGS.skill_type,
                preset_skill=None,
                min_steps_before_resample=FLAGS.min_steps_before_resample,
                resample_prob=FLAGS.resample_prob,
            ),
            max_episode_steps=FLAGS.max_env_steps,
        )

        # all specifications required for all networks and agents
        py_action_spec = py_env.action_spec()
        tf_action_spec = tensor_spec.from_spec(
            py_action_spec)  # policy, critic action spec
        env_obs_spec = py_env.observation_spec()
        py_env_time_step_spec = ts.time_step_spec(
            env_obs_spec)  # replay buffer time_step spec
        if observation_omit_size > 0:
            agent_obs_spec = array_spec.BoundedArraySpec(
                (env_obs_spec.shape[0] - observation_omit_size, ),
                env_obs_spec.dtype,
                minimum=env_obs_spec.minimum,
                maximum=env_obs_spec.maximum,
                name=env_obs_spec.name,
            )  # policy, critic observation spec
        else:
            agent_obs_spec = env_obs_spec
        py_agent_time_step_spec = ts.time_step_spec(
            agent_obs_spec)  # policy, critic time_step spec
        tf_agent_time_step_spec = tensor_spec.from_spec(
            py_agent_time_step_spec)

        if not FLAGS.reduced_observation:
            skill_dynamics_observation_size = (
                py_env_time_step_spec.observation.shape[0] - FLAGS.num_skills)
        else:
            skill_dynamics_observation_size = FLAGS.reduced_observation

        # TODO(architsh): Shift co-ordinate hiding to actor_net and critic_net (good for futher image based processing as well)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            tf_agent_time_step_spec.observation,
            tf_action_spec,
            fc_layer_params=(FLAGS.hidden_layer_size, ) * 2,
            continuous_projection_net=do._normal_projection_net,
        )

        critic_net = critic_network.CriticNetwork(
            (tf_agent_time_step_spec.observation, tf_action_spec),
            observation_fc_layer_params=None,
            action_fc_layer_params=None,
            joint_fc_layer_params=(FLAGS.hidden_layer_size, ) * 2,
        )

        if (FLAGS.skill_dynamics_relabel_type is not None
                and "importance_sampling" in FLAGS.skill_dynamics_relabel_type
                and FLAGS.is_clip_eps > 1.0):
            reweigh_batches_flag = True
        else:
            reweigh_batches_flag = False

        agent = dads_agent.DADSAgent(
            # DADS parameters
            save_dir,
            skill_dynamics_observation_size,
            observation_modify_fn=do.process_observation,
            restrict_input_size=observation_omit_size,
            latent_size=FLAGS.num_skills,
            latent_prior=FLAGS.skill_type,
            prior_samples=FLAGS.random_skills,
            fc_layer_params=(FLAGS.hidden_layer_size, ) * 2,
            normalize_observations=FLAGS.normalize_data,
            network_type=FLAGS.graph_type,
            num_mixture_components=FLAGS.num_components,
            fix_variance=FLAGS.fix_variance,
            reweigh_batches=reweigh_batches_flag,
            skill_dynamics_learning_rate=FLAGS.skill_dynamics_lr,
            # SAC parameters
            time_step_spec=tf_agent_time_step_spec,
            action_spec=tf_action_spec,
            actor_network=actor_net,
            critic_network=critic_net,
            target_update_tau=0.005,
            target_update_period=1,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=FLAGS.agent_lr),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=FLAGS.agent_lr),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=FLAGS.agent_lr),
            td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gamma=FLAGS.agent_gamma,
            reward_scale_factor=1.0 / (FLAGS.agent_entropy + 1e-12),
            gradient_clipping=None,
            debug_summaries=FLAGS.debug,
            train_step_counter=global_step,
        )

        # evaluation policy
        eval_policy = py_tf_policy.PyTFPolicy(agent.policy)

        # collection policy
        if FLAGS.collect_policy == "default":
            collect_policy = py_tf_policy.PyTFPolicy(agent.collect_policy)
        elif FLAGS.collect_policy == "ou_noise":
            collect_policy = py_tf_policy.PyTFPolicy(
                ou_noise_policy.OUNoisePolicy(agent.collect_policy,
                                              ou_stddev=0.2,
                                              ou_damping=0.15))

        # relabelling policy deals with batches of data, unlike collect and eval
        relabel_policy = py_tf_policy.PyTFPolicy(agent.collect_policy)

        # constructing a replay buffer, need a python spec
        policy_step_spec = policy_step.PolicyStep(action=py_action_spec,
                                                  state=(),
                                                  info=())

        if (FLAGS.skill_dynamics_relabel_type is not None
                and "importance_sampling" in FLAGS.skill_dynamics_relabel_type
                and FLAGS.is_clip_eps > 1.0):
            policy_step_spec = policy_step_spec._replace(
                info=policy_step.set_log_probability(
                    policy_step_spec.info,
                    array_spec.ArraySpec(
                        shape=(), dtype=np.float32, name="action_log_prob"),
                ))

        trajectory_spec = from_transition(py_env_time_step_spec,
                                          policy_step_spec,
                                          py_env_time_step_spec)
        capacity = FLAGS.replay_buffer_capacity
        # for all the data collected
        rbuffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
            capacity=capacity, data_spec=trajectory_spec)

        if FLAGS.train_skill_dynamics_on_policy:
            # for on-policy data (if something special is required)
            on_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
                capacity=FLAGS.initial_collect_steps + FLAGS.collect_steps +
                10,
                data_spec=trajectory_spec,
            )

        # insert experience manually with relabelled rewards and skills
        agent.build_agent_graph()
        agent.build_skill_dynamics_graph()
        agent.create_savers()

        # saving this way requires the saver to be out the object
        train_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(save_dir, "agent"),
            agent=agent,
            global_step=global_step,
        )
        policy_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(save_dir, "policy"),
            policy=agent.policy,
            global_step=global_step,
        )
        rb_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(save_dir, "replay_buffer"),
            max_to_keep=1,
            replay_buffer=rbuffer,
        )

        setup_time = time.time() - start_time
        print("Setup time:", setup_time)

        with tf.compat.v1.Session().as_default() as sess:
            eval_policy.session = sess
            eval_policy.initialize(None)
            eval_policy.restore(os.path.join(FLAGS.logdir, "models", "policy"))

            plotdir = os.path.join(FLAGS.logdir, "plots")
            if not os.path.exists(plotdir):
                os.mkdir(plotdir)
            do.FLAGS = FLAGS
            do.eval_loop(eval_dir=plotdir,
                         eval_policy=eval_policy,
                         plot_name="plot")
def _get_action_step(action, log_prob):
    step = policy_step.PolicyStep(action=tf.convert_to_tensor(action))
    return step._replace(info=policy_step.set_log_probability(
        step.info, tf.convert_to_tensor(log_prob)))
示例#16
0
  def _action(self, time_step, policy_state, seed):
    seed_stream = tfp.util.SeedStream(seed=seed, salt='epsilon_greedy')
    greedy_action = self._greedy_policy.action(time_step, policy_state)
    random_action = self._random_policy.action(time_step, (), seed_stream())

    outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec)
    rng = tf.random.uniform(
        outer_shape, maxval=1.0, seed=seed_stream(), name='epsilon_rng')
    cond = tf.greater_equal(rng, self._get_epsilon())

    # Selects the action/info from the random policy with probability epsilon.
    # TODO(b/133175894): tf.compat.v1.where only supports a condition which is
    # either a scalar or a vector. Use tf.compat.v2 so that it can support any
    # condition whose leading dimensions are the same as the other operands of
    # tf.where.
    outer_ndims = int(outer_shape.shape[0])
    if outer_ndims >= 2:
      raise ValueError(
          'Only supports batched time steps with a single batch dimension')
    action = tf.nest.map_structure(lambda g, r: tf.compat.v1.where(cond, g, r),
                                   greedy_action.action, random_action.action)

    if greedy_action.info:
      if not random_action.info:
        raise ValueError('Incompatible info field')
      # Note that the objects in PolicyInfo may have different shapes, so we
      # need to call nest_utils.where() on each type of object.
      info = tf.nest.map_structure(lambda x, y: nest_utils.where(cond, x, y),
                                   greedy_action.info, random_action.info)
      if self._emit_log_probability:
        # At this point, info.log_probability contains the log prob of the
        # action chosen, conditioned on the policy that was chosen. We want to
        # emit the full log probability of the action, so we'll add in the log
        # probability of choosing the policy.
        random_log_prob = tf.nest.map_structure(
            lambda t: tf.math.log(tf.zeros_like(t) + self._get_epsilon()),
            info.log_probability)
        greedy_log_prob = tf.nest.map_structure(
            lambda t: tf.math.log(tf.ones_like(t) - self._get_epsilon()),
            random_log_prob)
        log_prob_of_chosen_policy = nest_utils.where(cond, greedy_log_prob,
                                                     random_log_prob)
        log_prob = tf.nest.map_structure(lambda a, b: a + b,
                                         log_prob_of_chosen_policy,
                                         info.log_probability)
        info = policy_step.set_log_probability(info, log_prob)
      # Overwrite bandit policy info type.
      if policy_utilities.has_bandit_policy_type(info, check_for_tensor=True):
        # Generate mask of the same shape as bandit_policy_type (batch_size, 1).
        # This is the opposite of `cond`, which is 1-D bool tensor (batch_size,)
        # that is true when greedy policy was used, otherwise `cond` is false.
        random_policy_mask = tf.reshape(tf.logical_not(cond),
                                        tf.shape(info.bandit_policy_type))  # pytype: disable=attribute-error
        bandit_policy_type = policy_utilities.bandit_policy_uniform_mask(
            info.bandit_policy_type, mask=random_policy_mask)  # pytype: disable=attribute-error
        info = policy_utilities.set_bandit_policy_type(
            info, bandit_policy_type)
    else:
      if random_action.info:
        raise ValueError('Incompatible info field')
      info = ()

    # The state of the epsilon greedy policy is the state of the underlying
    # greedy policy (the random policy carries no state).
    # It is commonly assumed that the new policy state only depends only
    # on the previous state and "time_step", the action (be it the greedy one
    # or the random one) does not influence the new policy state.
    state = greedy_action.state

    return policy_step.PolicyStep(action, state, info)
示例#17
0
    def load(self):
        # setting up
        tf.compat.v1.enable_resource_variables()
        tf.compat.v1.disable_eager_execution()

        root_dir = os.path.abspath(os.path.expanduser(self.flags.logdir))
        if not tf.io.gfile.exists(root_dir):
            tf.io.gfile.makedirs(root_dir)
        log_dir = os.path.join(root_dir, self.flags.environment)

        if not tf.io.gfile.exists(log_dir):
            tf.io.gfile.makedirs(log_dir)
        save_dir = os.path.join(log_dir, "models")
        if not tf.io.gfile.exists(save_dir):
            tf.io.gfile.makedirs(save_dir)

        train_summary_writer = tf.compat.v2.summary.create_file_writer(
            os.path.join(log_dir, "train", "in_graph_data"),
            flush_millis=10 * 1000)
        train_summary_writer.set_as_default()

        global_step = tf.compat.v1.train.get_or_create_global_step()
        with tf.compat.v2.summary.record_if(True):
            # environment related stuff
            env = do.get_environment(env_name=self.flags.environment)
            py_env = wrap_env(
                skill_wrapper.SkillWrapper(
                    env,
                    num_latent_skills=self.flags.num_skills,
                    skill_type=self.flags.skill_type,
                    preset_skill=None,
                    min_steps_before_resample=self.flags.
                    min_steps_before_resample,
                    resample_prob=self.flags.resample_prob,
                ),
                max_episode_steps=self.flags.max_env_steps,
            )

            # all specifications required for all networks and agents
            py_action_spec = py_env.action_spec()
            tf_action_spec = tensor_spec.from_spec(
                py_action_spec)  # policy, critic action spec
            env_obs_spec = py_env.observation_spec()
            py_env_time_step_spec = ts.time_step_spec(
                env_obs_spec)  # replay buffer time_step spec
            if self.flags.observation_omission_size > 0:
                agent_obs_spec = array_spec.BoundedArraySpec(
                    (env_obs_spec.shape[0] -
                     self.flags.observation_omission_size),
                    env_obs_spec.dtype,
                    minimum=env_obs_spec.minimum,
                    maximum=env_obs_spec.maximum,
                    name=env_obs_spec.name,
                )  # policy, critic observation spec
            else:
                agent_obs_spec = env_obs_spec
            py_agent_time_step_spec = ts.time_step_spec(
                agent_obs_spec)  # policy, critic time_step spec
            tf_agent_time_step_spec = tensor_spec.from_spec(
                py_agent_time_step_spec)

            if not self.flags.reduced_observation:
                skill_dynamics_observation_size = (
                    py_env_time_step_spec.observation.shape[0] -
                    self.flags.num_skills)
            else:
                skill_dynamics_observation_size = self.flags.reduced_observation

            # TODO(architsh): Shift co-ordinate hiding to actor_net and critic_net (good for futher image based processing as well)
            actor_net = actor_distribution_network.ActorDistributionNetwork(
                tf_agent_time_step_spec.observation,
                tf_action_spec,
                fc_layer_params=(self.flags.hidden_layer_size, ) * 2,
                continuous_projection_net=do._normal_projection_net,
            )

            critic_net = critic_network.CriticNetwork(
                (tf_agent_time_step_spec.observation, tf_action_spec),
                observation_fc_layer_params=None,
                action_fc_layer_params=None,
                joint_fc_layer_params=(self.flags.hidden_layer_size, ) * 2,
            )

            if (self.flags.skill_dynamics_relabel_type is not None
                    and "importance_sampling"
                    in self.flags.skill_dynamics_relabel_type
                    and self.flags.is_clip_eps > 1.0):
                reweigh_batches_flag = True
            else:
                reweigh_batches_flag = False

            agent = dads_agent.DADSAgent(
                # DADS parameters
                save_dir,
                skill_dynamics_observation_size,
                observation_modify_fn=self.process_observation,
                restrict_input_size=self.flags.observation_omission_size,
                latent_size=self.flags.num_skills,
                latent_prior=self.flags.skill_type,
                prior_samples=self.flags.random_skills,
                fc_layer_params=(self.flags.hidden_layer_size, ) * 2,
                normalize_observations=self.flags.normalize_data,
                network_type=self.flags.graph_type,
                num_mixture_components=self.flags.num_components,
                fix_variance=self.flags.fix_variance,
                reweigh_batches=reweigh_batches_flag,
                skill_dynamics_learning_rate=self.flags.skill_dynamics_lr,
                # SAC parameters
                time_step_spec=tf_agent_time_step_spec,
                action_spec=tf_action_spec,
                actor_network=actor_net,
                critic_network=critic_net,
                target_update_tau=0.005,
                target_update_period=1,
                actor_optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=self.flags.agent_lr),
                critic_optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=self.flags.agent_lr),
                alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=self.flags.agent_lr),
                td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
                gamma=self.flags.agent_gamma,
                reward_scale_factor=1.0 / (self.flags.agent_entropy + 1e-12),
                gradient_clipping=None,
                debug_summaries=self.flags.debug,
                train_step_counter=global_step,
            )

            # evaluation policy
            eval_policy = py_tf_policy.PyTFPolicy(agent.policy)

            # constructing a replay buffer, need a python spec
            policy_step_spec = policy_step.PolicyStep(action=py_action_spec,
                                                      state=(),
                                                      info=())

            if (self.flags.skill_dynamics_relabel_type is not None
                    and "importance_sampling"
                    in self.flags.skill_dynamics_relabel_type
                    and self.flags.is_clip_eps > 1.0):
                policy_step_spec = policy_step_spec._replace(
                    info=policy_step.set_log_probability(
                        policy_step_spec.info,
                        array_spec.ArraySpec(
                            shape=(
                            ), dtype=np.float32, name="action_log_prob"),
                    ))

            # insert experience manually with relabelled rewards and skills
            agent.build_agent_graph()
            agent.build_skill_dynamics_graph()

            with tf.compat.v1.Session().as_default() as sess:
                eval_policy.session = sess
                eval_policy.initialize(None)
                eval_policy.restore(
                    os.path.join(self.flags.logdir, "models", "policy"))
                self.policy = eval_policy