Пример #1
0
    def __init__(self,
                 action_spec,
                 actor_network: Network,
                 critic_network: Network,
                 critic_loss=None,
                 target_entropy=None,
                 initial_log_alpha=0.0,
                 target_update_tau=0.05,
                 target_update_period=1,
                 dqda_clipping=None,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 alpha_optimizer=None,
                 gradient_clipping=None,
                 train_step_counter=None,
                 debug_summaries=False,
                 name="SacAlgorithm"):
        """Create a SacAlgorithm

        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            actor_network (Network): The network will be called with
                call(observation, step_type).
            critic_network (Network): The network will be called with
                call(observation, action, step_type).
            critic_loss (None|OneStepTDLoss): an object for calculating critic loss.
                If None, a default OneStepTDLoss will be used.
            initial_log_alpha (float): initial value for variable log_alpha
            target_entropy (float|None): The target average policy entropy, for updating alpha.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between [-dqda_clipping, dqda_clipping].
                Does not perform clipping if dqda_clipping == 0.
            actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            critic_optimizer (tf.optimizers.Optimizer): The optimizer for critic.
            alpha_optimizer (tf.optimizers.Optimizer): The optimizer for alpha.
            gradient_clipping (float): Norm length to clip gradients.
            train_step_counter (tf.Variable): An optional counter to increment
                every time the a new iteration is started. If None, it will use
                tf.summary.experimental.get_step(). If this is still None, a
                counter will be created.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        critic_network1 = critic_network
        critic_network2 = critic_network.copy(name='CriticNetwork2')
        log_alpha = tfa_common.create_variable(name='log_alpha',
                                               initial_value=initial_log_alpha,
                                               dtype=tf.float32,
                                               trainable=True)
        super().__init__(
            action_spec,
            train_state_spec=SacState(
                share=SacShareState(actor=actor_network.state_spec),
                actor=SacActorState(critic1=critic_network.state_spec,
                                    critic2=critic_network.state_spec),
                critic=SacCriticState(
                    critic1=critic_network.state_spec,
                    critic2=critic_network.state_spec,
                    target_critic1=critic_network.state_spec,
                    target_critic2=critic_network.state_spec)),
            action_distribution_spec=actor_network.output_spec,
            predict_state_spec=actor_network.state_spec,
            optimizer=[actor_optimizer, critic_optimizer, alpha_optimizer],
            get_trainable_variables_func=[
                lambda: actor_network.trainable_variables, lambda:
                (critic_network1.trainable_variables + critic_network2.
                 trainable_variables), lambda: [log_alpha]
            ],
            gradient_clipping=gradient_clipping,
            train_step_counter=train_step_counter,
            debug_summaries=debug_summaries,
            name=name)

        self._log_alpha = log_alpha
        self._actor_network = actor_network
        self._critic_network1 = critic_network1
        self._critic_network2 = critic_network2
        self._target_critic_network1 = self._critic_network1.copy(
            name='TargetCriticNetwork1')
        self._target_critic_network2 = self._critic_network2.copy(
            name='TargetCriticNetwork2')
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer

        if critic_loss is None:
            critic_loss = OneStepTDLoss(debug_summaries=debug_summaries)
        self._critic_loss = critic_loss

        flat_action_spec = tf.nest.flatten(self._action_spec)
        self._is_continuous = tensor_spec.is_continuous(flat_action_spec[0])
        if target_entropy is None:
            target_entropy = np.sum(
                list(
                    map(dist_utils.calc_default_target_entropy,
                        flat_action_spec)))
        self._target_entropy = target_entropy

        self._dqda_clipping = dqda_clipping

        self._update_target = common.get_target_updater(
            models=[self._critic_network1, self._critic_network2],
            target_models=[
                self._target_critic_network1, self._target_critic_network2
            ],
            tau=target_update_tau,
            period=target_update_period)

        tfa_common.soft_variables_update(
            self._critic_network1.variables,
            self._target_critic_network1.variables,
            tau=1.0)

        tfa_common.soft_variables_update(
            self._critic_network2.variables,
            self._target_critic_network2.variables,
            tau=1.0)
Пример #2
0
    def __init__(self,
                 action_spec,
                 actor_network: Network,
                 critic_network: Network,
                 ou_stddev=0.2,
                 ou_damping=0.15,
                 critic_loss=None,
                 target_update_tau=0.05,
                 target_update_period=1,
                 dqda_clipping=None,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 gradient_clipping=None,
                 train_step_counter=None,
                 debug_summaries=False,
                 name="DdpgAlgorithm"):
        """
        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            actor_network (Network):  The network will be called with
                call(observation, step_type).
            critic_network (Network): The network will be called with
                call(observation, action, step_type).
            ou_stddev (float): Standard deviation for the Ornstein-Uhlenbeck
                (OU) noise added in the default collect policy.
            ou_damping (float): Damping factor for the OU noise added in the
                default collect policy.
            critic_loss (None|OneStepTDLoss): an object for calculating critic
                loss. If None, a default OneStepTDLoss will be used.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between [-dqda_clipping, dqda_clipping].
                Does not perform clipping if dqda_clipping == 0.
            actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            critic_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            gradient_clipping (float): Norm length to clip gradients.
            train_step_counter (tf.Variable): An optional counter to increment
                every time the a new iteration is started. If None, it will use
                tf.summary.experimental.get_step(). If this is still None, a
                counter will be created.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        train_state_spec = DdpgState(
            actor=DdpgActorState(actor=actor_network.state_spec,
                                 critic=critic_network.state_spec),
            critic=DdpgCriticState(critic=critic_network.state_spec,
                                   target_actor=actor_network.state_spec,
                                   target_critic=critic_network.state_spec))

        super().__init__(action_spec,
                         train_state_spec=train_state_spec,
                         action_distribution_spec=action_spec,
                         optimizer=[actor_optimizer, critic_optimizer],
                         get_trainable_variables_func=[
                             lambda: actor_network.trainable_variables,
                             lambda: critic_network.trainable_variables
                         ],
                         gradient_clipping=gradient_clipping,
                         train_step_counter=train_step_counter,
                         debug_summaries=debug_summaries,
                         name=name)

        self._actor_network = actor_network
        self._critic_network = critic_network
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer

        self._target_actor_network = actor_network.copy(
            name='target_actor_network')
        self._target_critic_network = critic_network.copy(
            name='target_critic_network')

        self._ou_stddev = ou_stddev
        self._ou_damping = ou_damping

        if critic_loss is None:
            critic_loss = OneStepTDLoss(debug_summaries=debug_summaries)
        self._critic_loss = critic_loss

        self._ou_process = self._create_ou_process(ou_stddev, ou_damping)

        self._update_target = common.get_target_updater(
            models=[self._actor_network, self._critic_network],
            target_models=[
                self._target_actor_network, self._target_critic_network
            ],
            tau=target_update_tau,
            period=target_update_period)

        self._dqda_clipping = dqda_clipping

        tfa_common.soft_variables_update(self._critic_network.variables,
                                         self._target_critic_network.variables,
                                         tau=1.0)
        tfa_common.soft_variables_update(self._actor_network.variables,
                                         self._target_actor_network.variables,
                                         tau=1.0)
Пример #3
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 actor_network: network.Network,
                 critic_network: network.Network,
                 actor_optimizer: Optional[types.Optimizer] = None,
                 critic_optimizer: Optional[types.Optimizer] = None,
                 ou_stddev: types.Float = 1.0,
                 ou_damping: types.Float = 1.0,
                 target_actor_network: Optional[network.Network] = None,
                 target_critic_network: Optional[network.Network] = None,
                 target_update_tau: types.Float = 1.0,
                 target_update_period: types.Int = 1,
                 dqda_clipping: Optional[types.Float] = None,
                 td_errors_loss_fn: Optional[types.LossFn] = None,
                 gamma: types.Float = 1.0,
                 reward_scale_factor: types.Float = 1.0,
                 gradient_clipping: Optional[types.Float] = None,
                 debug_summaries: bool = False,
                 summarize_grads_and_vars: bool = False,
                 train_step_counter: Optional[tf.Variable] = None,
                 name: Optional[Text] = None):
        """Creates a DDPG Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type[, policy_state])
        and should return (action, new_state).
      critic_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call((observation, action), step_type[,
        policy_state]) and should return (q_value, new_state).
      actor_optimizer: The optimizer to use for the actor network.
      critic_optimizer: The optimizer to use for the critic network.
      ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added
        in the default collect policy.
      ou_damping: Damping factor for the OU noise added in the default collect
        policy.
      target_actor_network: (Optional.)  A `tf_agents.network.Network` to be
        used as the actor target network during Q learning.  Every
        `target_update_period` train steps, the weights from `actor_network` are
        copied (possibly withsmoothing via `target_update_tau`) to `
        target_q_network`.

        If `target_actor_network` is not provided, it is created by making a
        copy of `actor_network`, which initializes a new network with the same
        structure and its own layers and weights.

        Performing a `Network.copy` does not work when the network instance
        already has trainable parameters (e.g., has already been built, or
        when the network is sharing layers with another).  In these cases, it is
        up to you to build a copy having weights that are not
        shared with the original `actor_network`, so that this can be used as a
        target network.  If you provide a `target_actor_network` that shares any
        weights with `actor_network`, a warning will be logged but no exception
        is thrown.
      target_critic_network: (Optional.) Similar network as target_actor_network
         but for the critic_network. See documentation for target_actor_network.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      dqda_clipping: when computing the actor loss, clips the gradient dqda
        element-wise between [-dqda_clipping, dqda_clipping]. Does not perform
        clipping if dqda_clipping == 0.
      td_errors_loss_fn:  A function for computing the TD errors loss. If None,
        a default value of elementwise huber_loss is used.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)
        self._actor_network = actor_network
        actor_network.create_variables()
        if target_actor_network:
            target_actor_network.create_variables()
        self._target_actor_network = common.maybe_copy_target_network_with_checks(
            self._actor_network, target_actor_network, 'TargetActorNetwork')
        self._critic_network = critic_network
        critic_network.create_variables()
        if target_critic_network:
            target_critic_network.create_variables()
        self._target_critic_network = common.maybe_copy_target_network_with_checks(
            self._critic_network, target_critic_network, 'TargetCriticNetwork')

        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer

        self._ou_stddev = ou_stddev
        self._ou_damping = ou_damping
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._dqda_clipping = dqda_clipping
        self._td_errors_loss_fn = (td_errors_loss_fn
                                   or common.element_wise_huber_loss)
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._gradient_clipping = gradient_clipping

        self._update_target = self._get_target_updater(target_update_tau,
                                                       target_update_period)

        policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec,
                                          action_spec=action_spec,
                                          actor_network=self._actor_network,
                                          clip=True)
        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=False)
        collect_policy = ou_noise_policy.OUNoisePolicy(
            collect_policy,
            ou_stddev=self._ou_stddev,
            ou_damping=self._ou_damping,
            clip=True)

        super(DdpgAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=2
                             if not self._actor_network.state_spec else None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
Пример #4
0
  def __init__(self,
               time_step_spec: ts.TimeStep,
               action_spec: types.NestedTensor,
               actor_network: network.Network,
               critic_network: network.Network,
               actor_optimizer: types.Optimizer,
               critic_optimizer: types.Optimizer,
               exploration_noise_std: types.Float = 0.1,
               critic_network_2: Optional[network.Network] = None,
               target_actor_network: Optional[network.Network] = None,
               target_critic_network: Optional[network.Network] = None,
               target_critic_network_2: Optional[network.Network] = None,
               target_update_tau: types.Float = 1.0,
               target_update_period: types.Int = 1,
               actor_update_period: types.Int = 1,
               td_errors_loss_fn: Optional[types.LossFn] = None,
               gamma: types.Float = 1.0,
               reward_scale_factor: types.Float = 1.0,
               target_policy_noise: types.Float = 0.2,
               target_policy_noise_clip: types.Float = 0.5,
               gradient_clipping: Optional[types.Float] = None,
               debug_summaries: bool = False,
               summarize_grads_and_vars: bool = False,
               train_step_counter: Optional[tf.Variable] = None,
               name: Text = None):
    """Creates a Td3Agent Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      critic_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, action, step_type).
      actor_optimizer: The default optimizer to use for the actor network.
      critic_optimizer: The default optimizer to use for the critic network.
      exploration_noise_std: Scale factor on exploration policy noise.
      critic_network_2: (Optional.)  A `tf_agents.network.Network` to be used as
        the second critic network during Q learning.  The weights from
        `critic_network` are copied if this is not provided.
      target_actor_network: (Optional.)  A `tf_agents.network.Network` to be
        used as the target actor network during Q learning. Every
        `target_update_period` train steps, the weights from `actor_network` are
        copied (possibly withsmoothing via `target_update_tau`) to `
        target_actor_network`.  If `target_actor_network` is not provided, it is
        created by making a copy of `actor_network`, which initializes a new
        network with the same structure and its own layers and weights.
        Performing a `Network.copy` does not work when the network instance
        already has trainable parameters (e.g., has already been built, or when
        the network is sharing layers with another).  In these cases, it is up
        to you to build a copy having weights that are not shared with the
        original `actor_network`, so that this can be used as a target network.
        If you provide a `target_actor_network` that shares any weights with
        `actor_network`, a warning will be logged but no exception is thrown.
      target_critic_network: (Optional.) Similar network as target_actor_network
        but for the critic_network. See documentation for target_actor_network.
      target_critic_network_2: (Optional.) Similar network as
        target_actor_network but for the critic_network_2. See documentation for
        target_actor_network. Will only be used if 'critic_network_2' is also
        specified.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      actor_update_period: Period for the optimization step on actor network.
      td_errors_loss_fn:  A function for computing the TD errors loss. If None,
        a default value of elementwise huber_loss is used.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      target_policy_noise: Scale factor on target action noise
      target_policy_noise_clip: Value to clip noise.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
    tf.Module.__init__(self, name=name)
    self._actor_network = actor_network
    actor_network.create_variables()
    if target_actor_network:
      target_actor_network.create_variables()
    self._target_actor_network = common.maybe_copy_target_network_with_checks(
        self._actor_network, target_actor_network, 'TargetActorNetwork')

    self._critic_network_1 = critic_network
    critic_network.create_variables()
    if target_critic_network:
      target_critic_network.create_variables()
    self._target_critic_network_1 = (
        common.maybe_copy_target_network_with_checks(self._critic_network_1,
                                                     target_critic_network,
                                                     'TargetCriticNetwork1'))

    if critic_network_2 is not None:
      self._critic_network_2 = critic_network_2
    else:
      self._critic_network_2 = critic_network.copy(name='CriticNetwork2')
      # Do not use target_critic_network_2 if critic_network_2 is None.
      target_critic_network_2 = None
    self._critic_network_2.create_variables()
    if target_critic_network_2:
      target_critic_network_2.create_variables()
    self._target_critic_network_2 = (
        common.maybe_copy_target_network_with_checks(self._critic_network_2,
                                                     target_critic_network_2,
                                                     'TargetCriticNetwork2'))

    self._actor_optimizer = actor_optimizer
    self._critic_optimizer = critic_optimizer

    self._exploration_noise_std = exploration_noise_std
    self._target_update_tau = target_update_tau
    self._target_update_period = target_update_period
    self._actor_update_period = actor_update_period
    self._td_errors_loss_fn = (
        td_errors_loss_fn or common.element_wise_huber_loss)
    self._gamma = gamma
    self._reward_scale_factor = reward_scale_factor
    self._target_policy_noise = target_policy_noise
    self._target_policy_noise_clip = target_policy_noise_clip
    self._gradient_clipping = gradient_clipping

    self._update_target = self._get_target_updater(
        target_update_tau, target_update_period)

    policy = actor_policy.ActorPolicy(
        time_step_spec=time_step_spec, action_spec=action_spec,
        actor_network=self._actor_network, clip=True)
    collect_policy = actor_policy.ActorPolicy(
        time_step_spec=time_step_spec, action_spec=action_spec,
        actor_network=self._actor_network, clip=False)
    collect_policy = gaussian_policy.GaussianPolicy(
        collect_policy,
        scale=self._exploration_noise_std,
        clip=True)

    train_sequence_length = 2 if not self._actor_network.state_spec else None
    super(Td3Agent, self).__init__(
        time_step_spec,
        action_spec,
        policy,
        collect_policy,
        train_sequence_length=train_sequence_length,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter,
        validate_args=False
    )

    self._as_transition = data_converter.AsTransition(
        self.data_context, squeeze_time_dim=(train_sequence_length == 2))
Пример #5
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 q_network: network.Network,
                 sampler: cem_actions_sampler.ActionsSampler,
                 init_mean: types.NestedArray,
                 init_var: types.NestedArray,
                 actor_policy: Optional[tf_policy.TFPolicy] = None,
                 minimal_var: float = 0.0001,
                 info_spec: types.NestedSpecTensorOrArray = (),
                 num_samples: int = 32,
                 num_elites: int = 4,
                 num_iterations: int = 32,
                 emit_log_probability: bool = False,
                 preprocess_state_action: bool = True,
                 training: bool = False,
                 weights: types.NestedTensorOrArray = None,
                 name: Optional[str] = None):
        """Builds a CEM-Policy given a network and a sampler.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      q_network: An instance of a `tf_agents.network.Network`, callable via
        `network(observation, step_type) -> (output, final_state)`.
      sampler: Samples the actions needed for the CEM.
      init_mean: A list or tuple or scalar, reprenting initial mean for actions.
      init_var: A list or tuple or scalar, reprenting initial var for actions.
      actor_policy: Optional actor policy.
      minimal_var: Minimal variance to prevent CEM distributon collapsing.
      info_spec: A policy info spec.
      num_samples: Number of samples to sample each round.
      num_elites: Number of best actions each round to refit the distribution
        with.
      num_iterations: Number of iterations to run the CEM loop.
      emit_log_probability: Whether to emit log-probs in info of `PolicyStep`.
      preprocess_state_action: The shape of state is (B, ...) and the shape of
        action is (B, N, A). When preprocess_state_action is enabled, the state
        will be tile_batched to be (BxN, ...) and the action will be reshaped
        to be (BxN, A). When preprocess_state_action is not enabled, the same
        operation needs to be done inside the network. This is helpful when the
        input have large memory requirements and the replication of state could
        happen after a few layers inside the network.
      training: Whether it is in training mode or inference mode.
      weights: A nested structure of weights w/ the same structure as action.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: If `q_network.action_spec` exists and is not compatible with
        `action_spec`.
    """
        network_action_spec = getattr(q_network, 'action_spec', None)

        if network_action_spec is not None:
            if not action_spec.is_compatible_with(network_action_spec):
                raise ValueError(
                    'action_spec must be compatible with q_network.action_spec; '
                    'instead got action_spec=%s, q_network.action_spec=%s' %
                    (action_spec, network_action_spec))

        if q_network:
            network_utils.check_single_floating_network_output(
                q_network.create_variables(),
                expected_output_shape=(),
                label=str(q_network))
            policy_state_spec = q_network.state_spec
        else:
            policy_state_spec = ()

        self._actor_policy = actor_policy
        self._q_network = q_network
        self._init_mean = init_mean
        self._init_var = init_var
        self._minimal_var = minimal_var
        self._num_samples = num_samples  # N
        self._num_elites = num_elites  # M
        self._num_iterations = num_iterations
        self._actions_sampler = sampler
        self._observation_spec = time_step_spec.observation
        self._training = training
        self._preprocess_state_action = preprocess_state_action
        self._weights = weights

        super(CEMPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             info_spec=info_spec,
                             policy_state_spec=policy_state_spec,
                             clip=False,
                             emit_log_probability=emit_log_probability,
                             name=name)
Пример #6
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 critic_network: network.Network,
                 actor_network: network.Network,
                 actor_optimizer: types.Optimizer,
                 critic_optimizer: types.Optimizer,
                 alpha_optimizer: types.Optimizer,
                 actor_loss_weight: types.Float = 1.0,
                 critic_loss_weight: types.Float = 0.5,
                 alpha_loss_weight: types.Float = 1.0,
                 actor_policy_ctor: Callable[
                     ..., tf_policy.TFPolicy] = actor_policy.ActorPolicy,
                 critic_network_2: Optional[network.Network] = None,
                 target_critic_network: Optional[network.Network] = None,
                 target_critic_network_2: Optional[network.Network] = None,
                 target_update_tau: types.Float = 1.0,
                 target_update_period: types.Int = 1,
                 td_errors_loss_fn: types.LossFn = tf.math.squared_difference,
                 gamma: types.Float = 1.0,
                 reward_scale_factor: types.Float = 1.0,
                 initial_log_alpha: types.Float = 0.0,
                 use_log_alpha_in_alpha_loss: bool = True,
                 target_entropy: Optional[types.Float] = None,
                 gradient_clipping: Optional[types.Float] = None,
                 debug_summaries: bool = False,
                 summarize_grads_and_vars: bool = False,
                 train_step_counter: Optional[tf.Variable] = None,
                 name: Optional[Text] = None):
        """Creates a SAC Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      critic_network: A function critic_network((observations, actions)) that
        returns the q_values for each observation and action.
      actor_network: A function actor_network(observation, action_spec) that
        returns action distribution.
      actor_optimizer: The optimizer to use for the actor network.
      critic_optimizer: The default optimizer to use for the critic network.
      alpha_optimizer: The default optimizer to use for the alpha variable.
      actor_loss_weight: The weight on actor loss.
      critic_loss_weight: The weight on critic loss.
      alpha_loss_weight: The weight on alpha loss.
      actor_policy_ctor: The policy class to use.
      critic_network_2: (Optional.)  A `tf_agents.network.Network` to be used as
        the second critic network during Q learning.  The weights from
        `critic_network` are copied if this is not provided.
      target_critic_network: (Optional.)  A `tf_agents.network.Network` to be
        used as the target critic network during Q learning. Every
        `target_update_period` train steps, the weights from `critic_network`
        are copied (possibly withsmoothing via `target_update_tau`) to `
        target_critic_network`.  If `target_critic_network` is not provided, it
        is created by making a copy of `critic_network`, which initializes a new
        network with the same structure and its own layers and weights.
        Performing a `Network.copy` does not work when the network instance
        already has trainable parameters (e.g., has already been built, or when
        the network is sharing layers with another).  In these cases, it is up
        to you to build a copy having weights that are not shared with the
        original `critic_network`, so that this can be used as a target network.
        If you provide a `target_critic_network` that shares any weights with
        `critic_network`, a warning will be logged but no exception is thrown.
      target_critic_network_2: (Optional.) Similar network as
        target_critic_network but for the critic_network_2. See documentation
        for target_critic_network. Will only be used if 'critic_network_2' is
        also specified.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn:  A function for computing the elementwise TD errors
        loss.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      initial_log_alpha: Initial value for log_alpha.
      use_log_alpha_in_alpha_loss: A boolean, whether using log_alpha or alpha
        in alpha loss. Certain implementations of SAC use log_alpha as log
        values are generally nicer to work with.
      target_entropy: The target average policy entropy, for updating alpha. The
        default value is negative of the total number of actions.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)

        self._check_action_spec(action_spec)

        self._critic_network_1 = critic_network
        self._critic_network_1.create_variables(
            (time_step_spec.observation, action_spec))
        if target_critic_network:
            target_critic_network.create_variables(
                (time_step_spec.observation, action_spec))
        self._target_critic_network_1 = (
            common.maybe_copy_target_network_with_checks(
                self._critic_network_1, target_critic_network,
                'TargetCriticNetwork1'))

        if critic_network_2 is not None:
            self._critic_network_2 = critic_network_2
        else:
            self._critic_network_2 = critic_network.copy(name='CriticNetwork2')
            # Do not use target_critic_network_2 if critic_network_2 is None.
            target_critic_network_2 = None
        self._critic_network_2.create_variables(
            (time_step_spec.observation, action_spec))
        if target_critic_network_2:
            target_critic_network_2.create_variables(
                (time_step_spec.observation, action_spec))
        self._target_critic_network_2 = (
            common.maybe_copy_target_network_with_checks(
                self._critic_network_2, target_critic_network_2,
                'TargetCriticNetwork2'))

        if actor_network:
            actor_network.create_variables(time_step_spec.observation)
        self._actor_network = actor_network

        policy = actor_policy_ctor(time_step_spec=time_step_spec,
                                   action_spec=action_spec,
                                   actor_network=self._actor_network,
                                   training=False)

        self._train_policy = actor_policy_ctor(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            training=True)

        self._log_alpha = common.create_variable(
            'initial_log_alpha',
            initial_value=initial_log_alpha,
            dtype=tf.float32,
            trainable=True)

        if target_entropy is None:
            target_entropy = self._get_default_target_entropy(action_spec)

        self._use_log_alpha_in_alpha_loss = use_log_alpha_in_alpha_loss
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer
        self._actor_loss_weight = actor_loss_weight
        self._critic_loss_weight = critic_loss_weight
        self._alpha_loss_weight = alpha_loss_weight
        self._td_errors_loss_fn = td_errors_loss_fn
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._target_entropy = target_entropy
        self._gradient_clipping = gradient_clipping
        self._debug_summaries = debug_summaries
        self._summarize_grads_and_vars = summarize_grads_and_vars
        self._update_target = self._get_target_updater(
            tau=self._target_update_tau, period=self._target_update_period)

        train_sequence_length = 2 if not critic_network.state_spec else None

        super(SacAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy=policy,
                             collect_policy=policy,
                             train_sequence_length=train_sequence_length,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter,
                             validate_args=False)

        self._as_transition = data_converter.AsTransition(
            self.data_context, squeeze_time_dim=(train_sequence_length == 2))
Пример #7
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 q_network: network.Network,
                 min_q_value: float,
                 max_q_value: float,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 temperature: types.Float = 1.0):
        """Builds a categorical Q-policy given a categorical Q-network.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A `BoundedTensorSpec` representing the actions.
      q_network: A network.Network to use for our policy.
      min_q_value: A float specifying the minimum Q-value, used for setting up
        the support.
      max_q_value: A float specifying the maximum Q-value, used for setting up
        the support.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment.
        The function takes in a full observation and returns a tuple consisting
        of 1) the part of the observation intended as input to the network and
        2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as:
        ```
        def observation_and_action_constraint_splitter(observation):
          return observation['network_input'], observation['constraint']
        ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the provided `q_network` is compatible with the network-specific
        half of the output of the `observation_and_action_constraint_splitter`.
        In particular, `observation_and_action_constraint_splitter` will be
        called on the observation before passing to the network.
        If `observation_and_action_constraint_splitter` is None, action
        constraints are not applied.
      temperature: temperature for sampling, when close to 0.0 is arg_max.

    Raises:
      ValueError: if `q_network` does not have property `num_atoms`.
      TypeError: if `action_spec` is not a `BoundedTensorSpec`.
    """
        network_action_spec = getattr(q_network, 'action_spec', None)

        if network_action_spec is not None:
            action_spec = cast(tf.TypeSpec, action_spec)
            if not action_spec.is_compatible_with(network_action_spec):
                raise ValueError(
                    'action_spec must be compatible with q_network.action_spec; '
                    'instead got action_spec=%s, q_network.action_spec=%s' %
                    (action_spec, network_action_spec))

        if not isinstance(action_spec, tensor_spec.BoundedTensorSpec):
            raise TypeError(
                'action_spec must be a BoundedTensorSpec. Got: %s' %
                (action_spec, ))

        action_spec = cast(tensor_spec.BoundedTensorSpec, action_spec)
        if action_spec.minimum != 0:
            raise ValueError(
                'Action specs should have minimum of 0, but saw: {0}.  If collecting '
                'from a python environment, consider using '
                'tf_agents.environments.wrappers.ActionOffsetWrapper.'.format(
                    action_spec))

        num_actions = action_spec.maximum - action_spec.minimum + 1
        try:
            num_atoms = q_network.num_atoms
        except AttributeError:
            raise ValueError(
                'Expected q_network to have property `num_atoms`, but '
                'it doesn\'t. (Note: you likely want to use a '
                'CategoricalQNetwork.) Network is: %s' % q_network)
        self._num_atoms = num_atoms

        network_utils.check_single_floating_network_output(
            q_network.create_variables(), (num_actions, num_atoms),
            str(q_network))

        super(CategoricalQPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             policy_state_spec=q_network.state_spec,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter))

        self._temperature = tf.convert_to_tensor(temperature, dtype=tf.float32)
        self._q_network = q_network

        # Generate support in numpy so that we can assign it to a constant and avoid
        # having a tensor property.
        support = np.linspace(min_q_value,
                              max_q_value,
                              self._num_atoms,
                              dtype=np.float32)
        self._support = tf.constant(support, dtype=tf.float32)
        self._action_dtype = action_spec.dtype
Пример #8
0
    def __init__(
            self,
            time_step_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            q_network: network.Network,
            optimizer: types.Optimizer,
            observation_and_action_constraint_splitter: Optional[
                types.Splitter] = None,
            epsilon_greedy: types.Float = 0.1,
            n_step_update: int = 1,
            boltzmann_temperature: Optional[types.Int] = None,
            emit_log_probability: bool = False,
            # Params for target network updates
            target_q_network: Optional[network.Network] = None,
            target_update_tau: types.Float = 1.0,
            target_update_period: int = 1,
            # Params for training.
            td_errors_loss_fn: Optional[types.LossFn] = None,
            gamma: types.Float = 1.0,
            reward_scale_factor: types.Float = 1.0,
            gradient_clipping: Optional[types.Float] = None,
            # Params for debugging
            debug_summaries: bool = False,
            summarize_grads_and_vars: bool = False,
            train_step_counter: Optional[tf.Variable] = None,
            name: Optional[Text] = None,
            entropy_tau: types.Float = 0.9,
            alpha: types.Float = 0.3):

        tf.Module.__init__(self, name=name)

        self._check_action_spec(action_spec)

        if epsilon_greedy is not None and boltzmann_temperature is not None:
            raise ValueError(
                'Configured both epsilon_greedy value {} and temperature {}, '
                'however only one of them can be used for exploration.'.format(
                    epsilon_greedy, boltzmann_temperature))

        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
        self._q_network = q_network
        net_observation_spec = time_step_spec.observation
        if observation_and_action_constraint_splitter:
            net_observation_spec, _ = observation_and_action_constraint_splitter(
                net_observation_spec)
        q_network.create_variables(net_observation_spec)
        if target_q_network:
            target_q_network.create_variables(net_observation_spec)
        self._target_q_network = common.maybe_copy_target_network_with_checks(
            self._q_network,
            target_q_network,
            input_spec=net_observation_spec,
            name='TargetQNetwork')

        self._check_network_output(self._q_network, 'q_network')
        self._check_network_output(self._target_q_network, 'target_q_network')

        self._epsilon_greedy = epsilon_greedy
        self._n_step_update = n_step_update
        self._boltzmann_temperature = boltzmann_temperature
        self._optimizer = optimizer
        self._td_errors_loss_fn = (td_errors_loss_fn
                                   or common.element_wise_huber_loss)
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._gradient_clipping = gradient_clipping
        self._update_target = self._get_target_updater(target_update_tau,
                                                       target_update_period)
        self.entropy_tau = entropy_tau
        self.alpha = alpha

        policy, collect_policy = self._setup_policy(time_step_spec,
                                                    action_spec,
                                                    boltzmann_temperature,
                                                    emit_log_probability)

        if q_network.state_spec and n_step_update != 1:
            raise NotImplementedError(
                'DqnAgent does not currently support n-step updates with stateful '
                'networks (i.e., RNNs), but n_step_update = {}'.format(
                    n_step_update))

        train_sequence_length = (n_step_update +
                                 1 if not q_network.state_spec else None)

        super(dqn_agent.DqnAgent, self).__init__(
            time_step_spec,
            action_spec,
            policy,
            collect_policy,
            train_sequence_length=train_sequence_length,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter,
            validate_args=False,
        )

        if q_network.state_spec:
            # AsNStepTransition does not support emitting [B, T, ...] tensors,
            # which we need for DQN-RNN.
            self._as_transition = data_converter.AsTransition(
                self.data_context, squeeze_time_dim=False)
        else:
            # This reduces the n-step return and removes the extra time dimension,
            # allowing the rest of the computations to be independent of the
            # n-step parameter.
            self._as_transition = data_converter.AsNStepTransition(
                self.data_context, gamma=gamma, n=n_step_update)
Пример #9
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 actor_network: network.Network,
                 value_network: network.Network,
                 observation_normalizer: Optional[
                     tensor_normalizer.TensorNormalizer] = None,
                 clip: bool = True,
                 collect: bool = True,
                 compute_value_and_advantage_in_train: bool = False):
        """Builds a PPO Policy given network Templates or functions.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: An instance of a tf_agents.networks.network.Network, with
        call(observation, step_type, network_state).  Network should
        return one of the following: 1. a nested tuple of tfp.distributions
          objects matching action_spec, or 2. a nested tuple of tf.Tensors
          representing actions.
      value_network:  An instance of a tf_agents.networks.network.Network, with
        call(observation, step_type, network_state).  Network should return
        value predictions for the input state.
      observation_normalizer: An object to use for obervation normalization.
      clip: Whether to clip actions to spec before returning them.  Default
        True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped
        continuous actions for training.
      collect: If True, creates ops for actions_log_prob, value_preds, and
        action_distribution_params. (default True)
      compute_value_and_advantage_in_train: A bool to indicate where value
        prediction and advantage calculation happen.  If True, both happen in
        agent.train(), therefore no need to save the value prediction inside of
        policy info. If False, value prediction is computed during data
        collection. This argument must be set to `False` if mini batch learning
        is enabled.

    Raises:
      TypeError: if `actor_network` or `value_network` is not of type
        `tf_agents.networks.Network`.
      ValueError: if `actor_network` or `value_network` do not emit
        valid outputs.
    """
        if not isinstance(actor_network, network.Network):
            raise TypeError('actor_network is not of type network.Network')
        if not isinstance(value_network, network.Network):
            raise TypeError('value_network is not of type network.Network')

        actor_output_spec = actor_network.create_variables(
            time_step_spec.observation)
        value_output_spec = value_network.create_variables(
            time_step_spec.observation)

        nest_utils.assert_value_spec(value_output_spec, 'value_network')

        distribution_utils.assert_specs_are_compatible(
            actor_output_spec, action_spec,
            'actor_network output spec does not match action spec')

        self._compute_value_and_advantage_in_train = (
            compute_value_and_advantage_in_train)

        if collect:
            # TODO(oars): Cleanup how we handle non distribution networks.
            if isinstance(actor_network, network.DistributionNetwork):
                network_output_spec = actor_network.output_spec
            else:
                network_output_spec = tf.nest.map_structure(
                    distribution_spec.deterministic_distribution_from_spec,
                    action_spec)
            info_spec = {
                'dist_params':
                tf.nest.map_structure(lambda spec: spec.input_params_spec,
                                      network_output_spec)
            }

            if not self._compute_value_and_advantage_in_train:
                info_spec['value_prediction'] = tensor_spec.TensorSpec(
                    shape=[], dtype=tf.float32)
        else:
            info_spec = ()

        policy_state_spec = {}
        if actor_network.state_spec:
            policy_state_spec['actor_network_state'] = actor_network.state_spec
        if (collect and value_network.state_spec
                and not self._compute_value_and_advantage_in_train):
            policy_state_spec['value_network_state'] = value_network.state_spec
        if not policy_state_spec:
            policy_state_spec = ()

        super(PPOPolicy,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             policy_state_spec=policy_state_spec,
                             info_spec=info_spec,
                             actor_network=actor_network,
                             observation_normalizer=observation_normalizer,
                             clip=clip)

        self._collect = collect
        if value_network is not None:
            value_network.create_variables()
        self._value_network = value_network
Пример #10
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 actor_network: network.Network,
                 value_network: network.Network,
                 observation_normalizer: Optional[
                     tensor_normalizer.TensorNormalizer] = None,
                 clip: bool = True,
                 collect: bool = True,
                 compute_value_and_advantage_in_train: bool = False):
        """Builds a PPO Policy given network Templates or functions.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: An instance of a tf_agents.networks.network.Network, with
        call(observation, step_type, network_state).  Network should
        return one of the following: 1. a nested tuple of tfp.distributions
          objects matching action_spec, or 2. a nested tuple of tf.Tensors
          representing actions.
      value_network:  An instance of a tf_agents.networks.network.Network, with
        call(observation, step_type, network_state).  Network should return
        value predictions for the input state.
      observation_normalizer: An object to use for obervation normalization.
      clip: Whether to clip actions to spec before returning them.  Default
        True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped
        continuous actions for training.
      collect: If True, creates ops for actions_log_prob, value_preds, and
        action_distribution_params. (default True)
      compute_value_and_advantage_in_train: A bool to indicate where value
        prediction and advantage calculation happen.  If True, both happen in
        agent.train(), therefore no need to save the value prediction inside of
        policy info. If False, value prediction is computed during data
        collection. This argument must be set to `False` if mini batch learning
        is enabled.

    Raises:
      TypeError: if `actor_network` or `value_network` is not of type
        `tf_agents.networks.Network`.
      ValueError: if `actor_network` or `value_network` do not emit
        valid outputs.  For example, `actor_network` must either be
        a (legacy style) `DistributionNetwork`, or explicitly emit
        a nest of `tfp.distribution.Distribution` objects.
    """
        if not isinstance(actor_network, network.Network):
            raise TypeError('actor_network is not of type network.Network')
        if not isinstance(value_network, network.Network):
            raise TypeError('value_network is not of type network.Network')

        actor_output_spec = actor_network.create_variables(
            time_step_spec.observation)

        value_output_spec = value_network.create_variables(
            time_step_spec.observation)

        nest_utils.assert_value_spec(value_output_spec, 'value_network')

        distribution_utils.assert_specs_are_compatible(
            actor_output_spec, action_spec,
            'actor_network output spec does not match action spec')

        self._compute_value_and_advantage_in_train = (
            compute_value_and_advantage_in_train)

        if collect:
            if isinstance(actor_network, network.DistributionNetwork):
                # Legacy DistributionNetwork case.  New code can just provide a regular
                # Network that emits a Distribution object; and we use a different
                # code path using DistributionSpecV2 for that.
                network_output_spec = actor_network.output_spec
                info_spec = {
                    'dist_params':
                    tf.nest.map_structure(lambda spec: spec.input_params_spec,
                                          network_output_spec)
                }
            else:
                # We have a Network that emits a nest of distributions.
                def nested_dist_params(spec):
                    if not isinstance(spec,
                                      distribution_utils.DistributionSpecV2):
                        raise ValueError(
                            'Unexpected output from `actor_network`.  Expected '
                            '`Distribution` objects, but saw output spec: {}'.
                            format(actor_output_spec))
                    return distribution_utils.parameters_to_dict(
                        spec.parameters, tensors_only=True)

                info_spec = {
                    'dist_params':
                    tf.nest.map_structure(nested_dist_params,
                                          actor_output_spec)
                }

            if not self._compute_value_and_advantage_in_train:
                info_spec['value_prediction'] = tensor_spec.TensorSpec(
                    shape=[], dtype=tf.float32)
        else:
            info_spec = ()

        policy_state_spec = {}
        if actor_network.state_spec:
            policy_state_spec['actor_network_state'] = actor_network.state_spec
        if (collect and value_network.state_spec
                and not self._compute_value_and_advantage_in_train):
            policy_state_spec['value_network_state'] = value_network.state_spec
        if not policy_state_spec:
            policy_state_spec = ()

        super(PPOPolicy,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             policy_state_spec=policy_state_spec,
                             info_spec=info_spec,
                             actor_network=actor_network,
                             observation_normalizer=observation_normalizer,
                             clip=clip)

        self._collect = collect
        self._value_network = value_network
Пример #11
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 actor_network: network.Network,
                 policy_state_spec: types.NestedTensorSpec = (),
                 info_spec: types.NestedTensorSpec = (),
                 observation_normalizer: Optional[
                     tensor_normalizer.TensorNormalizer] = None,
                 clip: bool = True,
                 training: bool = False,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 name: Optional[Text] = None):
        """Builds an Actor Policy given an actor network.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      actor_network: An instance of a `tf_agents.networks.network.Network` to be
        used by the policy. The network will be called with `call(observation,
        step_type, policy_state)` and should return `(actions_or_distributions,
        new_state)`.
      policy_state_spec: A nest of TensorSpec representing the policy_state.
        If not set, defaults to actor_network.state_spec.
      info_spec: A nest of `TensorSpec` representing the policy info.
      observation_normalizer: An object to use for observation normalization.
      clip: Whether to clip actions to spec before returning them. Default True.
        Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped
        continuous actions for training.
      training: Whether the network should be called in training mode.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment.
        The function takes in a full observation and returns a tuple consisting
        of 1) the part of the observation intended as input to the network and
        2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as:
        ```
        def observation_and_action_constraint_splitter(observation):
          return observation['network_input'], observation['constraint']
        ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the provided `actor_network` is compatible with the
        network-specific half of the output of the
        `observation_and_action_constraint_splitter`. In particular,
        `observation_and_action_constraint_splitter` will be called on the
        observation before passing to the network.
        If `observation_and_action_constraint_splitter` is None, action
        constraints are not applied.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: if `actor_network` is not of type `network.Network`.
      NotImplementedError: if `observation_and_action_constraint_splitter` is
        not None but `action_spec` is not discrete.
    """
        if not isinstance(actor_network, network.Network):
            raise ValueError('actor_network must be a network.Network. Found '
                             '{}.'.format(type(actor_network)))
        actor_network.create_variables()
        self._actor_network = actor_network
        self._observation_normalizer = observation_normalizer
        self._training = training

        if observation_and_action_constraint_splitter is not None:
            if len(tf.nest.flatten(action_spec)) > 1 or (
                    not tensor_spec.is_discrete(action_spec)):
                raise NotImplementedError(
                    'Action constraints for ActorPolicy are currently only supported '
                    'for a single spec of discrete actions. Got action_spec {}'
                    .format(action_spec))

        if not policy_state_spec:
            policy_state_spec = actor_network.state_spec

        super(ActorPolicy,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             policy_state_spec=policy_state_spec,
                             info_spec=info_spec,
                             clip=clip,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)
Пример #12
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 critic_network: network.Network,
                 actor_network: network.Network,
                 actor_optimizer: types.Optimizer,
                 critic_optimizer: types.Optimizer,
                 alpha_optimizer: types.Optimizer,
                 actor_loss_weight: types.Float = 1.0,
                 critic_loss_weight: types.Float = 0.5,
                 alpha_loss_weight: types.Float = 1.0,
                 actor_policy_ctor: Callable[
                     ..., tf_policy.TFPolicy] = actor_policy.ActorPolicy,
                 critic_network_2: Optional[network.Network] = None,
                 target_critic_network: Optional[network.Network] = None,
                 target_critic_network_2: Optional[network.Network] = None,
                 target_update_tau: types.Float = 1.0,
                 target_update_period: types.Int = 1,
                 td_errors_loss_fn: types.LossFn = tf.math.squared_difference,
                 gamma: types.Float = 1.0,
                 sigma: types.Float = 0.9,
                 reward_scale_factor: types.Float = 1.0,
                 initial_log_alpha: types.Float = 0.0,
                 use_log_alpha_in_alpha_loss: bool = True,
                 target_entropy: Optional[types.Float] = None,
                 gradient_clipping: Optional[types.Float] = None,
                 debug_summaries: bool = False,
                 summarize_grads_and_vars: bool = False,
                 train_step_counter: Optional[tf.Variable] = None,
                 name: Optional[Text] = None):

        tf.Module.__init__(self, name=name)

        self._check_action_spec(action_spec)

        net_observation_spec = time_step_spec.observation
        critic_spec = (net_observation_spec, action_spec)

        self._critic_network_1 = critic_network

        if critic_network_2 is not None:
            self._critic_network_2 = critic_network_2
        else:
            self._critic_network_2 = critic_network.copy(name='CriticNetwork2')
            # Do not use target_critic_network_2 if critic_network_2 is None.
            target_critic_network_2 = None

        # Wait until critic_network_2 has been copied from critic_network_1 before
        # creating variables on both.
        self._critic_network_1.create_variables(critic_spec)
        self._critic_network_2.create_variables(critic_spec)

        if target_critic_network:
            target_critic_network.create_variables(critic_spec)

        self._target_critic_network_1 = (
            common.maybe_copy_target_network_with_checks(
                self._critic_network_1,
                target_critic_network,
                input_spec=critic_spec,
                name='TargetCriticNetwork1'))

        if target_critic_network_2:
            target_critic_network_2.create_variables(critic_spec)
        self._target_critic_network_2 = (
            common.maybe_copy_target_network_with_checks(
                self._critic_network_2,
                target_critic_network_2,
                input_spec=critic_spec,
                name='TargetCriticNetwork2'))

        if actor_network:
            actor_network.create_variables(net_observation_spec)
        self._actor_network = actor_network

        policy = actor_policy_ctor(time_step_spec=time_step_spec,
                                   action_spec=action_spec,
                                   actor_network=self._actor_network,
                                   training=False)

        self._train_policy = actor_policy_ctor(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            training=True)

        self._log_alpha = common.create_variable(
            'initial_log_alpha',
            initial_value=initial_log_alpha,
            dtype=tf.float32,
            trainable=True)

        if target_entropy is None:
            target_entropy = self._get_default_target_entropy(action_spec)

        self._use_log_alpha_in_alpha_loss = use_log_alpha_in_alpha_loss
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer
        self._actor_loss_weight = actor_loss_weight
        self._critic_loss_weight = critic_loss_weight
        self._alpha_loss_weight = alpha_loss_weight
        self._td_errors_loss_fn = td_errors_loss_fn
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._target_entropy = target_entropy
        self._gradient_clipping = gradient_clipping
        self._debug_summaries = debug_summaries
        self._summarize_grads_and_vars = summarize_grads_and_vars
        self._update_target = self._get_target_updater(
            tau=self._target_update_tau, period=self._target_update_period)

        self.sigma = sigma

        train_sequence_length = 2 if not critic_network.state_spec else None

        super(sac_agent.SacAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy=policy,
                             collect_policy=policy,
                             train_sequence_length=train_sequence_length,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter,
                             validate_args=False)

        self._as_transition = data_converter.AsTransition(
            self.data_context, squeeze_time_dim=(train_sequence_length == 2))
Пример #13
0
    def __init__(self,
                 output_dim,
                 noise_dim=32,
                 input_tensor_spec=None,
                 hidden_layers=(256, ),
                 net: Network = None,
                 net_moving_average_rate=None,
                 entropy_regularization=0.,
                 kernel_sharpness=2.,
                 mi_weight=None,
                 mi_estimator_cls=MIEstimator,
                 optimizer: tf.optimizers.Optimizer = None,
                 name="Generator"):
        """Create a Generator.

        Args:
            output_dim (int): dimension of output
            noise_dim (int): dimension of noise
            input_tensor_spec (nested TensorSpec): spec of inputs. If there is
                no inputs, this should be None.
            hidden_layers (tuple): size of hidden layers.
            net (Network): network for generating outputs from [noise, inputs]
                or noise (if inputs is None). If None, a default one with
                hidden_layers will be created
            net_moving_average_rate (float): If provided, use a moving average
                version of net to do prediction. This has been shown to be
                effective for GAN training (arXiv:1907.02544, arXiv:1812.04948).
            entropy_regularization (float): weight of entropy regularization
            kernel_sharpness (float): Used only for entropy_regularization > 0.
                We calcualte the kernel in SVGD as:
                    exp(-kernel_sharpness * reduce_mean((x-y)^2/width)),
                where width is the elementwise moving average of (x-y)^2
            mi_estimator_cls (type): the class of mutual information estimator
                for maximizing the mutual information between [noise, inputs]
                and [outputs, inputs].
            optimizer (tf.optimizers.Optimizer): optimizer (optional)
            name (str): name of this generator
        """
        super().__init__(train_state_spec=(), optimizer=optimizer, name=name)
        self._noise_dim = noise_dim
        self._entropy_regularization = entropy_regularization
        if entropy_regularization == 0:
            self._grad_func = self._ml_grad
        else:
            self._grad_func = self._stein_grad
            self._kernel_width_averager = AdaptiveAverager(
                tensor_spec=tf.TensorSpec(shape=(output_dim, )))
            self._kernel_sharpness = kernel_sharpness

        noise_spec = tf.TensorSpec(shape=[noise_dim])

        if net is None:
            net_input_spec = noise_spec
            if input_tensor_spec is not None:
                net_input_spec = [net_input_spec, input_tensor_spec]
            net = EncodingNetwork(
                name="Generator",
                input_tensor_spec=net_input_spec,
                fc_layer_params=hidden_layers,
                last_layer_size=output_dim)

        self._mi_estimator = None
        self._input_tensor_spec = input_tensor_spec
        if mi_weight is not None:
            x_spec = noise_spec
            y_spec = tf.TensorSpec((output_dim, ))
            if input_tensor_spec is not None:
                x_spec = [x_spec, input_tensor_spec]
            self._mi_estimator = mi_estimator_cls(
                x_spec, y_spec, sampler='shift')
            self._mi_weight = mi_weight
        self._net = net
        self._predict_net = None
        self._net_moving_average_rate = net_moving_average_rate
        if net_moving_average_rate:
            self._predict_net = net.copy(name="Genrator_average")
            tfa_common.soft_variables_update(
                self._net.variables, self._predict_net.variables, tau=1.0)
Пример #14
0
    def __init__(
            self,
            time_step_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            cloning_network: network.Network,
            optimizer: types.Optimizer,
            num_outer_dims: Literal[1, 2] = 1,  # pylint: disable=bad-whitespace
            epsilon_greedy: types.Float = 0.1,
            loss_fn: Optional[Callable[[types.NestedTensor, bool],
                                       types.Tensor]] = None,
            gradient_clipping: Optional[types.Float] = None,
            # Params for debugging.
            debug_summaries: bool = False,
            summarize_grads_and_vars: bool = False,
            train_step_counter: Optional[tf.Variable] = None,
            name: Optional[Text] = None):
        """Creates an instance of a Behavioral Cloning agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      cloning_network: A `tf_agents.networks.Network` to be used by the agent.
        The network will be called as

          ```
          network(observation, step_type=step_type, network_state=initial_state)
          ```
        and must return a 2-tuple with elements `(output, next_network_state)`
      optimizer: The optimizer to use for training.
      num_outer_dims: The number of outer dimensions for the agent. Must be
        either 1 or 2. If 2, training will require both a batch_size and time
        dimension on every Tensor; if 1, training will require only a batch_size
        outer dimension.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if actions are discrete)
      loss_fn: A function for computing the error between the output of the
        cloning network and the action that was taken. If None, the loss
        depends on the action dtype. The `loss_fn` is called with parameters:
        `(experience, training)`, and must return a loss value for each element
        of the batch.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)
        self._cloning_network = cloning_network
        self._optimizer = optimizer
        self._gradient_clipping = gradient_clipping

        action_spec = tensor_spec.from_spec(action_spec)
        flat_action_spec = tf.nest.flatten(action_spec)
        continuous_specs = [
            tensor_spec.is_continuous(s) for s in flat_action_spec
        ]

        if not flat_action_spec:
            raise ValueError(
                'The `action_spec` must contain at least one action.')

        single_discrete_scalar_action = (
            len(flat_action_spec) == 1 and flat_action_spec[0].shape.rank == 0
            and not tensor_spec.is_continuous(flat_action_spec[0]))
        single_continuous_action = (len(flat_action_spec) == 1
                                    and tensor_spec.is_continuous(
                                        flat_action_spec[0]))

        if (not loss_fn and not single_discrete_scalar_action
                and not single_continuous_action):
            raise ValueError(
                'A `loss_fn` must be provided unless there is a single, scalar '
                'discrete action or a single (scalar or non-scalar) continuous '
                'action.')

        self._network_output_spec = cloning_network.create_variables(
            time_step_spec.observation)

        # If there is a mix of continuous and discrete actions we want to use an
        # actor policy so we can use the `setup_as_continuous` method as long as the
        # user provided a custom loss_fn which we verified above.
        if any(continuous_specs):
            policy, collect_policy = self._setup_as_continuous(
                time_step_spec, action_spec, loss_fn)
        else:
            policy, collect_policy = self._setup_as_discrete(
                time_step_spec, action_spec, loss_fn, epsilon_greedy)

        super(BehavioralCloningAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)

        self._as_trajectory = data_converter.AsTrajectory(
            self.data_context,
            sequence_length=None,
            num_outer_dims=num_outer_dims)
Пример #15
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.TensorSpec,
                 actor_network: network.Network,
                 optimizer: types.Optimizer,
                 value_network: Optional[network.Network] = None,
                 value_estimation_loss_coef: types.Float = 0.2,
                 advantage_fn: Optional[AdvantageFnType] = None,
                 use_advantage_loss: bool = True,
                 gamma: types.Float = 1.0,
                 normalize_returns: bool = True,
                 gradient_clipping: Optional[types.Float] = None,
                 debug_summaries: bool = False,
                 summarize_grads_and_vars: bool = False,
                 entropy_regularization: Optional[types.Float] = None,
                 train_step_counter: Optional[tf.Variable] = None,
                 name: Optional[Text] = None):
        """Creates a REINFORCE Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      optimizer: Optimizer for the actor network.
      value_network: (Optional) A `tf_agents.network.Network` to be used by the
        agent. The network will be called with call(observation, step_type) and
        returns a floating point value tensor.
      value_estimation_loss_coef: (Optional) Multiplier for value prediction
        loss to balance with policy gradient loss.
      advantage_fn: A function `A(returns, value_preds)` that takes returns and
        value function predictions as input and returns advantages. The default
        is `A(returns, value_preds) = returns - value_preds` if a value network
        is specified and `use_advantage_loss=True`, otherwise `A(returns,
        value_preds) = returns`.
      use_advantage_loss: Whether to use value function predictions for
        computing returns. `use_advantage_loss=False` is equivalent to setting
        `advantage_fn=lambda returns, value_preds: returns`.
      gamma: A discount factor for future rewards.
      normalize_returns: Whether to normalize returns across episodes when
        computing the loss.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      entropy_regularization: Coefficient for entropy regularization loss term.
      train_step_counter: An optional counter to increment every time the train
        op is run. Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)

        actor_network.create_variables()
        self._actor_network = actor_network
        if value_network:
            value_network.create_variables()
        self._value_network = value_network

        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=True)

        policy = greedy_policy.GreedyPolicy(collect_policy)

        self._optimizer = optimizer
        self._gamma = gamma
        self._normalize_returns = normalize_returns
        self._gradient_clipping = gradient_clipping
        self._entropy_regularization = entropy_regularization
        self._value_estimation_loss_coef = value_estimation_loss_coef
        self._baseline = self._value_network is not None
        self._advantage_fn = advantage_fn
        if self._advantage_fn is None:
            if use_advantage_loss and self._baseline:
                self._advantage_fn = lambda returns, value_preds: returns - value_preds
            else:
                self._advantage_fn = lambda returns, _: returns

        super(ReinforceAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
        self._as_trajectory = data_converter.AsTrajectory(self.data_context)
Пример #16
0
  def __init__(
      self,
      time_step_spec: ts.TimeStep,
      action_spec: types.NestedTensorSpec,
      q_network: network.Network,
      optimizer: types.Optimizer,
      observation_and_action_constraint_splitter: Optional[
          types.Splitter] = None,
      epsilon_greedy: types.Float = 0.1,
      n_step_update: int = 1,
      boltzmann_temperature: Optional[types.Int] = None,
      emit_log_probability: bool = False,
      # Params for target network updates
      target_q_network: Optional[network.Network] = None,
      target_update_tau: types.Float = 1.0,
      target_update_period: int = 1,
      # Params for training.
      td_errors_loss_fn: Optional[types.LossFn] = None,
      gamma: types.Float = 1.0,
      reward_scale_factor: types.Float = 1.0,
      gradient_clipping: Optional[types.Float] = None,
      # Params for debugging
      debug_summaries: bool = False,
      summarize_grads_and_vars: bool = False,
      train_step_counter: Optional[tf.Variable] = None,
      name: Optional[Text] = None):
    """Creates a DQN Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      q_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with `call(observation, step_type)` and should
        emit logits over the action space.
      optimizer: The optimizer to use for training.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment.
        The function takes in a full observation and returns a tuple consisting
        of 1) the part of the observation intended as input to the network and
        2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as:
        ```
        def observation_and_action_constraint_splitter(observation):
          return observation['network_input'], observation['constraint']
        ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the provided `q_network` is compatible with the network-specific
        half of the output of the `observation_and_action_constraint_splitter`.
        In particular, `observation_and_action_constraint_splitter` will be
        called on the observation before passing to the network.
        If `observation_and_action_constraint_splitter` is None, action
        constraints are not applied.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if a wrapper is not provided to
        the collect_policy method).
      n_step_update: The number of steps to consider when computing TD error and
        TD loss. Defaults to single-step updates. Note that this requires the
        user to call train on Trajectory objects with a time dimension of
        `n_step_update + 1`. However, note that we do not yet support
        `n_step_update > 1` in the case of RNNs (i.e., non-empty
        `q_network.state_spec`).
      boltzmann_temperature: Temperature value to use for Boltzmann sampling of
        the actions during data collection. The closer to 0.0, the higher the
        probability of choosing the best action.
      emit_log_probability: Whether policies emit log probabilities or not.
      target_q_network: (Optional.)  A `tf_agents.network.Network`
        to be used as the target network during Q learning.  Every
        `target_update_period` train steps, the weights from
        `q_network` are copied (possibly with smoothing via
        `target_update_tau`) to `target_q_network`.

        If `target_q_network` is not provided, it is created by
        making a copy of `q_network`, which initializes a new
        network with the same structure and its own layers and weights.

        Network copying is performed via the `Network.copy` superclass method,
        and may inadvertently lead to the resulting network to share weights
        with the original.  This can happen if, for example, the original
        network accepted a pre-built Keras layer in its `__init__`, or
        accepted a Keras layer that wasn't built, but neglected to create
        a new copy.

        In these cases, it is up to you to provide a target Network having
        weights that are not shared with the original `q_network`.
        If you provide a `target_q_network` that shares any
        weights with `q_network`, a warning will be logged but
        no exception is thrown.

        Note; shallow copies of Keras layers may be built via the code:

        ```python
        new_layer = type(layer).from_config(layer.get_config())
        ```
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn: A function for computing the TD errors loss. If None, a
        default value of element_wise_huber_loss is used. This function takes as
        input the target and the estimated Q values and returns the loss for
        each element of the batch.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: If `action_spec` contains more than one action or action
        spec minimum is not equal to 0.
      ValueError: If the q networks do not emit floating point outputs with
        inner shape matching `action_spec`.
      NotImplementedError: If `q_network` has non-empty `state_spec` (i.e., an
        RNN is provided) and `n_step_update > 1`.
    """
    tf.Module.__init__(self, name=name)

    self._check_action_spec(action_spec)

    if epsilon_greedy is not None and boltzmann_temperature is not None:
      raise ValueError(
          'Configured both epsilon_greedy value {} and temperature {}, '
          'however only one of them can be used for exploration.'.format(
              epsilon_greedy, boltzmann_temperature))

    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    self._q_network = q_network
    net_observation_spec = time_step_spec.observation
    if observation_and_action_constraint_splitter:
      net_observation_spec, _ = observation_and_action_constraint_splitter(
          net_observation_spec)
    q_network.create_variables(net_observation_spec)
    if target_q_network:
      target_q_network.create_variables(net_observation_spec)
    self._target_q_network = common.maybe_copy_target_network_with_checks(
        self._q_network, target_q_network, input_spec=net_observation_spec,
        name='TargetQNetwork')

    self._check_network_output(self._q_network, 'q_network')
    self._check_network_output(self._target_q_network, 'target_q_network')

    self._epsilon_greedy = epsilon_greedy
    self._n_step_update = n_step_update
    self._boltzmann_temperature = boltzmann_temperature
    self._optimizer = optimizer
    self._td_errors_loss_fn = (
        td_errors_loss_fn or common.element_wise_huber_loss)
    self._gamma = gamma
    self._reward_scale_factor = reward_scale_factor
    self._gradient_clipping = gradient_clipping
    self._update_target = self._get_target_updater(
        target_update_tau, target_update_period)

    policy, collect_policy = self._setup_policy(time_step_spec, action_spec,
                                                boltzmann_temperature,
                                                emit_log_probability)

    if q_network.state_spec and n_step_update != 1:
      raise NotImplementedError(
          'DqnAgent does not currently support n-step updates with stateful '
          'networks (i.e., RNNs), but n_step_update = {}'.format(n_step_update))

    train_sequence_length = (
        n_step_update + 1 if not q_network.state_spec else None)

    super(DqnAgent, self).__init__(
        time_step_spec,
        action_spec,
        policy,
        collect_policy,
        train_sequence_length=train_sequence_length,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter,
        validate_args=False,
    )

    if q_network.state_spec:
      # AsNStepTransition does not support emitting [B, T, ...] tensors,
      # which we need for DQN-RNN.
      self._as_transition = data_converter.AsTransition(
          self.data_context, squeeze_time_dim=False)
    else:
      # This reduces the n-step return and removes the extra time dimension,
      # allowing the rest of the computations to be independent of the
      # n-step parameter.
      self._as_transition = data_converter.AsNStepTransition(
          self.data_context, gamma=gamma, n=n_step_update)
Пример #17
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_network: DistributionNetwork,
                 critic_network: Network,
                 gamma=0.99,
                 ou_stddev=0.2,
                 ou_damping=0.15,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 target_update_tau=0.05,
                 target_update_period=10,
                 dqda_clipping=None,
                 gradient_clipping=None,
                 debug_summaries=False,
                 name="SarsaAlgorithm"):
        """Create an SarsaAlgorithm.

        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            observation_spec (nested TensorSpec): spec for observation.
            actor_network (Network|DistributionNetwork):  The network will be
                called with call(observation, step_type). If it is DistributionNetwork
                an action will be sampled.
            critic_network (Network): The network will be called with
                call(observation, action, step_type).
            gamma (float): discount rate for reward
            ou_stddev (float): Only used for DDPG. Standard deviation for the
                Ornstein-Uhlenbeck (OU) noise added in the default collect policy.
            ou_damping (float): Only used for DDPG. Damping factor for the OU
                noise added in the default collect policy.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between [-dqda_clipping, dqda_clipping].
                Does not perform clipping if dqda_clipping == 0.
            actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            critic_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            gradient_clipping (float): Norm length to clip gradients.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        if isinstance(actor_network, DistributionNetwork):
            self._action_distribution_spec = actor_network.output_spec
        elif isinstance(actor_network, Network):
            self._action_distribution_spec = action_spec
        else:
            raise ValueError("Expect DistributionNetwork or Network for"
                             " `actor_network`, got %s" % type(actor_network))

        super().__init__(observation_spec,
                         action_spec,
                         predict_state_spec=SarsaState(
                             prev_observation=observation_spec,
                             prev_step_type=tf.TensorSpec((), tf.int32),
                             actor=actor_network.state_spec),
                         train_state_spec=SarsaState(
                             prev_observation=observation_spec,
                             prev_step_type=tf.TensorSpec((), tf.int32),
                             actor=actor_network.state_spec,
                             target_actor=actor_network.state_spec,
                             critic=critic_network.state_spec,
                             target_critic=critic_network.state_spec,
                         ),
                         optimizer=[actor_optimizer, critic_optimizer],
                         trainable_module_sets=[[actor_network],
                                                [critic_network]],
                         gradient_clipping=gradient_clipping,
                         debug_summaries=debug_summaries,
                         name=name)
        self._actor_network = actor_network
        self._critic_network = critic_network
        self._target_actor_network = actor_network.copy(
            name='target_actor_network')
        self._target_critic_network = critic_network.copy(
            name='target_critic_network')
        self._update_target = common.get_target_updater(
            models=[self._actor_network, self._critic_network],
            target_models=[
                self._target_actor_network, self._target_critic_network
            ],
            tau=target_update_tau,
            period=target_update_period)
        self._dqda_clipping = dqda_clipping
        self._gamma = gamma
        self._ou_process = create_ou_process(action_spec, ou_stddev,
                                             ou_damping)
Пример #18
0
  def __init__(
      self,
      time_step_spec: ts.TimeStep,
      action_spec: types.NestedTensorSpec,
      q_network: network.Network,
      emit_log_probability: bool = False,
      observation_and_action_constraint_splitter: Optional[
          types.Splitter] = None,
      validate_action_spec_and_network: bool = True,
      name: Optional[Text] = None):
    """Builds a Q-Policy given a q_network.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      q_network: An instance of a `tf_agents.network.Network`,
        callable via `network(observation, step_type) -> (output, final_state)`.
      emit_log_probability: Whether to emit log-probs in info of `PolicyStep`.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment.
        The function takes in a full observation and returns a tuple consisting
        of 1) the part of the observation intended as input to the network and
        2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as:
        ```
        def observation_and_action_constraint_splitter(observation):
          return observation['network_input'], observation['constraint']
        ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the provided `q_network` is compatible with the network-specific
        half of the output of the `observation_and_action_constraint_splitter`.
        In particular, `observation_and_action_constraint_splitter` will be
        called on the observation before passing to the network.
        If `observation_and_action_constraint_splitter` is None, action
        constraints are not applied.
      validate_action_spec_and_network: If `True` (default),
        action_spec is checked to make sure it is a single scalar spec
        with a minimum of zero.  Also validates that the network's output
        matches the spec.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: If `q_network.action_spec` exists and is not compatible with
        `action_spec`.
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec`.
    """
    action_spec = tensor_spec.from_spec(action_spec)
    time_step_spec = tensor_spec.from_spec(time_step_spec)

    network_action_spec = getattr(q_network, 'action_spec', None)

    if network_action_spec is not None:
      action_spec = cast(tf.TypeSpec, action_spec)
      if not action_spec.is_compatible_with(network_action_spec):
        raise ValueError(
            'action_spec must be compatible with q_network.action_spec; '
            'instead got action_spec=%s, q_network.action_spec=%s' % (
                action_spec, network_action_spec))

    flat_action_spec = tf.nest.flatten(action_spec)
    if len(flat_action_spec) > 1:
      raise ValueError(
          'Only scalar actions are supported now, but action spec is: {}'
          .format(action_spec))
    if validate_action_spec_and_network:
      spec = flat_action_spec[0]
      if spec.shape.rank > 0:
        raise ValueError(
            'Only scalar actions are supported now, but action spec is: {}'
            .format(action_spec))

      if spec.minimum != 0:
        raise ValueError(
            'Action specs should have minimum of 0, but saw: {0}'.format(spec))

      num_actions = spec.maximum - spec.minimum + 1
      network_utils.check_single_floating_network_output(
          q_network.create_variables(), (num_actions,), str(q_network))

    # We need to maintain the flat action spec for dtype, shape and range.
    self._flat_action_spec = flat_action_spec[0]

    self._q_network = q_network
    super(QPolicy, self).__init__(
        time_step_spec,
        action_spec,
        policy_state_spec=q_network.state_spec,
        clip=False,
        emit_log_probability=emit_log_probability,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter),
        name=name)
Пример #19
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 q_network: network.Network,
                 emit_log_probability: bool = False,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 name: Optional[Text] = None):
        """Builds a Q-Policy given a q_network.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      q_network: An instance of a `tf_agents.network.Network`,
        callable via `network(observation, step_type) -> (output, final_state)`.
      emit_log_probability: Whether to emit log-probs in info of `PolicyStep`.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment.
        The function takes in a full observation and returns a tuple consisting
        of 1) the part of the observation intended as input to the network and
        2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as:
        ```
        def observation_and_action_constraint_splitter(observation):
          return observation['network_input'], observation['constraint']
        ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the provided `q_network` is compatible with the network-specific
        half of the output of the `observation_and_action_constraint_splitter`.
        In particular, `observation_and_action_constraint_splitter` will be
        called on the observation before passing to the network.
        If `observation_and_action_constraint_splitter` is None, action
        constraints are not applied.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: If `q_network.action_spec` exists and is not compatible with
        `action_spec`.
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec`.
    """
        network_action_spec = getattr(q_network, 'action_spec', None)

        if network_action_spec is not None:
            if not action_spec.is_compatible_with(network_action_spec):
                raise ValueError(
                    'action_spec must be compatible with q_network.action_spec; '
                    'instead got action_spec=%s, q_network.action_spec=%s' %
                    (action_spec, network_action_spec))

        flat_action_spec = tf.nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise NotImplementedError(
                'action_spec can only contain a single BoundedTensorSpec.')
        # We need to maintain the flat action spec for dtype, shape and range.
        self._flat_action_spec = flat_action_spec[0]
        q_network.create_variables()
        self._q_network = q_network
        super(QPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             policy_state_spec=q_network.state_spec,
                             clip=False,
                             emit_log_probability=emit_log_probability,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)