def __init__(self, action_spec, actor_network: Network, critic_network: Network, critic_loss=None, target_entropy=None, initial_log_alpha=0.0, target_update_tau=0.05, target_update_period=1, dqda_clipping=None, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, gradient_clipping=None, train_step_counter=None, debug_summaries=False, name="SacAlgorithm"): """Create a SacAlgorithm Args: action_spec (nested BoundedTensorSpec): representing the actions. actor_network (Network): The network will be called with call(observation, step_type). critic_network (Network): The network will be called with call(observation, action, step_type). critic_loss (None|OneStepTDLoss): an object for calculating critic loss. If None, a default OneStepTDLoss will be used. initial_log_alpha (float): initial value for variable log_alpha target_entropy (float|None): The target average policy entropy, for updating alpha. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor. critic_optimizer (tf.optimizers.Optimizer): The optimizer for critic. alpha_optimizer (tf.optimizers.Optimizer): The optimizer for alpha. gradient_clipping (float): Norm length to clip gradients. train_step_counter (tf.Variable): An optional counter to increment every time the a new iteration is started. If None, it will use tf.summary.experimental.get_step(). If this is still None, a counter will be created. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ critic_network1 = critic_network critic_network2 = critic_network.copy(name='CriticNetwork2') log_alpha = tfa_common.create_variable(name='log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) super().__init__( action_spec, train_state_spec=SacState( share=SacShareState(actor=actor_network.state_spec), actor=SacActorState(critic1=critic_network.state_spec, critic2=critic_network.state_spec), critic=SacCriticState( critic1=critic_network.state_spec, critic2=critic_network.state_spec, target_critic1=critic_network.state_spec, target_critic2=critic_network.state_spec)), action_distribution_spec=actor_network.output_spec, predict_state_spec=actor_network.state_spec, optimizer=[actor_optimizer, critic_optimizer, alpha_optimizer], get_trainable_variables_func=[ lambda: actor_network.trainable_variables, lambda: (critic_network1.trainable_variables + critic_network2. trainable_variables), lambda: [log_alpha] ], gradient_clipping=gradient_clipping, train_step_counter=train_step_counter, debug_summaries=debug_summaries, name=name) self._log_alpha = log_alpha self._actor_network = actor_network self._critic_network1 = critic_network1 self._critic_network2 = critic_network2 self._target_critic_network1 = self._critic_network1.copy( name='TargetCriticNetwork1') self._target_critic_network2 = self._critic_network2.copy( name='TargetCriticNetwork2') self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer if critic_loss is None: critic_loss = OneStepTDLoss(debug_summaries=debug_summaries) self._critic_loss = critic_loss flat_action_spec = tf.nest.flatten(self._action_spec) self._is_continuous = tensor_spec.is_continuous(flat_action_spec[0]) if target_entropy is None: target_entropy = np.sum( list( map(dist_utils.calc_default_target_entropy, flat_action_spec))) self._target_entropy = target_entropy self._dqda_clipping = dqda_clipping self._update_target = common.get_target_updater( models=[self._critic_network1, self._critic_network2], target_models=[ self._target_critic_network1, self._target_critic_network2 ], tau=target_update_tau, period=target_update_period) tfa_common.soft_variables_update( self._critic_network1.variables, self._target_critic_network1.variables, tau=1.0) tfa_common.soft_variables_update( self._critic_network2.variables, self._target_critic_network2.variables, tau=1.0)
def __init__(self, action_spec, actor_network: Network, critic_network: Network, ou_stddev=0.2, ou_damping=0.15, critic_loss=None, target_update_tau=0.05, target_update_period=1, dqda_clipping=None, actor_optimizer=None, critic_optimizer=None, gradient_clipping=None, train_step_counter=None, debug_summaries=False, name="DdpgAlgorithm"): """ Args: action_spec (nested BoundedTensorSpec): representing the actions. actor_network (Network): The network will be called with call(observation, step_type). critic_network (Network): The network will be called with call(observation, action, step_type). ou_stddev (float): Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping (float): Damping factor for the OU noise added in the default collect policy. critic_loss (None|OneStepTDLoss): an object for calculating critic loss. If None, a default OneStepTDLoss will be used. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor. critic_optimizer (tf.optimizers.Optimizer): The optimizer for actor. gradient_clipping (float): Norm length to clip gradients. train_step_counter (tf.Variable): An optional counter to increment every time the a new iteration is started. If None, it will use tf.summary.experimental.get_step(). If this is still None, a counter will be created. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ train_state_spec = DdpgState( actor=DdpgActorState(actor=actor_network.state_spec, critic=critic_network.state_spec), critic=DdpgCriticState(critic=critic_network.state_spec, target_actor=actor_network.state_spec, target_critic=critic_network.state_spec)) super().__init__(action_spec, train_state_spec=train_state_spec, action_distribution_spec=action_spec, optimizer=[actor_optimizer, critic_optimizer], get_trainable_variables_func=[ lambda: actor_network.trainable_variables, lambda: critic_network.trainable_variables ], gradient_clipping=gradient_clipping, train_step_counter=train_step_counter, debug_summaries=debug_summaries, name=name) self._actor_network = actor_network self._critic_network = critic_network self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._target_actor_network = actor_network.copy( name='target_actor_network') self._target_critic_network = critic_network.copy( name='target_critic_network') self._ou_stddev = ou_stddev self._ou_damping = ou_damping if critic_loss is None: critic_loss = OneStepTDLoss(debug_summaries=debug_summaries) self._critic_loss = critic_loss self._ou_process = self._create_ou_process(ou_stddev, ou_damping) self._update_target = common.get_target_updater( models=[self._actor_network, self._critic_network], target_models=[ self._target_actor_network, self._target_critic_network ], tau=target_update_tau, period=target_update_period) self._dqda_clipping = dqda_clipping tfa_common.soft_variables_update(self._critic_network.variables, self._target_critic_network.variables, tau=1.0) tfa_common.soft_variables_update(self._actor_network.variables, self._target_actor_network.variables, tau=1.0)
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, actor_network: network.Network, critic_network: network.Network, actor_optimizer: Optional[types.Optimizer] = None, critic_optimizer: Optional[types.Optimizer] = None, ou_stddev: types.Float = 1.0, ou_damping: types.Float = 1.0, target_actor_network: Optional[network.Network] = None, target_critic_network: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: types.Int = 1, dqda_clipping: Optional[types.Float] = None, td_errors_loss_fn: Optional[types.LossFn] = None, gamma: types.Float = 1.0, reward_scale_factor: types.Float = 1.0, gradient_clipping: Optional[types.Float] = None, debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates a DDPG Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. actor_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, step_type[, policy_state]) and should return (action, new_state). critic_network: A tf_agents.network.Network to be used by the agent. The network will be called with call((observation, action), step_type[, policy_state]) and should return (q_value, new_state). actor_optimizer: The optimizer to use for the actor network. critic_optimizer: The optimizer to use for the critic network. ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping: Damping factor for the OU noise added in the default collect policy. target_actor_network: (Optional.) A `tf_agents.network.Network` to be used as the actor target network during Q learning. Every `target_update_period` train steps, the weights from `actor_network` are copied (possibly withsmoothing via `target_update_tau`) to ` target_q_network`. If `target_actor_network` is not provided, it is created by making a copy of `actor_network`, which initializes a new network with the same structure and its own layers and weights. Performing a `Network.copy` does not work when the network instance already has trainable parameters (e.g., has already been built, or when the network is sharing layers with another). In these cases, it is up to you to build a copy having weights that are not shared with the original `actor_network`, so that this can be used as a target network. If you provide a `target_actor_network` that shares any weights with `actor_network`, a warning will be logged but no exception is thrown. target_critic_network: (Optional.) Similar network as target_actor_network but for the critic_network. See documentation for target_actor_network. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. dqda_clipping: when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of elementwise huber_loss is used. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._actor_network = actor_network actor_network.create_variables() if target_actor_network: target_actor_network.create_variables() self._target_actor_network = common.maybe_copy_target_network_with_checks( self._actor_network, target_actor_network, 'TargetActorNetwork') self._critic_network = critic_network critic_network.create_variables() if target_critic_network: target_critic_network.create_variables() self._target_critic_network = common.maybe_copy_target_network_with_checks( self._critic_network, target_critic_network, 'TargetCriticNetwork') self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._ou_stddev = ou_stddev self._ou_damping = ou_damping self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._dqda_clipping = dqda_clipping self._td_errors_loss_fn = (td_errors_loss_fn or common.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater(target_update_tau, target_update_period) policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=True) collect_policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=False) collect_policy = ou_noise_policy.OUNoisePolicy( collect_policy, ou_stddev=self._ou_stddev, ou_damping=self._ou_damping, clip=True) super(DdpgAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=2 if not self._actor_network.state_spec else None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensor, actor_network: network.Network, critic_network: network.Network, actor_optimizer: types.Optimizer, critic_optimizer: types.Optimizer, exploration_noise_std: types.Float = 0.1, critic_network_2: Optional[network.Network] = None, target_actor_network: Optional[network.Network] = None, target_critic_network: Optional[network.Network] = None, target_critic_network_2: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: types.Int = 1, actor_update_period: types.Int = 1, td_errors_loss_fn: Optional[types.LossFn] = None, gamma: types.Float = 1.0, reward_scale_factor: types.Float = 1.0, target_policy_noise: types.Float = 0.2, target_policy_noise_clip: types.Float = 0.5, gradient_clipping: Optional[types.Float] = None, debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Text = None): """Creates a Td3Agent Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. actor_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, step_type). critic_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, action, step_type). actor_optimizer: The default optimizer to use for the actor network. critic_optimizer: The default optimizer to use for the critic network. exploration_noise_std: Scale factor on exploration policy noise. critic_network_2: (Optional.) A `tf_agents.network.Network` to be used as the second critic network during Q learning. The weights from `critic_network` are copied if this is not provided. target_actor_network: (Optional.) A `tf_agents.network.Network` to be used as the target actor network during Q learning. Every `target_update_period` train steps, the weights from `actor_network` are copied (possibly withsmoothing via `target_update_tau`) to ` target_actor_network`. If `target_actor_network` is not provided, it is created by making a copy of `actor_network`, which initializes a new network with the same structure and its own layers and weights. Performing a `Network.copy` does not work when the network instance already has trainable parameters (e.g., has already been built, or when the network is sharing layers with another). In these cases, it is up to you to build a copy having weights that are not shared with the original `actor_network`, so that this can be used as a target network. If you provide a `target_actor_network` that shares any weights with `actor_network`, a warning will be logged but no exception is thrown. target_critic_network: (Optional.) Similar network as target_actor_network but for the critic_network. See documentation for target_actor_network. target_critic_network_2: (Optional.) Similar network as target_actor_network but for the critic_network_2. See documentation for target_actor_network. Will only be used if 'critic_network_2' is also specified. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. actor_update_period: Period for the optimization step on actor network. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of elementwise huber_loss is used. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. target_policy_noise: Scale factor on target action noise target_policy_noise_clip: Value to clip noise. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._actor_network = actor_network actor_network.create_variables() if target_actor_network: target_actor_network.create_variables() self._target_actor_network = common.maybe_copy_target_network_with_checks( self._actor_network, target_actor_network, 'TargetActorNetwork') self._critic_network_1 = critic_network critic_network.create_variables() if target_critic_network: target_critic_network.create_variables() self._target_critic_network_1 = ( common.maybe_copy_target_network_with_checks(self._critic_network_1, target_critic_network, 'TargetCriticNetwork1')) if critic_network_2 is not None: self._critic_network_2 = critic_network_2 else: self._critic_network_2 = critic_network.copy(name='CriticNetwork2') # Do not use target_critic_network_2 if critic_network_2 is None. target_critic_network_2 = None self._critic_network_2.create_variables() if target_critic_network_2: target_critic_network_2.create_variables() self._target_critic_network_2 = ( common.maybe_copy_target_network_with_checks(self._critic_network_2, target_critic_network_2, 'TargetCriticNetwork2')) self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._exploration_noise_std = exploration_noise_std self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._actor_update_period = actor_update_period self._td_errors_loss_fn = ( td_errors_loss_fn or common.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_policy_noise = target_policy_noise self._target_policy_noise_clip = target_policy_noise_clip self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater( target_update_tau, target_update_period) policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=True) collect_policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=False) collect_policy = gaussian_policy.GaussianPolicy( collect_policy, scale=self._exploration_noise_std, clip=True) train_sequence_length = 2 if not self._actor_network.state_spec else None super(Td3Agent, self).__init__( time_step_spec, action_spec, policy, collect_policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, validate_args=False ) self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=(train_sequence_length == 2))
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, q_network: network.Network, sampler: cem_actions_sampler.ActionsSampler, init_mean: types.NestedArray, init_var: types.NestedArray, actor_policy: Optional[tf_policy.TFPolicy] = None, minimal_var: float = 0.0001, info_spec: types.NestedSpecTensorOrArray = (), num_samples: int = 32, num_elites: int = 4, num_iterations: int = 32, emit_log_probability: bool = False, preprocess_state_action: bool = True, training: bool = False, weights: types.NestedTensorOrArray = None, name: Optional[str] = None): """Builds a CEM-Policy given a network and a sampler. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. q_network: An instance of a `tf_agents.network.Network`, callable via `network(observation, step_type) -> (output, final_state)`. sampler: Samples the actions needed for the CEM. init_mean: A list or tuple or scalar, reprenting initial mean for actions. init_var: A list or tuple or scalar, reprenting initial var for actions. actor_policy: Optional actor policy. minimal_var: Minimal variance to prevent CEM distributon collapsing. info_spec: A policy info spec. num_samples: Number of samples to sample each round. num_elites: Number of best actions each round to refit the distribution with. num_iterations: Number of iterations to run the CEM loop. emit_log_probability: Whether to emit log-probs in info of `PolicyStep`. preprocess_state_action: The shape of state is (B, ...) and the shape of action is (B, N, A). When preprocess_state_action is enabled, the state will be tile_batched to be (BxN, ...) and the action will be reshaped to be (BxN, A). When preprocess_state_action is not enabled, the same operation needs to be done inside the network. This is helpful when the input have large memory requirements and the replication of state could happen after a few layers inside the network. training: Whether it is in training mode or inference mode. weights: A nested structure of weights w/ the same structure as action. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If `q_network.action_spec` exists and is not compatible with `action_spec`. """ network_action_spec = getattr(q_network, 'action_spec', None) if network_action_spec is not None: if not action_spec.is_compatible_with(network_action_spec): raise ValueError( 'action_spec must be compatible with q_network.action_spec; ' 'instead got action_spec=%s, q_network.action_spec=%s' % (action_spec, network_action_spec)) if q_network: network_utils.check_single_floating_network_output( q_network.create_variables(), expected_output_shape=(), label=str(q_network)) policy_state_spec = q_network.state_spec else: policy_state_spec = () self._actor_policy = actor_policy self._q_network = q_network self._init_mean = init_mean self._init_var = init_var self._minimal_var = minimal_var self._num_samples = num_samples # N self._num_elites = num_elites # M self._num_iterations = num_iterations self._actions_sampler = sampler self._observation_spec = time_step_spec.observation self._training = training self._preprocess_state_action = preprocess_state_action self._weights = weights super(CEMPolicy, self).__init__(time_step_spec, action_spec, info_spec=info_spec, policy_state_spec=policy_state_spec, clip=False, emit_log_probability=emit_log_probability, name=name)
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, critic_network: network.Network, actor_network: network.Network, actor_optimizer: types.Optimizer, critic_optimizer: types.Optimizer, alpha_optimizer: types.Optimizer, actor_loss_weight: types.Float = 1.0, critic_loss_weight: types.Float = 0.5, alpha_loss_weight: types.Float = 1.0, actor_policy_ctor: Callable[ ..., tf_policy.TFPolicy] = actor_policy.ActorPolicy, critic_network_2: Optional[network.Network] = None, target_critic_network: Optional[network.Network] = None, target_critic_network_2: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: types.Int = 1, td_errors_loss_fn: types.LossFn = tf.math.squared_difference, gamma: types.Float = 1.0, reward_scale_factor: types.Float = 1.0, initial_log_alpha: types.Float = 0.0, use_log_alpha_in_alpha_loss: bool = True, target_entropy: Optional[types.Float] = None, gradient_clipping: Optional[types.Float] = None, debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates a SAC Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. critic_network: A function critic_network((observations, actions)) that returns the q_values for each observation and action. actor_network: A function actor_network(observation, action_spec) that returns action distribution. actor_optimizer: The optimizer to use for the actor network. critic_optimizer: The default optimizer to use for the critic network. alpha_optimizer: The default optimizer to use for the alpha variable. actor_loss_weight: The weight on actor loss. critic_loss_weight: The weight on critic loss. alpha_loss_weight: The weight on alpha loss. actor_policy_ctor: The policy class to use. critic_network_2: (Optional.) A `tf_agents.network.Network` to be used as the second critic network during Q learning. The weights from `critic_network` are copied if this is not provided. target_critic_network: (Optional.) A `tf_agents.network.Network` to be used as the target critic network during Q learning. Every `target_update_period` train steps, the weights from `critic_network` are copied (possibly withsmoothing via `target_update_tau`) to ` target_critic_network`. If `target_critic_network` is not provided, it is created by making a copy of `critic_network`, which initializes a new network with the same structure and its own layers and weights. Performing a `Network.copy` does not work when the network instance already has trainable parameters (e.g., has already been built, or when the network is sharing layers with another). In these cases, it is up to you to build a copy having weights that are not shared with the original `critic_network`, so that this can be used as a target network. If you provide a `target_critic_network` that shares any weights with `critic_network`, a warning will be logged but no exception is thrown. target_critic_network_2: (Optional.) Similar network as target_critic_network but for the critic_network_2. See documentation for target_critic_network. Will only be used if 'critic_network_2' is also specified. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the elementwise TD errors loss. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. initial_log_alpha: Initial value for log_alpha. use_log_alpha_in_alpha_loss: A boolean, whether using log_alpha or alpha in alpha loss. Certain implementations of SAC use log_alpha as log values are generally nicer to work with. target_entropy: The target average policy entropy, for updating alpha. The default value is negative of the total number of actions. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._check_action_spec(action_spec) self._critic_network_1 = critic_network self._critic_network_1.create_variables( (time_step_spec.observation, action_spec)) if target_critic_network: target_critic_network.create_variables( (time_step_spec.observation, action_spec)) self._target_critic_network_1 = ( common.maybe_copy_target_network_with_checks( self._critic_network_1, target_critic_network, 'TargetCriticNetwork1')) if critic_network_2 is not None: self._critic_network_2 = critic_network_2 else: self._critic_network_2 = critic_network.copy(name='CriticNetwork2') # Do not use target_critic_network_2 if critic_network_2 is None. target_critic_network_2 = None self._critic_network_2.create_variables( (time_step_spec.observation, action_spec)) if target_critic_network_2: target_critic_network_2.create_variables( (time_step_spec.observation, action_spec)) self._target_critic_network_2 = ( common.maybe_copy_target_network_with_checks( self._critic_network_2, target_critic_network_2, 'TargetCriticNetwork2')) if actor_network: actor_network.create_variables(time_step_spec.observation) self._actor_network = actor_network policy = actor_policy_ctor(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, training=False) self._train_policy = actor_policy_ctor( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, training=True) self._log_alpha = common.create_variable( 'initial_log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) if target_entropy is None: target_entropy = self._get_default_target_entropy(action_spec) self._use_log_alpha_in_alpha_loss = use_log_alpha_in_alpha_loss self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer self._actor_loss_weight = actor_loss_weight self._critic_loss_weight = critic_loss_weight self._alpha_loss_weight = alpha_loss_weight self._td_errors_loss_fn = td_errors_loss_fn self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_entropy = target_entropy self._gradient_clipping = gradient_clipping self._debug_summaries = debug_summaries self._summarize_grads_and_vars = summarize_grads_and_vars self._update_target = self._get_target_updater( tau=self._target_update_tau, period=self._target_update_period) train_sequence_length = 2 if not critic_network.state_spec else None super(SacAgent, self).__init__(time_step_spec, action_spec, policy=policy, collect_policy=policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, validate_args=False) self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=(train_sequence_length == 2))
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, q_network: network.Network, min_q_value: float, max_q_value: float, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, temperature: types.Float = 1.0): """Builds a categorical Q-policy given a categorical Q-network. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A `BoundedTensorSpec` representing the actions. q_network: A network.Network to use for our policy. min_q_value: A float specifying the minimum Q-value, used for setting up the support. max_q_value: A float specifying the maximum Q-value, used for setting up the support. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `q_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. temperature: temperature for sampling, when close to 0.0 is arg_max. Raises: ValueError: if `q_network` does not have property `num_atoms`. TypeError: if `action_spec` is not a `BoundedTensorSpec`. """ network_action_spec = getattr(q_network, 'action_spec', None) if network_action_spec is not None: action_spec = cast(tf.TypeSpec, action_spec) if not action_spec.is_compatible_with(network_action_spec): raise ValueError( 'action_spec must be compatible with q_network.action_spec; ' 'instead got action_spec=%s, q_network.action_spec=%s' % (action_spec, network_action_spec)) if not isinstance(action_spec, tensor_spec.BoundedTensorSpec): raise TypeError( 'action_spec must be a BoundedTensorSpec. Got: %s' % (action_spec, )) action_spec = cast(tensor_spec.BoundedTensorSpec, action_spec) if action_spec.minimum != 0: raise ValueError( 'Action specs should have minimum of 0, but saw: {0}. If collecting ' 'from a python environment, consider using ' 'tf_agents.environments.wrappers.ActionOffsetWrapper.'.format( action_spec)) num_actions = action_spec.maximum - action_spec.minimum + 1 try: num_atoms = q_network.num_atoms except AttributeError: raise ValueError( 'Expected q_network to have property `num_atoms`, but ' 'it doesn\'t. (Note: you likely want to use a ' 'CategoricalQNetwork.) Network is: %s' % q_network) self._num_atoms = num_atoms network_utils.check_single_floating_network_output( q_network.create_variables(), (num_actions, num_atoms), str(q_network)) super(CategoricalQPolicy, self).__init__(time_step_spec, action_spec, policy_state_spec=q_network.state_spec, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter)) self._temperature = tf.convert_to_tensor(temperature, dtype=tf.float32) self._q_network = q_network # Generate support in numpy so that we can assign it to a constant and avoid # having a tensor property. support = np.linspace(min_q_value, max_q_value, self._num_atoms, dtype=np.float32) self._support = tf.constant(support, dtype=tf.float32) self._action_dtype = action_spec.dtype
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, q_network: network.Network, optimizer: types.Optimizer, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, epsilon_greedy: types.Float = 0.1, n_step_update: int = 1, boltzmann_temperature: Optional[types.Int] = None, emit_log_probability: bool = False, # Params for target network updates target_q_network: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: int = 1, # Params for training. td_errors_loss_fn: Optional[types.LossFn] = None, gamma: types.Float = 1.0, reward_scale_factor: types.Float = 1.0, gradient_clipping: Optional[types.Float] = None, # Params for debugging debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None, entropy_tau: types.Float = 0.9, alpha: types.Float = 0.3): tf.Module.__init__(self, name=name) self._check_action_spec(action_spec) if epsilon_greedy is not None and boltzmann_temperature is not None: raise ValueError( 'Configured both epsilon_greedy value {} and temperature {}, ' 'however only one of them can be used for exploration.'.format( epsilon_greedy, boltzmann_temperature)) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._q_network = q_network net_observation_spec = time_step_spec.observation if observation_and_action_constraint_splitter: net_observation_spec, _ = observation_and_action_constraint_splitter( net_observation_spec) q_network.create_variables(net_observation_spec) if target_q_network: target_q_network.create_variables(net_observation_spec) self._target_q_network = common.maybe_copy_target_network_with_checks( self._q_network, target_q_network, input_spec=net_observation_spec, name='TargetQNetwork') self._check_network_output(self._q_network, 'q_network') self._check_network_output(self._target_q_network, 'target_q_network') self._epsilon_greedy = epsilon_greedy self._n_step_update = n_step_update self._boltzmann_temperature = boltzmann_temperature self._optimizer = optimizer self._td_errors_loss_fn = (td_errors_loss_fn or common.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater(target_update_tau, target_update_period) self.entropy_tau = entropy_tau self.alpha = alpha policy, collect_policy = self._setup_policy(time_step_spec, action_spec, boltzmann_temperature, emit_log_probability) if q_network.state_spec and n_step_update != 1: raise NotImplementedError( 'DqnAgent does not currently support n-step updates with stateful ' 'networks (i.e., RNNs), but n_step_update = {}'.format( n_step_update)) train_sequence_length = (n_step_update + 1 if not q_network.state_spec else None) super(dqn_agent.DqnAgent, self).__init__( time_step_spec, action_spec, policy, collect_policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, validate_args=False, ) if q_network.state_spec: # AsNStepTransition does not support emitting [B, T, ...] tensors, # which we need for DQN-RNN. self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=False) else: # This reduces the n-step return and removes the extra time dimension, # allowing the rest of the computations to be independent of the # n-step parameter. self._as_transition = data_converter.AsNStepTransition( self.data_context, gamma=gamma, n=n_step_update)
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, actor_network: network.Network, value_network: network.Network, observation_normalizer: Optional[ tensor_normalizer.TensorNormalizer] = None, clip: bool = True, collect: bool = True, compute_value_and_advantage_in_train: bool = False): """Builds a PPO Policy given network Templates or functions. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. actor_network: An instance of a tf_agents.networks.network.Network, with call(observation, step_type, network_state). Network should return one of the following: 1. a nested tuple of tfp.distributions objects matching action_spec, or 2. a nested tuple of tf.Tensors representing actions. value_network: An instance of a tf_agents.networks.network.Network, with call(observation, step_type, network_state). Network should return value predictions for the input state. observation_normalizer: An object to use for obervation normalization. clip: Whether to clip actions to spec before returning them. Default True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped continuous actions for training. collect: If True, creates ops for actions_log_prob, value_preds, and action_distribution_params. (default True) compute_value_and_advantage_in_train: A bool to indicate where value prediction and advantage calculation happen. If True, both happen in agent.train(), therefore no need to save the value prediction inside of policy info. If False, value prediction is computed during data collection. This argument must be set to `False` if mini batch learning is enabled. Raises: TypeError: if `actor_network` or `value_network` is not of type `tf_agents.networks.Network`. ValueError: if `actor_network` or `value_network` do not emit valid outputs. """ if not isinstance(actor_network, network.Network): raise TypeError('actor_network is not of type network.Network') if not isinstance(value_network, network.Network): raise TypeError('value_network is not of type network.Network') actor_output_spec = actor_network.create_variables( time_step_spec.observation) value_output_spec = value_network.create_variables( time_step_spec.observation) nest_utils.assert_value_spec(value_output_spec, 'value_network') distribution_utils.assert_specs_are_compatible( actor_output_spec, action_spec, 'actor_network output spec does not match action spec') self._compute_value_and_advantage_in_train = ( compute_value_and_advantage_in_train) if collect: # TODO(oars): Cleanup how we handle non distribution networks. if isinstance(actor_network, network.DistributionNetwork): network_output_spec = actor_network.output_spec else: network_output_spec = tf.nest.map_structure( distribution_spec.deterministic_distribution_from_spec, action_spec) info_spec = { 'dist_params': tf.nest.map_structure(lambda spec: spec.input_params_spec, network_output_spec) } if not self._compute_value_and_advantage_in_train: info_spec['value_prediction'] = tensor_spec.TensorSpec( shape=[], dtype=tf.float32) else: info_spec = () policy_state_spec = {} if actor_network.state_spec: policy_state_spec['actor_network_state'] = actor_network.state_spec if (collect and value_network.state_spec and not self._compute_value_and_advantage_in_train): policy_state_spec['value_network_state'] = value_network.state_spec if not policy_state_spec: policy_state_spec = () super(PPOPolicy, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, policy_state_spec=policy_state_spec, info_spec=info_spec, actor_network=actor_network, observation_normalizer=observation_normalizer, clip=clip) self._collect = collect if value_network is not None: value_network.create_variables() self._value_network = value_network
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, actor_network: network.Network, value_network: network.Network, observation_normalizer: Optional[ tensor_normalizer.TensorNormalizer] = None, clip: bool = True, collect: bool = True, compute_value_and_advantage_in_train: bool = False): """Builds a PPO Policy given network Templates or functions. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. actor_network: An instance of a tf_agents.networks.network.Network, with call(observation, step_type, network_state). Network should return one of the following: 1. a nested tuple of tfp.distributions objects matching action_spec, or 2. a nested tuple of tf.Tensors representing actions. value_network: An instance of a tf_agents.networks.network.Network, with call(observation, step_type, network_state). Network should return value predictions for the input state. observation_normalizer: An object to use for obervation normalization. clip: Whether to clip actions to spec before returning them. Default True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped continuous actions for training. collect: If True, creates ops for actions_log_prob, value_preds, and action_distribution_params. (default True) compute_value_and_advantage_in_train: A bool to indicate where value prediction and advantage calculation happen. If True, both happen in agent.train(), therefore no need to save the value prediction inside of policy info. If False, value prediction is computed during data collection. This argument must be set to `False` if mini batch learning is enabled. Raises: TypeError: if `actor_network` or `value_network` is not of type `tf_agents.networks.Network`. ValueError: if `actor_network` or `value_network` do not emit valid outputs. For example, `actor_network` must either be a (legacy style) `DistributionNetwork`, or explicitly emit a nest of `tfp.distribution.Distribution` objects. """ if not isinstance(actor_network, network.Network): raise TypeError('actor_network is not of type network.Network') if not isinstance(value_network, network.Network): raise TypeError('value_network is not of type network.Network') actor_output_spec = actor_network.create_variables( time_step_spec.observation) value_output_spec = value_network.create_variables( time_step_spec.observation) nest_utils.assert_value_spec(value_output_spec, 'value_network') distribution_utils.assert_specs_are_compatible( actor_output_spec, action_spec, 'actor_network output spec does not match action spec') self._compute_value_and_advantage_in_train = ( compute_value_and_advantage_in_train) if collect: if isinstance(actor_network, network.DistributionNetwork): # Legacy DistributionNetwork case. New code can just provide a regular # Network that emits a Distribution object; and we use a different # code path using DistributionSpecV2 for that. network_output_spec = actor_network.output_spec info_spec = { 'dist_params': tf.nest.map_structure(lambda spec: spec.input_params_spec, network_output_spec) } else: # We have a Network that emits a nest of distributions. def nested_dist_params(spec): if not isinstance(spec, distribution_utils.DistributionSpecV2): raise ValueError( 'Unexpected output from `actor_network`. Expected ' '`Distribution` objects, but saw output spec: {}'. format(actor_output_spec)) return distribution_utils.parameters_to_dict( spec.parameters, tensors_only=True) info_spec = { 'dist_params': tf.nest.map_structure(nested_dist_params, actor_output_spec) } if not self._compute_value_and_advantage_in_train: info_spec['value_prediction'] = tensor_spec.TensorSpec( shape=[], dtype=tf.float32) else: info_spec = () policy_state_spec = {} if actor_network.state_spec: policy_state_spec['actor_network_state'] = actor_network.state_spec if (collect and value_network.state_spec and not self._compute_value_and_advantage_in_train): policy_state_spec['value_network_state'] = value_network.state_spec if not policy_state_spec: policy_state_spec = () super(PPOPolicy, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, policy_state_spec=policy_state_spec, info_spec=info_spec, actor_network=actor_network, observation_normalizer=observation_normalizer, clip=clip) self._collect = collect self._value_network = value_network
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, actor_network: network.Network, policy_state_spec: types.NestedTensorSpec = (), info_spec: types.NestedTensorSpec = (), observation_normalizer: Optional[ tensor_normalizer.TensorNormalizer] = None, clip: bool = True, training: bool = False, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, name: Optional[Text] = None): """Builds an Actor Policy given an actor network. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. actor_network: An instance of a `tf_agents.networks.network.Network` to be used by the policy. The network will be called with `call(observation, step_type, policy_state)` and should return `(actions_or_distributions, new_state)`. policy_state_spec: A nest of TensorSpec representing the policy_state. If not set, defaults to actor_network.state_spec. info_spec: A nest of `TensorSpec` representing the policy info. observation_normalizer: An object to use for observation normalization. clip: Whether to clip actions to spec before returning them. Default True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped continuous actions for training. training: Whether the network should be called in training mode. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `actor_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: if `actor_network` is not of type `network.Network`. NotImplementedError: if `observation_and_action_constraint_splitter` is not None but `action_spec` is not discrete. """ if not isinstance(actor_network, network.Network): raise ValueError('actor_network must be a network.Network. Found ' '{}.'.format(type(actor_network))) actor_network.create_variables() self._actor_network = actor_network self._observation_normalizer = observation_normalizer self._training = training if observation_and_action_constraint_splitter is not None: if len(tf.nest.flatten(action_spec)) > 1 or ( not tensor_spec.is_discrete(action_spec)): raise NotImplementedError( 'Action constraints for ActorPolicy are currently only supported ' 'for a single spec of discrete actions. Got action_spec {}' .format(action_spec)) if not policy_state_spec: policy_state_spec = actor_network.state_spec super(ActorPolicy, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, policy_state_spec=policy_state_spec, info_spec=info_spec, clip=clip, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, critic_network: network.Network, actor_network: network.Network, actor_optimizer: types.Optimizer, critic_optimizer: types.Optimizer, alpha_optimizer: types.Optimizer, actor_loss_weight: types.Float = 1.0, critic_loss_weight: types.Float = 0.5, alpha_loss_weight: types.Float = 1.0, actor_policy_ctor: Callable[ ..., tf_policy.TFPolicy] = actor_policy.ActorPolicy, critic_network_2: Optional[network.Network] = None, target_critic_network: Optional[network.Network] = None, target_critic_network_2: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: types.Int = 1, td_errors_loss_fn: types.LossFn = tf.math.squared_difference, gamma: types.Float = 1.0, sigma: types.Float = 0.9, reward_scale_factor: types.Float = 1.0, initial_log_alpha: types.Float = 0.0, use_log_alpha_in_alpha_loss: bool = True, target_entropy: Optional[types.Float] = None, gradient_clipping: Optional[types.Float] = None, debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): tf.Module.__init__(self, name=name) self._check_action_spec(action_spec) net_observation_spec = time_step_spec.observation critic_spec = (net_observation_spec, action_spec) self._critic_network_1 = critic_network if critic_network_2 is not None: self._critic_network_2 = critic_network_2 else: self._critic_network_2 = critic_network.copy(name='CriticNetwork2') # Do not use target_critic_network_2 if critic_network_2 is None. target_critic_network_2 = None # Wait until critic_network_2 has been copied from critic_network_1 before # creating variables on both. self._critic_network_1.create_variables(critic_spec) self._critic_network_2.create_variables(critic_spec) if target_critic_network: target_critic_network.create_variables(critic_spec) self._target_critic_network_1 = ( common.maybe_copy_target_network_with_checks( self._critic_network_1, target_critic_network, input_spec=critic_spec, name='TargetCriticNetwork1')) if target_critic_network_2: target_critic_network_2.create_variables(critic_spec) self._target_critic_network_2 = ( common.maybe_copy_target_network_with_checks( self._critic_network_2, target_critic_network_2, input_spec=critic_spec, name='TargetCriticNetwork2')) if actor_network: actor_network.create_variables(net_observation_spec) self._actor_network = actor_network policy = actor_policy_ctor(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, training=False) self._train_policy = actor_policy_ctor( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, training=True) self._log_alpha = common.create_variable( 'initial_log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) if target_entropy is None: target_entropy = self._get_default_target_entropy(action_spec) self._use_log_alpha_in_alpha_loss = use_log_alpha_in_alpha_loss self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer self._actor_loss_weight = actor_loss_weight self._critic_loss_weight = critic_loss_weight self._alpha_loss_weight = alpha_loss_weight self._td_errors_loss_fn = td_errors_loss_fn self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_entropy = target_entropy self._gradient_clipping = gradient_clipping self._debug_summaries = debug_summaries self._summarize_grads_and_vars = summarize_grads_and_vars self._update_target = self._get_target_updater( tau=self._target_update_tau, period=self._target_update_period) self.sigma = sigma train_sequence_length = 2 if not critic_network.state_spec else None super(sac_agent.SacAgent, self).__init__(time_step_spec, action_spec, policy=policy, collect_policy=policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, validate_args=False) self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=(train_sequence_length == 2))
def __init__(self, output_dim, noise_dim=32, input_tensor_spec=None, hidden_layers=(256, ), net: Network = None, net_moving_average_rate=None, entropy_regularization=0., kernel_sharpness=2., mi_weight=None, mi_estimator_cls=MIEstimator, optimizer: tf.optimizers.Optimizer = None, name="Generator"): """Create a Generator. Args: output_dim (int): dimension of output noise_dim (int): dimension of noise input_tensor_spec (nested TensorSpec): spec of inputs. If there is no inputs, this should be None. hidden_layers (tuple): size of hidden layers. net (Network): network for generating outputs from [noise, inputs] or noise (if inputs is None). If None, a default one with hidden_layers will be created net_moving_average_rate (float): If provided, use a moving average version of net to do prediction. This has been shown to be effective for GAN training (arXiv:1907.02544, arXiv:1812.04948). entropy_regularization (float): weight of entropy regularization kernel_sharpness (float): Used only for entropy_regularization > 0. We calcualte the kernel in SVGD as: exp(-kernel_sharpness * reduce_mean((x-y)^2/width)), where width is the elementwise moving average of (x-y)^2 mi_estimator_cls (type): the class of mutual information estimator for maximizing the mutual information between [noise, inputs] and [outputs, inputs]. optimizer (tf.optimizers.Optimizer): optimizer (optional) name (str): name of this generator """ super().__init__(train_state_spec=(), optimizer=optimizer, name=name) self._noise_dim = noise_dim self._entropy_regularization = entropy_regularization if entropy_regularization == 0: self._grad_func = self._ml_grad else: self._grad_func = self._stein_grad self._kernel_width_averager = AdaptiveAverager( tensor_spec=tf.TensorSpec(shape=(output_dim, ))) self._kernel_sharpness = kernel_sharpness noise_spec = tf.TensorSpec(shape=[noise_dim]) if net is None: net_input_spec = noise_spec if input_tensor_spec is not None: net_input_spec = [net_input_spec, input_tensor_spec] net = EncodingNetwork( name="Generator", input_tensor_spec=net_input_spec, fc_layer_params=hidden_layers, last_layer_size=output_dim) self._mi_estimator = None self._input_tensor_spec = input_tensor_spec if mi_weight is not None: x_spec = noise_spec y_spec = tf.TensorSpec((output_dim, )) if input_tensor_spec is not None: x_spec = [x_spec, input_tensor_spec] self._mi_estimator = mi_estimator_cls( x_spec, y_spec, sampler='shift') self._mi_weight = mi_weight self._net = net self._predict_net = None self._net_moving_average_rate = net_moving_average_rate if net_moving_average_rate: self._predict_net = net.copy(name="Genrator_average") tfa_common.soft_variables_update( self._net.variables, self._predict_net.variables, tau=1.0)
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, cloning_network: network.Network, optimizer: types.Optimizer, num_outer_dims: Literal[1, 2] = 1, # pylint: disable=bad-whitespace epsilon_greedy: types.Float = 0.1, loss_fn: Optional[Callable[[types.NestedTensor, bool], types.Tensor]] = None, gradient_clipping: Optional[types.Float] = None, # Params for debugging. debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates an instance of a Behavioral Cloning agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. cloning_network: A `tf_agents.networks.Network` to be used by the agent. The network will be called as ``` network(observation, step_type=step_type, network_state=initial_state) ``` and must return a 2-tuple with elements `(output, next_network_state)` optimizer: The optimizer to use for training. num_outer_dims: The number of outer dimensions for the agent. Must be either 1 or 2. If 2, training will require both a batch_size and time dimension on every Tensor; if 1, training will require only a batch_size outer dimension. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if actions are discrete) loss_fn: A function for computing the error between the output of the cloning network and the action that was taken. If None, the loss depends on the action dtype. The `loss_fn` is called with parameters: `(experience, training)`, and must return a loss value for each element of the batch. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._cloning_network = cloning_network self._optimizer = optimizer self._gradient_clipping = gradient_clipping action_spec = tensor_spec.from_spec(action_spec) flat_action_spec = tf.nest.flatten(action_spec) continuous_specs = [ tensor_spec.is_continuous(s) for s in flat_action_spec ] if not flat_action_spec: raise ValueError( 'The `action_spec` must contain at least one action.') single_discrete_scalar_action = ( len(flat_action_spec) == 1 and flat_action_spec[0].shape.rank == 0 and not tensor_spec.is_continuous(flat_action_spec[0])) single_continuous_action = (len(flat_action_spec) == 1 and tensor_spec.is_continuous( flat_action_spec[0])) if (not loss_fn and not single_discrete_scalar_action and not single_continuous_action): raise ValueError( 'A `loss_fn` must be provided unless there is a single, scalar ' 'discrete action or a single (scalar or non-scalar) continuous ' 'action.') self._network_output_spec = cloning_network.create_variables( time_step_spec.observation) # If there is a mix of continuous and discrete actions we want to use an # actor policy so we can use the `setup_as_continuous` method as long as the # user provided a custom loss_fn which we verified above. if any(continuous_specs): policy, collect_policy = self._setup_as_continuous( time_step_spec, action_spec, loss_fn) else: policy, collect_policy = self._setup_as_discrete( time_step_spec, action_spec, loss_fn, epsilon_greedy) super(BehavioralCloningAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) self._as_trajectory = data_converter.AsTrajectory( self.data_context, sequence_length=None, num_outer_dims=num_outer_dims)
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.TensorSpec, actor_network: network.Network, optimizer: types.Optimizer, value_network: Optional[network.Network] = None, value_estimation_loss_coef: types.Float = 0.2, advantage_fn: Optional[AdvantageFnType] = None, use_advantage_loss: bool = True, gamma: types.Float = 1.0, normalize_returns: bool = True, gradient_clipping: Optional[types.Float] = None, debug_summaries: bool = False, summarize_grads_and_vars: bool = False, entropy_regularization: Optional[types.Float] = None, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates a REINFORCE Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. actor_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, step_type). optimizer: Optimizer for the actor network. value_network: (Optional) A `tf_agents.network.Network` to be used by the agent. The network will be called with call(observation, step_type) and returns a floating point value tensor. value_estimation_loss_coef: (Optional) Multiplier for value prediction loss to balance with policy gradient loss. advantage_fn: A function `A(returns, value_preds)` that takes returns and value function predictions as input and returns advantages. The default is `A(returns, value_preds) = returns - value_preds` if a value network is specified and `use_advantage_loss=True`, otherwise `A(returns, value_preds) = returns`. use_advantage_loss: Whether to use value function predictions for computing returns. `use_advantage_loss=False` is equivalent to setting `advantage_fn=lambda returns, value_preds: returns`. gamma: A discount factor for future rewards. normalize_returns: Whether to normalize returns across episodes when computing the loss. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. entropy_regularization: Coefficient for entropy regularization loss term. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) actor_network.create_variables() self._actor_network = actor_network if value_network: value_network.create_variables() self._value_network = value_network collect_policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, clip=True) policy = greedy_policy.GreedyPolicy(collect_policy) self._optimizer = optimizer self._gamma = gamma self._normalize_returns = normalize_returns self._gradient_clipping = gradient_clipping self._entropy_regularization = entropy_regularization self._value_estimation_loss_coef = value_estimation_loss_coef self._baseline = self._value_network is not None self._advantage_fn = advantage_fn if self._advantage_fn is None: if use_advantage_loss and self._baseline: self._advantage_fn = lambda returns, value_preds: returns - value_preds else: self._advantage_fn = lambda returns, _: returns super(ReinforceAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) self._as_trajectory = data_converter.AsTrajectory(self.data_context)
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, q_network: network.Network, optimizer: types.Optimizer, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, epsilon_greedy: types.Float = 0.1, n_step_update: int = 1, boltzmann_temperature: Optional[types.Int] = None, emit_log_probability: bool = False, # Params for target network updates target_q_network: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: int = 1, # Params for training. td_errors_loss_fn: Optional[types.LossFn] = None, gamma: types.Float = 1.0, reward_scale_factor: types.Float = 1.0, gradient_clipping: Optional[types.Float] = None, # Params for debugging debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates a DQN Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. q_network: A `tf_agents.network.Network` to be used by the agent. The network will be called with `call(observation, step_type)` and should emit logits over the action space. optimizer: The optimizer to use for training. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `q_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if a wrapper is not provided to the collect_policy method). n_step_update: The number of steps to consider when computing TD error and TD loss. Defaults to single-step updates. Note that this requires the user to call train on Trajectory objects with a time dimension of `n_step_update + 1`. However, note that we do not yet support `n_step_update > 1` in the case of RNNs (i.e., non-empty `q_network.state_spec`). boltzmann_temperature: Temperature value to use for Boltzmann sampling of the actions during data collection. The closer to 0.0, the higher the probability of choosing the best action. emit_log_probability: Whether policies emit log probabilities or not. target_q_network: (Optional.) A `tf_agents.network.Network` to be used as the target network during Q learning. Every `target_update_period` train steps, the weights from `q_network` are copied (possibly with smoothing via `target_update_tau`) to `target_q_network`. If `target_q_network` is not provided, it is created by making a copy of `q_network`, which initializes a new network with the same structure and its own layers and weights. Network copying is performed via the `Network.copy` superclass method, and may inadvertently lead to the resulting network to share weights with the original. This can happen if, for example, the original network accepted a pre-built Keras layer in its `__init__`, or accepted a Keras layer that wasn't built, but neglected to create a new copy. In these cases, it is up to you to provide a target Network having weights that are not shared with the original `q_network`. If you provide a `target_q_network` that shares any weights with `q_network`, a warning will be logged but no exception is thrown. Note; shallow copies of Keras layers may be built via the code: ```python new_layer = type(layer).from_config(layer.get_config()) ``` target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of element_wise_huber_loss is used. This function takes as input the target and the estimated Q values and returns the loss for each element of the batch. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If `action_spec` contains more than one action or action spec minimum is not equal to 0. ValueError: If the q networks do not emit floating point outputs with inner shape matching `action_spec`. NotImplementedError: If `q_network` has non-empty `state_spec` (i.e., an RNN is provided) and `n_step_update > 1`. """ tf.Module.__init__(self, name=name) self._check_action_spec(action_spec) if epsilon_greedy is not None and boltzmann_temperature is not None: raise ValueError( 'Configured both epsilon_greedy value {} and temperature {}, ' 'however only one of them can be used for exploration.'.format( epsilon_greedy, boltzmann_temperature)) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._q_network = q_network net_observation_spec = time_step_spec.observation if observation_and_action_constraint_splitter: net_observation_spec, _ = observation_and_action_constraint_splitter( net_observation_spec) q_network.create_variables(net_observation_spec) if target_q_network: target_q_network.create_variables(net_observation_spec) self._target_q_network = common.maybe_copy_target_network_with_checks( self._q_network, target_q_network, input_spec=net_observation_spec, name='TargetQNetwork') self._check_network_output(self._q_network, 'q_network') self._check_network_output(self._target_q_network, 'target_q_network') self._epsilon_greedy = epsilon_greedy self._n_step_update = n_step_update self._boltzmann_temperature = boltzmann_temperature self._optimizer = optimizer self._td_errors_loss_fn = ( td_errors_loss_fn or common.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater( target_update_tau, target_update_period) policy, collect_policy = self._setup_policy(time_step_spec, action_spec, boltzmann_temperature, emit_log_probability) if q_network.state_spec and n_step_update != 1: raise NotImplementedError( 'DqnAgent does not currently support n-step updates with stateful ' 'networks (i.e., RNNs), but n_step_update = {}'.format(n_step_update)) train_sequence_length = ( n_step_update + 1 if not q_network.state_spec else None) super(DqnAgent, self).__init__( time_step_spec, action_spec, policy, collect_policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, validate_args=False, ) if q_network.state_spec: # AsNStepTransition does not support emitting [B, T, ...] tensors, # which we need for DQN-RNN. self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=False) else: # This reduces the n-step return and removes the extra time dimension, # allowing the rest of the computations to be independent of the # n-step parameter. self._as_transition = data_converter.AsNStepTransition( self.data_context, gamma=gamma, n=n_step_update)
def __init__(self, observation_spec, action_spec, actor_network: DistributionNetwork, critic_network: Network, gamma=0.99, ou_stddev=0.2, ou_damping=0.15, actor_optimizer=None, critic_optimizer=None, target_update_tau=0.05, target_update_period=10, dqda_clipping=None, gradient_clipping=None, debug_summaries=False, name="SarsaAlgorithm"): """Create an SarsaAlgorithm. Args: action_spec (nested BoundedTensorSpec): representing the actions. observation_spec (nested TensorSpec): spec for observation. actor_network (Network|DistributionNetwork): The network will be called with call(observation, step_type). If it is DistributionNetwork an action will be sampled. critic_network (Network): The network will be called with call(observation, action, step_type). gamma (float): discount rate for reward ou_stddev (float): Only used for DDPG. Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping (float): Only used for DDPG. Damping factor for the OU noise added in the default collect policy. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor. critic_optimizer (tf.optimizers.Optimizer): The optimizer for actor. gradient_clipping (float): Norm length to clip gradients. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ if isinstance(actor_network, DistributionNetwork): self._action_distribution_spec = actor_network.output_spec elif isinstance(actor_network, Network): self._action_distribution_spec = action_spec else: raise ValueError("Expect DistributionNetwork or Network for" " `actor_network`, got %s" % type(actor_network)) super().__init__(observation_spec, action_spec, predict_state_spec=SarsaState( prev_observation=observation_spec, prev_step_type=tf.TensorSpec((), tf.int32), actor=actor_network.state_spec), train_state_spec=SarsaState( prev_observation=observation_spec, prev_step_type=tf.TensorSpec((), tf.int32), actor=actor_network.state_spec, target_actor=actor_network.state_spec, critic=critic_network.state_spec, target_critic=critic_network.state_spec, ), optimizer=[actor_optimizer, critic_optimizer], trainable_module_sets=[[actor_network], [critic_network]], gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, name=name) self._actor_network = actor_network self._critic_network = critic_network self._target_actor_network = actor_network.copy( name='target_actor_network') self._target_critic_network = critic_network.copy( name='target_critic_network') self._update_target = common.get_target_updater( models=[self._actor_network, self._critic_network], target_models=[ self._target_actor_network, self._target_critic_network ], tau=target_update_tau, period=target_update_period) self._dqda_clipping = dqda_clipping self._gamma = gamma self._ou_process = create_ou_process(action_spec, ou_stddev, ou_damping)
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, q_network: network.Network, emit_log_probability: bool = False, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, validate_action_spec_and_network: bool = True, name: Optional[Text] = None): """Builds a Q-Policy given a q_network. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. q_network: An instance of a `tf_agents.network.Network`, callable via `network(observation, step_type) -> (output, final_state)`. emit_log_probability: Whether to emit log-probs in info of `PolicyStep`. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `q_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. validate_action_spec_and_network: If `True` (default), action_spec is checked to make sure it is a single scalar spec with a minimum of zero. Also validates that the network's output matches the spec. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If `q_network.action_spec` exists and is not compatible with `action_spec`. NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec`. """ action_spec = tensor_spec.from_spec(action_spec) time_step_spec = tensor_spec.from_spec(time_step_spec) network_action_spec = getattr(q_network, 'action_spec', None) if network_action_spec is not None: action_spec = cast(tf.TypeSpec, action_spec) if not action_spec.is_compatible_with(network_action_spec): raise ValueError( 'action_spec must be compatible with q_network.action_spec; ' 'instead got action_spec=%s, q_network.action_spec=%s' % ( action_spec, network_action_spec)) flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise ValueError( 'Only scalar actions are supported now, but action spec is: {}' .format(action_spec)) if validate_action_spec_and_network: spec = flat_action_spec[0] if spec.shape.rank > 0: raise ValueError( 'Only scalar actions are supported now, but action spec is: {}' .format(action_spec)) if spec.minimum != 0: raise ValueError( 'Action specs should have minimum of 0, but saw: {0}'.format(spec)) num_actions = spec.maximum - spec.minimum + 1 network_utils.check_single_floating_network_output( q_network.create_variables(), (num_actions,), str(q_network)) # We need to maintain the flat action spec for dtype, shape and range. self._flat_action_spec = flat_action_spec[0] self._q_network = q_network super(QPolicy, self).__init__( time_step_spec, action_spec, policy_state_spec=q_network.state_spec, clip=False, emit_log_probability=emit_log_probability, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, q_network: network.Network, emit_log_probability: bool = False, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, name: Optional[Text] = None): """Builds a Q-Policy given a q_network. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. q_network: An instance of a `tf_agents.network.Network`, callable via `network(observation, step_type) -> (output, final_state)`. emit_log_probability: Whether to emit log-probs in info of `PolicyStep`. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `q_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If `q_network.action_spec` exists and is not compatible with `action_spec`. NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec`. """ network_action_spec = getattr(q_network, 'action_spec', None) if network_action_spec is not None: if not action_spec.is_compatible_with(network_action_spec): raise ValueError( 'action_spec must be compatible with q_network.action_spec; ' 'instead got action_spec=%s, q_network.action_spec=%s' % (action_spec, network_action_spec)) flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise NotImplementedError( 'action_spec can only contain a single BoundedTensorSpec.') # We need to maintain the flat action spec for dtype, shape and range. self._flat_action_spec = flat_action_spec[0] q_network.create_variables() self._q_network = q_network super(QPolicy, self).__init__(time_step_spec, action_spec, policy_state_spec=q_network.state_spec, clip=False, emit_log_probability=emit_log_probability, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)