def _setup_policy(self, time_step_spec, action_spec, boltzmann_temperature, emit_log_probability): policy = categorical_q_policy.CategoricalQPolicy( time_step_spec, action_spec, self._q_network, self._min_q_value, self._max_q_value, observation_and_action_constraint_splitter=( self._observation_and_action_constraint_splitter)) if boltzmann_temperature is not None: collect_policy = boltzmann_policy.BoltzmannPolicy( policy, temperature=boltzmann_temperature) else: collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( policy, epsilon=self._epsilon_greedy) policy = greedy_policy.GreedyPolicy(policy) target_policy = categorical_q_policy.CategoricalQPolicy( time_step_spec, action_spec, self._target_q_network, self._min_q_value, self._max_q_value, observation_and_action_constraint_splitter=( self._observation_and_action_constraint_splitter)) self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy) return policy, collect_policy
def _setup_policy(self, time_step_spec, action_spec, boltzmann_temperature, emit_log_probability): policy = q_policy.QPolicy( time_step_spec, action_spec, q_network=self._q_network, emit_log_probability=emit_log_probability, observation_and_action_constraint_splitter=( self._observation_and_action_constraint_splitter)) if boltzmann_temperature is not None: collect_policy = boltzmann_policy.BoltzmannPolicy( policy, temperature=self._boltzmann_temperature) else: collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( policy, epsilon=self._epsilon_greedy) policy = greedy_policy.GreedyPolicy(policy) # Create self._target_greedy_policy in order to compute target Q-values. target_policy = q_policy.QPolicy( time_step_spec, action_spec, q_network=self._target_q_network, observation_and_action_constraint_splitter=( self._observation_and_action_constraint_splitter)) self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy) return policy, collect_policy
def testBuild(self): wrapped = q_policy.QPolicy( self._time_step_spec, self._action_spec, q_network=DummyNet()) policy = boltzmann_policy.BoltzmannPolicy(wrapped, temperature=0.9) self.assertEqual(policy.time_step_spec, self._time_step_spec) self.assertEqual(policy.action_spec, self._action_spec)
def testDistribution(self): tf.compat.v1.set_random_seed(1) wrapped = q_policy.QPolicy( self._time_step_spec, self._action_spec, q_network=DummyNet()) policy = boltzmann_policy.BoltzmannPolicy(wrapped, temperature=0.9) observations = tf.constant([[1, 2]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=1) distribution_step = policy.distribution(time_step) mode = distribution_step.action.mode() self.evaluate(tf.compat.v1.global_variables_initializer()) # The weights of index 0 are all 1 and the weights of index 1 are all 1.5, # so the Q values of index 1 will be higher. self.assertAllEqual([[1]], self.evaluate(mode))
def testAction(self): tf.compat.v1.set_random_seed(1) wrapped = q_policy.QPolicy( self._time_step_spec, self._action_spec, q_network=DummyNet()) policy = boltzmann_policy.BoltzmannPolicy(wrapped, temperature=0.9) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2, 1]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [[1], [1]])
def testLogits(self): tf.compat.v1.set_random_seed(1) wrapped = q_policy.QPolicy( self._time_step_spec, self._action_spec, q_network=DummyNet()) policy = boltzmann_policy.BoltzmannPolicy(wrapped, temperature=0.5) observations = tf.constant([[1, 2]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=1) distribution_step = policy.distribution(time_step) logits = distribution_step.action.logits original_logits = wrapped.distribution(time_step).action.logits self.evaluate(tf.compat.v1.global_variables_initializer()) # The un-temperature'd logits would be 4 and 5.5, because it is (1 2) . (1 # 1) + 1 and (1 2) . (1.5 1.5) + 1. The temperature'd logits will be double # that. self.assertAllEqual([[[4., 5.5]]], self.evaluate(original_logits)) self.assertAllEqual([[[8., 11.]]], self.evaluate(logits))
def _setup_policy(self, time_step_spec, action_spec, boltzmann_temperature, emit_log_probability): policy = q_policy.QPolicy(time_step_spec, action_spec, q_network=self._q_network, emit_log_probability=emit_log_probability) if boltzmann_temperature is not None: collect_policy = boltzmann_policy.BoltzmannPolicy( policy, temperature=self._boltzmann_temperature) else: collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( policy, epsilon=self._epsilon_greedy) policy = greedy_policy.GreedyPolicy(policy) return policy, collect_policy
def __init__( self, time_step_spec, action_spec, q_network, optimizer, epsilon_greedy=0.1, boltzmann_temperature=None, # Params for target network updates target_update_tau=1.0, target_update_period=1, # Params for training. td_errors_loss_fn=None, gamma=1.0, reward_scale_factor=1.0, gradient_clipping=None, # Params for debugging debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a DQN Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. q_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, step_type). optimizer: The optimizer to use for training. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if a wrapper is not provided to the collect_policy method). boltzmann_temperature: Temperature value to use for Boltzmann sampling of the actions during data collection. The closer to 0.0, the higher the probability of choosing the best action. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of element_wise_huber_loss is used. This function takes as input the target and the estimated Q values and returns the loss for each element of the batch. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If the action spec contains more than one action or action spec minimum is not equal to 0. """ tf.Module.__init__(self, name=name) flat_action_spec = tf.nest.flatten(action_spec) self._num_actions = [ spec.maximum - spec.minimum + 1 for spec in flat_action_spec ] # TODO(oars): Get DQN working with more than one dim in the actions. if len(flat_action_spec) > 1 or flat_action_spec[0].shape.ndims > 1: raise ValueError('Only one dimensional actions are supported now.') if not all(spec.minimum == 0 for spec in flat_action_spec): raise ValueError( 'Action specs should have minimum of 0, but saw: {0}'.format( [spec.minimum for spec in flat_action_spec])) if epsilon_greedy is not None and boltzmann_temperature is not None: raise ValueError( 'Configured both epsilon_greedy value {} and temperature {}, ' 'however only one of them can be used for exploration.'.format( epsilon_greedy, boltzmann_temperature)) self._q_network = q_network self._target_q_network = self._q_network.copy(name='TargetQNetwork') self._epsilon_greedy = epsilon_greedy self._boltzmann_temperature = boltzmann_temperature self._optimizer = optimizer self._td_errors_loss_fn = td_errors_loss_fn or element_wise_huber_loss self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater( target_update_tau, target_update_period) policy = q_policy.QPolicy( time_step_spec, action_spec, q_network=self._q_network) if boltzmann_temperature is not None: collect_policy = boltzmann_policy.BoltzmannPolicy( policy, temperature=self._boltzmann_temperature) else: collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( policy, epsilon=self._epsilon_greedy) policy = greedy_policy.GreedyPolicy(policy) super(DqnAgent, self).__init__( time_step_spec, action_spec, policy, collect_policy, train_sequence_length=2 if not q_network.state_spec else None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)
def __init__( self, time_step_spec, action_spec, categorical_q_network, optimizer, min_q_value=-10.0, max_q_value=10.0, epsilon_greedy=0.1, n_step_update=1, boltzmann_temperature=None, # Params for target network updates target_update_tau=1.0, target_update_period=1, # Params for training. td_errors_loss_fn=None, gamma=1.0, reward_scale_factor=1.0, gradient_clipping=None, # Params for debugging debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a Categorical DQN Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A `BoundedTensorSpec` representing the actions. categorical_q_network: A categorical_q_network.CategoricalQNetwork that returns the q_distribution for each action. optimizer: The optimizer to use for training. min_q_value: A float specifying the minimum Q-value, used for setting up the support. max_q_value: A float specifying the maximum Q-value, used for setting up the support. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if a wrapper is not provided to the collect_policy method). n_step_update: The number of steps to consider when computing TD error and TD loss. Defaults to single-step updates. Note that this requires the user to call train on Trajectory objects with a time dimension of `n_step_update + 1`. However, note that we do not yet support `n_step_update > 1` in the case of RNNs (i.e., non-empty `q_network.state_spec`). boltzmann_temperature: Temperature value to use for Boltzmann sampling of the actions during data collection. The closer to 0.0, the higher the probability of choosing the best action. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of element_wise_huber_loss is used. This function takes as input the target and the estimated Q values and returns the loss for each element of the batch. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: TypeError: If the action spec contains more than one action. """ num_atoms = getattr(categorical_q_network, 'num_atoms', None) if num_atoms is None: raise TypeError( 'Expected categorical_q_network to have property ' '`num_atoms`, but it doesn\'t (note: you likely want to ' 'use a CategoricalQNetwork). Network is: %s' % (categorical_q_network, )) self._num_atoms = num_atoms self._min_q_value = min_q_value self._max_q_value = max_q_value self._support = tf.linspace(min_q_value, max_q_value, num_atoms) super(CategoricalDqnAgent, self).__init__(time_step_spec, action_spec, categorical_q_network, optimizer, epsilon_greedy=epsilon_greedy, n_step_update=n_step_update, boltzmann_temperature=boltzmann_temperature, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=td_errors_loss_fn, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, name=name) policy = categorical_q_policy.CategoricalQPolicy( min_q_value, max_q_value, self._q_network, self._action_spec) if boltzmann_temperature is not None: self._collect_policy = boltzmann_policy.BoltzmannPolicy( policy, temperature=self._boltzmann_temperature) else: self._collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( policy, epsilon=self._epsilon_greedy) self._policy = greedy_policy.GreedyPolicy(policy)
def __init__( self, time_step_spec, action_spec, categorical_q_network, optimizer, min_q_value=-10.0, max_q_value=10.0, epsilon_greedy=0.1, n_step_update=1, boltzmann_temperature=None, # Params for target network updates target_categorical_q_network=None, target_update_tau=1.0, target_update_period=1, # Params for training. td_errors_loss_fn=None, gamma=1.0, reward_scale_factor=1.0, gradient_clipping=None, # Params for debugging debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a Categorical DQN Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A `BoundedTensorSpec` representing the actions. categorical_q_network: A categorical_q_network.CategoricalQNetwork that returns the q_distribution for each action. optimizer: The optimizer to use for training. min_q_value: A float specifying the minimum Q-value, used for setting up the support. max_q_value: A float specifying the maximum Q-value, used for setting up the support. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if a wrapper is not provided to the collect_policy method). n_step_update: The number of steps to consider when computing TD error and TD loss. Defaults to single-step updates. Note that this requires the user to call train on Trajectory objects with a time dimension of `n_step_update + 1`. However, note that we do not yet support `n_step_update > 1` in the case of RNNs (i.e., non-empty `q_network.state_spec`). boltzmann_temperature: Temperature value to use for Boltzmann sampling of the actions during data collection. The closer to 0.0, the higher the probability of choosing the best action. target_categorical_q_network: (Optional.) A `tf_agents.network.Network` to be used as the target network during Q learning. Every `target_update_period` train steps, the weights from `categorical_q_network` are copied (possibly with smoothing via `target_update_tau`) to `target_categorical_q_network`. If `target_categorical_q_network` is not provided, it is created by making a copy of `categorical_q_network`, which initializes a new network with the same structure and its own layers and weights. Network copying is performed via the `Network.copy` superclass method, and may inadvertently lead to the resulting network to share weights with the original. This can happen if, for example, the original network accepted a pre-built Keras layer in its `__init__`, or accepted a Keras layer that wasn't built, but neglected to create a new copy. In these cases, it is up to you to provide a target Network having weights that are not shared with the original `categorical_q_network`. If you provide a `target_categorical_q_network` that shares any weights with `categorical_q_network`, a warning will be logged but no exception is thrown. Note; shallow copies of Keras layers may be built via the code: ```python new_layer = type(layer).from_config(layer.get_config()) ``` target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of huber_loss is used. This function takes as input the target and the estimated Q values and returns the loss for each element of the batch. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: TypeError: If the action spec contains more than one action. """ super(CategoricalDqnAgent, self).__init__(time_step_spec, action_spec, categorical_q_network, optimizer, epsilon_greedy=epsilon_greedy, n_step_update=n_step_update, boltzmann_temperature=boltzmann_temperature, target_q_network=target_categorical_q_network, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=td_errors_loss_fn, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, name=name) def check_atoms(net, label): num_atoms = getattr(net, 'num_atoms', None) if num_atoms is None: raise TypeError( 'Expected {} to have property `num_atoms`, but it ' 'doesn\'t (note: you likely want to use a ' 'CategoricalQNetwork). Network is: {}'.format(label, net)) return num_atoms num_atoms = check_atoms(self._q_network, 'categorical_q_network') target_num_atoms = check_atoms(self._target_q_network, 'target_categorical_q_network') if num_atoms != target_num_atoms: raise ValueError( 'categorical_q_network and target_categorical_q_network have ' 'different numbers of atoms: {} vs. {}'.format( num_atoms, target_num_atoms)) self._num_atoms = num_atoms self._min_q_value = min_q_value self._max_q_value = max_q_value self._support = tf.linspace(min_q_value, max_q_value, num_atoms) policy = categorical_q_policy.CategoricalQPolicy( min_q_value, max_q_value, self._q_network, self._action_spec) if boltzmann_temperature is not None: self._collect_policy = boltzmann_policy.BoltzmannPolicy( policy, temperature=self._boltzmann_temperature) else: self._collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( policy, epsilon=self._epsilon_greedy) self._policy = greedy_policy.GreedyPolicy(policy) target_policy = categorical_q_policy.CategoricalQPolicy( min_q_value, max_q_value, self._target_q_network, self._action_spec) self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy)
def __init__( self, time_step_spec, action_spec, q_network, optimizer, epsilon_greedy=0.1, n_step_update=1, boltzmann_temperature=None, emit_log_probability=False, update_period=None, # Params for target network updates target_update_tau=1.0, target_update_period=1, # Params for training. td_errors_loss_fn=None, gamma=1.0, reward_scale_factor=1.0, gradient_clipping=None, # Params for debugging debug_summaries=False, enable_functions=True, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a DQN Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. q_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, step_type). optimizer: The optimizer to use for training. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if a wrapper is not provided to the collect_policy method). n_step_update: The number of steps to consider when computing TD error and TD loss. Defaults to single-step updates. Note that this requires the user to call train on Trajectory objects with a time dimension of `n_step_update + 1`. However, note that we do not yet support `n_step_update > 1` in the case of RNNs (i.e., non-empty `q_network.state_spec`). boltzmann_temperature: Temperature value to use for Boltzmann sampling of the actions during data collection. The closer to 0.0, the higher the probability of choosing the best action. emit_log_probability: Whether policies emit log probabilities or not. update_period: Update period. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of element_wise_huber_loss is used. This function takes as input the target and the estimated Q values and returns the loss for each element of the batch. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. enable_functions: A bool to decide whether or not to enable tf function summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If the action spec contains more than one action or action spec minimum is not equal to 0. NotImplementedError: If `q_network` has non-empty `state_spec` (i.e., an RNN is provided) and `n_step_update > 1`. """ tf.Module.__init__(self, name=name) flat_action_spec = tf.nest.flatten(action_spec) self._num_actions = [ spec.maximum - spec.minimum + 1 for spec in flat_action_spec ] if len(flat_action_spec) > 1 or flat_action_spec[0].shape.ndims > 1: raise ValueError('Only one dimensional actions are supported now.') if not all(spec.minimum == 0 for spec in flat_action_spec): raise ValueError( 'Action specs should have minimum of 0, but saw: {0}'.format( [spec.minimum for spec in flat_action_spec])) if epsilon_greedy is not None and boltzmann_temperature is not None: raise ValueError( 'Configured both epsilon_greedy value {} and temperature {}, ' 'however only one of them can be used for exploration.'.format( epsilon_greedy, boltzmann_temperature)) self._q_network = q_network self._target_q_network = self._q_network.copy(name='TargetQNetwork') self._epsilon_greedy = epsilon_greedy self._n_step_update = n_step_update self._boltzmann_temperature = boltzmann_temperature self._optimizer = optimizer self._td_errors_loss_fn = td_errors_loss_fn or element_wise_huber_loss self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater(target_update_tau, target_update_period) policy = q_policy.QPolicy( time_step_spec, action_spec, q_network=self._q_network, emit_log_probability=emit_log_probability) if boltzmann_temperature is not None: collect_policy = boltzmann_policy.BoltzmannPolicy( policy, temperature=self._boltzmann_temperature) else: collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( policy, epsilon=self._epsilon_greedy) policy = greedy_policy.GreedyPolicy(policy) if q_network.state_spec and n_step_update != 1: raise NotImplementedError( 'DqnAgent does not currently support n-step updates with stateful ' 'networks (i.e., RNNs), but n_step_update = {}'.format(n_step_update)) train_sequence_length = ( n_step_update + 1 if not q_network.state_spec else None) super(DqnAgent, self).__init__( time_step_spec, action_spec, policy, collect_policy, train_sequence_length=train_sequence_length, update_period=update_period, debug_summaries=debug_summaries, enable_functions=enable_functions, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) tf.compat.v1.summary.scalar( 'epsilon/' + self.name, self._epsilon_greedy, collections=['train_' + self.name])
def __init__(self, time_step_spec, action_spec, actor_network, q_network, actor_optimizer, critic_optimizer, exploration_noise_std=0.1, boltzmann_temperature=None, epsilon_greedy=0.1, q_network_2=None, target_actor_network=None, target_q_network=None, target_q_network_2=None, target_update_tau=1.0, target_update_period=1, actor_update_period=1, dqda_clipping=None, td_errors_loss_fn=None, gamma=1.0, reward_scale_factor=1.0, target_policy_noise=0.2, target_policy_noise_clip=0.5, gradient_clipping=None, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, action_params_mask=None, n_step_update=1, name=None): """Creates a Td3Agent Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A namedtuple of nested BoundedTensorSpec representing the actions. actor_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, step_type). q_network: A tf_agents.network.Network to be used by the agent. The network will be called with call(observation, action, step_type). actor_optimizer: The default optimizer to use for the actor network. critic_optimizer: The default optimizer to use for the critic network. exploration_noise_std: Scale factor on exploration policy noise. q_network_2: (Optional.) A `tf_agents.network.Network` to be used as the second critic network during Q learning. The weights from `q_network` are copied if this is not provided. target_actor_network: (Optional.) A `tf_agents.network.Network` to be used as the target actor network during Q learning. Every `target_update_period` train steps, the weights from `actor_network` are copied (possibly withsmoothing via `target_update_tau`) to ` target_actor_network`. If `target_actor_network` is not provided, it is created by making a copy of `actor_network`, which initializes a new network with the same structure and its own layers and weights. Performing a `Network.copy` does not work when the network instance already has trainable parameters (e.g., has already been built, or when the network is sharing layers with another). In these cases, it is up to you to build a copy having weights that are not shared with the original `actor_network`, so that this can be used as a target network. If you provide a `target_actor_network` that shares any weights with `actor_network`, a warning will be logged but no exception is thrown. target_q_network: (Optional.) Similar network as target_actor_network but for the q_network. See documentation for target_actor_network. target_q_network_2: (Optional.) Similar network as target_actor_network but for the q_network_2. See documentation for target_actor_network. Will only be used if 'q_network_2' is also specified. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. actor_update_period: Period for the optimization step on actor network. dqda_clipping: A scalar or float clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Default is None representing no clippiing. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of elementwise huber_loss is used. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. target_policy_noise: Scale factor on target action noise target_policy_noise_clip: Value to clip noise. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. action_params_mask: A mask of continuous parameter actions for discrete action name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._actor_network = actor_network self._target_actor_network = common.maybe_copy_target_network_with_checks( self._actor_network, target_actor_network, 'TargetActorNetwork') # critic network here is Q-network self._q_network_1 = q_network self._target_q_network_1 = ( common.maybe_copy_target_network_with_checks( self._q_network_1, target_q_network, 'TargetCriticNetwork1')) if q_network_2 is not None: self._q_network_2 = q_network_2 else: self._q_network_2 = q_network.copy(name='CriticNetwork2') # Do not use target_q_network_2 if q_network_2 is None. target_q_network_2 = None self._target_q_network_2 = ( common.maybe_copy_target_network_with_checks( self._q_network_2, target_q_network_2, 'TargetCriticNetwork2')) self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._exploration_noise_std = exploration_noise_std self._epsilon_greedy = epsilon_greedy self._boltzmann_temperature = boltzmann_temperature self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._actor_update_period = actor_update_period self._dqda_clipping = dqda_clipping self._td_errors_loss_fn = (td_errors_loss_fn or common.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_policy_noise = target_policy_noise self._target_policy_noise_clip = target_policy_noise_clip self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater(target_update_tau, target_update_period) policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec.actor_network, actor_network=self._actor_network, clip=True) policy = mixed_q_policy.MixedQPolicy(policy, time_step_spec=time_step_spec, action_spec=action_spec.q_network, q_network=q_network) collect_policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec.actor_network, actor_network=self._actor_network, clip=False) collect_policy = gaussian_policy.GaussianPolicy( collect_policy, scale=self._exploration_noise_std, clip=True) collect_policy = mixed_q_policy.MixedQPolicy( collect_policy, time_step_spec=time_step_spec, action_spec=action_spec.q_network, q_network=q_network) if boltzmann_temperature is not None: collect_policy = boltzmann_policy.BoltzmannPolicy( collect_policy, temperature=self._boltzmann_temperature) else: collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( collect_policy, epsilon=self._epsilon_greedy) # Create self._target_greedy_policy in order to compute target Q-values. target_policy = actor_policy.ActorPolicy( time_step_spec=time_step_spec, action_spec=action_spec.actor_network, actor_network=self._target_actor_network, clip=True) target_policy = mixed_q_policy.MixedQPolicy( target_policy, time_step_spec=time_step_spec, action_spec=action_spec.q_network, q_network=self._target_q_network_1) self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy) self._action_params_mask = action_params_mask self._n_step_update = n_step_update if action_spec.actor_network is not None and action_params_mask is None: raise ValueError( "action_params_mask is required for actor network") super(MixedTd3Agent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=2 if not self._actor_network.state_spec else None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)